# Exercise 2: Data Schema Validator

### Importing libraries and schema

In [1]:
import json
import pandas as pd
import re

schema = { 
    "tables": 
    [ 
        { 
            "name": "customers", 
            "columns": [ 
                { 
                    "name": "customer_id", 
                    "type": "string", 
                    "required": True, 
                    "validation": { 
                        "pattern": "^CUS[0-9]{6}$" 
                    } 
                }, 
                { 
                    "name": "purchase_amount", 
                    "type": "decimal", 
                    "required": True, 
                    "validation": { 
                        "min": 0, 
                        "max": 1000000 
                    } 
                } 
            ] 
        } 
    ] 
}



### 1. Create a function that extracts column details from the schema file:
- Use lambda functions to transform complex column definitions
- Create a mapping of table names to their column specifications

In [2]:
def column_details(schema):
    transform_columns = lambda columns: [{'column_name': col['name'], 'column_type': col['type'], 'validation': col.get('validation', {})} for col in columns]

    table_details = {table['name']: transform_columns(table['columns']) for table in schema['tables']}
    
    return table_details


In [3]:
column_details(schema)

{'customers': [{'column_name': 'customer_id',
   'column_type': 'string',
   'validation': {'pattern': '^CUS[0-9]{6}$'}},
  {'column_name': 'purchase_amount',
   'column_type': 'decimal',
   'validation': {'min': 0, 'max': 1000000}}]}

###  2. Create a validation function that:
- Filters required columns using filter() and lambda
- Validates data types and constraints
- Returns validation errors in a structured format

In [4]:
def filter_column(columns):
    return list(filter(lambda x: x, columns))

In [5]:
def validation(schema, data):
    val_errors = pd.DataFrame({'Error_severity': [], 'Table_Name': [], 'Column_Name': [], 'Error_Type': []})
    
    # Iterating through each table in the schema
    for table in schema['tables']:
        table_name = table['name']
        
        # Iterating through columns in the table
        for column in table['columns']:
            column_name = column['name']
            column_type = column['type']
            is_required = column.get('required', False)
            validation_rules = column.get('validation', {})
            
            # Checking data for the column
            for idx, value in data[column_name].items():
                # Checking for required fields
                if is_required and pd.isnull(value):
                    val_errors = pd.concat([val_errors, pd.DataFrame({
                        'Error_severity': ['High'],
                        'Table_Name': [table_name],
                        'Column_Name': [column_name],
                        'Error_Type': ['Missing Value']
                    })], ignore_index=True)
                    continue
                
                # Checking data type
                if column_type == "string" and not isinstance(value, str):
                    val_errors = pd.concat([val_errors, pd.DataFrame({
                        'Error_severity': ['High'],
                        'Table_Name': [table_name],
                        'Column_Name': [column_name],
                        'Error_Type': ['Data Type']
                    })], ignore_index=True)
                elif column_type == "decimal" and not isinstance(value, (float, int)):
                    val_errors = pd.concat([val_errors, pd.DataFrame({
                        'Error_severity': ['High'],
                        'Table_Name': [table_name],
                        'Column_Name': [column_name],
                        'Error_Type': ['Data Type']
                    })], ignore_index=True)
                
                # Applying validation rules
                if 'pattern' in validation_rules:
                    if not re.match(validation_rules['pattern'], str(value)):
                        val_errors = pd.concat([val_errors, pd.DataFrame({
                            'Error_severity': ['Medium'],
                            'Table_Name': [table_name],
                            'Column_Name': [column_name],
                            'Error_Type': ['Pattern Mismatch']
                        })], ignore_index=True)
                
                if 'min' in validation_rules or 'max' in validation_rules:
                    if isinstance(value, (int, float)):
                        if value < validation_rules['min'] or value > validation_rules['max']:
                            val_errors = pd.concat([val_errors, pd.DataFrame({
                                'Error_severity': ['Medium'],
                                'Table_Name': [table_name],
                                'Column_Name': [column_name],
                                'Error_Type': ['Value Out of Range']
                            })], ignore_index=True)
    
    return val_errors


### 3. Create a function that sorts validation errors by:
- Error severity
- Table name
- Column name

In [6]:
def val_error_sort(val_errors, column_name='Error_severity'):
    return val_errors.sort_values(by=column_name, ascending=True,ignore_index=True)

### TEST CASES TO CHECK

In [7]:
test_case_1 = pd.DataFrame({
    "customer_id": ["CUS123456", "CUS654321", "CUS000001"],
    "purchase_amount": [100.0, 999999.99, 500000.0]
})


In [8]:
test_case_2 = pd.DataFrame({
    "customer_id": ["CUS12345", "INVALID_ID", None],  
    "purchase_amount": [-100.0, 2000000.0, None]    
})


In [9]:
df = validation(schema,test_case_2)
df

Unnamed: 0,Error_severity,Table_Name,Column_Name,Error_Type
0,Medium,customers,customer_id,Pattern Mismatch
1,Medium,customers,customer_id,Pattern Mismatch
2,High,customers,customer_id,Missing Value
3,Medium,customers,purchase_amount,Value Out of Range
4,Medium,customers,purchase_amount,Value Out of Range
5,High,customers,purchase_amount,Missing Value


In [10]:
val_error_sort(df)

Unnamed: 0,Error_severity,Table_Name,Column_Name,Error_Type
0,High,customers,customer_id,Missing Value
1,High,customers,purchase_amount,Missing Value
2,Medium,customers,customer_id,Pattern Mismatch
3,Medium,customers,customer_id,Pattern Mismatch
4,Medium,customers,purchase_amount,Value Out of Range
5,Medium,customers,purchase_amount,Value Out of Range
