# Automated Data Quality Checks Implementation

## Objectives
1. Implement automated data quality checks
2. Set up validation rules for HR data
3. Create reporting mechanism for data quality issues

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import logging

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [2]:
# Load HR data files
def load_hr_data():
    data_files = {
        'people': '../../dummy_data/PER_ALL_PEOPLE_F_20241216.csv',
        'assignments': '../../dummy_data/PER_ALL_ASSIGNMENTS_F_20241216.csv',
        'addresses': '../../dummy_data/HR_ALL_ADDRESSES_20241216.csv'
    }
    
    return {name: pd.read_csv(path) for name, path in data_files.items()}

In [3]:
class HRDataValidator:
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        
    def validate_date_format(self, date_str):
        """Validate if a string is in the correct date format (YYYY-MM-DD)"""
        try:
            datetime.strptime(date_str, '%Y-%m-%d')
            return True
        except ValueError:
            return False
            
    def check_missing_values(self, df, threshold=0.1):
        """Check for columns with missing values above threshold"""
        missing_stats = df.isnull().sum() / len(df)
        return missing_stats[missing_stats > threshold]
        
    def check_duplicates(self, df, subset=None):
        """Check for duplicate records"""
        return df.duplicated(subset=subset).sum()
        
    def validate_numeric_range(self, series, min_val, max_val):
        """Validate if numeric values are within expected range"""
        return series.between(min_val, max_val)
    
    def validate_email_format(self, series):
        """Validate email format using basic pattern matching"""
        return series.str.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
    
    def validate_nino_format(self, series):
        """Validate National Insurance Number format"""
        return series.str.match(r'^[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z][0-9]{6}[A-D]$')

In [4]:
class HRDataQualityPipeline:
    def __init__(self):
        self.validator = HRDataValidator()
        self.logger = logging.getLogger(__name__)
        
    def run_quality_checks(self, data):
        """Run all quality checks on the HR data"""
        results = {}
        
        for dataset_name, df in data.items():
            self.logger.info(f"Running quality checks for {dataset_name}")
            results[dataset_name] = {
                'missing_values': self.validator.check_missing_values(df),
                'duplicates': self.validator.check_duplicates(df)
            }
            
            # Specific checks for people dataset
            if dataset_name == 'people':
                if 'email' in df.columns:
                    results[dataset_name]['invalid_emails'] = ~self.validator.validate_email_format(df['email'])
                if 'nino' in df.columns:
                    results[dataset_name]['invalid_ninos'] = ~self.validator.validate_nino_format(df['nino'])
                    
            # Specific checks for dates
            date_columns = df.select_dtypes(include=['object']).columns
            for col in date_columns:
                if 'date' in col.lower():
                    results[dataset_name][f'invalid_dates_{col}'] = df[col].apply(
                        lambda x: not self.validator.validate_date_format(str(x)) if pd.notna(x) else False
                    )
        
        return results

In [5]:
class QualityReportGenerator:
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        
    def generate_summary(self, quality_results):
        """Generate a summary report of data quality issues"""
        summary = []
        
        for dataset_name, results in quality_results.items():
            summary.append(f"\nQuality Report for {dataset_name}:")
            summary.append("-" * 50)
            
            # Missing values
            missing = results['missing_values']
            if not missing.empty:
                summary.append("\nColumns with high missing values:")
                for col, pct in missing.items():
                    summary.append(f"  - {col}: {pct:.2%} missing")
            
            # Duplicates
            dupes = results['duplicates']
            if dupes > 0:
                summary.append(f"\nDuplicate records found: {dupes}")
            
            # Invalid emails
            if 'invalid_emails' in results:
                invalid_count = results['invalid_emails'].sum()
                if invalid_count > 0:
                    summary.append(f"\nInvalid email formats: {invalid_count}")
            
            # Invalid NINOs
            if 'invalid_ninos' in results:
                invalid_count = results['invalid_ninos'].sum()
                if invalid_count > 0:
                    summary.append(f"\nInvalid NINO formats: {invalid_count}")
            
            # Invalid dates
            date_issues = {k: v for k, v in results.items() if k.startswith('invalid_dates_')}
            if date_issues:
                summary.append("\nInvalid date formats:")
                for col, invalid_mask in date_issues.items():
                    col_name = col.replace('invalid_dates_', '')
                    invalid_count = invalid_mask.sum()
                    if invalid_count > 0:
                        summary.append(f"  - {col_name}: {invalid_count} invalid dates")
        
        return '\n'.join(summary)

In [6]:
def main():
    # Initialize components
    pipeline = HRDataQualityPipeline()
    report_generator = QualityReportGenerator()
    
    try:
        # Load data
        hr_data = load_hr_data()
        
        # Run quality checks
        quality_results = pipeline.run_quality_checks(hr_data)
        
        # Generate and display report
        report = report_generator.generate_summary(quality_results)
        print(report)
        
    except Exception as e:
        logging.error(f"Error in data quality pipeline: {str(e)}")
        raise

if __name__ == "__main__":
    main()

2024-12-17 14:28:37,145 - INFO - Running quality checks for people
2024-12-17 14:28:37,158 - INFO - Running quality checks for assignments
2024-12-17 14:28:37,168 - INFO - Running quality checks for addresses



Quality Report for people:
--------------------------------------------------

Invalid NINO formats: 989

Invalid date formats:

Quality Report for assignments:
--------------------------------------------------

Invalid date formats:

Quality Report for addresses:
--------------------------------------------------
