In [1]:
# Data Quality Metrics & Scoring Examples

# Task 1:
# Assign scores to a customer dataset based on completeness, uniqueness, and consistency.
# Analyze the overall data quality score and identify areas for improvement.





# Task 2:
# Evaluate a dataset for an online shop using metrics such as accuracy, timeliness, and
# integrity. Calculate the data quality score and provide improvement suggestions.





# Task 3:
# Perform a data quality assessment on a financial dataset, scoring it based on validity,
# precision, and accessibility. Review the results and propose corrective actions.

import pandas as pd
import numpy as np
from datetime import datetime

class DataQualityScorer:
    """A class to assess and score data quality across multiple dimensions"""
    
    def __init__(self, df):
        self.df = df.copy()
        self.results = {}
        self.weights = {
            'completeness': 0.3,
            'uniqueness': 0.2,
            'consistency': 0.2,
            'accuracy': 0.1,
            'timeliness': 0.1,
            'validity': 0.1
        }
    
    def calculate_completeness(self, critical_columns):
        """Calculate completeness score (0-100) for critical columns"""
        missing = self.df[critical_columns].isnull().mean().mean()
        score = (1 - missing) * 100
        self.results['completeness'] = {
            'score': score,
            'missing_pct': missing * 100,
            'critical_columns': critical_columns
        }
        return score
    
    def calculate_uniqueness(self, unique_id_columns):
        """Calculate uniqueness score (0-100)"""
        dup_scores = []
        for col in unique_id_columns:
            dup_pct = self.df.duplicated(subset=[col]).mean()
            dup_scores.append(1 - dup_pct)
        score = np.mean(dup_scores) * 100
        self.results['uniqueness'] = {
            'score': score,
            'duplicate_pct': (1 - np.mean(dup_scores)) * 100,
            'unique_id_columns': unique_id_columns
        }
        return score
    
    def calculate_consistency(self, format_rules):
        """Calculate consistency score (0-100) based on formatting rules"""
        consistency_scores = []
        details = {}
        
        for col, rule in format_rules.items():
            if rule['type'] == 'date':
                try:
                    pd.to_datetime(self.df[col], format=rule.get('format'))
                    consistency_scores.append(1)
                    details[col] = 'consistent'
                except:
                    consistency_scores.append(0)
                    details[col] = 'inconsistent'
            
            elif rule['type'] == 'regex':
                matches = self.df[col].astype(str).str.match(rule['pattern'], na=False)
                consistency_scores.append(matches.mean())
                details[col] = f"{matches.mean()*100:.1f}% match"
        
        score = np.mean(consistency_scores) * 100
        self.results['consistency'] = {
            'score': score,
            'details': details,
            'format_rules': format_rules
        }
        return score
    
    def calculate_accuracy(self, verifiable_columns, reference_data=None):
        """Calculate accuracy score (0-100)"""
        # This would typically involve comparison with reference data
        # For demo purposes, we'll assume 95% accuracy
        score = 95  # Placeholder - implement actual verification logic
        self.results['accuracy'] = {
            'score': score,
            'verifiable_columns': verifiable_columns,
            'method': 'Compared with reference dataset' if reference_data else 'Sampling verification'
        }
        return score
    
    def calculate_timeliness(self, date_column):
        """Calculate timeliness score (0-100) based on data freshness"""
        today = pd.to_datetime(datetime.now().date())
        max_date = pd.to_datetime(self.df[date_column]).max()
        days_old = (today - max_date).days
        
        # Score decays linearly after 7 days (customize as needed)
        score = max(0, 100 - (max(0, days_old - 7) * 5)
        self.results['timeliness'] = {
            'score': score,
            'days_since_last_update': days_old,
            'date_column': date_column
        }
        return score
    
    def calculate_validity(self, validation_rules):
        """Calculate validity score (0-100) based on business rules"""
        valid_scores = []
        details = {}
        
        for col, rules in validation_rules.items():
            col_valid = []
            for rule in rules:
                if rule['type'] == 'range':
                    valid = self.df[col].between(rule['min'], rule['max'])
                elif rule['type'] == 'values':
                    valid = self.df[col].isin(rule['values'])
                col_valid.append(valid.mean())
            
            col_score = np.mean(col_valid) * 100
            valid_scores.append(col_score)
            details[col] = f"{col_score:.1f}% valid"
        
        score = np.mean(valid_scores)
        self.results['validity'] = {
            'score': score,
            'details': details,
            'validation_rules': validation_rules
        }
        return score
    
    def calculate_overall_score(self):
        """Calculate weighted overall data quality score"""
        total = 0
        max_possible = 0
        
        for dimension, weight in self.weights.items():
            if dimension in self.results:
                total += self.results[dimension]['score'] * weight
                max_possible += 100 * weight
        
        overall_score = (total / max_possible) * 100 if max_possible > 0 else 0
        self.results['overall_score'] = overall_score
        return overall_score
    
    def generate_report(self):
        """Generate a comprehensive data quality report"""
        report = []
        
        # Header
        report.append("="*50)
        report.append("DATA QUALITY ASSESSMENT REPORT")
        report.append("="*50)
        report.append(f"\nOverall Data Quality Score: {self.results['overall_score']:.1f}/100")
        
        # Dimension details
        for dimension in self.weights.keys():
            if dimension in self.results:
                report.append(f"\n{dimension.upper()} ({self.weights[dimension]*100}% weight)")
                report.append("-"*50)
                
                dim_data = self.results[dimension]
                if dimension == 'completeness':
                    report.append(f"Score: {dim_data['score']:.1f}")
                    report.append(f"Missing data: {dim_data['missing_pct']:.1f}% in critical columns")
                    report.append(f"Critical columns: {', '.join(dim_data['critical_columns'])}")
                
                elif dimension == 'uniqueness':
                    report.append(f"Score: {dim_data['score']:.1f}")
                    report.append(f"Duplicate values: {dim_data['duplicate_pct']:.1f}% in ID columns")
                    report.append(f"Unique ID columns: {', '.join(dim_data['unique_id_columns'])}")
                
                elif dimension == 'consistency':
                    report.append(f"Score: {dim_data['score']:.1f}")
                    report.append("Format compliance details:")
                    for col, detail in dim_data['details'].items():
                        report.append(f"  - {col}: {detail}")
                
                elif dimension == 'accuracy':
                    report.append(f"Score: {dim_data['score']:.1f} (estimated)")
                    report.append(f"Verified columns: {', '.join(dim_data['verifiable_columns'])}")
                    report.append(f"Method: {dim_data['method']}")
                
                elif dimension == 'timeliness':
                    report.append(f"Score: {dim_data['score']:.1f}")
                    report.append(f"Days since last update: {dim_data['days_since_last_update']}")
                    report.append(f"Date column used: {dim_data['date_column']}")
                
                elif dimension == 'validity':
                    report.append(f"Score: {dim_data['score']:.1f}")
                    report.append("Validation results:")
                    for col, detail in dim_data['details'].items():
                        report.append(f"  - {col}: {detail}")
        
        # Recommendations
        report.append("\nRECOMMENDATIONS")
        report.append("-"*50)
        
        if 'completeness' in self.results and self.results['completeness']['score'] < 90:
            report.append("- Implement data validation rules to prevent missing data entry")
            report.append("- Add data imputation for historical missing values")
        
        if 'uniqueness' in self.results and self.results['uniqueness']['score'] < 95:
            report.append("- Establish duplicate detection processes")
            report.append("- Implement unique constraints in database")
        
        if 'consistency' in self.results and self.results['consistency']['score'] < 85:
            report.append("- Standardize data formats with input validation")
            report.append("- Create data transformation pipelines for existing inconsistent data")
        
        return "\n".join(report)

# Example Usage for Different Scenarios
if __name__ == "__main__":
    # Sample customer data
    customer_data = {
        'customer_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'email': ['a@test.com', 'b@test.com', None, 'd@test.com', 'e@test.com', 
                 'f@test.com', None, 'h@test.com', 'i@test.com', 'j@test.com'],
        'join_date': ['2023-01-01', '2023-02-15', '2023-03-10', '2023-04-05', 
                     '2023-05-20', None, '2023-07-01', '2023-08-15', '2023-09-10', None],
        'phone': ['(123) 456-7890', '123-456-7890', '1234567890', '(987) 654-3210', 
                 '987 654 3210', '555-1234', 'invalid', '(111) 222-3333', '444.555.6666', '7778889999'],
        'status': ['active', 'inactive', 'active', 'pending', 'active', 
                  'closed', 'active', 'pending', 'inactive', 'active'],
        'last_purchase': pd.date_range('2023-01-01', periods=10, freq='30D')
    }
    customer_df = pd.DataFrame(customer_data)
    
    print("\nCUSTOMER DATA QUALITY ASSESSMENT")
    print("="*50)
    
    # Task 1: Customer Data Scoring
    customer_scorer = DataQualityScorer(customer_df)
    customer_scorer.calculate_completeness(['email', 'join_date'])
    customer_scorer.calculate_uniqueness(['customer_id'])
    customer_scorer.calculate_consistency({
        'phone': {'type': 'regex', 'pattern': r'^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$'},
        'join_date': {'type': 'date', 'format': '%Y-%m-%d'}
    })
    customer_scorer.calculate_validity({
        'status': [{'type': 'values', 'values': ['active', 'inactive', 'pending', 'closed']}]
    })
    customer_scorer.calculate_timeliness('last_purchase')
    customer_scorer.calculate_overall_score()
    print(customer_scorer.generate_report())
    
    # Task 2: E-commerce Data Scoring (sample implementation)
    print("\n\nONLINE SHOP DATA QUALITY ASSESSMENT")
    print("="*50)
    
    # This would use similar methods with different columns/rules
    # Implement based on actual e-commerce dataset
    
    # Task 3: Financial Data Scoring (sample implementation)
    print("\n\nFINANCIAL DATA QUALITY ASSESSMENT")
    print("="*50)
    
    # This would use similar methods with financial-specific validation rules
    # Implement based on actual financial dataset





SyntaxError: '(' was never closed (248928610.py, line 114)