In [3]:
import pandas as pd
import numpy as np
import json
import os
import re
import logging
from typing import Dict, List, Tuple, Any, Optional
from great_expectations.core import ExpectationSuite
from great_expectations.dataset import PandasDataset
import unittest
from unittest.mock import patch, MagicMock

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

class DataQualityError(Exception):
    """Custom exception for data quality related errors"""
    pass

# Task 1: Understanding and Defining Data Quality Metrics
def calculate_basic_metrics(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Calculate basic data quality metrics: completeness, validity, uniqueness
    
    Args:
        df: Input DataFrame to analyze
        
    Returns:
        Dictionary containing calculated metrics
        
    Raises:
        DataQualityError: If metrics calculation fails
    """
    try:
        metrics = {}
        
        # Calculate completeness (percentage of non-null values) - vectorized
        completeness = {}
        total_count = len(df)
        # Ensure we don't divide by zero
        if total_count == 0:
            raise DataQualityError("Cannot calculate metrics on empty DataFrame")
            
        for column in df.columns:
            non_null_count = df[column].count()
            completeness[column] = (non_null_count / total_count) * 100
        
        # Calculate validity (% of email fields containing @) - vectorized
        # Only applicable for the Email column
        if 'Email' in df.columns:
            # Using vectorized operations instead of a loop
            valid_emails = df['Email'].str.contains('@', na=False).sum()
            email_validity = (valid_emails / total_count) * 100
        else:
            email_validity = 0
            logger.warning("Email column not found in DataFrame. Setting email_validity to 0.")
        
        # Calculate uniqueness (count distinct entries in Email column) - vectorized
        if 'Email' in df.columns:
            unique_emails = df['Email'].nunique(dropna=False)  # Include NaN in count
            email_uniqueness = (unique_emails / total_count) * 100
        else:
            email_uniqueness = 0
            logger.warning("Email column not found in DataFrame. Setting email_uniqueness to 0.")
        
        # Create a metrics dictionary
        metrics = {
            'completeness': completeness,
            'email_validity': email_validity,
            'email_uniqueness': email_uniqueness
        }
        
        return metrics
    
    except Exception as e:
        logger.error(f"Error calculating metrics: {str(e)}")
        raise DataQualityError(f"Failed to calculate data quality metrics: {str(e)}")

# Task 2: Calculating Data Quality Score
def calculate_quality_score(metrics: Dict[str, Any]) -> float:
    """
    Aggregate multiple metrics to calculate an overall data quality score
    
    Args:
        metrics: Dictionary containing data quality metrics
        
    Returns:
        Overall data quality score as a percentage
        
    Raises:
        DataQualityError: If score calculation fails
    """
    try:
        # Calculate average completeness across all columns
        if not metrics['completeness']:
            raise DataQualityError("Completeness metrics are missing")
            
        avg_completeness = sum(metrics['completeness'].values()) / len(metrics['completeness'])
        
        # Use all three metrics for the overall score
        overall_score = (avg_completeness + metrics['email_validity'] + metrics['email_uniqueness']) / 3
        
        return overall_score
    
    except Exception as e:
        logger.error(f"Error calculating quality score: {str(e)}")
        raise DataQualityError(f"Failed to calculate quality score: {str(e)}")

# Function to load or create sample data
def load_or_create_data(file_path: str = 'sample_data.csv', create_if_missing: bool = True) -> pd.DataFrame:
    """
    Load data from CSV or create sample data if file doesn't exist
    
    Args:
        file_path: Path to the CSV file
        create_if_missing: Whether to create sample data if file doesn't exist
        
    Returns:
        DataFrame containing the data
        
    Raises:
        FileNotFoundError: If file doesn't exist and create_if_missing is False
        DataQualityError: If data creation fails
    """
    try:
        if os.path.exists(file_path):
            logger.info(f"Loading data from {file_path}")
            return pd.read_csv(file_path)
        elif create_if_missing:
            logger.info(f"File {file_path} not found. Creating sample data.")
            # Create sample data
            data = {
                'Name': ['John Doe', 'Jane Smith', None, 'Robert Brown', 'Emily White', 'David Green'],
                'Email': ['john@example.com', 'jane.smith@company.org', 'invalid-email', None, 'emily@domain.com', 'emily@domain.com'],
                'Age': [34, 28, 45, None, 22, 'thirty']
            }
            df = pd.DataFrame(data)
            
            # Save to CSV
            try:
                df.to_csv(file_path, index=False)
                logger.info(f"Sample data saved to {file_path}")
            except Exception as e:
                logger.warning(f"Failed to save sample data to {file_path}: {str(e)}")
                
            return df
        else:
            raise FileNotFoundError(f"File {file_path} not found")
    
    except Exception as e:
        if isinstance(e, FileNotFoundError):
            raise
        logger.error(f"Error loading or creating data: {str(e)}")
        raise DataQualityError(f"Failed to load or create data: {str(e)}")

# Task 3: Creating Expectations for a CSV
def create_expectations(df: Optional[pd.DataFrame] = None, 
                       file_path: str = 'sample_data.csv',
                       suite_name: str = "sample_data_suite",
                       output_path: str = 'expectation_suite.json') -> ExpectationSuite:
    """
    Develop basic data quality expectations using Great Expectations
    
    Args:
        df: Optional DataFrame to use (loads from file_path if None)
        file_path: Path to the CSV file (used if df is None)
        suite_name: Name for the expectation suite
        output_path: Where to save the expectation suite JSON
        
    Returns:
        ExpectationSuite object
        
    Raises:
        DataQualityError: If expectations creation fails
    """
    try:
        # Create an ExpectationSuite
        suite = ExpectationSuite(expectation_suite_name=suite_name)
        
        # Load data if not provided
        if df is None:
            df = load_or_create_data(file_path)
        
        # Create a PandasDataset with the expectation suite
        dataset = PandasDataset(df, expectation_suite=suite)
        
        # Define expectations for completeness
        if 'Name' in df.columns:
            dataset.expect_column_values_to_not_be_null('Name')
        if 'Email' in df.columns:
            dataset.expect_column_values_to_not_be_null('Email')
        if 'Age' in df.columns:
            dataset.expect_column_values_to_not_be_null('Age')
        
        # Define expectations for validity
        if 'Email' in df.columns:
            dataset.expect_column_values_to_match_regex('Email', r'.+@.+\..+')
        if 'Age' in df.columns:
            dataset.expect_column_values_to_be_of_type('Age', 'int64', mostly=0.9)
        
        # Define expectations for uniqueness
        if 'Email' in df.columns:
            dataset.expect_column_values_to_be_unique('Email')
        
        # Save the expectation suite to a JSON file
        try:
            with open(output_path, 'w') as f:
                json.dump(dataset.get_expectation_suite().to_json_dict(), f, indent=2)
            logger.info(f"Expectation suite saved to {output_path}")
        except Exception as e:
            logger.warning(f"Failed to save expectation suite to {output_path}: {str(e)}")
        
        return dataset.get_expectation_suite()
    
    except Exception as e:
        logger.error(f"Error creating expectations: {str(e)}")
        raise DataQualityError(f"Failed to create expectations: {str(e)}")

# Task 4: Running and Validating Expectations
def validate_data(df: pd.DataFrame, 
                 expectation_suite: ExpectationSuite,
                 output_path: str = 'validation_report.html') -> dict:
    """
    Run the created expectations and generate an output report
    
    Args:
        df: DataFrame to validate
        expectation_suite: ExpectationSuite to validate against
        output_path: Where to save the HTML report
        
    Returns:
        Validation result dictionary
        
    Raises:
        DataQualityError: If validation fails
    """
    try:
        # Convert the DataFrame to a PandasDataset with the expectation suite
        dataset = PandasDataset(df, expectation_suite=expectation_suite)
        
        # Validate the data against the expectations
        validation_result = dataset.validate()
        
        # Print a summary of the validation results
        logger.info(f"Validation successful: {validation_result.success}")
        logger.info(f"Total expectations: {len(validation_result.results)}")
        logger.info(f"Passed expectations: {sum(1 for result in validation_result.results if result.success)}")
        
        # Create a simple HTML report
        try:
            with open(output_path, 'w') as f:
                f.write("<html><body>")
                f.write("<h1>Data Validation Report</h1>")
                f.write(f"<p>Validation successful: {validation_result.success}</p>")
                f.write("<h2>Results</h2>")
                f.write("<table border='1'>")
                f.write("<tr><th>Expectation</th><th>Success</th><th>Details</th></tr>")
                
                for result in validation_result.results:
                    expectation = result.expectation_config
                    f.write("<tr>")
                    f.write(f"<td>{expectation.expectation_type}</td>")
                    f.write(f"<td>{'✅' if result.success else '❌'}</td>")
                    
                    # Add some details about the expectation
                    details = f"Column: {expectation.kwargs.get('column', 'N/A')}"
                    if 'mostly' in expectation.kwargs:
                        details += f", Threshold: {expectation.kwargs['mostly'] * 100}%"
                    f.write(f"<td>{details}</td>")
                    
                    f.write("</tr>")
                
                f.write("</table>")
                f.write("</body></html>")
            logger.info(f"Validation report saved to {output_path}")
        except Exception as e:
            logger.warning(f"Failed to save validation report to {output_path}: {str(e)}")
        
        return validation_result
    
    except Exception as e:
        logger.error(f"Error validating data: {str(e)}")
        raise DataQualityError(f"Failed to validate data: {str(e)}")

# Task 5: Automating Data Quality Score Calculation
def automate_quality_score(file_path: str, expectation_suite_path: str = 'expectation_suite.json') -> Tuple[float, dict]:
    """
    Automate the data quality score calculation with error handling
    
    Args:
        file_path: Path to the CSV file
        expectation_suite_path: Path to the expectation suite JSON
        
    Returns:
        Tuple of (quality_score, validation_result)
        
    Raises:
        DataQualityError: If automation fails
    """
    try:
        # Load the data
        df = load_or_create_data(file_path, create_if_missing=False)
        
        # Calculate basic metrics
        metrics = calculate_basic_metrics(df)
        
        # Calculate the quality score
        score = calculate_quality_score(metrics)
        
        # Load the expectation suite
        try:
            with open(expectation_suite_path, 'r') as f:
                expectation_suite_dict = json.load(f)
                
            # Convert dict to ExpectationSuite
            expectation_suite = ExpectationSuite(
                expectation_suite_name=expectation_suite_dict["expectation_suite_name"],
                expectations=expectation_suite_dict["expectations"]
            )
        except FileNotFoundError:
            logger.warning(f"Expectation suite not found at {expectation_suite_path}. Creating new expectations.")
            expectation_suite = create_expectations(df)
        
        # Validate the data against the expectations
        validation_result = validate_data(df, expectation_suite)
        
        # Return the quality score and validation result
        return score, validation_result
    
    except Exception as e:
        logger.error(f"Error automating quality score: {str(e)}")
        raise DataQualityError(f"Failed to automate quality score: {str(e)}")

# Task 6: Leveraging Data Quality Metrics for Automated Data Cleaning
def clean_data(df: pd.DataFrame, validation_result: dict) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    """
    Clean data based on validation results using vectorized operations where possible
    
    Args:
        df: DataFrame to clean
        validation_result: Validation result from Great Expectations
        
    Returns:
        Tuple of (cleaned_dataframe, cleaning_stats)
        
    Raises:
        DataQualityError: If cleaning fails
    """
    try:
        cleaned_df = df.copy()
        
        # Track the number of rows and values cleaned
        cleaning_stats = {
            'rows_affected': 0,
            'cells_cleaned': 0,
            'cleaning_actions': []
        }
        
        # Identify failed expectations
        failed_expectations = [result.expectation_config for result in validation_result.results if not result.success]
        
        # --- Handle missing values (expectation: expect_column_values_to_not_be_null) ---
        for expectation in [e for e in failed_expectations if e.expectation_type == 'expect_column_values_to_not_be_null']:
            column = expectation.kwargs.get('column')
            if not column or column not in cleaned_df.columns:
                continue
                
            missing_mask = cleaned_df[column].isna()
            missing_count = missing_mask.sum()
            
            if missing_count == 0:
                continue
                
            if column == 'Name':
                # Fill missing names with placeholder - vectorized
                cleaned_df.loc[missing_mask, column] = 'Unknown User'
                cleaning_stats['cells_cleaned'] += missing_count
                cleaning_stats['rows_affected'] += missing_count
                cleaning_stats['cleaning_actions'].append(f"Filled {missing_count} missing values in {column}")
            
            elif column == 'Email':
                # For email generation, we still need some iteration
                # But we'll use vectorized operations where possible
                for idx in cleaned_df[missing_mask].index:
                    name = cleaned_df.loc[idx, 'Name']
                    if pd.isna(name) or name == 'Unknown User':
                        email = f"user{idx}@placeholder.com"
                    else:
                        # Create an email from the name
                        name_parts = name.lower().split()
                        email = f"{name_parts[0]}.{name_parts[-1]}@placeholder.com"
                    
                    cleaned_df.loc[idx, column] = email
                
                cleaning_stats['cells_cleaned'] += missing_count
                cleaning_stats['rows_affected'] += missing_count
                cleaning_stats['cleaning_actions'].append(f"Generated {missing_count} missing emails")
            
            elif column == 'Age':
                # Fill missing ages with the median age - vectorized
                valid_ages = pd.to_numeric(cleaned_df[column], errors='coerce')
                median_age = valid_ages.median()
                
                if pd.isna(median_age):  # If all values are null or non-numeric
                    median_age = 30  # Default value
                
                cleaned_df.loc[missing_mask, column] = median_age
                cleaning_stats['cells_cleaned'] += missing_count
                cleaning_stats['rows_affected'] += missing_count
                cleaning_stats['cleaning_actions'].append(f"Filled {missing_count} missing ages with median: {median_age}")
        
        # --- Handle invalid emails (expectation: expect_column_values_to_match_regex) ---
        for expectation in [e for e in failed_expectations if e.expectation_type == 'expect_column_values_to_match_regex' and e.kwargs.get('column') == 'Email']:
            if 'Email' not in cleaned_df.columns:
                continue
                
            # Find emails that don't match the regex pattern - vectorized
            invalid_mask = ~cleaned_df['Email'].str.contains('@', na=False)
            invalid_count = invalid_mask.sum()
            
            if invalid_count == 0:
                continue
            
            # For correcting emails, we still need some iteration
            for idx in cleaned_df[invalid_mask].index:
                name = cleaned_df.loc[idx, 'Name']
                if pd.isna(name) or name == 'Unknown User':
                    email = f"user{idx}@corrected.com"
                else:
                    # Create an email from the name
                    name_parts = name.lower().split()
                    email = f"{name_parts[0]}.{name_parts[-1]}@corrected.com"
                
                cleaned_df.loc[idx, 'Email'] = email
            
            cleaning_stats['cells_cleaned'] += invalid_count
            cleaning_stats['rows_affected'] += invalid_count
            cleaning_stats['cleaning_actions'].append(f"Corrected {invalid_count} invalid email formats")
        
        # --- Handle non-numeric ages (expectation: expect_column_values_to_be_of_type) ---
        for expectation in [e for e in failed_expectations if e.expectation_type == 'expect_column_values_to_be_of_type' and e.kwargs.get('column') == 'Age']:
            if 'Age' not in cleaned_df.columns:
                continue
                
            # Vectorized conversion of Age column with pd.to_numeric
            original_values = cleaned_df['Age'].copy()
            cleaned_df['Age'] = pd.to_numeric(cleaned_df['Age'], errors='coerce')
            
            # Fill any new NaN values (from coercion) with default
            new_nulls_mask = cleaned_df['Age'].isna() & ~original_values.isna()
            new_nulls_count = new_nulls_mask.sum()
            
            if new_nulls_count > 0:
                cleaned_df.loc[new_nulls_mask, 'Age'] = 30
                cleaning_stats['cells_cleaned'] += new_nulls_count
                cleaning_stats['rows_affected'] += new_nulls_count
                cleaning_stats['cleaning_actions'].append(f"Converted {new_nulls_count} non-numeric ages to default value (30)")
        
        # --- Handle duplicate emails (expectation: expect_column_values_to_be_unique) ---
        for expectation in [e for e in failed_expectations if e.expectation_type == 'expect_column_values_to_be_unique' and e.kwargs.get('column') == 'Email']:
            if 'Email' not in cleaned_df.columns:
                continue
                
            # Find duplicate emails - vectorized identification
            duplicate_mask = cleaned_df.duplicated(subset=['Email'], keep='first')
            duplicate_count = duplicate_count = duplicate_mask.sum()
            
            if duplicate_count == 0:
                continue
                
            # Extract duplicated values for processing
            duplicate_emails = cleaned_df.loc[duplicate_mask, 'Email'].values
            
            # For making duplicates unique, we need some iteration
            dup_counter = {}
            for idx in cleaned_df[duplicate_mask].index:
                email = cleaned_df.loc[idx, 'Email']
                
                if email not in dup_counter:
                    dup_counter[email] = 1
                else:
                    dup_counter[email] += 1
                    # Modify the duplicated email to make it unique
                    if isinstance(email, str) and '@' in email:
                        base_email = email.split('@')
                        new_email = f"{base_email[0]}+{dup_counter[email]}@{base_email[1]}"
                        cleaned_df.loc[idx, 'Email'] = new_email
            
            modified_count = sum(v - 1 for v in dup_counter.values() if v > 0)
            if modified_count > 0:
                cleaning_stats['cells_cleaned'] += modified_count
                cleaning_stats['rows_affected'] += modified_count
                cleaning_stats['cleaning_actions'].append(f"Made {modified_count} duplicate emails unique")
        
        return cleaned_df, cleaning_stats
    
    except Exception as e:
        logger.error(f"Error cleaning data: {str(e)}")
        raise DataQualityError(f"Failed to clean data: {str(e)}")

# Function to check data quality and trigger cleaning if needed
def check_quality_and_clean(df: pd.DataFrame, 
                          quality_score: float, 
                          validation_result: dict,
                          threshold: float = 80.0) -> Tuple[pd.DataFrame, bool, Dict[str, Any]]:
    """
    Check data quality against threshold and clean if needed
    
    Args:
        df: DataFrame to check and potentially clean
        quality_score: The calculated quality score
        validation_result: Validation result from Great Expectations
        threshold: Quality threshold to trigger cleaning
        
    Returns:
        Tuple of (dataframe, was_cleaned, cleaning_stats)
        
    Raises:
        DataQualityError: If quality check or cleaning fails
    """
    try:
        cleaning_stats = {
            'rows_affected': 0,
            'cells_cleaned': 0,
            'cleaning_actions': []
        }
        
        if quality_score < threshold:
            logger.info(f"Data quality score ({quality_score:.2f}%) is below threshold ({threshold}%).")
            logger.info("Initiating automated data cleaning...")
            
            # Clean the data
            cleaned_df, cleaning_stats = clean_data(df, validation_result)
            return cleaned_df, True, cleaning_stats
        else:
            logger.info(f"Data quality score ({quality_score:.2f}%) is above threshold ({threshold}%).")
            logger.info("No cleaning needed.")
            return df, False, cleaning_stats
    
    except Exception as e:
        logger.error(f"Error checking quality and cleaning: {str(e)}")
        raise DataQualityError(f"Failed to check quality and clean data: {str(e)}")

# Function to save cleaned data
def save_cleaned_data(df: pd.DataFrame, file_path: str = 'cleaned_data.csv') -> bool:
    """
    Save cleaned data to CSV with error handling
    
    Args:
        df: DataFrame to save
        file_path: Path to save the cleaned data
        
    Returns:
        True if saved successfully, False otherwise
    """
    try:
        df.to_csv(file_path, index=False)
        logger.info(f"Cleaned data saved to {file_path}")
        return True
    except Exception as e:
        logger.error(f"Error saving cleaned data: {str(e)}")
        return False

# Main function to execute the full data quality workflow
def main(data_file: str = 'sample_data.csv', 
        output_dir: str = '.', 
        quality_threshold: float = 80.0,
        validate_only: bool = False):
    """
    Execute the full data quality workflow with better error handling and modularity
    
    Args:
        data_file: Path to the input CSV file
        output_dir: Directory for output files
        quality_threshold: Threshold for triggering data cleaning
        validate_only: If True, only validate without cleaning
    """
    try:
        logger.info("Starting data quality workflow")
        
        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        # Step 1: Load or create data
        logger.info("Loading data...")
        df = load_or_create_data(data_file)
        logger.info(f"Loaded data with {len(df)} rows and {len(df.columns)} columns")
        
        # Step 2: Calculate basic metrics
        logger.info("Calculating basic data quality metrics...")
        metrics = calculate_basic_metrics(df)
        for column, completeness in metrics['completeness'].items():
            logger.info(f"Completeness ({column}): {completeness:.2f}%")
        logger.info(f"Email Validity: {metrics['email_validity']:.2f}%")
        logger.info(f"Email Uniqueness: {metrics['email_uniqueness']:.2f}%")
        
        # Step 3: Calculate quality score
        logger.info("Calculating overall data quality score...")
        score = calculate_quality_score(metrics)
        logger.info(f"Overall Data Quality Score: {score:.2f}%")
        
        # Step 4: Create expectations
        logger.info("Creating expectations using Great Expectations...")
        expectation_suite = create_expectations(
            df, 
            file_path=data_file,
            output_path=os.path.join(output_dir, 'expectation_suite.json')
        )
        logger.info("Expectations created successfully")
        
        # Step 5: Validate data
        logger.info("Validating data against expectations...")
        validation_result = validate_data(
            df, 
            expectation_suite,
            output_path=os.path.join(output_dir, 'validation_report.html')
        )
        
        # Skip cleaning if validate_only is True
        if validate_only:
            logger.info("Validation complete. Skipping cleaning as validate_only=True")
            return
        
        # Step 6: Check quality and clean if needed
        cleaned_df, was_cleaned, cleaning_stats = check_quality_and_clean(
            df, score, validation_result, threshold=quality_threshold
        )
        
        # Step 7: If data was cleaned, save it and recalculate metrics
        if was_cleaned:
            # Save the cleaned data
            save_cleaned_data(cleaned_df, os.path.join(output_dir, 'cleaned_data.csv'))
            
            # Print cleaning statistics
            logger.info("\nCleaning Statistics:")
            logger.info(f"Rows affected: {cleaning_stats['rows_affected']}")
            logger.info(f"Cells cleaned: {cleaning_stats['cells_cleaned']}")
            logger.info("\nCleaning actions performed:")
            for action in cleaning_stats['cleaning_actions']:
                logger.info(f"- {action}")
            
            # Recalculate metrics and score for the cleaned data
            logger.info("\nRecalculating metrics after cleaning...")
            new_metrics = calculate_basic_metrics(cleaned_df)
            new_score = calculate_quality_score(new_metrics)
            
            logger.info(f"New Data Quality Score after cleaning: {new_score:.2f}%")
            
            # Create new validation report for cleaned data
            logger.info("Validating cleaned data...")
            validate_data(
                cleaned_df, 
                expectation_suite,
                output_path=os.path.join(output_dir, 'cleaned_validation_report.html')
            )
        
        logger.info("Data quality workflow completed successfully")
    
    except Exception as e:
        logger.error(f"Error in data quality workflow: {str(e)}")
        raise


# Unit tests for the data quality functions
class TestDataQualityFunctions(unittest.TestCase):
    """Unit tests for the data quality functions"""
    
    def setUp(self):
        """Set up test data"""
        self.test_data = pd.DataFrame({
            'Name': ['Test User', None, 'Another User'],
            'Email': ['test@example.com', 'invalid', 'another@example.com'],
            'Age': [25, None, 'invalid']
        })
    
    def test_calculate_basic_metrics(self):
        """Test calculate_basic_metrics function"""
        metrics = calculate_basic_metrics(self.test_data)
        
        # Check completeness
        self.assertAlmostEqual(metrics['completeness']['Name'], 2/3 * 100)
        self.assertAlmostEqual(metrics['completeness']['Email'], 3/3 * 100)
        self.assertAlmostEqual(metrics['completeness']['Age'], 2/3 * 100)
        
        # Check email validity
        self.assertAlmostEqual(metrics['email_validity'], 2/3 * 100)
        
        # Check email uniqueness
        self.assertAlmostEqual(metrics['email_uniqueness'], 3/3 * 100)
    
    def test_calculate_quality_score(self):
        """Test calculate_quality_score function"""
        metrics = {
            'completeness': {'Name': 80, 'Email': 90, 'Age': 70},
            'email_validity': 60,
            'email_uniqueness': 100
        }
        
        score = calculate_quality_score(metrics)
        expected_score = ((80 + 90 + 70) / 3 + 60 + 100) / 3
        self.assertAlmostEqual(score, expected_score)
    
    def test_clean_data(self):
        """Test clean_data function"""
        # Mock validation result
        mock_expectation = MagicMock()
        mock_expectation.expectation_type = 'expect_column_values_to_not_be_null'
        mock_expectation.kwargs = {'column': 'Name'}
        
        mock_result = MagicMock()
        mock_result.expectation_config = mock_expectation
        mock_result.success = False
        
        mock_validation = MagicMock()
        mock_validation.results = [mock_result]
        
        # Test cleaning
        cleaned_df, stats = clean_data(self.test_data, mock_validation)
        
        # Check if missing name was filled
        self.assertEqual(cleaned_df.loc[1, 'Name'], 'Unknown User')
        
        # Check if stats were recorded
        self.assertEqual(stats['cells_cleaned'], 1)
        self.assertEqual(stats['rows_affected'], 1)
        self.assertEqual(len(stats['cleaning_actions']), 1)
    
    def test_empty_dataframe(self):
        """Test handling of empty DataFrame"""
        empty_df = pd.DataFrame()
        with self.assertRaises(DataQualityError):
            calculate_basic_metrics(empty_df)
    
    def test_validate_data(self):
        """Test validate_data function"""
        # Create a simple expectation suite
        suite = ExpectationSuite(expectation_suite_name="test_suite")
        
        # Test validation
        with patch('builtins.open', MagicMock()):  # Mock file writing
            result = validate_data(self.test_data, suite)
            self.assertIsNotNone(result)
    
    def test_check_quality_and_clean(self):
        """Test check_quality_and_clean function"""
        # Mock validation result
        mock_validation = MagicMock()
        
        # Test when score is above threshold
        df, was_cleaned, stats = check_quality_and_clean(
            self.test_data, 90.0, mock_validation, threshold=80.0
        )
        self.assertFalse(was_cleaned)
        
        # Test when score is below threshold
        with patch('__main__.clean_data', return_value=(self.test_data, {'rows_affected': 1, 'cells_cleaned': 2, 'cleaning_actions': ['test']})):
            df, was_cleaned, stats = check_quality_and_clean(
                self.test_data, 70.0, mock_validation, threshold=80.0
            )
            self.assertTrue(was_cleaned)


# Function to run the full workflow with command-line arguments
if __name__ == "__main__":
    import argparse
    
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Data Quality Workflow')
    parser.add_argument('--data', type=str, default='sample_data.csv', help='Path to input data file')
    parser.add_argument('--output', type=str, default='.', help='Directory for output files')
    parser.add_argument('--threshold', type=float, default=80.0, help='Quality threshold for cleaning')
    parser.add_argument('--validate-only', action='store_true', help='Only validate data without cleaning')
    parser.add_argument('--test', action='store_true', help='Run unit tests')
    args = parser.parse_args()
    
    # Run tests if --test flag is provided
    if args.test:
        import unittest
        unittest.main(argv=['first-arg-is-ignored'])
    else:
        # Run the main workflow
        main(data_file=args.data, output_dir=args.output, 
             quality_threshold=args.threshold, validate_only=args.validate_only)

ModuleNotFoundError: No module named 'great_expectations.dataset'

In [4]:
import os
import sys
import json
import logging
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Any, Optional
from great_expectations.core import ExpectationSuite
from great_expectations.data_context import BaseDataContext
from great_expectations.data_context.types.base import DataContextConfig
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.checkpoint import SimpleCheckpoint
from great_expectations.exceptions import DataContextError

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

class AutomatedDataCleaner:
    """
    A class to handle automated data cleaning based on Great Expectations validations.
    Implements Task 6: Leveraging Data Quality Metrics for Automated Data Cleaning.
    """
    
    def __init__(
        self, 
        data_file: str,
        context_root_dir: str = "great_expectations",
        quality_threshold: float = 80.0,
    ):
        """
        Initialize the AutomatedDataCleaner
        
        Args:
            data_file: Path to the CSV file to process
            context_root_dir: Directory for Great Expectations context
            quality_threshold: Quality threshold to trigger cleaning
        """
        self.data_file = data_file
        self.context_root_dir = context_root_dir
        self.quality_threshold = quality_threshold
        self.context = self._initialize_context()
        
    def _initialize_context(self) -> BaseDataContext:
        """
        Create or load a Great Expectations context
        
        Returns:
            Great Expectations context
        """
        try:
            # Create directory if it doesn't exist
            os.makedirs(self.context_root_dir, exist_ok=True)
            
            # Define a simple data context config
            data_context_config = DataContextConfig(
                store_backend_defaults={"class_name": "InMemoryStoreBackend"},
                datasources={},
                expectations_store_name="expectations_store",
                validations_store_name="validations_store",
                evaluation_parameter_store_name="evaluation_parameter_store",
                checkpoint_store_name="checkpoint_store",
                config_version=3.0,
            )
            
            # Create or return the data context
            context = BaseDataContext(project_config=data_context_config)
            return context
            
        except Exception as e:
            logger.error(f"Error initializing Great Expectations context: {str(e)}")
            raise
    
    def create_expectation_suite(self, suite_name: str = "data_quality_suite") -> ExpectationSuite:
        """
        Create an expectation suite with standard data quality expectations
        
        Args:
            suite_name: Name for the expectation suite
            
        Returns:
            ExpectationSuite object
        """
        try:
            # Create a new expectation suite
            suite = self.context.create_expectation_suite(
                expectation_suite_name=suite_name,
                overwrite_existing=True
            )
            
            # Load the data to infer appropriate expectations
            df = pd.read_csv(self.data_file)
            
            # Create a batch from the dataframe
            batch = self.context.get_batch(
                batch_request=RuntimeBatchRequest(
                    datasource_name="my_pandas_datasource",
                    data_connector_name="default_runtime_data_connector_name",
                    data_asset_name="my_data_asset",
                    runtime_parameters={"batch_data": df},
                    batch_identifiers={"default_identifier_name": "default_identifier"},
                )
            )
            
            # Add general expectations for all columns
            for column in df.columns:
                # Completeness - expect not null
                batch.expect_column_values_to_not_be_null(column, mostly=0.8)
                
                # Type consistency
                if pd.api.types.is_numeric_dtype(df[column]):
                    batch.expect_column_values_to_be_in_type_list(
                        column, ["INTEGER", "FLOAT", "DOUBLE", "DECIMAL", "NUMERIC"]
                    )
            
            # Add specific expectations for email columns
            if 'email' in df.columns.str.lower().tolist() or 'Email' in df.columns:
                email_col = 'email' if 'email' in df.columns else 'Email'
                batch.expect_column_values_to_match_regex(
                    email_col, r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', mostly=0.9
                )
                batch.expect_column_values_to_be_unique(email_col)
            
            # Add specific expectations for name columns
            if 'name' in df.columns.str.lower().tolist() or 'Name' in df.columns:
                name_col = 'name' if 'name' in df.columns else 'Name'
                batch.expect_column_values_to_not_be_null(name_col, mostly=0.9)
                batch.expect_column_value_lengths_to_be_between(name_col, min_value=2)
            
            # Add specific expectations for age columns
            if 'age' in df.columns.str.lower().tolist() or 'Age' in df.columns:
                age_col = 'age' if 'age' in df.columns else 'Age'
                batch.expect_column_values_to_be_between(age_col, min_value=0, max_value=120)
            
            # Save the expectation suite
            self.context.save_expectation_suite(batch.get_expectation_suite(), suite_name)
            logger.info(f"Created expectation suite: {suite_name}")
            
            return batch.get_expectation_suite()
            
        except Exception as e:
            logger.error(f"Error creating expectation suite: {str(e)}")
            raise
    
    def setup_cleaning_checkpoint(self, 
                                 suite_name: str = "data_quality_suite",
                                 checkpoint_name: str = "data_cleaning_checkpoint") -> None:
        """
        Set up a checkpoint that will trigger cleaning when validation fails
        
        Args:
            suite_name: Name of the expectation suite to use
            checkpoint_name: Name for the new checkpoint
        """
        try:
            # Configure a simple checkpoint with an action list
            checkpoint_config = {
                "name": checkpoint_name,
                "config_version": 1.0,
                "class_name": "SimpleCheckpoint",
                "run_name_template": "%Y%m%d-%H%M%S-cleaning-checkpoint",
                "validations": [
                    {
                        "batch_request": {
                            "datasource_name": "my_pandas_datasource",
                            "data_connector_name": "default_runtime_data_connector_name",
                            "data_asset_name": "my_data_asset",
                        },
                        "expectation_suite_name": suite_name
                    }
                ],
                "action_list": [
                    {
                        "name": "store_validation_result",
                        "action": {"class_name": "StoreValidationResultAction"}
                    },
                    {
                        "name": "store_evaluation_params",
                        "action": {"class_name": "StoreEvaluationParametersAction"}
                    },
                    {
                        "name": "update_data_docs",
                        "action": {"class_name": "UpdateDataDocsAction"}
                    },
                    {
                        "name": "clean_data_on_failure",
                        "action": {
                            "class_name": "CustomAction",
                            "module_name": "custom_actions",
                            "function_name": "clean_data_on_failure",
                            "kwargs": {
                                "threshold": self.quality_threshold,
                                "output_path": "cleaned_data.csv"
                            }
                        }
                    }
                ]
            }
            
            # Register the checkpoint
            self.context.add_checkpoint(**checkpoint_config)
            logger.info(f"Created checkpoint: {checkpoint_name}")
            
        except Exception as e:
            logger.error(f"Error setting up cleaning checkpoint: {str(e)}")
            raise
    
    def calculate_quality_score(self, validation_result: dict) -> float:
        """
        Calculate a quality score from validation results
        
        Args:
            validation_result: Great Expectations validation result
            
        Returns:
            Quality score as a percentage
        """
        try:
            if not validation_result or "results" not in validation_result:
                return 0.0
                
            expectations_count = len(validation_result["results"])
            if expectations_count == 0:
                return 0.0
                
            passed_count = sum(1 for r in validation_result["results"] if r.get("success", False))
            quality_score = (passed_count / expectations_count) * 100
            
            return quality_score
            
        except Exception as e:
            logger.error(f"Error calculating quality score: {str(e)}")
            return 0.0
    
    def clean_data(self, df: pd.DataFrame, validation_result: dict) -> pd.DataFrame:
        """
        Clean data based on validation results with vectorized operations
        
        Args:
            df: DataFrame to clean
            validation_result: Validation result dictionary
            
        Returns:
            Cleaned DataFrame
        """
        cleaned_df = df.copy()
        
        try:
            # Extract failed expectations
            failed_expectations = []
            for result in validation_result.get("results", []):
                if not result.get("success", True):
                    failed_expectations.append(result.get("expectation_config", {}))
            
            # Process each failed expectation and apply appropriate cleaning
            for expectation in failed_expectations:
                expectation_type = expectation.get("expectation_type", "")
                column = expectation.get("kwargs", {}).get("column")
                
                if not column or column not in cleaned_df.columns:
                    continue
                
                # Handle missing values
                if expectation_type == "expect_column_values_to_not_be_null":
                    self._clean_missing_values(cleaned_df, column)
                
                # Handle type mismatches
                elif expectation_type in ["expect_column_values_to_be_of_type", 
                                         "expect_column_values_to_be_in_type_list"]:
                    self._clean_type_issues(cleaned_df, column, expectation)
                
                # Handle regex pattern mismatches (like email format)
                elif expectation_type == "expect_column_values_to_match_regex":
                    pattern = expectation.get("kwargs", {}).get("regex")
                    if pattern and "email" in column.lower():
                        self._clean_email_format(cleaned_df, column, pattern)
                
                # Handle value range issues
                elif expectation_type == "expect_column_values_to_be_between":
                    min_val = expectation.get("kwargs", {}).get("min_value")
                    max_val = expectation.get("kwargs", {}).get("max_value")
                    self._clean_value_ranges(cleaned_df, column, min_val, max_val)
                
                # Handle uniqueness issues
                elif expectation_type == "expect_column_values_to_be_unique":
                    self._clean_duplicate_values(cleaned_df, column)
            
            return cleaned_df
            
        except Exception as e:
            logger.error(f"Error cleaning data: {str(e)}")
            # Return the original dataframe if cleaning fails
            return df
    
    def _clean_missing_values(self, df: pd.DataFrame, column: str) -> None:
        """
        Clean missing values in a column with appropriate strategies
        
        Args:
            df: DataFrame to modify in-place
            column: Column to clean
        """
        missing_mask = df[column].isna()
        if not missing_mask.any():
            return
            
        # Different strategies based on column type
        if "name" in column.lower():
            # For name columns, use a placeholder
            df.loc[missing_mask, column] = "Unknown"
            
        elif "email" in column.lower():
            # For email columns, generate from index
            for idx in df[missing_mask].index:
                df.loc[idx, column] = f"user{idx}@example.com"
                
        elif df[column].dtype.kind in 'bifc':  # numeric types
            # For numeric columns, use median
            median_val = df[column].median()
            if pd.isna(median_val):
                median_val = 0
            df.loc[missing_mask, column] = median_val
            
        elif df[column].dtype == 'datetime64[ns]':
            # For datetime columns, use current date
            df.loc[missing_mask, column] = pd.Timestamp.now()
            
        else:
            # For other types, use a generic placeholder
            df.loc[missing_mask, column] = "Unknown"
    
    def _clean_type_issues(self, df: pd.DataFrame, column: str, expectation: dict) -> None:
        """
        Clean type issues in a column
        
        Args:
            df: DataFrame to modify in-place
            column: Column to clean
            expectation: The failed expectation details
        """
        expected_type = None
        if "type_list" in expectation.get("kwargs", {}):
            expected_types = expectation["kwargs"]["type_list"]
            if expected_types:
                expected_type = expected_types[0]
        elif "type_" in expectation.get("kwargs", {}):
            expected_type = expectation["kwargs"]["type_"]
        
        # Handle numeric conversions
        if expected_type in ["INTEGER", "FLOAT", "DOUBLE", "DECIMAL", "NUMERIC", "int", "float"]:
            # Convert to numeric, coercing errors to NaN
            df[column] = pd.to_numeric(df[column], errors='coerce')
            
            # Fill NaN values with appropriate defaults
            missing_mask = df[column].isna()
            if missing_mask.any():
                # Use median or 0
                median_val = df[column].median()
                if pd.isna(median_val):
                    df.loc[missing_mask, column] = 0
                else:
                    df.loc[missing_mask, column] = median_val
        
        # Handle string conversions
        elif expected_type in ["STRING", "TEXT", "VARCHAR", "str"]:
            # Convert to string
            non_null_mask = ~df[column].isna()
            df.loc[non_null_mask, column] = df.loc[non_null_mask, column].astype(str)
    
    def _clean_email_format(self, df: pd.DataFrame, column: str, pattern: str) -> None:
        """
        Clean email format issues
        
        Args:
            df: DataFrame to modify in-place
            column: Column to clean
            pattern: Regex pattern for valid emails
        """
        import re
        
        # Find invalid emails
        non_null_mask = ~df[column].isna()
        valid_pattern = re.compile(pattern)
        
        # Check each value against the pattern
        for idx in df[non_null_mask].index:
            email = str(df.loc[idx, column])
            if not valid_pattern.match(email):
                # Generate a replacement email
                if 'name' in df.columns:
                    name = df.loc[idx, 'name']
                    if pd.notna(name):
                        name_parts = str(name).lower().replace(' ', '.').replace('-', '.')
                        df.loc[idx, column] = f"{name_parts}@example.com"
                    else:
                        df.loc[idx, column] = f"user{idx}@example.com"
                else:
                    df.loc[idx, column] = f"user{idx}@example.com"
    
    def _clean_value_ranges(self, df: pd.DataFrame, column: str, min_val: Any, max_val: Any) -> None:
        """
        Clean values outside of valid ranges
        
        Args:
            df: DataFrame to modify in-place
            column: Column to clean
            min_val: Minimum allowed value
            max_val: Maximum allowed value
        """
        if pd.api.types.is_numeric_dtype(df[column]):
            # For numeric columns, clip to the valid range
            if min_val is not None and max_val is not None:
                df[column] = df[column].clip(min_val, max_val)
            elif min_val is not None:
                df[column] = df[column].clip(lower=min_val)
            elif max_val is not None:
                df[column] = df[column].clip(upper=max_val)
    
    def _clean_duplicate_values(self, df: pd.DataFrame, column: str) -> None:
        """
        Clean duplicate values in a column
        
        Args:
            df: DataFrame to modify in-place
            column: Column to clean
        """
        # Get duplicate values
        duplicates = df[df.duplicated(subset=[column], keep='first')][column]
        
        if not duplicates.empty:
            for idx in df[df.duplicated(subset=[column], keep='first')].index:
                original_value = df.loc[idx, column]
                
                if pd.isna(original_value):
                    # For NaN values, generate a unique placeholder
                    df.loc[idx, column] = f"unique_value_{idx}"
                elif isinstance(original_value, str):
                    # For string values (like emails), add a suffix
                    if '@' in original_value:
                        # Handle email specifically
                        username, domain = original_value.split('@', 1)
                        df.loc[idx, column] = f"{username}+{idx}@{domain}"
                    else:
                        # Add a suffix
                        df.loc[idx, column] = f"{original_value}_{idx}"
                else:
                    # For numeric or other types, add a small increment
                    try:
                        df.loc[idx, column] = original_value + (idx / 1000)
                    except:
                        # If incrementing fails, convert to string and add suffix
                        df.loc[idx, column] = f"{original_value}_{idx}"
    
    def run_validation_and_cleaning(self, 
                                   suite_name: str = "data_quality_suite",
                                   output_file: str = "cleaned_data.csv") -> Tuple[bool, float, Optional[str]]:
        """
        Run validation and trigger cleaning if quality is below threshold
        
        Args:
            suite_name: Name of the expectation suite to use
            output_file: Path to save cleaned data if needed
            
        Returns:
            Tuple of (was_cleaned, quality_score, cleaned_file_path)
        """
        try:
            # Load the data
            df = pd.read_csv(self.data_file)
            logger.info(f"Loaded data from {self.data_file} with shape {df.shape}")
            
            # Get or create the expectation suite
            try:
                suite = self.context.get_expectation_suite(suite_name)
                logger.info(f"Using existing expectation suite: {suite_name}")
            except DataContextError:
                suite = self.create_expectation_suite(suite_name)
                logger.info(f"Created new expectation suite: {suite_name}")
            
            # Create a batch from the dataframe
            batch = self.context.get_batch(
                batch_request=RuntimeBatchRequest(
                    datasource_name="my_pandas_datasource",
                    data_connector_name="default_runtime_data_connector_name",
                    data_asset_name="my_data_asset",
                    runtime_parameters={"batch_data": df},
                    batch_identifiers={"default_identifier_name": "default_identifier"},
                    batch_spec_passthrough={"reader_method": "pandas"},
                )
            )
            
            # Validate the data
            validation_result = batch.validate(expectation_suite=suite)
            
            # Calculate quality score
            quality_score = self.calculate_quality_score(validation_result)
            logger.info(f"Data quality score: {quality_score:.2f}%")
            
            # Decide if cleaning is needed
            if quality_score < self.quality_threshold:
                logger.info(f"Quality score {quality_score:.2f}% is below threshold {self.quality_threshold}%. Cleaning data...")
                
                # Clean the data
                cleaned_df = self.clean_data(df, validation_result)
                
                # Save the cleaned data
                cleaned_df.to_csv(output_file, index=False)
                logger.info(f"Cleaned data saved to {output_file}")
                
                # Recalculate quality score after cleaning
                cleaned_batch = self.context.get_batch(
                    batch_request=RuntimeBatchRequest(
                        datasource_name="my_pandas_datasource",
                        data_connector_name="default_runtime_data_connector_name",
                        data_asset_name="cleaned_data_asset",
                        runtime_parameters={"batch_data": cleaned_df},
                        batch_identifiers={"default_identifier_name": "default_identifier"},
                    )
                )
                
                new_validation_result = cleaned_batch.validate(expectation_suite=suite)
                new_quality_score = self.calculate_quality_score(new_validation_result)
                logger.info(f"New quality score after cleaning: {new_quality_score:.2f}%")
                
                return True, new_quality_score, output_file
            else:
                logger.info(f"Quality score {quality_score:.2f}% is above threshold {self.quality_threshold}%. No cleaning needed.")
                return False, quality_score, None
                
        except Exception as e:
            logger.error(f"Error in validation and cleaning: {str(e)}")
            return False, 0.0, None


# Custom action module for Great Expectations
class custom_actions:
    """Custom actions for Great Expectations"""
    
    @staticmethod
    def clean_data_on_failure(context, validation_result, threshold=80.0, output_path="cleaned_data.csv"):
        """
        Custom action to clean data when validation fails
        
        Args:
            context: Great Expectations context
            validation_result: Validation result data
            threshold: Quality threshold
            output_path: Path to save cleaned data
            
        Returns:
            Dictionary with results of cleaning action
        """
        try:
            # Calculate quality score
            if not validation_result or "results" not in validation_result:
                return {"success": False, "message": "Invalid validation result"}
                
            expectations_count = len(validation_result["results"])
            if expectations_count == 0:
                return {"success": False, "message": "No expectations were validated"}
                
            passed_count = sum(1 for r in validation_result["results"] if r.get("success", False))
            quality_score = (passed_count / expectations_count) * 100
            
            # Check if cleaning is needed
            if quality_score < threshold:
                # Get data batch
                batch_id = validation_result.get("meta", {}).get("batch_spec", {}).get("id")
                if not batch_id:
                    return {"success": False, "message": "Could not identify batch"}
                
                # Implement cleaning logic here
                cleaner = AutomatedDataCleaner(
                    data_file=validation_result.get("meta", {}).get("batch_spec", {}).get("path"),
                    quality_threshold=threshold
                )
                df = pd.read_csv(validation_result.get("meta", {}).get("batch_spec", {}).get("path"))
                cleaned_df = cleaner.clean_data(df, validation_result)
                cleaned_df.to_csv(output_path, index=False)
                
                return {
                    "success": True,
                    "message": f"Data cleaned and saved to {output_path}",
                    "quality_score_before": quality_score,
                    "output_path": output_path
                }
            else:
                return {
                    "success": True,
                    "message": f"No cleaning needed. Quality score {quality_score:.2f}% is above threshold {threshold}%.",
                    "quality_score": quality_score
                }
                
        except Exception as e:
            return {"success": False, "message": f"Error in clean_data_on_failure: {str(e)}"}


if __name__ == "__main__":
    import argparse
    
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Automated Data Cleaning with Great Expectations')
    parser.add_argument('--data', type=str, required=True, help='Path to input data file')
    parser.add_argument('--output', type=str, default='cleaned_data.csv', help='Path for cleaned data output')
    parser.add_argument('--threshold', type=float, default=80.0, help='Quality threshold for cleaning')
    parser.add_argument('--ge-dir', type=str, default='great_expectations', help='Great Expectations directory')
    args = parser.parse_args()
    
    # Create and run the automated cleaner
    cleaner = AutomatedDataCleaner(
        data_file=args.data,
        context_root_dir=args.ge_dir,
        quality_threshold=args.threshold
    )
    
    was_cleaned, quality_score, cleaned_file = cleaner.run_validation_and_cleaning(
        output_file=args.output
    )
    
    if was_cleaned:
        logger.info(f"Data was cleaned to a quality score of {quality_score:.2f}% and saved to {cleaned_file}")
    else:
        logger.info(f"No cleaning was needed. Quality score: {quality_score:.2f}%")

ImportError: cannot import name 'BaseDataContext' from 'great_expectations.data_context' (/home/vscode/.local/lib/python3.10/site-packages/great_expectations/data_context/__init__.py)