### Imputing Missing Data
**Description**: Impute missing data using various strategies like mean, median, or mode.

In [1]:
# Write your code from here
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
import os
import sys

# ----------------------------- Configuration ----------------------------- #
reference_file = "reference_data.csv"
current_file = "current_data.csv"
feature = "age"  # Feature to analyze
required_columns = [feature]
# ------------------------------------------------------------------------ #

def load_csv(file_path):
    """Load CSV with error handling."""
    if not os.path.exists(file_path):
        print(f"❌ Error: File not found: {file_path}")
        sys.exit(1)
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")
        sys.exit(1)

def validate_columns(df, file_name):
    """Ensure required columns exist."""
    for col in required_columns:
        if col not in df.columns:
            print(f"❌ Error: '{col}' column missing in {file_name}")
            sys.exit(1)

def handle_missing_values(df):
    """Drop rows with missing values in required columns."""
    return df.dropna(subset=required_columns)

def detect_data_drift(ref_data, current_data, feature):
    """Visualize and statistically test for data drift."""
    print(f"\n📊 Analyzing drift on feature: '{feature}'")
    
    # Visualization
    plt.figure(figsize=(10, 6))
    sns.kdeplot(ref_data[feature], label="Reference", shade=True)
    sns.kdeplot(current_data[feature], label="Current", shade=True)
    plt.title(f"Distribution of '{feature}' - Data Drift Detection")
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Statistical test (KS Test)
    stat, p_value = ks_2samp(ref_data[feature], current_data[feature])
    print(f"🧪 KS Test → Statistic: {stat:.4f}, p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("🔺 Drift Detected: The distributions are significantly different.")
    else:
        print("✅ No significant drift detected.")

def generate_evidently_report(ref_data, current_data):
    """Generate a detailed HTML drift report."""
    print("\n📄 Generating Evidently data drift report...")
    try:
        report = Report(metrics=[DataDriftPreset()])
        report.run(reference_data=ref_data, current_data=current_data)
        report.save_html("data_drift_report.html")
        print("✅ Drift report saved as 'data_drift_report.html'")
    except Exception as e:
        print(f"⚠️ Could not generate Evidently report: {e}")

def main():
    # Load datasets
    ref_data = load_csv(reference_file)
    current_data = load_csv(current_file)

    # Validate columns
    validate_columns(ref_data, reference_file)
    validate_columns(current_data, current_file)

    # Handle missing values
    ref_data = handle_missing_values(ref_data)
    current_data = handle_missing_values(current_data)

    # Detect data drift
    detect_data_drift(ref_data, current_data, feature)

    # Generate HTML report (optional)
    generate_evidently_report(ref_data, current_data)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'evidently'