### Detect Data Drift in ML Models
**Objective**: Monitor and detect changes in data distributions that impact ML model performance.

**Task**: Feature Correlation Drift

**Steps**:
1. Compute the correlation matrix of features in your training dataset.
2. Compute the correlation matrix of the same features in your production data.
3. Assess changes in the correlation matrix over time to identify any significant deviations.
4. Investigate any significant changes in correlation as they may indicate issues in the data collection process or model assumptions.

In [3]:
# write your code from here
import pandas as pd
import numpy as np

# === Step 1: Load Training and Production Data ===

def load_training_data():
    return pd.DataFrame({
        'feature_1': [1, 2, 3, 4, 5],
        'feature_2': [5, 4, 3, 2, 1],
        'feature_3': [2, 3, 4, 5, 6]
    })

def load_production_data():
    return pd.DataFrame({
        'feature_1': [2, 3, 4, 5, 6],
        'feature_2': [4, 3, 2, 1, 0],
        'feature_3': [3, 4, 5, 6, 7]
    })

# === Step 2: Validate Input DataFrames ===

def validate_features(df, required_features):
    if df.empty:
        raise ValueError("DataFrame is empty.")
    missing = [f for f in required_features if f not in df.columns]
    if missing:
        raise ValueError(f"Missing features: {missing}")
    # Optional: Check numeric dtype
    non_numeric = [f for f in required_features if not pd.api.types.is_numeric_dtype(df[f])]
    if non_numeric:
        raise ValueError(f"Non-numeric features found: {non_numeric}")
    return True

# === Step 3: Compute Correlation Matrices ===

def compute_correlation_matrix(df, features):
    return df[features].corr()

# === Step 4: Measure Correlation Drift ===

def correlation_drift_score(corr_train, corr_prod):
    # Compute absolute difference matrix
    diff = (corr_train - corr_prod).abs()
    # Sum of all differences (excluding diagonal)
    np.fill_diagonal(diff.values, 0)  # ignore diagonal
    drift_score = diff.values.sum() / 2  # symmetric matrix, so divide by 2
    return drift_score

# === Step 5: Run Detection ===

try:
    train_df = load_training_data()
    prod_df = load_production_data()

    features = ['feature_1', 'feature_2', 'feature_3']

    validate_features(train_df, features)
    validate_features(prod_df, features)

    corr_train = compute_correlation_matrix(train_df, features)
    corr_prod = compute_correlation_matrix(prod_df, features)

    drift_score = correlation_drift_score(corr_train, corr_prod)

    print("✅ Training Correlation Matrix:")
    print(corr_train)
    print("\n✅ Production Correlation Matrix:")
    print(corr_prod)
    print(f"\n🔍 Correlation Drift Score: {drift_score:.4f}")

    threshold = 0.5  # define threshold based on domain knowledge
    if drift_score > threshold:
        print("🚨 Significant correlation drift detected! Investigate data or model assumptions.")
    else:
        print("✅ No significant correlation drift detected.")

except Exception as e:
    print(f"❌ Error: {e}")


✅ Training Correlation Matrix:
           feature_1  feature_2  feature_3
feature_1        1.0       -1.0        1.0
feature_2       -1.0        1.0       -1.0
feature_3        1.0       -1.0        1.0

✅ Production Correlation Matrix:
           feature_1  feature_2  feature_3
feature_1        1.0       -1.0        1.0
feature_2       -1.0        1.0       -1.0
feature_3        1.0       -1.0        1.0

🔍 Correlation Drift Score: 0.0000
✅ No significant correlation drift detected.


In [4]:
import unittest

class TestCorrelationDrift(unittest.TestCase):

    def setUp(self):
        self.train_df = pd.DataFrame({
            'feature_1': [1, 2, 3, 4, 5],
            'feature_2': [5, 4, 3, 2, 1],
            'feature_3': [2, 3, 4, 5, 6]
        })
        self.prod_df_similar = pd.DataFrame({
            'feature_1': [2, 3, 4, 5, 6],
            'feature_2': [4, 3, 2, 1, 0],
            'feature_3': [3, 4, 5, 6, 7]
        })
        self.prod_df_drifted = pd.DataFrame({
            'feature_1': [10, 20, 30, 40, 50],
            'feature_2': [50, 40, 30, 20, 10],
            'feature_3': [100, 200, 300, 400, 500]
        })

    def test_validate_features_success(self):
        self.assertTrue(validate_features(self.train_df, ['feature_1', 'feature_2', 'feature_3']))

    def test_validate_features_missing_column(self):
        df = pd.DataFrame({'feature_1': [1, 2]})
        with self.assertRaises(ValueError):
            validate_features(df, ['feature_1', 'feature_2'])

    def test_correlation_drift_score_small(self):
        corr_train = compute_correlation_matrix(self.train_df, ['feature_1', 'feature_2', 'feature_3'])
        corr_prod = compute_correlation_matrix(self.prod_df_similar, ['feature_1', 'feature_2', 'feature_3'])
        score = correlation_drift_score(corr_train, corr_prod)
        self.assertLess(score, 0.5)

    def test_correlation_drift_score_large(self):
        corr_train = compute_correlation_matrix(self.train_df, ['feature_1', 'feature_2', 'feature_3'])
        corr_prod = compute_correlation_matrix(self.prod_df_drifted, ['feature_1', 'feature_2', 'feature_3'])
        score = correlation_drift_score(corr_train, corr_prod)
        self.assertGreater(score, 0.5)

if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)


F...
FAIL: test_correlation_drift_score_large (__main__.TestCorrelationDrift)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_26263/3756738389.py", line 40, in test_correlation_drift_score_large
    self.assertGreater(score, 0.5)
AssertionError: 0.0 not greater than 0.5

----------------------------------------------------------------------
Ran 4 tests in 0.013s

FAILED (failures=1)
