# Tutorial 18: Data Drift and Model Degradation

## Module 7: Monitoring and Infrastructure

## Learning Objectives

1. Understand different types of data drift (covariate shift, label shift, concept drift)
2. Implement statistical tests for drift detection (KS test, chi-square, PSI)
3. Build automated drift detection pipelines
4. Design model retraining strategies
5. Monitor model degradation over time

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass
from enum import Enum
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)
print('Setup complete!')

## 1. Types of Data Drift

| Type | Definition | Detection |
|------|------------|-----------|  
| **Covariate Shift** | P(X) changes, P(Y|X) constant | Feature distribution tests |
| **Label Shift** | P(Y) changes, P(X|Y) constant | Label distribution monitoring |
| **Concept Drift** | P(Y|X) changes | Model performance monitoring |

In [None]:
class DriftType(Enum):
    COVARIATE = 'covariate_shift'
    LABEL = 'label_shift'
    CONCEPT = 'concept_drift'

@dataclass
class DriftInfo:
    drift_type: DriftType
    description: str
    detection_method: str

DRIFT_CATALOG = {
    DriftType.COVARIATE: DriftInfo(
        DriftType.COVARIATE,
        'Input feature distribution changes while relationship stays same',
        'Statistical tests on features (KS, PSI, chi-square)'
    ),
    DriftType.LABEL: DriftInfo(
        DriftType.LABEL,
        'Target distribution changes (class imbalance shifts)',
        'Monitor label distribution over time'
    ),
    DriftType.CONCEPT: DriftInfo(
        DriftType.CONCEPT,
        'Relationship between features and target changes',
        'Monitor model accuracy, compare to holdout'
    )
}

print('TYPES OF DATA DRIFT')
print('=' * 60)
for dtype, info in DRIFT_CATALOG.items():
    print(f'\n{info.drift_type.value.upper()}')
    print(f'  {info.description}')
    print(f'  Detection: {info.detection_method}')

In [None]:
# Visualize drift types
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Covariate Shift
ax = axes[0, 0]
x_train = np.random.normal(0, 1, 1000)
x_prod = np.random.normal(1.5, 1.2, 1000)
ax.hist(x_train, bins=50, alpha=0.5, label='Training', color='blue', density=True)
ax.hist(x_prod, bins=50, alpha=0.5, label='Production', color='red', density=True)
ax.set_title('Covariate Shift: P(X) Changes')
ax.legend()

# Label Shift
ax = axes[0, 1]
ax.bar(['Train 0', 'Train 1', 'Prod 0', 'Prod 1'], [0.9, 0.1, 0.7, 0.3],
       color=['blue', 'blue', 'red', 'red'], alpha=0.7)
ax.set_title('Label Shift: P(Y) Changes')

# Concept Drift
ax = axes[1, 0]
x = np.linspace(0, 10, 100)
ax.scatter(x, 2*x + 1 + np.random.normal(0, 1, 100), alpha=0.5, label='Training', color='blue')
ax.scatter(x, 0.5*x + 5 + np.random.normal(0, 1, 100), alpha=0.5, label='Production', color='red')
ax.set_title('Concept Drift: P(Y|X) Changes')
ax.legend()

# Performance over time
ax = axes[1, 1]
time = np.arange(100)
perf = 0.95 - 0.002*time - 0.1*(time > 50) + np.random.normal(0, 0.02, 100)
ax.plot(time, perf, color='green')
ax.axvline(50, color='red', linestyle='--', label='Drift Point')
ax.set_title('Model Performance Over Time')
ax.legend()

plt.tight_layout()
plt.show()

## 2. Statistical Tests for Drift Detection

- **KS Test**: Continuous features
- **Chi-Square**: Categorical features  
- **PSI**: Industry standard for drift

In [None]:
class DriftDetector:
    def __init__(self, significance_level: float = 0.05):
        self.significance_level = significance_level
    
    def ks_test(self, reference: np.ndarray, current: np.ndarray) -> Dict:
        """Kolmogorov-Smirnov test for continuous features."""
        statistic, p_value = stats.ks_2samp(reference, current)
        return {
            'test': 'ks_test',
            'statistic': statistic,
            'p_value': p_value,
            'drift_detected': p_value < self.significance_level
        }
    
    def psi(self, reference: np.ndarray, current: np.ndarray, bins: int = 10) -> Dict:
        """Population Stability Index."""
        bin_edges = np.percentile(reference, np.linspace(0, 100, bins + 1))
        bin_edges[0], bin_edges[-1] = -np.inf, np.inf
        
        ref_counts, _ = np.histogram(reference, bins=bin_edges)
        cur_counts, _ = np.histogram(current, bins=bin_edges)
        
        ref_pct = np.clip(ref_counts / len(reference), 0.0001, 1)
        cur_pct = np.clip(cur_counts / len(current), 0.0001, 1)
        
        psi_value = np.sum((cur_pct - ref_pct) * np.log(cur_pct / ref_pct))
        
        severity = 'low' if psi_value < 0.1 else ('medium' if psi_value < 0.2 else 'high')
        return {
            'test': 'psi',
            'statistic': psi_value,
            'drift_detected': psi_value >= 0.1,
            'severity': severity
        }
    
    def chi_square_test(self, reference: np.ndarray, current: np.ndarray) -> Dict:
        """Chi-square test for categorical features."""
        categories = np.unique(np.concatenate([reference, current]))
        ref_counts = np.array([np.sum(reference == c) for c in categories])
        cur_counts = np.array([np.sum(current == c) for c in categories])
        
        expected = np.maximum((ref_counts / len(reference)) * len(current), 1)
        statistic, p_value = stats.chisquare(cur_counts, expected)
        
        return {
            'test': 'chi_square',
            'statistic': statistic,
            'p_value': p_value,
            'drift_detected': p_value < self.significance_level
        }

detector = DriftDetector()
print('DriftDetector created!')

In [None]:
# Demonstrate drift detection
np.random.seed(42)
reference = np.random.normal(50, 10, 1000)
no_drift = np.random.normal(50, 10, 1000)
mild_drift = np.random.normal(55, 10, 1000)
severe_drift = np.random.normal(65, 15, 1000)

print('DRIFT DETECTION RESULTS')
print('=' * 60)

for name, data in [('No Drift', no_drift), ('Mild Drift', mild_drift), ('Severe Drift', severe_drift)]:
    ks = detector.ks_test(reference, data)
    psi = detector.psi(reference, data)
    print(f'\n{name}:')
    print(f'  KS: {ks["statistic"]:.4f}, drift={ks["drift_detected"]}')
    print(f'  PSI: {psi["statistic"]:.4f} ({psi["severity"]})')

In [None]:
# Visualize drift detection
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
datasets = [('No Drift', no_drift), ('Mild Drift', mild_drift), ('Severe Drift', severe_drift)]

for idx, (name, data) in enumerate(datasets):
    ax = axes[0, idx]
    ax.hist(reference, bins=50, alpha=0.5, label='Reference', color='blue', density=True)
    ax.hist(data, bins=50, alpha=0.5, label='Current', color='red', density=True)
    psi_val = detector.psi(reference, data)['statistic']
    ax.set_title(f'{name} (PSI: {psi_val:.3f})')
    ax.legend()

# PSI comparison
ax = axes[1, 0]
psi_vals = [detector.psi(reference, d)['statistic'] for _, d in datasets]
colors = ['green' if p < 0.1 else 'orange' if p < 0.2 else 'red' for p in psi_vals]
ax.bar([n for n, _ in datasets], psi_vals, color=colors)
ax.axhline(0.1, color='orange', linestyle='--', label='Moderate')
ax.axhline(0.2, color='red', linestyle='--', label='Severe')
ax.set_title('PSI Comparison')
ax.legend()

# KS comparison
ax = axes[1, 1]
ks_vals = [detector.ks_test(reference, d)['statistic'] for _, d in datasets]
ax.bar([n for n, _ in datasets], ks_vals, color='steelblue')
ax.set_title('KS Statistic')

# Thresholds guide
ax = axes[1, 2]
thresholds = ['PSI < 0.1', '0.1 <= PSI < 0.2', 'PSI >= 0.2']
interpretations = ['No Drift', 'Moderate Drift', 'Severe Drift']
colors = ['green', 'orange', 'red']
ax.barh(thresholds, [1, 1, 1], color=colors)
ax.set_title('PSI Interpretation Guide')
ax.set_xlim(0, 1.5)
for i, interp in enumerate(interpretations):
    ax.text(1.1, i, interp, va='center')

plt.tight_layout()
plt.show()

## 3. Feature-Level Drift Detection

In [None]:
class FeatureDriftMonitor:
    def __init__(self, feature_names: List[str]):
        self.feature_names = feature_names
        self.detector = DriftDetector()
        self.reference_data = None
    
    def set_reference(self, df: pd.DataFrame):
        self.reference_data = df[self.feature_names].copy()
    
    def detect_drift(self, current_df: pd.DataFrame) -> Dict[str, Dict]:
        results = {}
        for feature in self.feature_names:
            ref = self.reference_data[feature].values
            cur = current_df[feature].values
            
            ks = self.detector.ks_test(ref, cur)
            psi = self.detector.psi(ref, cur)
            
            results[feature] = {
                'ks_statistic': ks['statistic'],
                'psi': psi['statistic'],
                'severity': psi['severity'],
                'drift_detected': ks['drift_detected'] or psi['drift_detected']
            }
        return results

# Create sample data
np.random.seed(42)
n = 2000

train_df = pd.DataFrame({
    'age': np.random.normal(35, 10, n),
    'income': np.random.normal(50000, 20000, n),
    'amount': np.random.exponential(100, n)
})

prod_df = pd.DataFrame({
    'age': np.random.normal(40, 12, n),  # Drift
    'income': np.random.normal(50000, 20000, n),  # No drift
    'amount': np.random.exponential(150, n)  # Drift
})

monitor = FeatureDriftMonitor(['age', 'income', 'amount'])
monitor.set_reference(train_df)
results = monitor.detect_drift(prod_df)

print('FEATURE DRIFT ANALYSIS')
print('=' * 50)
for feat, res in results.items():
    status = 'DRIFT' if res['drift_detected'] else 'OK'
    print(f"{feat}: [{status}] PSI={res['psi']:.3f} ({res['severity']})")

In [None]:
# Visualize feature drift
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, feat in enumerate(['age', 'income', 'amount']):
    ax = axes[idx]
    ax.hist(train_df[feat], bins=50, alpha=0.5, label='Train', color='blue', density=True)
    ax.hist(prod_df[feat], bins=50, alpha=0.5, label='Prod', color='red', density=True)
    status = 'DRIFT' if results[feat]['drift_detected'] else 'OK'
    ax.set_title(f"{feat} [{status}] - PSI: {results[feat]['psi']:.3f}")
    ax.legend()

plt.tight_layout()
plt.show()

## 4. Model Performance Degradation

In [None]:
class PerformanceTracker:
    def __init__(self, baseline: Dict[str, float], thresholds: Dict[str, float]):
        self.baseline = baseline
        self.thresholds = thresholds
        self.history = []
    
    def record(self, timestamp: datetime, metrics: Dict[str, float]):
        entry = {'timestamp': timestamp}
        for metric, value in metrics.items():
            entry[metric] = value
            if metric in self.baseline:
                entry[f'{metric}_drop'] = self.baseline[metric] - value
                entry[f'{metric}_degraded'] = entry[f'{metric}_drop'] > self.thresholds.get(metric, 0.05)
        self.history.append(entry)
    
    def get_report(self) -> str:
        if not self.history:
            return 'No data'
        latest = self.history[-1]
        lines = ['DEGRADATION REPORT', '=' * 40]
        for metric in self.baseline:
            if metric in latest:
                status = 'DEGRADED' if latest.get(f'{metric}_degraded', False) else 'OK'
                lines.append(f"{metric}: {latest[metric]:.3f} (baseline: {self.baseline[metric]:.3f}) [{status}]")
        return '\n'.join(lines)

# Simulate performance over time
baseline = {'accuracy': 0.92, 'f1': 0.88}
thresholds = {'accuracy': 0.03, 'f1': 0.05}
tracker = PerformanceTracker(baseline, thresholds)

start = datetime(2024, 1, 1)
for day in range(90):
    degradation = 0.001 * day + (0.1 if day > 45 else 0)
    metrics = {
        'accuracy': max(0.92 - degradation + np.random.normal(0, 0.01), 0.75),
        'f1': max(0.88 - degradation * 1.2 + np.random.normal(0, 0.01), 0.70)
    }
    tracker.record(start + timedelta(days=day), metrics)

print(tracker.get_report())

In [None]:
# Visualize degradation
df = pd.DataFrame(tracker.history)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax = axes[0]
ax.plot(df['timestamp'], df['accuracy'], label='Accuracy', color='blue')
ax.plot(df['timestamp'], df['f1'], label='F1', color='green')
ax.axvline(datetime(2024, 2, 15), color='red', linestyle='--', label='Drift')
ax.axhline(baseline['accuracy'] - thresholds['accuracy'], color='blue', linestyle=':', alpha=0.5)
ax.set_title('Performance Over Time')
ax.legend()
ax.tick_params(axis='x', rotation=45)

ax = axes[1]
ax.fill_between(df['timestamp'], baseline['accuracy'], baseline['accuracy'] - thresholds['accuracy'],
                alpha=0.3, color='green', label='Acceptable')
ax.fill_between(df['timestamp'], baseline['accuracy'] - thresholds['accuracy'], 0.7,
                alpha=0.3, color='red', label='Degraded')
ax.plot(df['timestamp'], df['accuracy'], color='blue', linewidth=2)
ax.set_title('Accuracy with Thresholds')
ax.legend()
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Model Retraining Strategies

In [None]:
class RetrainingStrategy(Enum):
    SCHEDULED = 'scheduled'
    PERFORMANCE_TRIGGERED = 'performance_triggered'
    DRIFT_TRIGGERED = 'drift_triggered'

@dataclass
class RetrainingConfig:
    strategy: RetrainingStrategy
    schedule_days: Optional[int] = None
    performance_threshold: Optional[float] = None
    drift_threshold: Optional[float] = None

class RetrainingManager:
    def __init__(self, config: RetrainingConfig):
        self.config = config
        self.last_retrain = None
        self.history = []
    
    def should_retrain(self, metrics: Dict, drift_scores: Dict, 
                       current_date: datetime) -> Tuple[bool, str]:
        if self.config.strategy == RetrainingStrategy.SCHEDULED:
            if self.last_retrain is None:
                return True, 'Initial training'
            days = (current_date - self.last_retrain).days
            if days >= self.config.schedule_days:
                return True, f'Scheduled ({days} days)'
        
        elif self.config.strategy == RetrainingStrategy.PERFORMANCE_TRIGGERED:
            for metric, value in metrics.items():
                if value < self.config.performance_threshold:
                    return True, f'Performance ({metric}={value:.3f})'
        
        elif self.config.strategy == RetrainingStrategy.DRIFT_TRIGGERED:
            for feature, score in drift_scores.items():
                if score > self.config.drift_threshold:
                    return True, f'Drift in {feature} (PSI={score:.3f})'
        
        return False, 'No retraining needed'
    
    def record_retrain(self, date: datetime, reason: str, before: Dict, after: Dict):
        self.last_retrain = date
        self.history.append({'date': date, 'reason': reason, 'before': before, 'after': after})

# Demo
config = RetrainingConfig(RetrainingStrategy.PERFORMANCE_TRIGGERED, performance_threshold=0.85)
manager = RetrainingManager(config)
manager.last_retrain = datetime(2024, 1, 1)

should, reason = manager.should_retrain({'accuracy': 0.82}, {}, datetime(2024, 2, 1))
print(f'Should retrain: {should}')
print(f'Reason: {reason}')

In [None]:
# Simulate retraining decisions
manager.last_retrain = datetime(2024, 1, 1)
decisions = []

for day in range(90):
    date = datetime(2024, 1, 1) + timedelta(days=day)
    accuracy = max(0.92 - 0.001 * day - (0.1 if day > 45 else 0), 0.75)
    
    should, reason = manager.should_retrain({'accuracy': accuracy}, {}, date)
    
    if should and (not manager.history or (date - manager.history[-1]['date']).days > 7):
        manager.record_retrain(date, reason, {'accuracy': accuracy}, {'accuracy': 0.92})
    
    decisions.append({'date': date, 'accuracy': accuracy, 'retrain': should})

print('RETRAINING EVENTS')
print('=' * 50)
for event in manager.history:
    print(f"{event['date'].strftime('%Y-%m-%d')}: {event['reason']}")
    print(f"  Accuracy: {event['before']['accuracy']:.3f} -> {event['after']['accuracy']:.3f}")

## 6. Complete Drift Detection Pipeline

In [None]:
class DriftPipeline:
    def __init__(self, features: List[str], psi_threshold: float = 0.15):
        self.monitor = FeatureDriftMonitor(features)
        self.psi_threshold = psi_threshold
        self.reference_labels = None
    
    def set_reference(self, df: pd.DataFrame, labels: np.ndarray):
        self.monitor.set_reference(df)
        self.reference_labels = labels
        self.ref_positive_rate = np.mean(labels)
    
    def analyze(self, df: pd.DataFrame, labels: np.ndarray = None) -> Dict:
        results = {'timestamp': datetime.now(), 'feature_drift': {}, 'alerts': []}
        
        # Feature drift
        feature_results = self.monitor.detect_drift(df)
        for feat, res in feature_results.items():
            results['feature_drift'][feat] = res
            if res['drift_detected']:
                results['alerts'].append(f'Feature drift: {feat} (PSI={res["psi"]:.3f})')
        
        # Label drift
        if labels is not None and self.reference_labels is not None:
            cur_rate = np.mean(labels)
            shift = abs(cur_rate - self.ref_positive_rate)
            results['label_drift'] = {'ref': self.ref_positive_rate, 'cur': cur_rate, 'shift': shift}
            if shift > 0.05:
                results['alerts'].append(f'Label drift: {self.ref_positive_rate:.2f} -> {cur_rate:.2f}')
        
        return results
    
    def report(self, results: Dict) -> str:
        lines = ['=' * 50, 'DRIFT DETECTION REPORT', '=' * 50]
        lines.append('\nFeature Drift:')
        for feat, res in results['feature_drift'].items():
            status = 'DRIFT' if res['drift_detected'] else 'OK'
            lines.append(f"  {feat}: [{status}] PSI={res['psi']:.3f}")
        
        if 'label_drift' in results:
            ld = results['label_drift']
            lines.append(f"\nLabel Drift: {ld['ref']:.2f} -> {ld['cur']:.2f} (shift: {ld['shift']:.2f})")
        
        lines.append('\nAlerts:')
        for alert in results['alerts']:
            lines.append(f"  [!] {alert}")
        if not results['alerts']:
            lines.append('  No alerts')
        
        return '\n'.join(lines)

# Run pipeline
pipeline = DriftPipeline(['age', 'income', 'amount'])
ref_labels = np.random.choice([0, 1], 2000, p=[0.9, 0.1])
pipeline.set_reference(train_df, ref_labels)

cur_labels = np.random.choice([0, 1], 2000, p=[0.85, 0.15])
results = pipeline.analyze(prod_df, cur_labels)
print(pipeline.report(results))

## 7. Hands-on Exercise

Build a fraud detection drift monitor:
1. Generate synthetic transaction data with drift
2. Train a classifier and track performance
3. Implement automatic drift alerts

In [None]:
class FraudDriftMonitor:
    def __init__(self):
        self.model = None
        self.reference = None
        self.detector = DriftDetector()
        self.baseline_acc = None
    
    def train(self, X: pd.DataFrame, y: np.ndarray):
        self.reference = X.copy()
        self.model = RandomForestClassifier(n_estimators=50, random_state=42)
        self.model.fit(X, y)
        self.baseline_acc = accuracy_score(y, self.model.predict(X))
        print(f'Trained. Baseline accuracy: {self.baseline_acc:.3f}')
    
    def monitor(self, X: pd.DataFrame, y: np.ndarray) -> Dict:
        results = {'drift': [], 'accuracy': None, 'alerts': []}
        
        # Check drift per feature
        for col in self.reference.columns:
            psi = self.detector.psi(self.reference[col].values, X[col].values)
            results['drift'].append({'feature': col, 'psi': psi['statistic']})
            if psi['drift_detected']:
                results['alerts'].append(f'Drift in {col}: PSI={psi["statistic"]:.3f}')
        
        # Check accuracy
        pred = self.model.predict(X)
        results['accuracy'] = accuracy_score(y, pred)
        if results['accuracy'] < self.baseline_acc - 0.05:
            results['alerts'].append(f'Accuracy drop: {self.baseline_acc:.3f} -> {results["accuracy"]:.3f}')
        
        return results

# Generate data
np.random.seed(42)
n = 1000
X_train = pd.DataFrame({
    'amount': np.random.exponential(100, n),
    'hour': np.random.randint(0, 24, n),
    'distance': np.random.exponential(50, n)
})
y_train = (X_train['amount'] > 150).astype(int) | (np.random.random(n) < 0.05)

# Train
monitor = FraudDriftMonitor()
monitor.train(X_train, y_train)

# Production data with drift
X_prod = pd.DataFrame({
    'amount': np.random.exponential(150, n),  # Drift!
    'hour': np.random.randint(0, 24, n),
    'distance': np.random.exponential(50, n)
})
y_prod = (X_prod['amount'] > 150).astype(int) | (np.random.random(n) < 0.1)

results = monitor.monitor(X_prod, y_prod)
print('\nMonitoring Results:')
print(f'Accuracy: {results["accuracy"]:.3f}')
print('\nAlerts:')
for alert in results['alerts']:
    print(f'  [!] {alert}')

## 8. Summary

### Key Concepts

1. **Types of Drift**: Covariate (P(X) changes), Label (P(Y) changes), Concept (P(Y|X) changes)
2. **Statistical Tests**: KS test for continuous, Chi-square for categorical, PSI for industry standard
3. **PSI Thresholds**: <0.1 (no drift), 0.1-0.2 (moderate), >0.2 (severe)
4. **Retraining Strategies**: Scheduled, performance-triggered, drift-triggered

### Best Practices

- Monitor ALL features, not just model inputs
- Use multiple detection methods (KS + PSI)
- Set up alerts before drift becomes critical
- Automate retraining pipelines
- Keep baseline/reference data versioned

### Next Steps

In the next module, we will explore **ML Infrastructure** including experiment tracking, feature stores, and MLOps practices.