# Step 5: Model Monitoring

## Objective
Evaluate model performance by comparing predictions (`scores.csv`) with actual outcomes (`ground_truth.csv`).

## Prerequisites
- Completed Step 4: API Inference Scoring
- `scores.csv` file with model predictions
- `ground_truth.csv` file with actual labels

## Deliverables
- Model performance metrics (accuracy, precision, recall, F1, AUC)
- Confusion matrix analysis
- Business impact assessment
- Model drift detection
- `monitoring_report.json` with comprehensive metrics

---
## 1. Setup and Imports

In [None]:
# Install required packages (run once)
# !pip install pandas numpy scikit-learn matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
import os

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print(f"Notebook started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

---
## 2. Configuration

In [None]:
# File paths
SCORES_PATH = "../data/scores.csv"
GROUND_TRUTH_PATH = "../data/ground_truth.csv"
REPORT_OUTPUT_PATH = "../reports/monitoring_report.json"

# Performance thresholds (for drift detection)
EXPECTED_ACCURACY = 0.70
EXPECTED_AUC = 0.65
MAX_PREDICTION_BIAS = 0.10

print("Configuration loaded.")

---
## 3. Load Data

In [None]:
# Load predictions
scores_df = pd.read_csv(SCORES_PATH)
print(f"Loaded scores:")
print(f"  - Samples: {len(scores_df):,}")
print(f"  - Columns: {list(scores_df.columns)}")

# Load ground truth
ground_truth_df = pd.read_csv(GROUND_TRUTH_PATH)
print(f"\nLoaded ground truth:")
print(f"  - Samples: {len(ground_truth_df):,}")
print(f"  - Columns: {list(ground_truth_df.columns)}")

In [None]:
# Merge predictions with ground truth
merged_df = pd.merge(scores_df, ground_truth_df, on='ID', how='inner')

print(f"\nMerged dataset:")
print(f"  - Matched samples: {len(merged_df):,}")

# Check for any null predictions
null_preds = merged_df['prediction'].isnull().sum()
if null_preds > 0:
    print(f"  - WARNING: {null_preds} null predictions found")
    # Remove null predictions for analysis
    merged_df = merged_df.dropna(subset=['prediction'])
    print(f"  - After removing nulls: {len(merged_df):,} samples")

merged_df.head()

---
## 4. Classification Performance Metrics

In [None]:
# Extract predictions and actuals
y_true = merged_df['actual_default'].values
y_pred = merged_df['prediction'].values.astype(int)
y_prob = merged_df['probability_default'].values

print("Data prepared for evaluation")
print(f"  - Total samples: {len(y_true):,}")
print(f"  - Actual defaults: {y_true.sum():,} ({y_true.mean():.2%})")
print(f"  - Predicted defaults: {y_pred.sum():,} ({y_pred.mean():.2%})")

In [None]:
# Calculate classification metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
specificity = recall_score(y_true, y_pred, pos_label=0, zero_division=0)

# ROC AUC
if len(np.unique(y_true)) > 1:
    roc_auc = roc_auc_score(y_true, y_prob)
else:
    roc_auc = None
    print("Warning: Only one class present in ground truth")

print("\n" + "=" * 60)
print("CLASSIFICATION METRICS")
print("=" * 60)
print(f"Accuracy:             {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision:            {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"Specificity:          {specificity:.4f}")
print(f"F1 Score:             {f1:.4f}")
if roc_auc:
    print(f"ROC AUC:              {roc_auc:.4f}")

---
## 5. Confusion Matrix Analysis

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()

print("\n" + "=" * 60)
print("CONFUSION MATRIX")
print("=" * 60)
print("\n                    Predicted")
print("                    No Default    Default")
print(f"Actual  No Default    {tn:>6}       {fp:>6}")
print(f"        Default       {fn:>6}       {tp:>6}")

# Additional metrics from confusion matrix
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

print(f"\nTrue Negatives:  {tn:,}")
print(f"False Positives: {fp:,} (False Alarm Rate: {fpr:.4f})")
print(f"False Negatives: {fn:,} (Miss Rate: {fnr:.4f})")
print(f"True Positives:  {tp:,}")

In [None]:
# Visualize confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Default', 'Default'],
            yticklabels=['No Default', 'Default'],
            ax=ax)
ax.set_ylabel('Actual')
ax.set_xlabel('Predicted')
ax.set_title('Confusion Matrix')
plt.tight_layout()
plt.savefig('../reports/confusion_matrix.png', dpi=150)
plt.show()
print("Confusion matrix saved to: reports/confusion_matrix.png")

---
## 6. ROC Curve and Precision-Recall Curve

In [None]:
# Plot ROC and Precision-Recall curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ROC Curve
if roc_auc:
    fpr_curve, tpr_curve, _ = roc_curve(y_true, y_prob)
    axes[0].plot(fpr_curve, tpr_curve, label=f'ROC (AUC = {roc_auc:.4f})')
    axes[0].plot([0, 1], [0, 1], 'k--', label='Random')
    axes[0].set_xlabel('False Positive Rate')
    axes[0].set_ylabel('True Positive Rate')
    axes[0].set_title('ROC Curve')
    axes[0].legend(loc='lower right')
    axes[0].grid(True, alpha=0.3)

# Precision-Recall Curve
precision_curve, recall_curve, _ = precision_recall_curve(y_true, y_prob)
axes[1].plot(recall_curve, precision_curve, label='Precision-Recall')
axes[1].axhline(y=y_true.mean(), color='k', linestyle='--', label=f'Baseline ({y_true.mean():.4f})')
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve')
axes[1].legend(loc='upper right')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/roc_pr_curves.png', dpi=150)
plt.show()
print("Curves saved to: reports/roc_pr_curves.png")

---
## 7. Prediction Distribution Analysis

In [None]:
# Analyze probability distribution
print("\n" + "=" * 60)
print("PREDICTION DISTRIBUTION")
print("=" * 60)
print(f"Mean probability:   {y_prob.mean():.4f}")
print(f"Std probability:    {y_prob.std():.4f}")
print(f"Median probability: {np.median(y_prob):.4f}")
print(f"Min probability:    {y_prob.min():.4f}")
print(f"Max probability:    {y_prob.max():.4f}")

# Percentiles
percentiles = [10, 25, 50, 75, 90, 95, 99]
print("\nPercentiles:")
for p in percentiles:
    print(f"  P{p:2d}: {np.percentile(y_prob, p):.4f}")

In [None]:
# Visualize probability distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram by actual class
for label, name in [(0, 'No Default'), (1, 'Default')]:
    mask = y_true == label
    axes[0].hist(y_prob[mask], bins=30, alpha=0.6, label=name, density=True)
axes[0].set_xlabel('Predicted Probability of Default')
axes[0].set_ylabel('Density')
axes[0].set_title('Probability Distribution by Actual Class')
axes[0].legend()
axes[0].axvline(x=0.5, color='red', linestyle='--', label='Threshold (0.5)')
axes[0].grid(True, alpha=0.3)

# Calibration plot
prob_bins = pd.cut(y_prob, bins=10)
calibration_df = pd.DataFrame({
    'prob': y_prob,
    'actual': y_true,
    'bin': prob_bins
})
calibration = calibration_df.groupby('bin').agg({
    'prob': 'mean',
    'actual': 'mean'
}).dropna()

axes[1].plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
axes[1].scatter(calibration['prob'], calibration['actual'], s=100, label='Model')
axes[1].set_xlabel('Mean Predicted Probability')
axes[1].set_ylabel('Actual Default Rate')
axes[1].set_title('Model Calibration Plot')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/probability_distribution.png', dpi=150)
plt.show()
print("Distribution plots saved to: reports/probability_distribution.png")

---
## 8. Business Impact Analysis

In [None]:
# Business cost analysis
# Assumptions:
# - Cost of missing a default (False Negative): Higher risk
# - Cost of false alarm (False Positive): Lower risk

COST_FALSE_NEGATIVE = 500  # Cost of missing a default
COST_FALSE_POSITIVE = 100  # Cost of unnecessary action on non-default

total_cost = (fn * COST_FALSE_NEGATIVE) + (fp * COST_FALSE_POSITIVE)
avg_cost_per_sample = total_cost / len(y_true)

print("\n" + "=" * 60)
print("BUSINESS IMPACT ANALYSIS")
print("=" * 60)
print(f"Total samples:                  {len(y_true):,}")
print(f"Actual defaults:                {y_true.sum():,} ({y_true.mean()*100:.2f}%)")
print(f"Predicted defaults:             {y_pred.sum():,} ({y_pred.mean()*100:.2f}%)")
print(f"\nCorrectly identified defaults:  {tp:,}")
print(f"Missed defaults (FN):           {fn:,}")
print(f"False alarms (FP):              {fp:,}")
print(f"\nCost Analysis:")
print(f"  Cost per missed default:      ${COST_FALSE_NEGATIVE:,}")
print(f"  Cost per false alarm:         ${COST_FALSE_POSITIVE:,}")
print(f"  Total cost:                   ${total_cost:,}")
print(f"  Average cost per sample:      ${avg_cost_per_sample:.2f}")
print(f"\nDefault Detection Rate:         {tp/y_true.sum()*100:.2f}% of actual defaults caught")

---
## 9. Model Drift Detection

In [None]:
# Check for model drift
print("\n" + "=" * 60)
print("MODEL DRIFT DETECTION")
print("=" * 60)

alerts = []

# Check accuracy
if accuracy < EXPECTED_ACCURACY:
    alert = f"WARNING: Accuracy ({accuracy:.4f}) below threshold ({EXPECTED_ACCURACY})"
    alerts.append(alert)
    print(f"[!] {alert}")
else:
    print(f"[OK] Accuracy ({accuracy:.4f}) meets threshold ({EXPECTED_ACCURACY})")

# Check AUC
if roc_auc and roc_auc < EXPECTED_AUC:
    alert = f"WARNING: ROC AUC ({roc_auc:.4f}) below threshold ({EXPECTED_AUC})"
    alerts.append(alert)
    print(f"[!] {alert}")
elif roc_auc:
    print(f"[OK] ROC AUC ({roc_auc:.4f}) meets threshold ({EXPECTED_AUC})")

# Check for prediction bias
prediction_bias = abs(y_pred.mean() - y_true.mean())
if prediction_bias > MAX_PREDICTION_BIAS:
    alert = f"WARNING: Prediction bias ({prediction_bias:.4f}) exceeds threshold ({MAX_PREDICTION_BIAS})"
    alerts.append(alert)
    print(f"[!] {alert}")
else:
    print(f"[OK] Prediction bias ({prediction_bias:.4f}) within threshold ({MAX_PREDICTION_BIAS})")

# Check error rates
if fnr > 0.3:
    alert = f"WARNING: High False Negative Rate ({fnr:.4f})"
    alerts.append(alert)
    print(f"[!] {alert}")
else:
    print(f"[OK] False Negative Rate ({fnr:.4f}) acceptable")

if fpr > 0.3:
    alert = f"WARNING: High False Positive Rate ({fpr:.4f})"
    alerts.append(alert)
    print(f"[!] {alert}")
else:
    print(f"[OK] False Positive Rate ({fpr:.4f}) acceptable")

if not alerts:
    print("\nNo model drift detected. Performance within acceptable thresholds.")
else:
    print(f"\n{len(alerts)} alert(s) raised. Model may require retraining.")

---
## 10. Generate Comprehensive Report

In [None]:
# Compile all metrics into report
monitoring_report = {
    'timestamp': datetime.now().isoformat(),
    'samples_evaluated': len(y_true),
    'classification_metrics': {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'specificity': specificity,
        'roc_auc': roc_auc,
        'false_positive_rate': fpr,
        'false_negative_rate': fnr
    },
    'confusion_matrix': {
        'true_negatives': int(tn),
        'false_positives': int(fp),
        'false_negatives': int(fn),
        'true_positives': int(tp)
    },
    'distribution': {
        'mean_probability': float(y_prob.mean()),
        'std_probability': float(y_prob.std()),
        'median_probability': float(np.median(y_prob)),
        'min_probability': float(y_prob.min()),
        'max_probability': float(y_prob.max()),
        'actual_default_rate': float(y_true.mean()),
        'predicted_default_rate': float(y_pred.mean())
    },
    'business_metrics': {
        'total_samples': len(y_true),
        'actual_defaults': int(y_true.sum()),
        'predicted_defaults': int(y_pred.sum()),
        'correctly_identified_defaults': int(tp),
        'missed_defaults': int(fn),
        'false_alarms': int(fp),
        'cost_per_missed_default': COST_FALSE_NEGATIVE,
        'cost_per_false_alarm': COST_FALSE_POSITIVE,
        'total_cost': total_cost,
        'avg_cost_per_sample': avg_cost_per_sample
    },
    'drift_detection': {
        'alerts': alerts,
        'expected_accuracy': EXPECTED_ACCURACY,
        'expected_auc': EXPECTED_AUC,
        'max_prediction_bias': MAX_PREDICTION_BIAS
    }
}

# Save report
os.makedirs(os.path.dirname(REPORT_OUTPUT_PATH), exist_ok=True)
with open(REPORT_OUTPUT_PATH, 'w') as f:
    json.dump(monitoring_report, f, indent=2)

print(f"\nMonitoring report saved to: {REPORT_OUTPUT_PATH}")

---
## 11. Summary

Print a final summary of all key findings.

In [None]:
print("\n" + "=" * 60)
print("MODEL MONITORING SUMMARY")
print("=" * 60)
print(f"Report Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Samples Evaluated: {len(y_true):,}\n")

print("Key Performance Indicators:")
print(f"  Accuracy:        {accuracy:.4f}")
print(f"  Precision:       {precision:.4f}")
print(f"  Recall:          {recall:.4f}")
print(f"  F1 Score:        {f1:.4f}")
if roc_auc:
    print(f"  ROC AUC:         {roc_auc:.4f}")

print(f"\nError Analysis:")
print(f"  False Positives: {fp:,}")
print(f"  False Negatives: {fn:,}")
print(f"  Total Cost:      ${total_cost:,}")

print(f"\nDrift Detection:")
if alerts:
    for alert in alerts:
        print(f"  - {alert}")
else:
    print("  No drift detected")

print("\n" + "=" * 60)
print("Files Generated:")
print(f"  - {REPORT_OUTPUT_PATH}")
print("  - reports/confusion_matrix.png")
print("  - reports/roc_pr_curves.png")
print("  - reports/probability_distribution.png")
print("=" * 60)

---
## Next Steps

You have successfully completed model monitoring. Proceed to **Step 6: Report Generation** to compile your findings into a comprehensive project report.

### Key Insights to Include in Your Report:
1. Model accuracy and classification metrics
2. Confusion matrix interpretation
3. Business impact and cost analysis
4. Any model drift alerts
5. Recommendations for model improvement