# Advanced Model Evaluation for Pathology Classification

## Learning Objectives
- Implement comprehensive evaluation metrics for medical AI
- Handle class imbalance with appropriate metrics
- Create ROC curves and precision-recall curves
- Perform statistical significance testing
- Generate detailed performance reports

## Prerequisites
- Completed Classical ML tutorial
- Understanding of classification metrics
- Familiarity with medical AI evaluation standards

Let's master advanced evaluation techniques for pathology AI!

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_curve, auc,
    precision_recall_curve, average_precision_score, matthews_corrcoef,
    balanced_accuracy_score, cohen_kappa_score
)
from sklearn.model_selection import StratifiedKFold, permutation_test_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import label_binarize
import scipy.stats as stats
from itertools import cycle
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")
print("📊 Ready for advanced model evaluation in pathology!")

In [None]:
# Generate realistic pathology evaluation scenario
def create_evaluation_dataset():
    """Create realistic pathology dataset with class imbalance"""
    np.random.seed(42)
    
    # Realistic class distribution in pathology
    class_sizes = [600, 300, 100]  # Normal, Benign, Malignant
    class_names = ['Normal', 'Benign', 'Malignant']
    
    X_list, y_list = [], []
    
    for class_id, n_samples in enumerate(class_sizes):
        # Create class-specific feature distributions
        if class_id == 0:  # Normal
            features = np.random.multivariate_normal(
                mean=[0.2, 0.3, 0.1], cov=[[0.1, 0.02, 0.01], [0.02, 0.1, 0.01], [0.01, 0.01, 0.1]],
                size=(n_samples, 100)
            ).reshape(n_samples, -1)
        elif class_id == 1:  # Benign
            features = np.random.multivariate_normal(
                mean=[0.5, 0.4, 0.3], cov=[[0.15, 0.03, 0.02], [0.03, 0.15, 0.02], [0.02, 0.02, 0.15]],
                size=(n_samples, 100)
            ).reshape(n_samples, -1)
        else:  # Malignant
            features = np.random.multivariate_normal(
                mean=[0.8, 0.7, 0.6], cov=[[0.2, 0.04, 0.03], [0.04, 0.2, 0.03], [0.03, 0.03, 0.2]],
                size=(n_samples, 100)
            ).reshape(n_samples, -1)
        
        X_list.append(features)
        y_list.extend([class_id] * n_samples)
    
    X = np.vstack(X_list)
    y = np.array(y_list)
    
    return X, y, class_names

# Create evaluation dataset
X_eval, y_eval, class_names = create_evaluation_dataset()
print(f"📊 Evaluation dataset: {X_eval.shape[0]} samples, {X_eval.shape[1]} features")

# Show class distribution
unique, counts = np.unique(y_eval, return_counts=True)
for class_id, count in zip(unique, counts):
    print(f"   {class_names[class_id]}: {count} samples ({count/len(y_eval)*100:.1f}%)")

In [None]:
# Train baseline model for evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_eval, y_eval, test_size=0.3, random_state=42, stratify=y_eval
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)

# Get predictions and probabilities
y_pred = rf_model.predict(X_test_scaled)
y_pred_proba = rf_model.predict_proba(X_test_scaled)

print("✅ Baseline model trained and predictions generated")

In [None]:
# Comprehensive evaluation metrics
def comprehensive_evaluation(y_true, y_pred, y_pred_proba, class_names):
    """Generate comprehensive evaluation report"""
    
    print("📊 COMPREHENSIVE MODEL EVALUATION REPORT")
    print("="*60)
    
    # Basic metrics
    accuracy = np.mean(y_true == y_pred)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    
    print(f"\n🎯 OVERALL PERFORMANCE:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   Balanced Accuracy: {balanced_acc:.4f}")
    print(f"   Cohen's Kappa: {kappa:.4f}")
    print(f"   Matthews Correlation Coefficient: {mcc:.4f}")
    
    # Classification report
    print(f"\n📋 DETAILED CLASSIFICATION REPORT:")
    report = classification_report(y_true, y_pred, target_names=class_names)
    print(report)
    
    # Per-class metrics
    print(f"\n🎯 PER-CLASS PERFORMANCE:")
    cm = confusion_matrix(y_true, y_pred)
    
    for i, class_name in enumerate(class_names):
        TP = cm[i, i]
        FP = np.sum(cm[:, i]) - TP
        FN = np.sum(cm[i, :]) - TP
        TN = np.sum(cm) - TP - FP - FN
        
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        ppv = TP / (TP + FP) if (TP + FP) > 0 else 0
        npv = TN / (TN + FN) if (TN + FN) > 0 else 0
        
        print(f"\n   {class_name}:")
        print(f"      Sensitivity (Recall): {sensitivity:.4f}")
        print(f"      Specificity: {specificity:.4f}")
        print(f"      PPV (Precision): {ppv:.4f}")
        print(f"      NPV: {npv:.4f}")
    
    return {
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'kappa': kappa,
        'mcc': mcc,
        'confusion_matrix': cm
    }

# Generate comprehensive evaluation
eval_results = comprehensive_evaluation(y_test, y_pred, y_pred_proba, class_names)

In [None]:
# ROC Curves for multi-class classification
def plot_multiclass_roc_curves(y_true, y_pred_proba, class_names):
    """Plot ROC curves for each class in multi-class setting"""
    
    # Binarize labels for ROC analysis
    y_bin = label_binarize(y_true, classes=[0, 1, 2])
    n_classes = len(class_names)
    
    # Compute ROC curve and AUC for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Compute micro-average ROC curve
    fpr["micro"], tpr["micro"], _ = roc_curve(y_bin.ravel(), y_pred_proba.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # Plot
    plt.figure(figsize=(12, 8))
    colors = cycle(['blue', 'red', 'green', 'orange', 'purple'])
    
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                label=f'{class_names[i]} (AUC = {roc_auc[i]:.3f})')
    
    plt.plot(fpr["micro"], tpr["micro"], color='gold', lw=2, linestyle='--',
            label=f'Micro-average (AUC = {roc_auc["micro"]:.3f})')
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multi-class ROC Curves', fontsize=14, fontweight='bold')
    plt.legend(loc="lower right")
    plt.grid(alpha=0.3)
    plt.show()
    
    return roc_auc

# Plot ROC curves
roc_scores = plot_multiclass_roc_curves(y_test, y_pred_proba, class_names)

In [None]:
# Precision-Recall curves
def plot_precision_recall_curves(y_true, y_pred_proba, class_names):
    """Plot Precision-Recall curves for each class"""
    
    y_bin = label_binarize(y_true, classes=[0, 1, 2])
    n_classes = len(class_names)
    
    # Compute PR curve for each class
    precision = dict()
    recall = dict()
    avg_precision = dict()
    
    plt.figure(figsize=(12, 8))
    colors = cycle(['blue', 'red', 'green', 'orange', 'purple'])
    
    for i, color in zip(range(n_classes), colors):
        precision[i], recall[i], _ = precision_recall_curve(y_bin[:, i], y_pred_proba[:, i])
        avg_precision[i] = average_precision_score(y_bin[:, i], y_pred_proba[:, i])
        
        plt.plot(recall[i], precision[i], color=color, lw=2,
                label=f'{class_names[i]} (AP = {avg_precision[i]:.3f})')
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Multi-class Precision-Recall Curves', fontsize=14, fontweight='bold')
    plt.legend(loc="lower left")
    plt.grid(alpha=0.3)
    plt.show()
    
    return avg_precision

# Plot Precision-Recall curves
pr_scores = plot_precision_recall_curves(y_test, y_pred_proba, class_names)

In [None]:
# Enhanced confusion matrix visualization
def plot_enhanced_confusion_matrix(y_true, y_pred, class_names):
    """Create enhanced confusion matrix with percentages"""
    
    cm = confusion_matrix(y_true, y_pred)
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Raw counts
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names, ax=ax1)
    ax1.set_title('Confusion Matrix (Counts)', fontweight='bold')
    ax1.set_xlabel('Predicted Label')
    ax1.set_ylabel('True Label')
    
    # Percentages
    sns.heatmap(cm_percent, annot=True, fmt='.1f', cmap='Oranges',
                xticklabels=class_names, yticklabels=class_names, ax=ax2)
    ax2.set_title('Confusion Matrix (Percentages)', fontweight='bold')
    ax2.set_xlabel('Predicted Label')
    ax2.set_ylabel('True Label')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate per-class accuracies
    class_accuracies = cm.diagonal() / cm.sum(axis=1)
    print("\n🎯 Per-class Accuracies:")
    for i, (class_name, acc) in enumerate(zip(class_names, class_accuracies)):
        print(f"   {class_name}: {acc:.3f} ({acc*100:.1f}%)")

# Plot enhanced confusion matrix
plot_enhanced_confusion_matrix(y_test, y_pred, class_names)

In [None]:
# Statistical significance testing
def statistical_significance_test(model, X, y, n_permutations=100):
    """Perform permutation test for statistical significance"""
    
    print("🧪 Performing statistical significance test...")
    
    # Permutation test
    score, perm_scores, pvalue = permutation_test_score(
        model, X, y, scoring="accuracy", cv=5, n_permutations=n_permutations, random_state=42
    )
    
    print(f"   Model Accuracy: {score:.4f}")
    print(f"   Permutation Test p-value: {pvalue:.6f}")
    
    if pvalue < 0.05:
        print("   ✅ Model performance is statistically significant!")
    else:
        print("   ⚠️ Model performance is NOT statistically significant")
    
    # Plot permutation scores
    plt.figure(figsize=(10, 6))
    plt.hist(perm_scores, bins=20, alpha=0.7, color='lightblue', edgecolor='black')
    plt.axvline(score, color='red', linestyle='--', linewidth=2, label=f'Model Score: {score:.3f}')
    plt.xlabel('Accuracy Score')
    plt.ylabel('Frequency')
    plt.title('Permutation Test Results', fontweight='bold')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()
    
    return score, pvalue

# Perform significance test
model_score, p_value = statistical_significance_test(rf_model, X_test_scaled, y_test)

In [None]:
# Bootstrap confidence intervals
def bootstrap_confidence_intervals(y_true, y_pred, n_bootstrap=1000, confidence=0.95):
    """Calculate bootstrap confidence intervals for performance metrics"""
    
    print(f"🔄 Calculating {confidence*100}% confidence intervals...")
    
    n_samples = len(y_true)
    bootstrap_accuracies = []
    
    # Bootstrap sampling
    for _ in range(n_bootstrap):
        # Sample with replacement
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        y_true_boot = y_true[indices]
        y_pred_boot = y_pred[indices]
        
        # Calculate accuracy
        accuracy = np.mean(y_true_boot == y_pred_boot)
        bootstrap_accuracies.append(accuracy)
    
    # Calculate confidence intervals
    alpha = 1 - confidence
    lower = np.percentile(bootstrap_accuracies, (alpha/2) * 100)
    upper = np.percentile(bootstrap_accuracies, (1 - alpha/2) * 100)
    mean_acc = np.mean(bootstrap_accuracies)
    
    print(f"   Mean Accuracy: {mean_acc:.4f}")
    print(f"   {confidence*100}% CI: [{lower:.4f}, {upper:.4f}]")
    print(f"   CI Width: {upper - lower:.4f}")
    
    # Plot bootstrap distribution
    plt.figure(figsize=(10, 6))
    plt.hist(bootstrap_accuracies, bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
    plt.axvline(mean_acc, color='blue', linestyle='-', linewidth=2, label=f'Mean: {mean_acc:.3f}')
    plt.axvline(lower, color='red', linestyle='--', linewidth=2, label=f'Lower CI: {lower:.3f}')
    plt.axvline(upper, color='red', linestyle='--', linewidth=2, label=f'Upper CI: {upper:.3f}')
    plt.xlabel('Accuracy')
    plt.ylabel('Frequency')
    plt.title('Bootstrap Distribution of Accuracy', fontweight='bold')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()
    
    return mean_acc, lower, upper

# Calculate confidence intervals
mean_accuracy, ci_lower, ci_upper = bootstrap_confidence_intervals(y_test, y_pred)

## 🎯 Exercise: Advanced Evaluation Challenge

Complete these advanced evaluation tasks:

1. **Medical AI Metrics**: Implement sensitivity, specificity, PPV, NPV for each class
2. **Cost-Sensitive Evaluation**: Define misclassification costs (e.g., missing cancer = high cost)
3. **Threshold Optimization**: Find optimal decision thresholds for each class
4. **Calibration Analysis**: Evaluate probability calibration using reliability diagrams

### Expected Outcomes
Your evaluation should demonstrate:
- **Statistical significance**: p-value < 0.05
- **Tight confidence intervals**: CI width < 0.1
- **Balanced performance**: No class with <70% sensitivity
- **Clinical relevance**: High NPV for cancer detection

### Advanced Task
Create a clinical decision support visualization showing prediction confidence!

In [None]:
# 🎯 VALIDATION: Advanced evaluation requirements
def validate_evaluation_quality(eval_results, roc_scores, p_value, ci_width):
    """Validate that evaluation meets clinical standards"""
    
    print("🏥 Validating clinical evaluation standards...")
    
    # Clinical performance requirements
    balanced_acc = eval_results['balanced_accuracy']
    kappa = eval_results['kappa']
    
    # ROC requirements
    min_auc = min(roc_scores[i] for i in range(len(class_names)))
    
    print(f"📊 Evaluation Quality Check:")
    print(f"   Balanced Accuracy: {balanced_acc:.4f}")
    print(f"   Cohen's Kappa: {kappa:.4f}")
    print(f"   Minimum AUC: {min_auc:.4f}")
    print(f"   Statistical p-value: {p_value:.6f}")
    print(f"   CI Width: {ci_width:.4f}")
    
    # Validation checks
    assert balanced_acc > 0.70, f"Balanced accuracy too low for clinical use: {balanced_acc:.4f}"
    assert kappa > 0.60, f"Cohen's Kappa indicates poor agreement: {kappa:.4f}"
    assert min_auc > 0.75, f"Minimum AUC too low: {min_auc:.4f}"
    assert p_value < 0.05, f"Results not statistically significant: {p_value:.6f}"
    assert ci_width < 0.15, f"Confidence interval too wide: {ci_width:.4f}"
    
    print("\n🎉 All clinical evaluation standards met!")
    print("🏥 Model ready for clinical validation studies!")
    print("🚀 Ready for next tutorial: Cross-validation & Advanced Techniques")
    
    return True

# Run validation
ci_width = ci_upper - ci_lower
validate_evaluation_quality(eval_results, roc_scores, p_value, ci_width)

## 📚 Summary

You've mastered advanced model evaluation techniques:

1. **Comprehensive Metrics**: Accuracy, balanced accuracy, Cohen's kappa, MCC
2. **Medical AI Standards**: Sensitivity, specificity, PPV, NPV for each class
3. **ROC & PR Analysis**: Multi-class curve analysis with AUC scores
4. **Statistical Validation**: Permutation testing for significance
5. **Confidence Intervals**: Bootstrap sampling for uncertainty quantification

### Clinical Relevance
- **Balanced Accuracy**: Addresses class imbalance common in medical data
- **Cohen's Kappa**: Measures agreement beyond chance
- **Statistical Testing**: Ensures results aren't due to random chance
- **Confidence Intervals**: Quantifies uncertainty for clinical decisions

### Best Practices
✅ **Always use balanced metrics** for imbalanced medical datasets  
✅ **Report confidence intervals** for all performance measures  
✅ **Test statistical significance** before clinical deployment  
✅ **Analyze per-class performance** especially for rare diseases  

🎓 **Outstanding!** You're ready to evaluate medical AI systems professionally!