# Model Diagnostics

Comprehensive analysis of model performance, errors, and cross-validation results.


In [1]:
import sys
from pathlib import Path
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support

# Paths
DATA_DIR = Path("../data/interim")
RESULTS_DIR = Path("../results")

# Load metadata
with open(DATA_DIR / "metadata.pkl", "rb") as f:
    metadata = pickle.load(f)
idx_to_label = metadata["idx_to_label"]
label_to_idx = metadata["label_to_idx"]
categories = list(label_to_idx.keys())

# Load training results (saved from train.ipynb)
with open(RESULTS_DIR / "training_results.pkl", "rb") as f:
    training_results = pickle.load(f)

history = training_results["history"]
cv_results = training_results["cv_results"]
results = training_results["test_results"]
n_classes = training_results["n_classes"]

labels = [idx_to_label[i] for i in range(n_classes)]

print(f"Categories: {categories}")
print(f"Loaded results from {RESULTS_DIR / 'training_results.pkl'}")


FileNotFoundError: [Errno 2] No such file or directory: '../results/training_results.pkl'

In [None]:
# Load test data for spectrogram visualization
X_test = np.load(DATA_DIR / "X_test.npy")
y_test = np.load(DATA_DIR / "y_test.npy")

print(f"Test data loaded: {X_test.shape}")
print(f"Training history: {len(history['train_loss'])} epochs")
print(f"CV results: {len(cv_results['fold_results'])} folds")
print(f"Test accuracy: {results['accuracy']:.1%}")


## 1. Cross-Validation Analysis


In [None]:
# CV results loaded from training_results.pkl
print(f"CV Accuracy: {cv_results['mean_acc']:.1%} ¬± {cv_results['std_acc']:.1%}")
fold_accs = [f"{r['best_val_acc']:.1%}" for r in cv_results['fold_results']]
print(f"Folds: {fold_accs}")


In [None]:
# Cross-validation bar plot with error bars
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Individual fold accuracies
folds = [r["fold"] for r in cv_results["fold_results"]]
accs = cv_results["accuracies"]
colors = ['#3498db' if a < cv_results['mean_acc'] else '#2ecc71' for a in accs]

bars = axes[0].bar(folds, accs, color=colors, edgecolor='white', linewidth=2)
axes[0].axhline(cv_results["mean_acc"], color='#e74c3c', linestyle='--', linewidth=2,
                label=f'Mean: {cv_results["mean_acc"]:.3f}')
axes[0].fill_between([0.5, 5.5], 
                      cv_results["mean_acc"] - cv_results["std_acc"],
                      cv_results["mean_acc"] + cv_results["std_acc"],
                      color='#e74c3c', alpha=0.2, label=f'¬±1 Std: {cv_results["std_acc"]:.3f}')
axes[0].set_xlabel("Fold", fontsize=12)
axes[0].set_ylabel("Validation Accuracy", fontsize=12)
axes[0].set_title("Per-Fold Accuracy", fontsize=14)
axes[0].set_ylim(0, 1)
axes[0].set_xticks(folds)
axes[0].legend(loc='lower right')

# Add value labels on bars
for bar, acc in zip(bars, accs):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                 f'{acc:.1%}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Right: Summary with error bar
axes[1].bar(['5-Fold CV'], [cv_results["mean_acc"]], 
            yerr=[cv_results["std_acc"]], capsize=10, color='#3498db',
            edgecolor='white', linewidth=2, error_kw={'linewidth': 2})
axes[1].set_ylabel("Accuracy", fontsize=12)
axes[1].set_title("Cross-Validation Summary", fontsize=14)
axes[1].set_ylim(0, 1)
axes[1].text(0, cv_results["mean_acc"] + cv_results["std_acc"] + 0.05,
             f'{cv_results["mean_acc"]:.1%} ¬± {cv_results["std_acc"]:.1%}',
             ha='center', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()


### Training Curves per Fold


In [None]:
# Training curves for each fold
n_folds = len(cv_results["fold_results"])
fig, axes = plt.subplots(n_folds, 2, figsize=(14, 3.5 * n_folds))

for i, fold_result in enumerate(cv_results["fold_results"]):
    history = fold_result["history"]
    fold_num = fold_result["fold"]
    best_acc = fold_result["best_val_acc"]
    
    # Loss curves
    axes[i, 0].plot(history["train_loss"], label="Train", linewidth=2)
    axes[i, 0].plot(history["val_loss"], label="Val", linewidth=2)
    axes[i, 0].set_xlabel("Epoch")
    axes[i, 0].set_ylabel("Loss")
    axes[i, 0].set_title(f"Fold {fold_num} - Loss")
    axes[i, 0].legend()
    axes[i, 0].grid(True, alpha=0.3)
    
    # Accuracy curves
    axes[i, 1].plot(history["train_acc"], label="Train", linewidth=2)
    axes[i, 1].plot(history["val_acc"], label="Val", linewidth=2)
    best_epoch = np.argmax(history["val_acc"])
    axes[i, 1].axvline(best_epoch, color='red', linestyle='--', alpha=0.5, label=f'Best epoch: {best_epoch+1}')
    axes[i, 1].set_xlabel("Epoch")
    axes[i, 1].set_ylabel("Accuracy")
    axes[i, 1].set_title(f"Fold {fold_num} - Accuracy (Best: {best_acc:.1%})")
    axes[i, 1].legend()
    axes[i, 1].grid(True, alpha=0.3)
    axes[i, 1].set_ylim(0, 1)

plt.tight_layout()
plt.show()


### Single Training Run Curves


In [None]:
# Training curves from single training run
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Loss
ax1.plot(history["train_loss"], label="Train", linewidth=2)
ax1.plot(history["val_loss"], label="Val", linewidth=2)
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Loss")
ax1.set_title("Training Loss")
ax1.legend()
ax1.grid(True, alpha=0.3)

# Accuracy
ax2.plot(history["train_acc"], label="Train", linewidth=2)
ax2.plot(history["val_acc"], label="Val", linewidth=2)
best_epoch = np.argmax(history["val_acc"])
ax2.axvline(best_epoch, color='red', linestyle='--', alpha=0.5, label=f'Best: epoch {best_epoch+1}')
ax2.set_xlabel("Epoch")
ax2.set_ylabel("Accuracy")
ax2.set_title(f"Training Accuracy (Best Val: {max(history['val_acc']):.1%})")
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, 1)

plt.tight_layout()
plt.show()


## 2. Test Set Evaluation


In [None]:
# Test results loaded from training_results.pkl
print(f"Test Accuracy: {results['accuracy']:.1%}")
print(f"Test Loss: {results['loss']:.4f}")


### Confusion Matrix


In [None]:
# Confusion matrix
cm = confusion_matrix(results["labels"], results["predictions"])

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Raw counts
disp1 = ConfusionMatrixDisplay(cm, display_labels=labels)
disp1.plot(ax=axes[0], cmap="Blues", xticks_rotation=45)
axes[0].set_title(f"Confusion Matrix - Counts\n(Test Accuracy: {results['accuracy']:.1%})")

# Normalized (row-wise = recall per class)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm_norm = np.nan_to_num(cm_norm)  # Handle division by zero
disp2 = ConfusionMatrixDisplay(cm_norm, display_labels=labels)
disp2.plot(ax=axes[1], cmap="Blues", xticks_rotation=45, values_format='.2f')
axes[1].set_title("Confusion Matrix - Normalized (Recall)")

plt.tight_layout()
plt.show()


### Per-Class Metrics


In [None]:
# Classification report
y_true_labels = [idx_to_label[i] for i in results["labels"]]
y_pred_labels = [idx_to_label[i] for i in results["predictions"]]

print("Classification Report:")
print(classification_report(y_true_labels, y_pred_labels, labels=labels, zero_division=0))


In [None]:
# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
    results["labels"], results["predictions"], labels=range(n_classes), zero_division=0
)

x = np.arange(n_classes)
width = 0.25

fig, ax = plt.subplots(figsize=(14, 6))
bars1 = ax.bar(x - width, precision, width, label='Precision', color='#3498db')
bars2 = ax.bar(x, recall, width, label='Recall', color='#2ecc71')
bars3 = ax.bar(x + width, f1, width, label='F1-Score', color='#e74c3c')

ax.set_xlabel('Category', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Per-Class Performance Metrics', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.legend()
ax.set_ylim(0, 1.1)
ax.axhline(0.5, color='gray', linestyle='--', alpha=0.5)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


## 3. Error Analysis

Examine misclassified samples to understand model failures.


In [None]:
# Find misclassified samples
misclassified_idx = np.where(results["predictions"] != results["labels"])[0]
correct_idx = np.where(results["predictions"] == results["labels"])[0]

print(f"Correct: {len(correct_idx)}/{len(results['labels'])}")
print(f"Misclassified: {len(misclassified_idx)}/{len(results['labels'])}")

if len(misclassified_idx) > 0:
    print("\nMisclassified samples:")
    for idx in misclassified_idx:
        true_label = idx_to_label[results["labels"][idx]]
        pred_label = idx_to_label[results["predictions"][idx]]
        print(f"  Sample {idx}: True={true_label}, Predicted={pred_label}")


In [None]:
# Visualize misclassified spectrograms
if len(misclassified_idx) > 0:
    n_show = min(len(misclassified_idx), 8)
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    axes = axes.flatten()
    
    for i in range(n_show):
        idx = misclassified_idx[i]
        true_label = idx_to_label[results["labels"][idx]]
        pred_label = idx_to_label[results["predictions"][idx]]
        
        axes[i].imshow(X_test[idx], aspect='auto', origin='lower', cmap='magma')
        axes[i].set_title(f"True: {true_label}\nPred: {pred_label}", fontsize=10)
        axes[i].set_xlabel("Time")
        axes[i].set_ylabel("Mel")
    
    # Hide unused subplots
    for i in range(n_show, 8):
        axes[i].axis('off')
    
    plt.suptitle("Misclassified Samples", fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
else:
    print("No misclassified samples!")


### Most Confused Class Pairs


In [None]:
# Find most confused pairs (off-diagonal elements of confusion matrix)
confusion_pairs = []
for i in range(n_classes):
    for j in range(n_classes):
        if i != j and cm[i, j] > 0:
            confusion_pairs.append({
                'true': labels[i],
                'predicted': labels[j],
                'count': cm[i, j]
            })

# Sort by count
confusion_pairs = sorted(confusion_pairs, key=lambda x: -x['count'])

print("Most Confused Pairs (True ‚Üí Predicted):")
print("-" * 40)
for pair in confusion_pairs[:10]:
    print(f"  {pair['true']:12} ‚Üí {pair['predicted']:12}: {pair['count']} times")

if not confusion_pairs:
    print("  No confusion pairs (perfect classification!)")


## 4. Summary & Recommendations


In [None]:
# Summary
print("=" * 50)
print("MODEL DIAGNOSTICS SUMMARY")
print("=" * 50)
print(f"\nüìä Cross-Validation (5-fold):")
print(f"   Accuracy: {cv_results['mean_acc']:.1%} ¬± {cv_results['std_acc']:.1%}")
print(f"   Best fold: {max(cv_results['accuracies']):.1%}")
print(f"   Worst fold: {min(cv_results['accuracies']):.1%}")

print(f"\nüéØ Test Set Performance:")
print(f"   Accuracy: {results['accuracy']:.1%}")
print(f"   Correct: {len(correct_idx)}/{len(results['labels'])}")
print(f"   Errors: {len(misclassified_idx)}/{len(results['labels'])}")

if confusion_pairs:
    print(f"\n‚ö†Ô∏è  Most Confused Pair:")
    print(f"   {confusion_pairs[0]['true']} ‚Üí {confusion_pairs[0]['predicted']}")

# Find best and worst classes
best_class = labels[np.argmax(f1)]
worst_class = labels[np.argmin(f1)]
print(f"\n‚úÖ Best performing class: {best_class} (F1: {max(f1):.2f})")
print(f"‚ùå Worst performing class: {worst_class} (F1: {min(f1):.2f})")

print("\n" + "=" * 50)
