# Mridangam Model Comprehensive Evaluation

This notebook provides comprehensive evaluation tools for the mridangam transcription model.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_recall_fscore_support, roc_curve, auc
)
from collections import Counter
from tqdm import tqdm

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load Your Trained Model and Data

Make sure you have already trained your model using the main training script.

In [None]:
# Load your trained model and test data
# Assuming you have these from your training script

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load model (replace with your model loading code)
# model = torch.load('mridangam_model_complete.pth', map_location=device)
# test_loader = your_test_loader
# label_encoder = your_label_encoder

## 1. Basic Model Performance

In [None]:
def get_predictions_and_labels(model, test_loader, device):
    """Get all predictions and true labels from test set"""
    model.eval()
    all_predictions = []
    all_labels = []
    all_probabilities = []
    
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc="Getting predictions"):
            inputs = inputs.to(device)
            labels = labels.to(device).long()
            
            outputs = model(inputs)
            probabilities = torch.softmax(outputs, dim=1)
            predictions = outputs.argmax(dim=1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
    
    return np.array(all_predictions), np.array(all_labels), np.array(all_probabilities)

# Get predictions
# predictions, true_labels, probabilities = get_predictions_and_labels(model, test_loader, device)

## 2. Comprehensive Metrics Display

In [None]:
def display_comprehensive_metrics(true_labels, predictions, probabilities, class_names):
    """Display comprehensive evaluation metrics"""
    
    # Basic metrics
    accuracy = accuracy_score(true_labels, predictions)
    
    print("=" * 60)
    print("COMPREHENSIVE EVALUATION RESULTS")
    print("=" * 60)
    print(f"Overall Accuracy: {accuracy*100:.2f}%")
    print(f"Total Test Samples: {len(true_labels)}")
    
    # Classification report
    print("\n" + "=" * 40)
    print("CLASSIFICATION REPORT")
    print("=" * 40)
    print(classification_report(true_labels, predictions, target_names=class_names))
    
    # Per-class analysis
    print("\n" + "=" * 40)
    print("PER-CLASS ANALYSIS")
    print("=" * 40)
    
    for i, class_name in enumerate(class_names):
        class_mask = true_labels == i
        if np.sum(class_mask) > 0:
            class_accuracy = accuracy_score(true_labels[class_mask], predictions[class_mask])
            class_count = np.sum(class_mask)
            predicted_as_this_class = np.sum(predictions == i)
            
            print(f"{class_name:>15}: Accuracy={class_accuracy*100:6.2f}% | "
                  f"True samples={class_count:3d} | Predicted as this={predicted_as_this_class:3d}")
    
    # Confidence analysis
    max_probs = np.max(probabilities, axis=1)
    correct_mask = predictions == true_labels
    
    print("\n" + "=" * 40)
    print("PREDICTION CONFIDENCE ANALYSIS")
    print("=" * 40)
    print(f"Average confidence (all): {np.mean(max_probs)*100:.2f}%")
    print(f"Average confidence (correct): {np.mean(max_probs[correct_mask])*100:.2f}%")
    print(f"Average confidence (incorrect): {np.mean(max_probs[~correct_mask])*100:.2f}%")
    
    # Low confidence predictions
    low_conf_threshold = 0.6
    low_conf_mask = max_probs < low_conf_threshold
    print(f"Predictions with confidence < {low_conf_threshold*100:.0f}%: {np.sum(low_conf_mask)} "
          f"({np.sum(low_conf_mask)/len(true_labels)*100:.1f}%)")

# Display metrics
# display_comprehensive_metrics(true_labels, predictions, probabilities, label_encoder.classes_)

## 3. Visualization Dashboard

In [None]:
def create_evaluation_dashboard(true_labels, predictions, probabilities, class_names):
    """Create comprehensive evaluation dashboard"""
    
    fig = plt.figure(figsize=(20, 16))
    
    # 1. Confusion Matrix
    plt.subplot(4, 4, 1)
    cm = confusion_matrix(true_labels, predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix', fontsize=12, fontweight='bold')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=45)
    
    # 2. Normalized Confusion Matrix
    plt.subplot(4, 4, 2)
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    sns.heatmap(cm_norm, annot=True, fmt='.2f', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Normalized Confusion Matrix', fontsize=12, fontweight='bold')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=45)
    
    # 3. Per-class metrics
    plt.subplot(4, 4, 3)
    report = classification_report(true_labels, predictions, target_names=class_names, output_dict=True)
    metrics_df = pd.DataFrame(report).T
    metrics_df = metrics_df.drop(['accuracy', 'macro avg', 'weighted avg'])
    
    x_pos = np.arange(len(class_names))
    width = 0.25
    
    plt.bar(x_pos - width, metrics_df['precision'], width, label='Precision', alpha=0.8)
    plt.bar(x_pos, metrics_df['recall'], width, label='Recall', alpha=0.8)
    plt.bar(x_pos + width, metrics_df['f1-score'], width, label='F1-Score', alpha=0.8)
    
    plt.title('Per-Class Metrics', fontsize=12, fontweight='bold')
    plt.xlabel('Classes')
    plt.ylabel('Score')
    plt.xticks(x_pos, class_names, rotation=45)
    plt.legend()
    plt.ylim(0, 1)
    
    # 4. Class distribution
    plt.subplot(4, 4, 4)
    unique_labels, label_counts = np.unique(true_labels, return_counts=True)
    colors = plt.cm.Set3(np.linspace(0, 1, len(unique_labels)))
    plt.pie(label_counts, labels=[class_names[i] for i in unique_labels], 
            autopct='%1.1f%%', colors=colors, startangle=90)
    plt.title('Test Set Class Distribution', fontsize=12, fontweight='bold')
    
    # 5. Prediction confidence distribution
    plt.subplot(4, 4, 5)
    max_probs = np.max(probabilities, axis=1)
    correct_mask = predictions == true_labels
    
    plt.hist(max_probs[correct_mask], bins=30, alpha=0.7, label='Correct', color='green', density=True)
    plt.hist(max_probs[~correct_mask], bins=30, alpha=0.7, label='Incorrect', color='red', density=True)
    plt.title('Prediction Confidence Distribution', fontsize=12, fontweight='bold')
    plt.xlabel('Max Probability')
    plt.ylabel('Density')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 6. Accuracy vs confidence threshold
    plt.subplot(4, 4, 6)
    thresholds = np.arange(0.1, 1.0, 0.05)
    accuracies = []
    sample_counts = []
    
    for threshold in thresholds:
        high_conf_mask = max_probs >= threshold
        if np.sum(high_conf_mask) > 0:
            acc = accuracy_score(true_labels[high_conf_mask], predictions[high_conf_mask])
            accuracies.append(acc)
            sample_counts.append(np.sum(high_conf_mask))
        else:
            accuracies.append(0)
            sample_counts.append(0)
    
    plt.plot(thresholds, accuracies, 'b-o', linewidth=2, markersize=4, label='Accuracy')
    plt.title('Accuracy vs Confidence Threshold', fontsize=12, fontweight='bold')
    plt.xlabel('Confidence Threshold')
    plt.ylabel('Accuracy')
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # 7. Top misclassifications
    plt.subplot(4, 4, 7)
    misclass_counts = Counter()
    for true_label, pred_label in zip(true_labels, predictions):
        if true_label != pred_label:
            pair = (class_names[true_label], class_names[pred_label])
            misclass_counts[f"{pair[0]}→{pair[1]}"] += 1
    
    if misclass_counts:
        top_misclass = misclass_counts.most_common(8)
        pairs, counts = zip(*top_misclass)
        y_pos = np.arange(len(pairs))
        plt.barh(y_pos, counts, color='salmon')
        plt.title('Top Misclassifications', fontsize=12, fontweight='bold')
        plt.xlabel('Count')
        plt.yticks(y_pos, pairs)
        plt.gca().invert_yaxis()
    
    # 8. Prediction confidence by class
    plt.subplot(4, 4, 8)
    conf_by_class = []
    class_labels = []
    
    for i, class_name in enumerate(class_names):
        class_mask = predictions == i
        if np.sum(class_mask) > 0:
            class_confidences = max_probs[class_mask]
            conf_by_class.append(class_confidences)
            class_labels.append(class_name)
    
    if conf_by_class:
        plt.boxplot(conf_by_class, labels=class_labels)
        plt.title('Prediction Confidence by Class', fontsize=12, fontweight='bold')
        plt.xlabel('Predicted Class')
        plt.ylabel('Confidence')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
    
    # 9-12. Individual class probability distributions
    for i in range(4):
        if i < len(class_names):
            plt.subplot(4, 4, 9+i)
            class_probs = probabilities[:, i]
            true_class_mask = true_labels == i
            other_class_mask = ~true_class_mask
            
            plt.hist(class_probs[true_class_mask], bins=20, alpha=0.7, 
                    label=f'True {class_names[i]}', color='green', density=True)
            plt.hist(class_probs[other_class_mask], bins=20, alpha=0.7, 
                    label='Other classes', color='red', density=True)
            
            plt.title(f'{class_names[i]} Probability Distribution', fontsize=10, fontweight='bold')
            plt.xlabel('Predicted Probability')
            plt.ylabel('Density')
            plt.legend()
            plt.grid(True, alpha=0.3)
    
    # 13-16. Error analysis for remaining classes
    for i in range(4, min(8, len(class_names))):
        if i < len(class_names):
            subplot_idx = 9 + (i - 4)
            if subplot_idx <= 16:
                plt.subplot(4, 4, subplot_idx)
                class_probs = probabilities[:, i]
                true_class_mask = true_labels == i
                other_class_mask = ~true_class_mask
                
                plt.hist(class_probs[true_class_mask], bins=20, alpha=0.7, 
                        label=f'True {class_names[i]}', color='green', density=True)
                plt.hist(class_probs[other_class_mask], bins=20, alpha=0.7, 
                        label='Other classes', color='red', density=True)
                
                plt.title(f'{class_names[i]} Probability Distribution', fontsize=10, fontweight='bold')
                plt.xlabel('Predicted Probability')
                plt.ylabel('Density')
                plt.legend()
                plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('model_evaluation_dashboard.png', dpi=300, bbox_inches='tight')
    plt.show()

# Create dashboard
# create_evaluation_dashboard(true_labels, predictions, probabilities, label_encoder.classes_)

## 4. Error Analysis

In [None]:
def analyze_errors_detailed(true_labels, predictions, probabilities, class_names, top_n=15):
    """Detailed error analysis"""
    
    # Find all errors
    error_indices = np.where(predictions != true_labels)[0]
    
    if len(error_indices) == 0:
        print("No errors found! Perfect model!")
        return
    
    # Get error details
    error_details = []
    for idx in error_indices:
        error_info = {
            'index': idx,
            'true_label': true_labels[idx],
            'predicted_label': predictions[idx],
            'true_class': class_names[true_labels[idx]],
            'predicted_class': class_names[predictions[idx]],
            'confidence': probabilities[idx][predictions[idx]],
            'true_class_prob': probabilities[idx][true_labels[idx]]
        }
        error_details.append(error_info)
    
    # Sort by confidence (most confident errors first)
    error_details.sort(key=lambda x: x['confidence'], reverse=True)
    
    print(f"\nERROR ANALYSIS")
    print(f"Found {len(error_indices)} errors out of {len(true_labels)} samples")
    print(f"Error rate: {len(error_indices)/len(true_labels)*100:.2f}%")
    
    print(f"\nTop {min(top_n, len(error_details))} most confident errors:")
    print("-" * 80)
    
    for i, error in enumerate(error_details[:top_n]):
        print(f"Error {i+1} (Index {error['index']}):")
        print(f"  True: {error['true_class']} (prob: {error['true_class_prob']:.3f})")
        print(f"  Predicted: {error['predicted_class']} (confidence: {error['confidence']:.3f})")
        print(f"  Confidence difference: {error['confidence'] - error['true_class_prob']:.3f}")
        print()
    
    return error_details

# Analyze errors
# error_analysis = analyze_errors_detailed(true_labels, predictions, probabilities, label_encoder.classes_)

## 5. Model Performance Summary

In [None]:
def generate_performance_summary(true_labels, predictions, probabilities, class_names):
    """Generate a comprehensive performance summary"""
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, support = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    
    max_probs = np.max(probabilities, axis=1)
    avg_confidence = np.mean(max_probs)
    
    # Create summary DataFrame
    summary_data = {
        'Metric': [
            'Overall Accuracy',
            'Weighted Precision',
            'Weighted Recall',
            'Weighted F1-Score',
            'Average Confidence',
            'Total Test Samples',
            'Number of Classes',
            'Error Rate'
        ],
        'Value': [
            f"{accuracy*100:.2f}%",
            f"{precision:.3f}",
            f"{recall:.3f}",
            f"{f1:.3f}",
            f"{avg_confidence*100:.2f}%",
            f"{len(true_labels)}",
            f"{len(class_names)}",
            f"{(1-accuracy)*100:.2f}%"
        ]
    }
    
    summary_df = pd.DataFrame(summary_data)
    
    print("\n" + "=" * 50)
    print("MODEL PERFORMANCE SUMMARY")
    print("=" * 50)
    print(summary_df.to_string(index=False))
    
    # Per-class summary
    report = classification_report(true_labels, predictions, target_names=class_names, output_dict=True)
    class_summary = []
    
    for class_name in class_names:
        if class_name in report:
            class_summary.append({
                'Class': class_name,
                'Precision': f"{report[class_name]['precision']:.3f}",
                'Recall': f"{report[class_name]['recall']:.3f}",
                'F1-Score': f"{report[class_name]['f1-score']:.3f}",
                'Support': f"{int(report[class_name]['support'])}"
            })
    
    class_df = pd.DataFrame(class_summary)
    
    print("\n" + "=" * 50)
    print("PER-CLASS PERFORMANCE SUMMARY")
    print("=" * 50)
    print(class_df.to_string(index=False))
    
    return summary_df, class_df

# Generate summary
# overall_summary, class_summary = generate_performance_summary(true_labels, predictions, probabilities, label_encoder.classes_)

## 6. Save Results and Plots

In [None]:
def save_evaluation_results(true_labels, predictions, probabilities, class_names, model_name="mridangam_model"):
    """Save all evaluation results to files"""
    
    # Save detailed classification report
    report = classification_report(true_labels, predictions, target_names=class_names, output_dict=True)
    report_df = pd.DataFrame(report).T
    report_df.to_csv(f'{model_name}_classification_report.csv')
    
    # Save confusion matrix
    cm = confusion_matrix(true_labels, predictions)
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
    cm_df.to_csv(f'{model_name}_confusion_matrix.csv')
    
    # Save predictions and probabilities
    results_df = pd.DataFrame({
        'true_label': true_labels,
        'predicted_label': predictions,
        'true_class': [class_names[i] for i in true_labels],
        'predicted_class': [class_names[i] for i in predictions],
        'max_probability': np.max(probabilities, axis=1),
        'correct': true_labels == predictions
    })
    
    # Add individual class probabilities
    for i, class_name in enumerate(class_names):
        results_df[f'prob_{class_name}'] = probabilities[:, i]
    
    results_df.to_csv(f'{model_name}_detailed_results.csv', index=False)
    
    print(f"\nEvaluation results saved:")
    print(f"- {model_name}_classification_report.csv")
    print(f"- {model_name}_confusion_matrix.csv")
    print(f"- {model_name}_detailed_results.csv")
    print(f"- model_evaluation_dashboard.png")

# Save results
# save_evaluation_results(true_labels, predictions, probabilities, label_encoder.classes_)

## 7. Complete Evaluation Pipeline

Run this cell to execute the complete evaluation pipeline:

In [None]:
def run_complete_evaluation(model, test_loader, label_encoder, device):
    """Run the complete evaluation pipeline"""
    
    print("Starting comprehensive model evaluation...")
    
    # Get predictions
    predictions, true_labels, probabilities = get_predictions_and_labels(model, test_loader, device)
    class_names = label_encoder.classes_
    
    # Display metrics
    display_comprehensive_metrics(true_labels, predictions, probabilities, class_names)
    
    # Create visualization dashboard
    create_evaluation_dashboard(true_labels, predictions, probabilities, class_names)
    
    # Analyze errors
    error_analysis = analyze_errors_detailed(true_labels, predictions, probabilities, class_names)
    
    # Generate summary
    overall_summary, class_summary = generate_performance_summary(true_labels, predictions, probabilities, class_names)
    
    # Save results
    save_evaluation_results(true_labels, predictions, probabilities, class_names)
    
    print("\nEvaluation complete! Check the generated files and plots.")
    
    return {
        'predictions': predictions,
        'true_labels': true_labels,
        'probabilities': probabilities,
        'overall_summary': overall_summary,
        'class_summary': class_summary,
        'error_analysis': error_analysis
    }

# Run complete evaluation
# evaluation_results = run_complete_evaluation(model, test_loader, label_encoder, device)

## Usage Instructions

1. **Load your trained model**: Make sure you have your trained model loaded
2. **Prepare test data**: Ensure your test_loader is ready
3. **Run evaluation**: Execute the `run_complete_evaluation` function
4. **Review results**: Check the generated plots and CSV files

### Files Generated:
- `model_evaluation_dashboard.png`: Comprehensive visualization dashboard
- `{model_name}_classification_report.csv`: Detailed classification metrics
- `{model_name}_confusion_matrix.csv`: Confusion matrix data
- `{model_name}_detailed_results.csv`: Per-sample predictions and probabilities

### Key Metrics to Watch:
- **Overall Accuracy**: How often the model is correct
- **Per-class F1-scores**: Balanced performance measure for each class
- **Confidence distributions**: How certain the model is about its predictions
- **Common misclassifications**: Which classes are often confused
- **Low-confidence predictions**: Samples the model is uncertain about