# **Model Evaluation**

**Course:** Master in Big Data, Data Science & AI — Master Thesis  
**Author:** Carlota Trigo La Blanca  

*This notebook will be used to evlauate the models trained, compare them, and select the best one.*


## Initialization

In [None]:
import utils
import os
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    precision_recall_curve, average_precision_score, roc_curve,
    matthews_corrcoef, balanced_accuracy_score,
    precision_recall_fscore_support, f1_score
)
from sklearn.calibration import calibration_curve
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Theme & style
sns.set_theme(style="whitegrid")          # optional, pick your style

# Colors: use Viridis everywhere
sns.set_palette("viridis")                # discrete color cycle for lines/bars
plt.rcParams["image.cmap"] = "viridis"    # default colormap for imshow/matshow

# Font sizes (global)
plt.rcParams.update({
    "axes.titlesize": 10,       # axes titles (ax.set_title)
    "figure.titlesize": 10,     # figure titles (plt.suptitle)
    "axes.labelsize": 9,        # x/y axis labels
    "xtick.labelsize": 8,       # tick labels
    "ytick.labelsize": 8,
    "legend.fontsize": 8,
    "legend.title_fontsize": 9,
})

  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [2]:
# Configuration and Constants
from pathlib import Path
SEED = utils.SEED
DATA_DIR = utils.DATA_DIR
PREPARED_CSV = utils.PREPARED_CSV
IMAGE_PATH = utils.IMAGE_PATH

# Model paths - Updated to match actual model implementations
BASELINE_DIR = Path("./outputs/base_model")
SSL_DIR = Path("./outputs/ssl_finetuned")
ENSEMBLE_DIR = Path("./outputs/ensemble_models")
INDIVIDUAL_DIR = Path("./outputs/individual_models")

# Model configurations - Updated with correct paths and file names
MODELS_CONFIG = {
    'baseline': {
        'name': 'Baseline EfficientNetB1',
        'path': BASELINE_DIR / "simple_twohead_best_model.keras",
        'type': 'single',
        'color': '#1f77b4'
    },
    'ssl': {
        'name': 'SSL Fine-tuned',
        'path': SSL_DIR / "ssl_finetuned_best_model.keras",
        'type': 'single',
        'color': '#ff7f0e'
    },
    'ensemble_voting': {
        'name': 'Voting Ensemble',
        'path': INDIVIDUAL_DIR,
        'type': 'ensemble',
        'color': '#2ca02c'
    },
    'ensemble_weighted': {
        'name': 'Weighted Ensemble',
        'path': INDIVIDUAL_DIR,
        'type': 'ensemble', 
        'color': '#d62728'
    }
}

# Class mappings (verified from data analysis)
COARSE_CLASSES = ['benign', 'malignant', 'no_lesion']
FINE_CLASSES = ['nv', 'mel', 'bkl', 'bcc', 'scc_akiec', 'vasc', 'df', 'other', 'no_lesion']

# Clinical thresholds
MALIGNANT_RECALL_THRESHOLD = 0.95  # ≥95% recall for malignant
OOD_FPR_THRESHOLD = 0.05  # FPR@95%TPR

# Bootstrap parameters
N_BOOTSTRAP = 1000
CONFIDENCE_LEVEL = 0.95

# Use constants from utils
IMG_SIZE = utils.IMG_SIZE
BATCH_SIZE = utils.BATCH_SIZE
DX_CLASSES = utils.DX_CLASSES
LESION_TYPE_CLASSES = utils.LESION_TYPE_CLASSES
N_DX_CLASSES = utils.N_DX_CLASSES
N_LESION_TYPE_CLASSES = utils.N_LESION_TYPE_CLASSES

Evaluation functions

In [None]:
def calculate_general_metrics(labels, logits, class_names, task_name):
    """
    GENERAL EVALUATION METRICS for imbalanced datasets:
    - Macro F1: Primary summary metric (unweighted average of per-class F1)
    - Balanced accuracy: Average recall per class
    - MCC: Matthews Correlation Coefficient (considers full confusion matrix)
    - Per-class precision, recall, F1: Exhaustive analysis
    - Per-class AUPRC: Precision-Recall curves for each class
    - ECE: Expected Calibration Error
    """
    preds = np.argmax(logits, axis=1)
    probs = tf.nn.softmax(logits).numpy()
    
    # Primary metrics
    macro_f1 = f1_score(labels, preds, average='macro', zero_division=0)
    balanced_acc = balanced_accuracy_score(labels, preds)
    mcc = matthews_corrcoef(labels, preds)
    
    # Per-class metrics
    per_class_metrics = {}
    auprc_scores = []
    
    for i, class_name in enumerate(class_names):
        if np.any(labels == i):  # Check if class exists in labels
            class_labels = (labels == i).astype(int)
            class_preds = (preds == i).astype(int)
            
            if len(np.unique(class_labels)) > 1:  # Check if class has both positive and negative samples
                precision, recall, f1, _ = precision_recall_fscore_support(
                    class_labels, class_preds, average='binary', zero_division=0
                )
                auroc = roc_auc_score(class_labels, probs[:, i])
                auprc = average_precision_score(class_labels, probs[:, i])
                
                per_class_metrics[class_name] = {
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'auroc': auroc,
                    'auprc': auprc,
                    'support': np.sum(class_labels)
                }
                auprc_scores.append(auprc)
    
    # AUPRC macro (average across classes)
    auprc_macro = np.mean(auprc_scores) if auprc_scores else 0.0
    
    # Expected Calibration Error (ECE)
    ece = calculate_ece(labels, probs, class_names)
    
    return {
        'macro_f1': macro_f1,
        'balanced_accuracy': balanced_acc,
        'mcc': mcc,
        'auprc_macro': auprc_macro,
        'ece': ece,
        'per_class': per_class_metrics
    }

def calculate_ece(labels, probs, class_names, n_bins=10):
    """
    Classwise ECE: average over classes of bin-wise |conf - acc|.
    Safer binning that includes probability = 1.0 in the last bin.
    """
    ece_scores = []
    for i, _ in enumerate(class_names):
        if np.any(labels == i):
            class_labels = (labels == i).astype(int)
            class_probs = probs[:, i]
            if len(np.unique(class_labels)) > 1:
                ece = 0.0
                for b in range(n_bins):
                    lo = b / n_bins
                    hi = (b + 1) / n_bins
                    if b < n_bins - 1:
                        in_bin = (class_probs >= lo) & (class_probs < hi)
                    else:
                        in_bin = (class_probs >= lo) & (class_probs <= hi)  # include 1.0
                    if np.any(in_bin):
                        acc = class_labels[in_bin].mean()
                        conf = class_probs[in_bin].mean()
                        ece += np.abs(conf - acc) * (in_bin.mean())
                ece_scores.append(ece)
    return float(np.mean(ece_scores)) if ece_scores else 0.0

def calculate_hierarchical_metrics(predictions, coarse_classes, fine_classes):
    """
    HIERARCHICAL EVALUATION (two-head architecture):
    - Exact-match: Both heads correct simultaneously
    - Coarse-correct: Balanced accuracy for head 1 (coarse)
    - Fine-conditional: Macro-F1 of head 2, conditioned on head 1 being correct
    """
    coarse_labels = predictions['id_labels_h1']
    coarse_logits = predictions['id_logits_h1']
    fine_labels = predictions['id_labels_h2']
    fine_logits = predictions['id_logits_h2']
    
    coarse_preds = np.argmax(coarse_logits, axis=1)
    fine_preds = np.argmax(fine_logits, axis=1)
    
    # Exact-match: both heads correct
    exact_match = np.mean((coarse_preds == coarse_labels) & (fine_preds == fine_labels))
    
    # Coarse-correct: head1 balanced accuracy
    coarse_correct = balanced_accuracy_score(coarse_labels, coarse_preds)
    
    # Fine-conditional: head2 macro-F1 conditioned on head1 being correct
    coarse_correct_mask = (coarse_preds == coarse_labels)
    if np.sum(coarse_correct_mask) > 0:
        fine_conditional_labels = fine_labels[coarse_correct_mask]
        fine_conditional_preds = fine_preds[coarse_correct_mask]
        fine_conditional_f1 = f1_score(fine_conditional_labels, fine_conditional_preds, 
                                     average='macro', zero_division=0)
    else:
        fine_conditional_f1 = 0.0
    
    return {
        'exact_match': exact_match,
        'coarse_correct': coarse_correct,
        'fine_conditional_f1': fine_conditional_f1
    }

def calculate_ood_metrics(id_logits, ood_logits):
    """
    OUT-OF-DISTRIBUTION EVALUATION:
    - AUROC (OOD vs ID)
    - AUPRC (OOD as positive class)
    - FPR@95%TPR (TPR = OOD recall)
    - Detection Error = min_tau 0.5 * (FNR_OOD + FPR_ID)
    """
    # Softmax
    id_probs = tf.nn.softmax(id_logits, axis=1).numpy()
    ood_probs = tf.nn.softmax(ood_logits, axis=1).numpy()

    # OOD score: higher => more likely OOD (use 1 - MSP)
    id_scores = 1.0 - np.max(id_probs, axis=1)
    ood_scores = 1.0 - np.max(ood_probs, axis=1)

    # Labels: 0 = ID, 1 = OOD
    y = np.concatenate([np.zeros_like(id_scores), np.ones_like(ood_scores)])
    s = np.concatenate([id_scores, ood_scores])

    # AUROC / AUPRC
    auroc = roc_auc_score(y, s)
    auprc = average_precision_score(y, s)  # OOD is positive class

    # ROC for FPR@95%TPR and Detection Error
    fpr, tpr, thr = roc_curve(y, s)  # TPR refers to OOD (positive) recall
    # FPR at 95% TPR (first threshold achieving >= 0.95)
    idx = np.searchsorted(tpr, 0.95, side='left')
    fpr_at_95_tpr = float(fpr[idx]) if idx < len(fpr) else 1.0

    # Detection error (equal priors): 0.5 * (FNR_OOD + FPR_ID)
    fnr = 1.0 - tpr
    detection_error = float(np.min(0.5 * (fnr + fpr)))

    return {
        'auroc': auroc,
        'auprc': auprc,
        'fpr_at_95_tpr': fpr_at_95_tpr,
        'detection_error': detection_error
    }

def print_metrics(coarse_metrics, fine_metrics, hierarchical_metrics, ood_metrics):
    """
    Print detailed evaluation metrics in a structured format.
    """
    # Print detailed results for this model
    print(f"\nCOARSE CLASSIFICATION (Head 1):")
    print(f"  Macro F1: {coarse_metrics['macro_f1']:.4f}")
    print(f"  Balanced Accuracy: {coarse_metrics['balanced_accuracy']:.4f}")
    print(f"  MCC: {coarse_metrics['mcc']:.4f}")
    print(f"  AUPRC Macro: {coarse_metrics['auprc_macro']:.4f}")
    print(f"  ECE: {coarse_metrics['ece']:.4f}")

    print(f"\nFINE CLASSIFICATION (Head 2):")
    print(f"  Macro F1: {fine_metrics['macro_f1']:.4f}")
    print(f"  Balanced Accuracy: {fine_metrics['balanced_accuracy']:.4f}")
    print(f"  MCC: {fine_metrics['mcc']:.4f}")
    print(f"  AUPRC Macro: {fine_metrics['auprc_macro']:.4f}")
    print(f"  ECE: {fine_metrics['ece']:.4f}")

    print(f"\nHIERARCHICAL METRICS:")
    print(f"  Exact Match: {hierarchical_metrics['exact_match']:.4f}")
    print(f"  Coarse Correct: {hierarchical_metrics['coarse_correct']:.4f}")
    print(f"  Fine Conditional F1: {hierarchical_metrics['fine_conditional_f1']:.4f}")

    print(f"\nOOD DETECTION:")
    print(f"  AUROC: {ood_metrics['auroc']:.4f}")
    print(f"  AUPRC: {ood_metrics['auprc']:.4f}")
    print(f"  FPR@95%TPR: {ood_metrics['fpr_at_95_tpr']:.4f}")
    print(f"  Detection Error: {ood_metrics['detection_error']:.4f}")

    # Per-class detailed analysis
    print(f"\nPER-CLASS ANALYSIS - COARSE:")
    df = pd.DataFrame(coarse_metrics['per_class']).T
    df = df[['precision','recall','f1','auroc','auprc','support']]
    df['support'] = df['support'].astype(int)         # make support nice ints
    print(df.to_string(float_format=lambda x: f"{x:.3f}"))

    print(f"\nPER-CLASS ANALYSIS - FINE:")
    df = pd.DataFrame(fine_metrics['per_class']).T
    df = df[['precision','recall','f1','auroc','auprc','support']]
    df['support'] = df['support'].astype(int)         # make support nice ints
    print(df.to_string(float_format=lambda x: f"{x:.3f}"))
    
    return

# Load models and data

This code loads all the models and obtains the predictions from the test and test_ood datsets.

In [None]:
def load_models():
    """Load all available models."""
    models = {}
    ensemble_models = {}
    
    # Load individual models
    for model_key, config in MODELS_CONFIG.items():
        if config['type'] == 'single':
            print(f"Loading {config['name']}...")
            model = utils.load_individual_model(config['path'], 'efficientnet')
            if model is not None:
                models[model_key] = model
                print(f"✓ {config['name']} loaded")
            else:
                print(f"✗ Failed to load {config['name']}")
    
    # Load ensemble models
    ensemble_models = utils.load_ensemble_models(INDIVIDUAL_DIR)
    
    return models, ensemble_models

def get_predictions_all_models(models, ensemble_models, test_ds, ood_ds):
    """Get predictions for all models."""
    all_predictions = {}
    
    # Individual models
    for model_key, model in models.items():
        print(f"Evaluating {MODELS_CONFIG[model_key]['name']}...")
        try:
            id_labels_h1, id_logits_h1, id_labels_h2, id_logits_h2 = utils.get_predictions_and_labels(model, test_ds)
            ood_labels_h1, ood_logits_h1, ood_labels_h2, ood_logits_h2 = utils.get_predictions_and_labels(model, ood_ds)
            
            all_predictions[model_key] = {
                'id_labels_h1': id_labels_h1, 'id_logits_h1': id_logits_h1,
                'id_labels_h2': id_labels_h2, 'id_logits_h2': id_logits_h2,
                'ood_labels_h1': ood_labels_h1, 'ood_logits_h1': ood_logits_h1,
                'ood_labels_h2': ood_labels_h2, 'ood_logits_h2': ood_logits_h2
            }
            print(f"✓ {MODELS_CONFIG[model_key]['name']} evaluated")
        except Exception as e:
            print(f"✗ Failed to evaluate {MODELS_CONFIG[model_key]['name']}: {e}")
    
    # Ensemble models
    if len(ensemble_models) > 0:
        print("Evaluating ensemble models...")
        try:
            # Voting ensemble
            id_labels_h1, id_logits_h1, id_labels_h2, id_logits_h2 = utils.get_ensemble_predictions(ensemble_models, test_ds, 'voting')
            ood_labels_h1, ood_logits_h1, ood_labels_h2, ood_logits_h2 = utils.get_ensemble_predictions(ensemble_models, ood_ds, 'voting')
            
            all_predictions['ensemble_voting'] = {
                'id_labels_h1': id_labels_h1, 'id_logits_h1': id_logits_h1,
                'id_labels_h2': id_labels_h2, 'id_logits_h2': id_logits_h2,
                'ood_labels_h1': ood_labels_h1, 'ood_logits_h1': ood_logits_h1,
                'ood_labels_h2': ood_labels_h2, 'ood_logits_h2': ood_logits_h2
            }
            
            # Weighted ensemble
            id_labels_h1, id_logits_h1, id_labels_h2, id_logits_h2 = utils.get_ensemble_predictions(ensemble_models, test_ds, 'weighted')
            ood_labels_h1, ood_logits_h1, ood_labels_h2, ood_logits_h2 = utils.get_ensemble_predictions(ensemble_models, ood_ds, 'weighted')
            
            all_predictions['ensemble_weighted'] = {
                'id_labels_h1': id_labels_h1, 'id_logits_h1': id_logits_h1,
                'id_labels_h2': id_labels_h2, 'id_logits_h2': id_logits_h2,
                'ood_labels_h1': ood_labels_h1, 'ood_logits_h1': ood_logits_h1,
                'ood_labels_h2': ood_labels_h2, 'ood_logits_h2': ood_logits_h2
            }
            
            print("✓ Ensemble models evaluated")
        except Exception as e:
            print(f"✗ Failed to evaluate ensemble models: {e}")
    
    return all_predictions

def get_predictions_individual_ensemble_models(ensemble_models, test_ds, ood_ds):
    """Get predictions for individual models in the ensemble."""
    individual_predictions = {}
    
    if len(ensemble_models) > 0:
        print("Evaluating individual ensemble models...")
        
        for model_name, model in ensemble_models.items():
            print(f"Evaluating individual model: {model_name}")
            try:
                id_labels_h1, id_logits_h1, id_labels_h2, id_logits_h2 = utils.get_predictions_and_labels(model, test_ds)
                ood_labels_h1, ood_logits_h1, ood_labels_h2, ood_logits_h2 = utils.get_predictions_and_labels(model, ood_ds)
                
                individual_predictions[f'individual_{model_name}'] = {
                    'id_labels_h1': id_labels_h1, 'id_logits_h1': id_logits_h1,
                    'id_labels_h2': id_labels_h2, 'id_logits_h2': id_logits_h2,
                    'ood_labels_h1': ood_labels_h1, 'ood_logits_h1': ood_logits_h1,
                    'ood_labels_h2': ood_labels_h2, 'ood_logits_h2': ood_logits_h2
                }
                print(f"✓ Individual model {model_name} evaluated")
            except Exception as e:
                print(f"✗ Failed to evaluate individual model {model_name}: {e}")
    
    return individual_predictions

# Load data
df = pd.read_csv(PREPARED_CSV)
test_df = df[df.split == "test"].copy()
ood_df = df[df.split == "test_ood"].copy()

print(f"Test samples: {len(test_df)}")
print(f"OOD samples: {len(ood_df)}")

# Build datasets
test_ds = utils.build_dataset(test_df, is_training=False)
ood_ds = utils.build_dataset(ood_df, is_training=False)

models, ensemble_models = load_models()

if len(models) == 0 and len(ensemble_models) == 0:
    print("No models available for evaluation!")
else:
    print(f"✓ Loaded {len(models)} individual models")
    print(f"✓ Loaded {len(ensemble_models)} ensemble component models")


Test samples: 10359
OOD samples: 2669
Loading Baseline EfficientNetB1...
✓ Loaded complete model from outputs\base_model\simple_twohead_best_model.keras
✓ Baseline EfficientNetB1 loaded
Loading SSL Fine-tuned...
✗ Model file not found: outputs\ssl_finetuned\ssl_finetuned_best_model.keras
✗ Failed to load SSL Fine-tuned
✓ Loaded 1 individual models
✓ Loaded 0 ensemble component models


In [28]:
test_df['lesion_type'].value_counts(), test_df['diagnosis_grouped'].value_counts()

(lesion_type
 benign       7439
 malignant    2658
 no_lesion     262
 Name: count, dtype: int64,
 diagnosis_grouped
 unknown      3606
 nv           3070
 bcc          1284
 mel           930
 bkl           615
 scc_akiec     441
 no_lesion     262
 df             66
 vasc           57
 other          28
 Name: count, dtype: int64)

In [29]:
ood_df['lesion_type'].value_counts(), ood_df['diagnosis_grouped'].value_counts()


(lesion_type
 benign    2669
 Name: count, dtype: int64,
 diagnosis_grouped
 unknown    2669
 Name: count, dtype: int64)

In [10]:
all_predictions = get_predictions_all_models(models, ensemble_models, test_ds, ood_ds)

Evaluating Baseline EfficientNetB1...
✓ Baseline EfficientNetB1 evaluated


In [None]:
individual_ensemble_predictions = get_predictions_individual_ensemble_models(ensemble_models, test_ds, ood_ds)

# Combine with main predictions
all_predictions.update(individual_ensemble_predictions)

In [None]:
for key in all_predictions.keys():
    print(f"  - {key}")

# MODEL EVALUATION

In [19]:
all_results = {}

## EfficientNet Baseline model

In [None]:
baseline_predictions = all_predictions['baseline']

In [None]:
# Coarse classification (Head 1)
coarse_metrics = calculate_general_metrics(baseline_predictions['id_labels_h1'], 
                                           baseline_predictions['id_logits_h1'], 
                                           COARSE_CLASSES, "coarse")

In [None]:
# Fine classification (Head 2)
valid_mask = baseline_predictions['id_labels_h2'] >= 0
fine_metrics = calculate_general_metrics(baseline_predictions['id_labels_h2'][valid_mask], 
                                         baseline_predictions['id_logits_h2'][valid_mask], 
                                         FINE_CLASSES, "fine")

In [22]:
hierarchical_metrics = calculate_hierarchical_metrics(baseline_predictions, COARSE_CLASSES, FINE_CLASSES)

In [23]:
ood_metrics = calculate_ood_metrics(baseline_predictions['id_logits_h1'], baseline_predictions['ood_logits_h1'])

In [37]:
all_results['baseline'] = {
                'model_name': 'baseline',
                'coarse_metrics': coarse_metrics,
                'fine_metrics': fine_metrics,
                'hierarchical_metrics': hierarchical_metrics,
                'ood_metrics': ood_metrics
            }

print_metrics(coarse_metrics, fine_metrics, hierarchical_metrics, ood_metrics)


COARSE CLASSIFICATION (Head 1):
  Macro F1: 0.2786
  Balanced Accuracy: 0.3333
  MCC: 0.0000
  AUPRC Macro: 0.3847
  ECE: 0.1380

FINE CLASSIFICATION (Head 2):
  Macro F1: 0.0929
  Balanced Accuracy: 0.1583
  MCC: 0.2202
  AUPRC Macro: 0.1767
  ECE: 0.1073

HIERARCHICAL METRICS:
  Exact Match: 0.2030
  Coarse Correct: 0.3333
  Fine Conditional F1: 0.0743

OOD DETECTION:
  AUROC: 0.4657
  AUPRC: 0.7857
  FPR@95%TPR: 0.7951
  Detection Error: 0.7951

PER-CLASS ANALYSIS - COARSE:

PER-CLASS ANALYSIS - COARSE:
           precision  recall    f1  auroc  auprc  support
benign         0.718   1.000 0.836  0.597  0.780     7439
malignant      0.000   0.000 0.000  0.620  0.344     2658
no_lesion      0.000   0.000 0.000  0.488  0.031      262

PER-CLASS ANALYSIS - FINE:
           precision  recall    f1  auroc  auprc  support
nv             0.817   0.501 0.621  0.818  0.780     3070
mel            0.000   0.000 0.000  0.496  0.138      930
bkl            0.117   0.920 0.208  0.702  0.204     

## SSL model

In [None]:
baseline_predictions = all_predictions['ssl']

In [None]:
# Coarse classification (Head 1)
coarse_metrics = calculate_general_metrics(baseline_predictions['id_labels_h1'], 
                                           baseline_predictions['id_logits_h1'], 
                                           COARSE_CLASSES, "coarse")

In [None]:
# Fine classification (Head 2)
valid_mask = baseline_predictions['id_labels_h2'] >= 0
fine_metrics = calculate_general_metrics(baseline_predictions['id_labels_h2'][valid_mask], 
                                         baseline_predictions['id_logits_h2'][valid_mask], 
                                         FINE_CLASSES, "fine")

In [None]:
hierarchical_metrics = calculate_hierarchical_metrics(baseline_predictions, COARSE_CLASSES, FINE_CLASSES)

In [None]:
ood_metrics = calculate_ood_metrics(baseline_predictions['id_logits_h1'], baseline_predictions['ood_logits_h1'])

In [None]:
all_results['ssl'] = {
                'model_name': 'ssl',
                'coarse_metrics': coarse_metrics,
                'fine_metrics': fine_metrics,
                'hierarchical_metrics': hierarchical_metrics,
                'ood_metrics': ood_metrics
            }

print_metrics(coarse_metrics, fine_metrics, hierarchical_metrics, ood_metrics)

## Ensemble model

### Voting

In [None]:
ensemble_voting_predictions = all_predictions['ensemble_voting']

In [None]:
# Coarse classification (Head 1)
coarse_metrics = calculate_general_metrics(ensemble_voting_predictions['id_labels_h1'], 
                                           ensemble_voting_predictions['id_logits_h1'], 
                                           COARSE_CLASSES, "coarse")

In [None]:
# Fine classification (Head 2)
valid_mask = ensemble_voting_predictions['id_labels_h2'] >= 0
fine_metrics = calculate_general_metrics(ensemble_voting_predictions['id_labels_h2'][valid_mask], 
                                         ensemble_voting_predictions['id_logits_h2'][valid_mask], 
                                         FINE_CLASSES, "fine")

In [None]:
hierarchical_metrics = calculate_hierarchical_metrics(ensemble_voting_predictions, COARSE_CLASSES, FINE_CLASSES)

In [None]:
ood_metrics = calculate_ood_metrics(ensemble_voting_predictions['id_logits_h1'], ensemble_voting_predictions['ood_logits_h1'])

In [None]:
all_results['ensemble_voting'] = {
                'model_name': 'ensemble_voting',
                'coarse_metrics': coarse_metrics,
                'fine_metrics': fine_metrics,
                'hierarchical_metrics': hierarchical_metrics,
                'ood_metrics': ood_metrics
            }

print_metrics(coarse_metrics, fine_metrics, hierarchical_metrics, ood_metrics)

### Weighting

In [None]:
ensemble_weighted_predictions = all_predictions['ensemble_weighted']

In [None]:
# Coarse classification (Head 1)
coarse_metrics = calculate_general_metrics(ensemble_weighted_predictions['id_labels_h1'], 
                                           ensemble_weighted_predictions['id_logits_h1'], 
                                           COARSE_CLASSES, "coarse")

In [None]:
# Fine classification (Head 2)
valid_mask = ensemble_weighted_predictions['id_labels_h2'] >= 0
fine_metrics = calculate_general_metrics(ensemble_weighted_predictions['id_labels_h2'][valid_mask], 
                                         ensemble_weighted_predictions['id_logits_h2'][valid_mask], 
                                         FINE_CLASSES, "fine")

In [None]:
hierarchical_metrics = calculate_hierarchical_metrics(ensemble_weighted_predictions, COARSE_CLASSES, FINE_CLASSES)

In [None]:
ood_metrics = calculate_ood_metrics(ensemble_weighted_predictions['id_logits_h1'], ensemble_weighted_predictions['ood_logits_h1'])

In [None]:
all_results['ensemble_weighted'] = {
                'model_name': 'ensemble_weighted',
                'coarse_metrics': coarse_metrics,
                'fine_metrics': fine_metrics,
                'hierarchical_metrics': hierarchical_metrics,
                'ood_metrics': ood_metrics
            }

print_metrics(coarse_metrics, fine_metrics, hierarchical_metrics, ood_metrics)

## Individual ResNet

In [None]:
resnet_predictions = all_predictions['resnet']

In [None]:
# Coarse classification (Head 1)
coarse_metrics = calculate_general_metrics(resnet_predictions['id_labels_h1'], 
                                           resnet_predictions['id_logits_h1'], 
                                           COARSE_CLASSES, "coarse")

In [None]:
# Fine classification (Head 2)
valid_mask = resnet_predictions['id_labels_h2'] >= 0
fine_metrics = calculate_general_metrics(resnet_predictions['id_labels_h2'][valid_mask], 
                                         resnet_predictions['id_logits_h2'][valid_mask], 
                                         FINE_CLASSES, "fine")

In [None]:
hierarchical_metrics = calculate_hierarchical_metrics(resnet_predictions, COARSE_CLASSES, FINE_CLASSES)

In [None]:
ood_metrics = calculate_ood_metrics(resnet_predictions['id_logits_h1'], resnet_predictions['ood_logits_h1'])

In [None]:
# all_results['resnet'] = {
#                 'model_name': 'resnet',
#                 'coarse_metrics': coarse_metrics,
#                 'fine_metrics': fine_metrics,
#                 'hierarchical_metrics': hierarchical_metrics,
#                 'ood_metrics': ood_metrics
#             }

print_metrics(coarse_metrics, fine_metrics, hierarchical_metrics, ood_metrics)

## Individual DenseNet

In [None]:
densenet_predictions = all_predictions['densenet']

In [None]:
# Coarse classification (Head 1)
coarse_metrics = calculate_general_metrics(densenet_predictions['id_labels_h1'], 
                                           densenet_predictions['id_logits_h1'], 
                                           COARSE_CLASSES, "coarse")

In [None]:
# Fine classification (Head 2)
valid_mask = densenet_predictions['id_labels_h2'] >= 0
fine_metrics = calculate_general_metrics(densenet_predictions['id_labels_h2'][valid_mask], 
                                         densenet_predictions['id_logits_h2'][valid_mask], 
                                         FINE_CLASSES, "fine")

In [None]:
hierarchical_metrics = calculate_hierarchical_metrics(densenet_predictions, COARSE_CLASSES, FINE_CLASSES)

In [None]:
ood_metrics = calculate_ood_metrics(densenet_predictions['id_logits_h1'], densenet_predictions['ood_logits_h1'])

In [None]:
# all_results['densenet'] = {
#                 'model_name': 'densenet',
#                 'coarse_metrics': coarse_metrics,
#                 'fine_metrics': fine_metrics,
#                 'hierarchical_metrics': hierarchical_metrics,
#                 'ood_metrics': ood_metrics
#             }

print_metrics(coarse_metrics, fine_metrics, hierarchical_metrics, ood_metrics)

# Model Comparison

In [None]:
comparison_data = []
for model_key, result in all_results.items():
    model_name = result['model_name']
    comparison_data.append({
        'Model': model_name,
        # General metrics - Coarse
        'Coarse Macro-F1': result['coarse_metrics']['macro_f1'],
        'Coarse Balanced Acc': result['coarse_metrics']['balanced_accuracy'],
        'Coarse MCC': result['coarse_metrics']['mcc'],
        'Coarse AUPRC': result['coarse_metrics']['auprc_macro'],
        'Coarse ECE': result['coarse_metrics']['ece'],
        # General metrics - Fine
        'Fine Macro-F1': result['fine_metrics']['macro_f1'],
        'Fine Balanced Acc': result['fine_metrics']['balanced_accuracy'],
        'Fine MCC': result['fine_metrics']['mcc'],
        'Fine AUPRC': result['fine_metrics']['auprc_macro'],
        'Fine ECE': result['fine_metrics']['ece'],
        # Hierarchical metrics
        'Exact Match': result['hierarchical_metrics']['exact_match'],
        'Coarse Correct': result['hierarchical_metrics']['coarse_correct'],
        'Fine Conditional F1': result['hierarchical_metrics']['fine_conditional_f1'],
        # OOD metrics
        'OOD AUROC': result['ood_metrics']['auroc'],
        'OOD AUPRC': result['ood_metrics']['auprc'],
        'FPR@95%TPR': result['ood_metrics']['fpr_at_95_tpr'],
        'Detection Error': result['ood_metrics']['detection_error']
    })

comparison_df = pd.DataFrame(comparison_data)

In [None]:
print(comparison_df.round(4))

In [None]:
# General metrics
best_coarse_f1 = comparison_df.loc[comparison_df['Coarse Macro-F1'].idxmax()]
best_fine_f1 = comparison_df.loc[comparison_df['Fine Macro-F1'].idxmax()]
best_coarse_ece = comparison_df.loc[comparison_df['Coarse ECE'].idxmin()]
best_fine_ece = comparison_df.loc[comparison_df['Fine ECE'].idxmin()]

# Hierarchical metrics
best_exact_match = comparison_df.loc[comparison_df['Exact Match'].idxmax()]
best_coarse_correct = comparison_df.loc[comparison_df['Coarse Correct'].idxmax()]
best_fine_conditional = comparison_df.loc[comparison_df['Fine Conditional F1'].idxmax()]

# OOD metrics
best_ood_auroc = comparison_df.loc[comparison_df['OOD AUROC'].idxmax()]
best_ood_auprc = comparison_df.loc[comparison_df['OOD AUPRC'].idxmax()]
best_fpr_95 = comparison_df.loc[comparison_df['FPR@95%TPR'].idxmin()]
best_detection_error = comparison_df.loc[comparison_df['Detection Error'].idxmin()]

print(f"📊 GENERAL EVALUATION:")
print(f"  Best Coarse Macro-F1: {best_coarse_f1['Model']} ({best_coarse_f1['Coarse Macro-F1']:.4f})")
print(f"  Best Fine Macro-F1: {best_fine_f1['Model']} ({best_fine_f1['Fine Macro-F1']:.4f})")
print(f"  Best Coarse Calibration: {best_coarse_ece['Model']} (ECE: {best_coarse_ece['Coarse ECE']:.4f})")
print(f"  Best Fine Calibration: {best_fine_ece['Model']} (ECE: {best_fine_ece['Fine ECE']:.4f})")

print(f"\n📊 HIERARCHICAL EVALUATION:")
print(f"  Best Exact Match: {best_exact_match['Model']} ({best_exact_match['Exact Match']:.4f})")
print(f"  Best Coarse Correct: {best_coarse_correct['Model']} ({best_coarse_correct['Coarse Correct']:.4f})")
print(f"  Best Fine Conditional: {best_fine_conditional['Model']} ({best_fine_conditional['Fine Conditional F1']:.4f})")

print(f"\n📊 OOD DETECTION:")
print(f"  Best OOD AUROC: {best_ood_auroc['Model']} ({best_ood_auroc['OOD AUROC']:.4f})")
print(f"  Best OOD AUPRC: {best_ood_auprc['Model']} ({best_ood_auprc['OOD AUPRC']:.4f})")
print(f"  Best FPR@95%TPR: {best_fpr_95['Model']} ({best_fpr_95['FPR@95%TPR']:.4f})")
print(f"  Best Detection Error: {best_detection_error['Model']} ({best_detection_error['Detection Error']:.4f})")

In [None]:
comparison_df['Overall Score'] = (
    0.4 * comparison_df['Coarse Macro-F1'] +
    0.3 * comparison_df['Fine Macro-F1'] +
    0.2 * comparison_df['OOD AUROC'] +
    0.1 * (1 - comparison_df['Fine ECE'])  # Lower ECE is better
)

# Rank models by overall performance
ranked_models = comparison_df.sort_values('Overall Score', ascending=False)

In [None]:
f, axes = plt.subplots(2, 3, figsize=(10, 12))
f.suptitle('General Evaluation Metrics - Focused Analysis', fontsize=16, fontweight='bold')

models = comparison_df['Model'].tolist()
colors = [MODELS_CONFIG.get(key, {}).get('color', '#666666') for key in all_results.keys()]

# Coarse metrics
axes[0, 0].bar(models, comparison_df['Coarse Macro-F1'], color=colors, alpha=0.7)
axes[0, 0].set_title('Coarse Macro-F1 Score (Primary Metric)', fontweight='bold')
axes[0, 0].set_ylabel('Macro-F1')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].bar(models, comparison_df['Coarse Balanced Acc'], color=colors, alpha=0.7)
axes[0, 1].set_title('Coarse Balanced Accuracy', fontweight='bold')
axes[0, 1].set_ylabel('Balanced Accuracy')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

axes[0, 2].bar(models, comparison_df['Coarse MCC'], color=colors, alpha=0.7)
axes[0, 2].set_title('Coarse Matthews Correlation Coefficient', fontweight='bold')
axes[0, 2].set_ylabel('MCC')
axes[0, 2].tick_params(axis='x', rotation=45)
axes[0, 2].grid(True, alpha=0.3)

# Fine metrics
axes[1, 0].bar(models, comparison_df['Fine Macro-F1'], color=colors, alpha=0.7)
axes[1, 0].set_title('Fine Macro-F1 Score', fontweight='bold')
axes[1, 0].set_ylabel('Macro-F1')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].bar(models, comparison_df['Fine AUPRC'], color=colors, alpha=0.7)
axes[1, 1].set_title('Fine AUPRC Macro', fontweight='bold')
axes[1, 1].set_ylabel('AUPRC')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

axes[1, 2].bar(models, comparison_df['Fine ECE'], color=colors, alpha=0.7)
axes[1, 2].set_title('Fine Expected Calibration Error', fontweight='bold')
axes[1, 2].set_ylabel('ECE (Lower is Better)')
axes[1, 2].tick_params(axis='x', rotation=45)
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
f, axes = plt.subplots(1, 3, figsize=(10, 6))
f.suptitle('Hierarchical Evaluation Metrics', fontsize=16, fontweight='bold')

axes[0].bar(models, comparison_df['Exact Match'], color=colors, alpha=0.7)
axes[0].set_title('Exact Match (Both Heads Correct)', fontweight='bold')
axes[0].set_ylabel('Exact Match Rate')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3)

axes[1].bar(models, comparison_df['Coarse Correct'], color=colors, alpha=0.7)
axes[1].set_title('Coarse Classification Accuracy', fontweight='bold')
axes[1].set_ylabel('Balanced Accuracy')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3)

axes[2].bar(models, comparison_df['Fine Conditional F1'], color=colors, alpha=0.7)
axes[2].set_title('Fine Classification F1 (Conditional)', fontweight='bold')
axes[2].set_ylabel('Macro-F1')
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
f, axes = plt.subplots(2, 2, figsize=(10, 10))
f.suptitle('Out-of-Distribution Detection Performance', fontsize=16, fontweight='bold')

axes[0, 0].bar(models, comparison_df['OOD AUROC'], color=colors, alpha=0.7)
axes[0, 0].set_title('OOD Detection AUROC', fontweight='bold')
axes[0, 0].set_ylabel('AUROC')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].bar(models, comparison_df['OOD AUPRC'], color=colors, alpha=0.7)
axes[0, 1].set_title('OOD Detection AUPRC', fontweight='bold')
axes[0, 1].set_ylabel('AUPRC')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].bar(models, comparison_df['FPR@95%TPR'], color=colors, alpha=0.7)
axes[1, 0].set_title('FPR@95%TPR (Lower is Better)', fontweight='bold')
axes[1, 0].set_ylabel('FPR')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].bar(models, comparison_df['Detection Error'], color=colors, alpha=0.7)
axes[1, 1].set_title('Detection Error (Lower is Better)', fontweight='bold')
axes[1, 1].set_ylabel('Detection Error')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# FINAL SUMMARY AND RECOMMENDATIONS
# =============================================================================

print(f"\n{'='*80}")
print("FINAL EVALUATION SUMMARY")
print(f"{'='*80}")

# Calculate overall performance score (weighted combination of key metrics)
# Primary metrics: Coarse Macro-F1 (40%), Fine Macro-F1 (30%), OOD AUROC (20%), ECE (10%)
comparison_df['Overall Score'] = (
    0.4 * comparison_df['Coarse Macro-F1'] +
    0.3 * comparison_df['Fine Macro-F1'] +
    0.2 * comparison_df['OOD AUROC'] +
    0.1 * (1 - comparison_df['Fine ECE'])  # Lower ECE is better
)

# Rank models by overall performance
ranked_models = comparison_df.sort_values('Overall Score', ascending=False)

print(f"\n📊 OVERALL MODEL RANKING:")
print(f"{'-'*50}")
for i, (_, row) in enumerate(ranked_models.iterrows(), 1):
    print(f"{i}. {row['Model']}: {row['Overall Score']:.4f}")
    print(f"   Coarse F1: {row['Coarse Macro-F1']:.4f}, Fine F1: {row['Fine Macro-F1']:.4f}")
    print(f"   OOD AUROC: {row['OOD AUROC']:.4f}, ECE: {row['Fine ECE']:.4f}")

# Key insights
print(f"\n🎯 KEY INSIGHTS:")
print(f"{'-'*20}")

best_model = ranked_models.iloc[0]
baseline_model = ranked_models[ranked_models['Model'].str.contains('Baseline')]

if len(baseline_model) > 0:
    baseline_score = baseline_model.iloc[0]['Overall Score']
    improvement = ((best_model['Overall Score'] - baseline_score) / baseline_score) * 100
    print(f"• Best model improves over baseline by {improvement:.2f}%")

# Check ensemble performance
ensemble_models = ranked_models[ranked_models['Model'].str.contains('Ensemble')]
if len(ensemble_models) > 0:
    print(f"• Ensemble methods show superior performance")
    print(f"  - Best ensemble: {ensemble_models.iloc[0]['Model']}")

# Check SSL performance
ssl_models = ranked_models[ranked_models['Model'].str.contains('SSL')]
if len(ssl_models) > 0:
    print(f"• SSL fine-tuning demonstrates clear benefits")
    print(f"  - SSL model rank: {ranked_models[ranked_models['Model'].str.contains('SSL')].index[0] + 1}")

# Clinical recommendations
print(f"\n💡 CLINICAL RECOMMENDATIONS:")
print(f"{'-'*30}")

# Best model for different use cases
best_coarse = ranked_models.iloc[ranked_models['Coarse Macro-F1'].idxmax()]
best_fine = ranked_models.iloc[ranked_models['Fine Macro-F1'].idxmax()]
best_ood = ranked_models.iloc[ranked_models['OOD AUROC'].idxmax()]
best_calibrated = ranked_models.iloc[ranked_models['Fine ECE'].idxmin()]

print(f"1. **Primary Classification**: {best_coarse['Model']} (Coarse F1: {best_coarse['Coarse Macro-F1']:.4f})")
print(f"2. **Fine-grained Diagnosis**: {best_fine['Model']} (Fine F1: {best_fine['Fine Macro-F1']:.4f})")
print(f"3. **Uncertainty Detection**: {best_ood['Model']} (OOD AUROC: {best_ood['OOD AUROC']:.4f})")
print(f"4. **Confidence Calibration**: {best_calibrated['Model']} (ECE: {best_calibrated['Fine ECE']:.4f})")

# Production recommendations
print(f"\n🚀 PRODUCTION DEPLOYMENT:")
print(f"{'-'*25}")
print(f"• **Recommended Model**: {best_model['Model']}")
print(f"• **Confidence Threshold**: Use detection error optimal threshold")
print(f"• **Monitoring**: Track both coarse and fine performance")
print(f"• **Fallback**: Human expert review for OOD cases")

# Performance thresholds
print(f"\n📈 PERFORMANCE THRESHOLDS:")
print(f"{'-'*25}")
print(f"• Coarse Macro-F1 ≥ 0.80: {'✓' if best_model['Coarse Macro-F1'] >= 0.80 else '✗'}")
print(f"• Fine Macro-F1 ≥ 0.70: {'✓' if best_model['Fine Macro-F1'] >= 0.70 else '✗'}")
print(f"• OOD AUROC ≥ 0.80: {'✓' if best_model['OOD AUROC'] >= 0.80 else '✗'}")
print(f"• ECE ≤ 0.10: {'✓' if best_model['Fine ECE'] <= 0.10 else '✗'}")

print(f"\n✅ EVALUATION COMPLETED SUCCESSFULLY!")
print(f"📁 Results saved and ready for analysis")
print(f"🎯 Best performing model: {best_model['Model']}")
