# Notebook 3: Model Evaluation and Comparative Analysis

**Goal:**
1.  Evaluate trained models on the test set.
2.  Calculate performance metrics.
3.  Load and compare results from multiple experimental setups.
4.  Perform qualitative analysis by visualizing predictions.
5.  Synthesize findings and discuss project structure.

In [1]:
import torch
import torch.nn as nn
import time
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from tqdm.notebook import tqdm # For progress bars

# Import utility functions
import utils

# --- Define Setup for SINGLE Model Evaluation (initially) ---
# This should match a *completed* training run from Notebook 2

# Unique identifier for the experimental setup to evaluate
SETUP_ID_TO_EVAL = 'resnet_mid_aug' # <<< CHANGE THIS TO THE DESIRED SETUP ID >>>

MODEL_NAME = 'resnet'        # 'resnet' or 'mobilenet'  (must match SETUP_ID_TO_EVAL)
UNFREEZE_STRATEGY = 'mid'    # 'head', 'mid', or 'deep' (must match SETUP_ID_TO_EVAL)
# AUGMENT_TRAIN is not directly used for evaluation, but kept for consistency if model loading needs it

# --- Device Setup ---
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

# --- Unfreeze Strategy Mapping ---
resnet_unfreeze_map = {
    'head': ['fc.'],
    'mid':  ['fc.', 'layer4.'],
    'deep': ['fc.', 'layer4.', 'layer3.']
}
mobilenet_unfreeze_map = {
    'head': ['classifier.3.'],
    'mid':  ['classifier.3.', 'features.10.', 'features.11.', 'features.12.'],
    'deep': ['classifier.3.', 'features.8.', 'features.9.', 'features.10.', 'features.11.', 'features.12.']
}
if MODEL_NAME == 'resnet':
    layers_to_unfreeze = resnet_unfreeze_map[UNFREEZE_STRATEGY]
elif MODEL_NAME == 'mobilenet':
    layers_to_unfreeze = mobilenet_unfreeze_map[UNFREEZE_STRATEGY]

# --- Filenames ---
# Input file from Notebook 2
best_model_filename = f"best_model_{SETUP_ID_TO_EVAL}.pth"
# Output files for this notebook
predictions_filename = f"predictions_{SETUP_ID_TO_EVAL}.json" # Store as list of dicts
eval_metrics_filename = f"eval_metrics_{SETUP_ID_TO_EVAL}.json"

print(f"Evaluating Setup ID: {SETUP_ID_TO_EVAL}")
print(f"Model: {MODEL_NAME}, Unfreeze: {UNFREEZE_STRATEGY}")

Using device: mps
Evaluating Setup ID: resnet_mid_aug
Model: resnet, Unfreeze: mid


In [3]:
# 1. Instantiate Model Architecture
model = utils.get_model(MODEL_NAME)
model = utils.adapt_model_head(model, MODEL_NAME)
model = utils.apply_unfreeze_logic(model, layers_to_unfreeze) # Apply same unfreeze logic
print(f"Model '{MODEL_NAME}' architecture instantiated for evaluation.")

AttributeError: 'tuple' object has no attribute 'parameters'

In [4]:
model

(ResNet(
   (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
   (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (relu): ReLU(inplace=True)
   (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
   (layer1): Sequential(
     (0): BasicBlock(
       (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
       (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       (relu): ReLU(inplace=True)
       (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
       (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     )
     (1): BasicBlock(
       (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
       (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       (relu): ReLU

In [None]:


# 2. Load Saved Best Model Weights
print(f"Loading best model weights from: {best_model_filename}")
# Add check if file exists? No error handling per request.
model.load_state_dict(torch.load(best_model_filename, map_location=device))
model.to(device)
model.eval() # Set to evaluation mode
print("Best model weights loaded and model set to evaluation mode.")

# 3. Load Test Dataset and DataLoader
# Test data is *never* augmented
_ , _, test_dataset = utils.get_datasets(augment_train=False) # Pass dummy augment_train
# The first two datasets (train, val) are not needed here
# weights_obj is not needed for simplified get_datasets

_, _, test_loader = utils.get_dataloaders(
    None, # train_dataset not needed
    None, # val_dataset not needed
    test_dataset,
    batch_size=64 # Can use a reasonable batch size for inference
)
print(f"Test DataLoader created. Test batches: {len(test_loader)}")

# Get class names for OxfordIIITPet (for confusion matrix and reports)
# This is a bit of a hack to get class names if not stored elsewhere.
# Assumes the test_dataset is an instance of OxfordIIITPet or a Subset of it.
if isinstance(test_dataset, torch.utils.data.Subset):
    # Access the underlying dataset if it's a Subset
    if hasattr(test_dataset.dataset, 'classes'):
         class_names = test_dataset.dataset.classes
    elif hasattr(test_dataset.dataset, '_breeds'): # OxfordIIITPet specific
         class_names = test_dataset.dataset._breeds
    else: # Fallback if classes are not directly accessible
         class_names = [str(i) for i in range(utils.NUM_CLASSES)]

elif hasattr(test_dataset, 'classes'):
    class_names = test_dataset.classes
elif hasattr(test_dataset, '_breeds'): # OxfordIIITPet specific
    class_names = test_dataset._breeds
else:
    class_names = [str(i) for i in range(utils.NUM_CLASSES)]

print(f"Number of classes: {len(class_names)}")
# print(class_names) # Uncomment to see class names