# **1. Objective of This Notebook**

This notebook answers four critical questions:

* How well does the model perform per class?

* What visual regions does the model attend to?

* Are learned representations well separated?

* Can the model’s confidence be trusted?

Together, these analyses help assess model reliability, not just accuracy.

## **3. Quantitative Evaluation (Metrics & Confusion Matrix)**

3.1 Collect Predictions Safely

In [None]:
def collect_logits_labels(model, loader, device):
    model.eval()
    logits_list, labels_list = [], []
    with torch.no_grad(), torch.amp.autocast(device_type='cuda', enabled=(device.type=='cuda')):
        for x, y in loader:
            x = x.to(device, non_blocking=True)
            out = model(x).float().cpu()
            logits_list.append(out)
            labels_list.append(y.cpu())
    logits = torch.cat(logits_list)
    labels = torch.cat(labels_list)
    probs  = torch.softmax(logits, dim=1).numpy()
    preds  = probs.argmax(1)
    return probs, preds, labels.numpy()


In [None]:
criterion = LabelSmoothingCE(0.1)
va_loss, va_acc, va_f1, va_top3, va_cm = validate(model, val_loader, criterion, DEVICE)
probs, preds, labels = collect_logits_labels(model, val_loader, DEVICE)

print(f"Val loss: {va_loss:.4f} | Acc: {va_acc:.4f} | F1: {va_f1:.4f} | Top-3: {va_top3:.4f}")


3.2 Per-Class Performance Report

In [None]:
from sklearn.metrics import classification_report
import seaborn as sns, matplotlib.pyplot as plt

target_names = [idx2label[i] for i in range(len(idx2label))]
report = classification_report(labels, preds, target_names=target_names, output_dict=True, zero_division=0)
pd.DataFrame(report).transpose().to_csv(FIG_DIR / "per_class_report.csv")
print("Saved per_class_report.csv")


In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(va_cm, cmap="Blues", cbar=True)
plt.title("Confusion Matrix (Validation)")
plt.xlabel("Predicted"); plt.ylabel("True")
plt.tight_layout()
plt.savefig(FIG_DIR / "confusion_matrix_val.png", dpi=200)
plt.close()


## **4. Spatial Explainability — Grad-CAM**
4.1 Motivation

Grad-CAM highlights where the model looks when making predictions, helping detect:

* background bias
* shortcut learning
* disease-region focus

4.2 Grad-CAM Visualization

In [None]:
!pip -q install torchcam==0.4.0
from torchcam.methods import GradCAM
import cv2, matplotlib.pyplot as plt

In [None]:
def overlay_cam(img_rgb, cam, alpha=0.4):
    cam = cv2.resize(cam, (img_rgb.shape[1], img_rgb.shape[0]))
    cam = (cam - cam.min()) / (cam.max() - cam.min() + 1e-6)
    heat = cv2.applyColorMap((255*cam).astype(np.uint8), cv2.COLORMAP_JET)
    heat = heat[..., ::-1]
    return (alpha*heat + (1-alpha)*img_rgb).astype(np.uint8)


In [None]:
# find late conv layer for EfficientNet-B3
target_layer = None
for n, m in model.named_modules():
    if "blocks.6" in n and hasattr(m, "conv_pw"):
        target_layer = n

cam_extractor = GradCAM(model, target_layer)
samples = val_df.sample(8, random_state=42)
tf = get_val_transform()

fig, axes = plt.subplots(4, 2, figsize=(10,18))
axes = axes.flatten()

for ax, (_, row) in zip(axes, samples.iterrows()):
    img = cv2.imread(row.filepath)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    x = tf(image=img)["image"].unsqueeze(0).to(DEVICE)
    out = model(x)
    cls = out.argmax(1).item()
    cam = cam_extractor(cls, out)[0].detach().cpu().numpy()
    ax.imshow(overlay_cam(img, cam))
    ax.set_title(f"Pred: {idx2label[cls]}")
    ax.axis("off")

plt.tight_layout()
plt.savefig(FIG_DIR / "gradcam_samples.png", dpi=200)
plt.close()

## **5. Representation Analysis — UMAP**
5.1 Motivation

UMAP reveals:

* class separability
* overlapping disease clusters
* representation quality beyond accuracy

5.2 Safe Feature Extraction

In [None]:
import umap, gc

def extract_features(model, loader, device):
    model.eval()
    feats, labs = [], []
    for x, y in loader:
        x = x.to(device, non_blocking=True)
        with torch.amp.autocast(device_type='cuda', enabled=(device.type=='cuda')):
            f = model.global_pool(model.forward_features(x)).detach()
        feats.append(f.cpu().numpy())
        labs.append(y.numpy())
    return np.concatenate(feats), np.concatenate(labs)

feats, labs = extract_features(model, val_loader, DEVICE)


In [None]:
emb = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42).fit_transform(feats)

plt.figure(figsize=(10,8))
for i in range(len(idx2label)):
    m = labs == i
    if m.sum():
        plt.scatter(emb[m,0], emb[m,1], s=6, alpha=0.6, label=idx2label[i])
plt.legend(markerscale=4, bbox_to_anchor=(1.05,1), fontsize=7)
plt.title("UMAP Projection (Validation)")
plt.tight_layout()
plt.savefig(FIG_DIR / "umap_val.png", dpi=200)
plt.close()

## **6. Confidence Calibration — Temperature Scaling**
6.1 Why Calibration Matters

High accuracy ≠ reliable confidence.

We measure this using Expected Calibration Error (ECE).

6.2 Compute ECE & Apply Temperature Scaling

In [None]:
def compute_ece(probs, labels, n_bins=15):
    conf = probs.max(1)
    preds = probs.argmax(1)
    acc = (preds == labels).astype(float)
    bins = np.linspace(0,1,n_bins+1)
    ece = 0.0
    for i in range(n_bins):
        m = (conf > bins[i]) & (conf <= bins[i+1])
        if m.any():
            ece += m.mean() * abs(acc[m].mean() - conf[m].mean())
    return ece

ece_before = compute_ece(probs, labels)
print("ECE before:", round(ece_before, 4))

In [None]:
T = torch.nn.Parameter(torch.ones(1), requires_grad=True)
opt = torch.optim.LBFGS([T], lr=0.1, max_iter=50)

logits = torch.tensor(probs).log()
labels_t = torch.tensor(labels)

def closure():
    opt.zero_grad()
    loss = torch.nn.functional.cross_entropy(logits / T.clamp(0.05,10), labels_t)
    loss.backward()
    return loss

opt.step(closure)

T_final = float(T.item())
probs_scaled = torch.softmax(logits / T_final, dim=1).numpy()
ece_after = compute_ece(probs_scaled, labels)

print("ECE after:", round(ece_after, 4))

In [None]:
with open(CKPT_DIR / "temperature.txt", "w") as f:
    f.write(str(T_final))

## **7. Failure Analysis — Misclassification Export**
7.1 Motivation

We export misclassifications with contextual cues to identify:

* lighting issues
* aspect ratio bias
* black borders

7.2 Export Misclassified Samples

In [None]:
import cv2

def context_tags(fp):
    img = cv2.imread(fp)
    if img is None:
        return {"black_bars": False, "aspect": "unknown", "bright": 0.0, "sat": 0.0}
    h,w = img.shape[:2]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    return {
        "black_bars": (gray[:,0]<5).mean()>0.9,
        "aspect": "tall" if h/w>1.2 else "wide" if w/h>1.2 else "square",
        "bright": round(hsv[...,2].mean()/255,3),
        "sat": round(hsv[...,1].mean()/255,3)
    }

In [None]:
rows = []
paths = val_df["filepath"].tolist()

for fp, y, p, conf in zip(paths, labels, preds, probs.max(1)):
    if y != p:
        rows.append({
            "filepath": fp,
            "true": idx2label[int(y)],
            "pred": idx2label[int(p)],
            "confidence": float(conf),
            **context_tags(fp)
        })

pd.DataFrame(rows).to_csv(FIG_DIR / "misclassifications_val.csv", index=False)
print("Saved misclassifications_val.csv")

## **8. Key Takeaways**

* The model attends to disease regions, not backgrounds (Grad-CAM)

* Some disease classes overlap in feature space (UMAP)

* Calibration significantly improves confidence reliability

* Misclassifications correlate with image quality issues