
# Capstone — CIFAR‑100 (QC'd, Sequential)  
**Models:** ResNet‑18 • WideResNet‑28×10 • ConvNeXt‑Tiny • ViT‑Tiny • ViT‑Hybrid (ResNet‑26 + ViT Small)  

This notebook is **QC'd** and follows **capstone best practices**. It trains and compares multiple architectures on **CIFAR‑100**, producing:
- Metrics: **Top‑1 / Top‑5**, loss (val/test)
- **Confusion matrix**, **per‑class accuracy** (with class names)
- **Calibration**: **ECE** (raw) and **ECE after temperature scaling** with **reliability diagrams**
- **Best checkpoint** per model (by validation Top‑1)
- **Per‑epoch CSV logs**, plots, and **aggregate summary**
- **Early stopping** with patience/min_delta/min_epochs and **target Top‑1** thresholds
- **Strong augmentations**, optional **Mixup/CutMix**, **EMA**, **AMP**, **channels‑last**, grad accumulation
- Optional **TensorBoard/W&B** logging and **one‑click ZIP export**

> Runs **sequentially with a dedicated cell for each model** so you can execute them one by one or all together.


## 0) (Optional) Installs

In [1]:
# !pip -q install torch torchvision timm scikit-learn matplotlib pandas tensorboard wandb

## 1) Environment & Reproducibility

In [2]:

import os, math, json, random, time, sys, platform, copy, shutil
from datetime import datetime
from typing import Tuple, List, Dict, Optional

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import torchvision
from torchvision import datasets, transforms
import timm
from sklearn.metrics import confusion_matrix
import pandas as pd

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

print("Python:", sys.version)
print("OS:", platform.platform())
print("PyTorch:", torch.__version__)
print("Torchvision:", torchvision.__version__)
print("timm:", timm.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU count:", torch.cuda.device_count())
    print("GPU name[0]:", torch.cuda.get_device_name(0))
    torch.set_float32_matmul_precision('high')


  from .autonotebook import tqdm as notebook_tqdm


Python: 3.12.10 (tags/v3.12.10:0cc8128, Apr  8 2025, 12:21:36) [MSC v.1943 64 bit (AMD64)]
OS: Windows-11-10.0.26100-SP0
PyTorch: 2.5.1+cu121
Torchvision: 0.20.1+cu121
timm: 1.0.22
CUDA available: True
GPU count: 1
GPU name[0]: NVIDIA RTX 2000 Ada Generation Laptop GPU


## 2) Utilities: Seeding, Metrics, ECE, Temperature Scaling

In [3]:

# Normalization stats
CIFAR_MEAN = [0.5071, 0.4867, 0.4408]
CIFAR_STD  = [0.2675, 0.2565, 0.2761]
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

def seed_everything(seed: int = 42, deterministic: bool = False):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    else:
        torch.backends.cudnn.benchmark = True

def ensure_dir(path: str): os.makedirs(path, exist_ok=True)

def topk_accuracy(logits: torch.Tensor, target: torch.Tensor, ks=(1,5)) -> List[float]:
    maxk = max(ks)
    with torch.no_grad():
        _, pred = logits.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))
        res = []
        for k in ks:
            correct_k = correct[:k].reshape(-1).float().sum(0)
            res.append((correct_k / target.size(0)).item())
        return res

def one_hot(targets: torch.Tensor, num_classes: int, smoothing: float = 0.0) -> torch.Tensor:
    with torch.no_grad():
        y = torch.zeros(targets.size(0), num_classes, device=targets.device)
        y.scatter_(1, targets.view(-1,1), 1.0)
        if smoothing > 0.0:
            y = y * (1.0 - smoothing) + smoothing / num_classes
    return y

def soft_cross_entropy(logits: torch.Tensor, soft_targets: torch.Tensor) -> torch.Tensor:
    log_probs = F.log_softmax(logits, dim=1)
    return -(soft_targets * log_probs).sum(dim=1).mean()

def compute_ece(confidences: np.ndarray, correctness: np.ndarray, n_bins: int = 15) -> Tuple[float, Dict]:
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    ece = 0.0
    stats = {"bin_left": [], "bin_right": [], "bin_acc": [], "bin_conf": [], "bin_count": []}
    N = len(confidences)
    for i in range(n_bins):
        l, r = bins[i], bins[i+1]
        idx = (confidences > l) & (confidences <= r) if i > 0 else (confidences >= l) & (confidences <= r)
        if idx.sum() > 0:
            acc = correctness[idx].mean(); conf = confidences[idx].mean()
            ece += (idx.sum() / N) * abs(acc - conf)
            stats["bin_left"].append(float(l)); stats["bin_right"].append(float(r))
            stats["bin_acc"].append(float(acc)); stats["bin_conf"].append(float(conf))
            stats["bin_count"].append(int(idx.sum()))
        else:
            stats["bin_left"].append(float(l)); stats["bin_right"].append(float(r))
            stats["bin_acc"].append(0.0); stats["bin_conf"].append(0.0); stats["bin_count"].append(0)
    return float(ece), stats

@torch.no_grad()
def collect_logits(model: nn.Module, dl: DataLoader, device, use_channels_last: bool = True) -> Tuple[torch.Tensor, torch.Tensor]:
    model.eval()
    logits_all, targets_all = [], []
    for images, targets in dl:
        images = images.to(device, non_blocking=True)
        if use_channels_last and torch.cuda.is_available():
            images = images.to(memory_format=torch.channels_last)
        targets = targets.to(device, non_blocking=True)
        logits = model(images)
        logits_all.append(logits.detach().cpu())
        targets_all.append(targets.detach().cpu())
    return torch.cat(logits_all,0), torch.cat(targets_all,0)

def fit_temperature(logits: torch.Tensor, labels: torch.Tensor, max_iter: int = 200) -> float:
    T = torch.ones(1, requires_grad=True)
    optimizer = torch.optim.LBFGS([T], lr=0.5, max_iter=50)
    nll_criterion = nn.CrossEntropyLoss()
    def _eval():
        optimizer.zero_grad()
        loss = nll_criterion(logits / T.clamp_min(1e-6), labels)
        loss.backward()
        return loss
    for _ in range(max_iter):
        optimizer.step(_eval)
    return float(T.detach().clamp_min(1e-6).item())


## 3) WideResNet‑28×10 (reference implementation)

In [4]:

class BasicBlock(nn.Module):
    def __init__(self, in_planes, out_planes, stride, drop_rate=0.0):
        super().__init__()
        self.equalInOut = (in_planes == out_planes)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_planes, out_planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.drop_rate = drop_rate
        self.shortcut = (not self.equalInOut) and nn.Conv2d(in_planes, out_planes, kernel_size=1,
                                                            stride=stride, padding=0, bias=False) or None
    def forward(self, x):
        out = self.relu1(self.bn1(x))
        out = self.conv1(out)
        out = self.relu2(self.bn2(out))
        if self.drop_rate > 0.0:
            out = F.dropout(out, p=self.drop_rate, training=self.training)
        out = self.conv2(out)
        return out + (x if self.equalInOut else self.shortcut(x))

class NetworkBlock(nn.Module):
    def __init__(self, nb_layers, in_planes, out_planes, block, stride, drop_rate):
        super().__init__()
        layers = []
        for i in range(nb_layers):
            layers.append(block(i == 0 and in_planes or out_planes, out_planes, i == 0 and stride or 1, drop_rate))
        self.layer = nn.Sequential(*layers)
    def forward(self, x): return self.layer(x)

class WideResNet(nn.Module):
    def __init__(self, depth=28, widen_factor=10, num_classes=100, drop_rate=0.3):
        super().__init__()
        assert ((depth - 4) % 6 == 0), "Depth should be 6n+4"
        n = (depth - 4) // 6; k = widen_factor
        nStages = [16, 16*k, 32*k, 64*k]
        self.conv1 = nn.Conv2d(3, nStages[0], kernel_size=3, stride=1, padding=1, bias=False)
        self.block1 = NetworkBlock(n, nStages[0], nStages[1], BasicBlock, 1, drop_rate)
        self.block2 = NetworkBlock(n, nStages[1], nStages[2], BasicBlock, 2, drop_rate)
        self.block3 = NetworkBlock(n, nStages[2], nStages[3], BasicBlock, 2, drop_rate)
        self.bn = nn.BatchNorm2d(nStages[3]); self.relu = nn.ReLU(inplace=True)
        self.pool = nn.AdaptiveAvgPool2d((1,1)); self.fc = nn.Linear(nStages[3], num_classes)
        for m in self.modules():
            if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
            elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1); nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear): nn.init.xavier_normal_(m.weight); nn.init.constant_(m.bias, 0)
    def forward(self, x):
        out = self.conv1(x); out = self.block1(out); out = self.block2(out); out = self.block3(out)
        out = self.relu(self.bn(out)); out = self.pool(out).flatten(1)
        return self.fc(out)


## 4) Data Pipeline: CIFAR‑100 with per‑model transforms

In [5]:

def build_transforms_32(train: bool = True, use_randaugment: bool = True, random_erasing: bool = True):
    aug = []
    if train:
        aug += [transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip()]
        if use_randaugment and hasattr(transforms, "RandAugment"):
            aug += [transforms.RandAugment(num_ops=2, magnitude=9)]
    aug += [transforms.ToTensor(), transforms.Normalize(CIFAR_MEAN, CIFAR_STD)]
    if train and random_erasing:
        aug += [transforms.RandomErasing(p=0.25, scale=(0.02, 0.2))]
    return transforms.Compose(aug)

def build_transforms_224(img_size: int = 224, train: bool = True, use_randaugment: bool = True):
    if train:
        ops = [transforms.RandomResizedCrop(img_size, scale=(0.5, 1.0), ratio=(3/4, 4/3)),
               transforms.RandomHorizontalFlip()]
        if use_randaugment and hasattr(transforms, "RandAugment"):
            ops += [transforms.RandAugment(num_ops=2, magnitude=9)]
    else:
        ops = [transforms.Resize(img_size), transforms.CenterCrop(img_size)]
    ops += [transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)]
    return transforms.Compose(ops)

def get_dataloaders_for_model(model_name: str, img_size: int, batch_size: int,
                              data_root: str, val_split: float = 0.1, num_workers: int = 0, seed: int = 42):
    small_32 = {"wrn28x10", "wide_resnet_28x10"}
    if model_name in small_32:
        train_tf = build_transforms_32(train=True,  use_randaugment=True, random_erasing=True)
        test_tf  = build_transforms_32(train=False, use_randaugment=False, random_erasing=False)
    else:
        train_tf = build_transforms_224(img_size, train=True,  use_randaugment=True)
        test_tf  = build_transforms_224(img_size, train=False, use_randaugment=False)

    full_train = datasets.CIFAR100(root=data_root, train=True, download=True, transform=train_tf)
    test_set   = datasets.CIFAR100(root=data_root, train=False, download=True, transform=test_tf)
    val_len = int(len(full_train) * val_split); train_len = len(full_train) - val_len
    g = torch.Generator().manual_seed(seed)
    train_set, val_set = random_split(full_train, [train_len, val_len], generator=g)

    persistent = False
    dl_train = DataLoader(train_set, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers, pin_memory=True, drop_last=True,
                          persistent_workers=persistent)
    dl_val   = DataLoader(val_set, batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=True,
                          persistent_workers=persistent)
    dl_test  = DataLoader(test_set, batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=True,
                          persistent_workers=persistent)
    class_names = getattr(test_set, 'classes', [str(i) for i in range(100)])
    return dl_train, dl_val, dl_test, class_names


## 5) Model Factory (torchvision + timm, plus ViT‑Hybrid)

In [6]:

def create_model(name: str, num_classes: int, pretrained: bool = False) -> Tuple[nn.Module, int]:
    name = name.lower()
    if name in ["resnet18", "resnet34", "resnet50"]:
        import torchvision.models as tvm
        if name == "resnet18":
            model = tvm.resnet18(weights=tvm.ResNet18_Weights.IMAGENET1K_V1 if pretrained else None)
        elif name == "resnet34":
            model = tvm.resnet34(weights=tvm.ResNet34_Weights.IMAGENET1K_V1 if pretrained else None)
        else:
            model = tvm.resnet50(weights=tvm.ResNet50_Weights.IMAGENET1K_V1 if pretrained else None)
        model.fc = nn.Linear(model.fc.in_features, num_classes)
        return model, 224

    if name in ["wrn28x10", "wide_resnet_28x10"]:
        model = WideResNet(depth=28, widen_factor=10, num_classes=num_classes, drop_rate=0.3)
        return model, 32

    if name in ["convnext_tiny", "convnext_t"]:
        model = timm.create_model("convnext_tiny", pretrained=pretrained, num_classes=num_classes)
        return model, 224

    if name in ["vit_tiny", "vit_tiny_patch16_224", "deit_tiny"]:
        model_name = "deit_tiny_patch16_224" if name in ["deit_tiny"] else "vit_tiny_patch16_224"
        model = timm.create_model(model_name, pretrained=pretrained, num_classes=num_classes)
        return model, 224

    if name in ["vit_hybrid", "vit_small_r26_s32_224"]:
        model = timm.create_model("vit_small_r26_s32_224", pretrained=pretrained, num_classes=num_classes)
        return model, 224

    raise ValueError(f"Unknown model name: {name}")


In [7]:

# === Patch: wrap original create_model to inject classifier head Dropout across backbones ===
try:
    _original_create_model
except NameError:
    _original_create_model = create_model  # keep reference to the original

import torch.nn as nn

def _wrap_linear_with_dropout(module, p, num_classes):
    if isinstance(module, nn.Sequential):
        # if a Dropout already present, keep as is
        if any(isinstance(m, nn.Dropout) for m in module.modules()):
            return module
        # if ends with Linear, wrap last Linear
        if isinstance(module[-1], nn.Linear):
            in_feats = module[-1].in_features
            return nn.Sequential(*list(module[:-1]), nn.Dropout(p), nn.Linear(in_feats, num_classes))
        return nn.Sequential(nn.Dropout(p), *list(module))

    if isinstance(module, nn.Linear):
        in_feats = module.in_features
        return nn.Sequential(nn.Dropout(p), nn.Linear(in_feats, num_classes))

    return module

def create_model(name: str, num_classes: int, pretrained: bool = False):
    model, img_size = _original_create_model(name, num_classes, pretrained)
    key = name.lower()
    cfg_local = MODEL_CFG.get(key, {}) if 'MODEL_CFG' in globals() else {}
    head_p = float(cfg_local.get('head_dropout', 0.2))

    # Torchvision ResNet family: model.fc
    if hasattr(model, "fc"):
        try:
            model.fc = _wrap_linear_with_dropout(model.fc, head_p, num_classes)
        except Exception:
            pass

    # ConvNeXt variants: classifier
    if hasattr(model, "classifier"):
        try:
            model.classifier = _wrap_linear_with_dropout(model.classifier, head_p, num_classes)
        except Exception:
            pass

    # timm/torchvision heads
    if hasattr(model, "head"):
        try:
            model.head = _wrap_linear_with_dropout(model.head, head_p, num_classes)
        except Exception:
            pass

    # torchvision ViT: 'heads' (Sequential([...], Linear))
    if hasattr(model, "heads"):
        try:
            if isinstance(model.heads, nn.Sequential) and isinstance(model.heads[-1], nn.Linear):
                in_feats = model.heads[-1].in_features
                has_dropout = any(isinstance(m, nn.Dropout) for m in model.heads.modules())
                if not has_dropout:
                    new_seq = []
                    for m in model.heads[:-1]:
                        new_seq.append(m)
                    new_seq.append(nn.Dropout(head_p))
                    new_seq.append(nn.Linear(in_feats, num_classes))
                    model.heads = nn.Sequential(*new_seq)
        except Exception:
            pass

    return model, img_size

print("[Patch] create_model() wrapped to inject head Dropout (default p=0.2; override via MODEL_CFG['head_dropout']).")


[Patch] create_model() wrapped to inject head Dropout (default p=0.2; override via MODEL_CFG['head_dropout']).


## 6) SOTA Add‑ons: Mixup/CutMix + EMA

In [8]:

def rand_bbox(W: int, H: int, lam: float):
    cut_rat = math.sqrt(1. - lam)
    cut_w, cut_h = int(W * cut_rat), int(H * cut_rat)
    cx, cy = np.random.randint(W), np.random.randint(H)
    x1 = np.clip(cx - cut_w // 2, 0, W); y1 = np.clip(cy - cut_h // 2, 0, H)
    x2 = np.clip(cx + cut_w // 2, 0, W); y2 = np.clip(cy + cut_h // 2, 0, H)
    return x1, y1, x2, y2

def apply_mixup_cutmix(images: torch.Tensor, targets: torch.Tensor, num_classes: int,
                       mixup_alpha: float = 0.2, cutmix_alpha: float = 1.0,
                       prob: float = 1.0, switch_prob: float = 0.5) -> Tuple[torch.Tensor, torch.Tensor]:
    if np.random.rand() > prob:
        return images, one_hot(targets, num_classes, smoothing=0.0)

    use_cutmix = np.random.rand() < switch_prob
    lam = np.random.beta(cutmix_alpha, cutmix_alpha) if use_cutmix else np.random.beta(mixup_alpha, mixup_alpha)
    batch_size = images.size(0)
    index = torch.randperm(batch_size, device=images.device)

    if use_cutmix:
        _, H, W = images.size(1), images.size(2), images.size(3)
        x1, y1, x2, y2 = rand_bbox(W, H, lam)
        images[:, :, y1:y2, x1:x2] = images[index, :, y1:y2, x1:x2]
        lam = 1 - ((x2 - x1) * (y2 - y1) / (W * H + 1e-6))

    mixed_targets = lam * one_hot(targets, num_classes) + (1 - lam) * one_hot(targets[index], num_classes)
    if not use_cutmix:
        images = lam * images + (1 - lam) * images[index]
    return images, mixed_targets

class ModelEMA:
    def __init__(self, model: nn.Module, decay: float = 0.9999, device: Optional[torch.device] = None):
        self.decay = decay
        self.device = device
        self.shadow = {}
        self.backup = {}
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone().detach()
                if device is not None:
                    self.shadow[name] = self.shadow[name].to(device)

    def update(self, model: nn.Module):
        with torch.no_grad():
            for name, param in model.named_parameters():
                if param.requires_grad:
                    self.shadow[name].mul_(self.decay).add_(param.data, alpha=1.0 - self.decay)

    def apply_shadow(self, model: nn.Module):
        self.backup = {}
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.backup[name] = param.data.clone()
                param.data.copy_(self.shadow[name].to(param.data.device))

    def restore(self, model: nn.Module):
        for name, param in model.named_parameters():
            if param.requires_grad and name in self.backup:
                param.data.copy_(self.backup[name])
        self.backup = {}


## 7) Train/Eval: AMP + channels‑last + accumulation + cosine LR + EMA/Mix

In [9]:

def train_one_epoch(model, dl, optimizer, scheduler, scaler, device, criterion,
                    max_grad_norm=None, accum_steps: int = 1, use_channels_last: bool = True,
                    use_mix: bool = False, num_classes: int = 100, mix_params: Dict = None, ema_obj: Optional[ModelEMA] = None):
    model.train()
    running_loss = 0.0; correct1 = 0; total = 0
    optimizer.zero_grad(set_to_none=True)
    mix_params = mix_params or {}
    for step, (images, targets) in enumerate(dl, start=1):
        images = images.to(device, non_blocking=True)
        if use_channels_last and torch.cuda.is_available():
            images = images.to(memory_format=torch.channels_last)
        targets = targets.to(device, non_blocking=True)

        soft_targets = None
        if use_mix:
            images, soft_targets = apply_mixup_cutmix(
                images, targets, num_classes,
                mixup_alpha=mix_params.get('mixup_alpha', 0.2),
                cutmix_alpha=mix_params.get('cutmix_alpha', 1.0),
                prob=mix_params.get('prob', 1.0),
                switch_prob=mix_params.get('switch_prob', 0.5)
            )

        with torch.amp.autocast('cuda', enabled=torch.cuda.is_available()):
            logits = model(images)
            if soft_targets is not None:
                loss = soft_cross_entropy(logits, soft_targets) / accum_steps
            else:
                loss = criterion(logits, targets) / accum_steps

        scaler.scale(loss).backward()

        if step % accum_steps == 0:
            if max_grad_norm is not None:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            if scheduler is not None:
                scheduler.step()
            if ema_obj is not None:
                ema_obj.update(model)

        running_loss += loss.item() * images.size(0) * accum_steps
        acc1, = topk_accuracy(logits, targets, ks=(1,))
        correct1 += int(acc1 * images.size(0))
        total += images.size(0)
    return running_loss / total, correct1 / total

@torch.no_grad()
def evaluate(model, dl, device, criterion, use_channels_last: bool = True,
             ema_obj: Optional[ModelEMA] = None, use_ema_eval: bool = False):
    # Use EMA weights if requested
    if ema_obj is not None and use_ema_eval:
        ema_obj.apply_shadow(model)

    model.eval()
    total = 0
    running_loss = 0.0
    correct1 = 0
    correct5 = 0
    all_probs, all_preds, all_targets, all_logits = [], [], [], []

    ce = torch.nn.CrossEntropyLoss()  # plain CE for evaluation

    for images, targets in dl:
        images = images.to(device, non_blocking=True)
        if use_channels_last and torch.cuda.is_available():
            images = images.to(memory_format=torch.channels_last)
        targets = targets.to(device, non_blocking=True).long()

        with torch.amp.autocast('cuda', enabled=False):
            logits = model(images).float()
            loss = ce(logits, targets)

        running_loss += float(loss.item()) * images.size(0)

        acc1, acc5 = topk_accuracy(logits, targets, ks=(1, 5))
        correct1 += int(acc1 * images.size(0))
        correct5 += int(acc5 * images.size(0))

        probs = F.softmax(logits, dim=1)
        conf, pred = probs.max(dim=1)
        all_probs.append(conf.detach().cpu().numpy())
        all_preds.append(pred.detach().cpu().numpy())
        all_targets.append(targets.detach().cpu().numpy())
        all_logits.append(logits.detach().cpu())

        total += images.size(0)

    if ema_obj is not None and use_ema_eval:
        ema_obj.restore(model)

    avg_loss = running_loss / max(1, total)
    top1 = correct1 / max(1, total)
    top5 = correct5 / max(1, total)

    all_probs = np.concatenate(all_probs, axis=0) if len(all_probs) else np.array([])
    all_preds = np.concatenate(all_preds, axis=0) if len(all_preds) else np.array([])
    all_targets = np.concatenate(all_targets, axis=0) if len(all_targets) else np.array([])
    all_logits = torch.cat(all_logits, 0) if len(all_logits) else torch.empty(0)

    correctness = (all_preds == all_targets).astype(np.float32) if len(all_preds) else np.array([])
    return avg_loss, top1, top5, all_probs, all_preds, all_targets, correctness, all_logits


## 8) Plotting Helpers

In [10]:

def plot_training_curves(history: Dict[str, List[float]], out_path: str, title: str):
    plt.figure(figsize=(7,5))
    plt.plot(history["train_acc"], label="train_acc")
    plt.plot(history["val_acc"], label="val_acc")
    plt.xlabel("Epoch"); plt.ylabel("Accuracy"); plt.title(f"{title} - Accuracy")
    plt.legend(); plt.grid(True, alpha=0.3)
    plt.tight_layout(); plt.savefig(os.path.join(out_path, "training_accuracy.png"), dpi=150); plt.close()

    plt.figure(figsize=(7,5))
    plt.plot(history["train_loss"], label="train_loss")
    plt.plot(history["val_loss"], label="val_loss")
    plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title(f"{title} - Loss")
    plt.legend(); plt.grid(True, alpha=0.3)
    plt.tight_layout(); plt.savefig(os.path.join(out_path, "training_loss.png"), dpi=150); plt.close()

def plot_confusion_matrix(cm: np.ndarray, out_path: str, title: str):
    plt.figure(figsize=(9,8))
    im = plt.imshow(cm, interpolation='nearest', aspect='auto')
    plt.title(title); plt.xlabel("Predicted"); plt.ylabel("True")
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.tight_layout(); plt.savefig(os.path.join(out_path, "confusion_matrix.png"), dpi=160); plt.close()

def plot_per_class_accuracy(cm: np.ndarray, out_path: str, title: str, class_names: List[str], topn: int = 25):
    per_class = cm.diagonal() / cm.sum(axis=1).clip(min=1)
    idx_sorted = np.argsort(per_class); worst_idx = idx_sorted[:topn]
    labels = [class_names[i] if i < len(class_names) else str(i) for i in worst_idx]
    plt.figure(figsize=(10,5))
    plt.bar(np.arange(topn), per_class[worst_idx])
    plt.ylim(0, 1.0); plt.xticks(np.arange(topn), labels, rotation=90)
    plt.ylabel("Per-class accuracy"); plt.title(f"{title} - {topn} Lowest Classes")
    plt.tight_layout(); plt.savefig(os.path.join(out_path, "per_class_accuracy_worst.png"), dpi=150); plt.close()

def plot_reliability(bin_stats: Dict, out_path: str, title: str, suffix: str):
    left = np.array(bin_stats["bin_left"]); right = np.array(bin_stats["bin_right"])
    acc = np.array(bin_stats["bin_acc"]); centers = (left + right) / 2.0
    width = (right - left) * 0.9
    plt.figure(figsize=(6.5,6))
    plt.bar(centers, acc, align='center', width=width, label="Accuracy per bin")
    plt.plot([0,1],[0,1], linestyle='--', label="Perfect calibration")
    plt.xlabel("Confidence"); plt.ylabel("Accuracy"); plt.title(f"{title} - Reliability Diagram ({suffix})")
    plt.legend(); plt.grid(True, alpha=0.3)
    plt.tight_layout(); plt.savefig(os.path.join(out_path, f"reliability_diagram_{suffix}.png"), dpi=160); plt.close()


## 9) Configuration & Logging

In [11]:

DATA_ROOT = './data'
VAL_SPLIT = 0.1
NUM_WORKERS = 2 if os.name == 'nt' else 4
SEED = 42
DETERMINISTIC = False
LABEL_SMOOTHING = 0.1
USE_CHANNELS_LAST = True
REQUIRE_GPU = True

LOG_TENSORBOARD = True
LOG_WANDB = False
WANDB_PROJECT = "cifar100-capstone"
WANDB_ENTITY = None

EARLY_STOP_DEFAULT = {'enabled': True, 'patience': 12, 'min_delta': 5e-4, 'min_epochs': 15, 'target_top1': None}

MODEL_CFG = {
    'resnet18': {
        'pretrained': True, 'epochs': 100, 'batch': 32, 'accum': 2,
        'opt': 'sgd', 'lr': 0.05, 'wd': 1e-4, 'max_grad_norm': None,
        'mix': {'enabled': True, 'prob': 0.8, 'mixup_alpha': 0.2, 'cutmix_alpha': 1.0, 'switch_prob': 0.5},
        'ema': {'enabled': True, 'decay': 0.9999, 'eval': True},
        'early_stop': {'patience': 15, 'min_epochs': 20, 'target_top1': 0.72}
    },
    'wrn28x10': {
        'pretrained': False, 'epochs': 120, 'batch': 128, 'accum': 1,
        'opt': 'sgd', 'lr': 0.1, 'wd': 5e-4, 'max_grad_norm': None, 'drop_rate': 0.3,
        'mix': {'enabled': True, 'prob': 0.8, 'mixup_alpha': 0.2, 'cutmix_alpha': 1.0, 'switch_prob': 0.5},
        'ema': {'enabled': True, 'decay': 0.9999, 'eval': True},
        'early_stop': {'patience': 18, 'min_epochs': 25, 'target_top1': 0.78}
    },
    'convnext_tiny': {
        'pretrained': True, 'epochs': 80, 'batch': 32, 'accum': 2,
        'opt': 'adamw', 'lr': 2e-4, 'wd': 0.05, 'max_grad_norm': 1.0,
        'mix': {'enabled': True, 'prob': 0.8, 'mixup_alpha': 0.2, 'cutmix_alpha': 1.0, 'switch_prob': 0.5},
        'ema': {'enabled': True, 'decay': 0.9999, 'eval': True},
        'early_stop': {'patience': 12, 'min_epochs': 15, 'target_top1': 0.78}
    },
    'vit_tiny': {
        'pretrained': True, 'epochs': 100, 'batch': 32, 'accum': 2,
        'opt': 'adamw', 'lr': 5e-4, 'wd': 0.05, 'max_grad_norm': 1.0,
        'mix': {'enabled': True, 'prob': 0.8, 'mixup_alpha': 0.2, 'cutmix_alpha': 1.0, 'switch_prob': 0.5},
        'ema': {'enabled': True, 'decay': 0.9999, 'eval': True},
        'early_stop': {'patience': 15, 'min_epochs': 20, 'target_top1': 0.75}
    },
    'vit_hybrid': {
        'pretrained': True, 'epochs': 100, 'batch': 32, 'accum': 2,
        'opt': 'adamw', 'lr': 5e-4, 'wd': 0.05, 'max_grad_norm': 1.0,
        'mix': {'enabled': True, 'prob': 0.8, 'mixup_alpha': 0.2, 'cutmix_alpha': 1.0, 'switch_prob': 0.5},
        'ema': {'enabled': True, 'decay': 0.9999, 'eval': True},
        'early_stop': {'patience': 15, 'min_epochs': 20, 'target_top1': 0.78}
    },
}

OUTPUT_ROOT = "./outputs"

if REQUIRE_GPU:
    assert torch.cuda.is_available(), "No GPU detected. Set REQUIRE_GPU=False to allow CPU (very slow)."
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)
seed_everything(SEED, deterministic=DETERMINISTIC)


Using device: cuda


In [12]:

# === CONFIG OVERRIDES (added) ===
# Apply recommended updates without editing the original MODEL_CFG literal.

# Safer early-stop for all models: remove hard target gate.
for _name in ['resnet18', 'wrn28x10', 'convnext_tiny', 'vit_tiny', 'vit_hybrid']:
    if _name in MODEL_CFG:
        es = MODEL_CFG[_name].get('early_stop', {})
        es.update({'target_top1': None})
        MODEL_CFG[_name]['early_stop'] = es

# ResNet-18 tweaks
if 'resnet18' in MODEL_CFG:
    MODEL_CFG['resnet18'].update({
        'wd': 5e-4,
        'head_dropout': 0.1,
        'mix': {'enabled': True, 'prob': 0.6, 'mixup_alpha': 0.2, 'cutmix_alpha': 1.0, 'switch_prob': 0.5},
    })
    es = MODEL_CFG['resnet18'].get('early_stop', {})
    es.update({'patience': 15, 'min_epochs': 60})
    MODEL_CFG['resnet18']['early_stop'] = es

# WRN-28x10 tweaks
if 'wrn28x10' in MODEL_CFG:
    MODEL_CFG['wrn28x10'].update({
        'epochs': 160,
        'mix': {'enabled': True, 'prob': 0.5, 'mixup_alpha': 0.2, 'cutmix_alpha': 1.0, 'switch_prob': 0.5},
    })
    es = MODEL_CFG['wrn28x10'].get('early_stop', {})
    es.update({'patience': 20, 'min_epochs': 120})
    MODEL_CFG['wrn28x10']['early_stop'] = es

# ConvNeXt / ViT early-stop smoothing
for _name, _me in [('convnext_tiny', 50), ('vit_tiny', 60), ('vit_hybrid', 60)]:
    if _name in MODEL_CFG:
        es = MODEL_CFG[_name].get('early_stop', {})
        es.update({'patience': 15, 'min_epochs': _me})
        MODEL_CFG[_name]['early_stop'] = es

print("[Overrides] Applied config updates for resnet18, wrn28x10, convnext_tiny, vit_tiny, vit_hybrid.")


# === CONFIG OVERRIDES (final) ===
# Extend prior overrides: explicitly turn off EMA for evaluation to simplify early validation behavior.
for _name, _cfg in MODEL_CFG.items():
    if 'ema' in _cfg:
        _ema = _cfg['ema']
        _ema['eval'] = False
        _cfg['ema'] = _ema
print("[Overrides] Set EMA.eval=False for validation across all models (you can re-enable later).")


[Overrides] Applied config updates for resnet18, wrn28x10, convnext_tiny, vit_tiny, vit_hybrid.
[Overrides] Set EMA.eval=False for validation across all models (you can re-enable later).


## 10) Orchestrator

In [13]:

def train_and_evaluate_one_model(model_name: str):
    cfg = MODEL_CFG[model_name]
    pretrained = cfg.get('pretrained', False)
    model, IMG_SIZE = create_model(model_name, num_classes=100, pretrained=pretrained)
    if isinstance(model, WideResNet) and ('drop_rate' in cfg):
        model = WideResNet(depth=28, widen_factor=10, num_classes=100, drop_rate=cfg['drop_rate'])
    model = model.to(DEVICE)
    if USE_CHANNELS_LAST and torch.cuda.is_available():
        model = model.to(memory_format=torch.channels_last)
    if hasattr(model, "set_grad_checkpointing"):
        model.set_grad_checkpointing(True)

    dl_train, dl_val, dl_test, class_names = get_dataloaders_for_model(
        model_name, IMG_SIZE, cfg['batch'], data_root=DATA_ROOT, val_split=VAL_SPLIT,
        num_workers=NUM_WORKERS, seed=SEED
    )
    criterion = nn.CrossEntropyLoss(label_smoothing=(0.0 if cfg.get('mix',{}).get('enabled', False) else LABEL_SMOOTHING)).to(DEVICE)

    opt_name = cfg['opt']; lr = cfg['lr']; wd = cfg['wd']
    if opt_name == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=wd, nesterov=True)
    elif opt_name == "adamw":
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    else:
        raise ValueError("Unsupported optimizer")

    EPOCHS = cfg['epochs']; ACCUM_STEPS = max(1, int(cfg['accum']))
    warmup_epochs = max(1, int(0.03 * EPOCHS))
    total_steps = math.ceil(EPOCHS * len(dl_train) / ACCUM_STEPS)
    warmup_steps = math.ceil(warmup_epochs * len(dl_train) / ACCUM_STEPS)
    def cosine_lr_lambda(step):
        if step < warmup_steps: return float(step) / float(max(1, warmup_steps))
        progress = float(step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return 0.5 * (1.0 + math.cos(math.pi * progress))
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=cosine_lr_lambda)

    early = {**EARLY_STOP_DEFAULT, **cfg.get('early_stop', {})}
    patience = int(early.get('patience', 10))
    min_delta = float(early.get('min_delta', 0.0))
    min_epochs = int(early.get('min_epochs', 0))
    target_top1 = early.get('target_top1', None)
    early_enabled = bool(early.get('enabled', True))

    ema = None
    ema_cfg = cfg.get('ema', {'enabled': False})
    if ema_cfg.get('enabled', False):
        ema = ModelEMA(model, decay=ema_cfg.get('decay', 0.9999), device=None)

    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    run_name = f"{model_name}_e{EPOCHS}_bs{cfg['batch']}_acc{ACCUM_STEPS}_{timestamp}"
    OUT_DIR = os.path.join(OUTPUT_ROOT, run_name)
    CKPT_DIR = os.path.join(OUT_DIR, "checkpoints")
    PLOTS_DIR = os.path.join(OUT_DIR, "plots")
    RESULTS_DIR = os.path.join(OUT_DIR, "results")
    for d in [OUT_DIR, CKPT_DIR, PLOTS_DIR, RESULTS_DIR]: ensure_dir(d)

    tb_writer = None
    if LOG_TENSORBOARD:
        try:
            from torch.utils.tensorboard import SummaryWriter
            tb_logdir = os.path.join(OUT_DIR, "tb"); ensure_dir(tb_logdir)
            tb_writer = SummaryWriter(log_dir=tb_logdir)
        except Exception as e:
            print("TensorBoard unavailable:", e); tb_writer = None

    wb = None
    if LOG_WANDB:
        try:
            import wandb
            wb = wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, name=run_name,
                            config={
                                "model": model_name, "epochs": EPOCHS, "batch": cfg['batch'], "accum": ACCUM_STEPS,
                                "optimizer": opt_name, "lr": lr, "weight_decay": wd,
                                "img_size": IMG_SIZE, "mix": cfg.get('mix', {}), "ema": ema_cfg,
                                "early_stop": early,
                            })
        except Exception as e:
            print("W&B unavailable or not logged in:", e); wb = None

    hist_csv_path = os.path.join(RESULTS_DIR, "history.csv")
    with open(hist_csv_path, "w") as f:
        f.write("epoch,train_loss,train_acc,val_loss,val_acc,lr\n")

    scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available())
    history = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}
    best_val = -1.0
    epochs_no_improve = 0
    BEST_PATH = os.path.join(CKPT_DIR, f"{model_name}_best.pt")

    print(f"\n==> [{model_name}] Training for {EPOCHS} epochs | opt={opt_name} lr={lr} wd={wd} | "
          f"img={IMG_SIZE} | bs={cfg['batch']} | accum={ACCUM_STEPS}")
    for epoch in range(1, EPOCHS + 1):
        t0 = time.time()
        train_loss, train_acc = train_one_epoch(
            model, dl_train, optimizer, scheduler, scaler, DEVICE, criterion,
            max_grad_norm=cfg.get('max_grad_norm', None), accum_steps=ACCUM_STEPS,
            use_channels_last=USE_CHANNELS_LAST,
            use_mix=cfg.get('mix',{}).get('enabled', False),
            num_classes=100, mix_params=cfg.get('mix', {}), ema_obj=ema
        )

        val_loss, val_top1, _, _, _, _, _, _ = evaluate(
            model, dl_val, DEVICE, criterion, use_channels_last=USE_CHANNELS_LAST,
            ema_obj=ema, use_ema_eval=ema_cfg.get('eval', True)
        )

        history["train_loss"].append(train_loss); history["val_loss"].append(val_loss)
        history["train_acc"].append(train_acc);   history["val_acc"].append(val_top1)

        lr_now = optimizer.param_groups[0]["lr"]
        with open(hist_csv_path, "a") as f:
            f.write(f"{epoch},{train_loss:.6f},{train_acc:.6f},{val_loss:.6f},{val_top1:.6f},{lr_now:.8f}\n")

        if tb_writer is not None:
            tb_writer.add_scalar("train/loss", train_loss, epoch)
            tb_writer.add_scalar("train/acc1", train_acc, epoch)
            tb_writer.add_scalar("val/loss", val_loss, epoch)
            tb_writer.add_scalar("val/acc1", val_top1, epoch)
            tb_writer.add_scalar("opt/lr", lr_now, epoch)

        if wb is not None:
            wb.log({"train/loss": train_loss, "train/acc1": train_acc, "val/loss": val_loss, "val/acc1": val_top1, "opt/lr": lr_now}, step=epoch)

        print(f"  Epoch {epoch:03d}/{EPOCHS} | train_loss={train_loss:.4f} acc={train_acc*100:.2f}% | "
              f"val_loss={val_loss:.4f} acc@1={val_top1*100:.2f}% | time={time.time()-t0:.1f}s")

        improved = val_top1 > (best_val + min_delta)
        if improved:
            best_val = val_top1
            epochs_no_improve = 0
            torch.save({'model': model.state_dict(), 'epoch': epoch, 'val_acc1': best_val}, BEST_PATH)
            if wb is not None:
                wb.log({"checkpoint/best_epoch": epoch, "checkpoint/best_val_acc1": best_val}, step=epoch)
        else:
            epochs_no_improve += 1

        if early_enabled and (target_top1 is not None) and (val_top1 >= float(target_top1)):
            print(f"  Early stop: target_top1 {float(target_top1):.3f} reached at epoch {epoch}.")
            break
        if early_enabled and (epoch >= min_epochs) and (epochs_no_improve >= patience):
            print(f"  Early stop: no improvement > {min_delta} for {patience} epochs (epoch {epoch}).")
            break

    try:
        ckpt = torch.load(BEST_PATH, map_location='cpu', weights_only=True)
    except TypeError:
        ckpt = torch.load(BEST_PATH, map_location='cpu')
    model.load_state_dict(ckpt['model'])

    use_ema_eval = ema_cfg.get('eval', True)
    val_loss, val_top1, val_top5, val_conf, val_pred, val_true, val_corr, val_logits = evaluate(
        model, dl_val, DEVICE, criterion, use_channels_last=USE_CHANNELS_LAST, ema_obj=ema, use_ema_eval=use_ema_eval
    )
    test_loss, test_top1, test_top5, test_conf, test_pred, test_true, test_corr, test_logits = evaluate(
        model, dl_test, DEVICE, criterion, use_channels_last=USE_CHANNELS_LAST, ema_obj=ema, use_ema_eval=use_ema_eval
    )

    cm = confusion_matrix(test_true, test_pred, labels=list(range(100)))

    T = fit_temperature(val_logits, torch.from_numpy(val_true))
    ece_raw, bin_stats_raw = compute_ece(test_conf, test_corr, n_bins=15)
    test_probs_scaled = F.softmax(test_logits / T, dim=1).numpy()
    test_conf_scaled = test_probs_scaled.max(axis=1)
    ece_temp, bin_stats_temp = compute_ece(test_conf_scaled, test_corr, n_bins=15)

    metrics = {
        'model': model_name,
        'epochs_run': len(history['val_acc']),
        'optimizer': opt_name, 'lr_final': float(optimizer.param_groups[0]['lr']), 'weight_decay': float(optimizer.param_groups[0]['weight_decay']),
        'batch_size': cfg['batch'], 'accum_steps': ACCUM_STEPS,
        'max_grad_norm': cfg.get('max_grad_norm', None),
        'label_smoothing': (0.0 if cfg.get('mix',{}).get('enabled', False) else float(LABEL_SMOOTHING)),
        'mixup_cutmix': cfg.get('mix', {}),
        'ema': ema_cfg,
        'val': {'loss': float(val_loss), 'top1': float(val_top1), 'top5': float(val_top5)},
        'test': {'loss': float(test_loss), 'top1': float(test_top1), 'top5': float(test_top5)},
        'ece_raw': float(ece_raw), 'ece_temp_scaled': float(ece_temp), 'temperature': float(T),
        'checkpoint': os.path.join(OUT_DIR, "checkpoints", f"{model_name}_best.pt"), 'run_dir': OUT_DIR
    }
    RESULTS_DIR = os.path.join(OUT_DIR, "results")
    with open(os.path.join(RESULTS_DIR, 'metrics.json'), 'w') as f: json.dump(metrics, f, indent=2)
    with open(os.path.join(RESULTS_DIR, 'reliability_bins_raw.json'), 'w') as f: json.dump(bin_stats_raw, f, indent=2)
    with open(os.path.join(RESULTS_DIR, 'reliability_bins_temp.json'), 'w') as f: json.dump(bin_stats_temp, f, indent=2)
    np = __import__('numpy')
    np.save(os.path.join(RESULTS_DIR, 'confusion_matrix.npy'), cm)

    PLOTS_DIR = os.path.join(OUT_DIR, "plots")
    plot_training_curves(history, PLOTS_DIR, title=model_name.upper())
    plot_confusion_matrix(cm, PLOTS_DIR, title=f"{model_name.upper()} - Confusion Matrix (Test)")
    plot_per_class_accuracy(cm, PLOTS_DIR, title=model_name.upper(), class_names=class_names, topn=25)
    plot_reliability(bin_stats_raw,  PLOTS_DIR, title=model_name.upper(), suffix="raw")
    plot_reliability(bin_stats_temp, PLOTS_DIR, title=model_name.upper(), suffix="temp_scaled")

    if tb_writer is not None:
        tb_writer.flush(); tb_writer.close()
    try:
        import wandb
        if wandb.run is not None:
            wandb.finish()
    except Exception:
        pass

    print(f"  -> Best checkpoint: {metrics['checkpoint']}")
    print(f"  -> Plots: {PLOTS_DIR}")
    print(f"  -> Results: {RESULTS_DIR}")
    return metrics


## 11) Train: ResNet‑18

In [14]:
resnet_metrics = train_and_evaluate_one_model('resnet18')
resnet_metrics

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data\cifar-100-python.tar.gz


100%|██████████| 169M/169M [01:23<00:00, 2.04MB/s]    


Extracting ./data\cifar-100-python.tar.gz to ./data
Files already downloaded and verified

==> [resnet18] Training for 100 epochs | opt=sgd lr=0.05 wd=0.0005 | img=224 | bs=32 | accum=2
  Epoch 001/100 | train_loss=3.1993 acc=22.82% | val_loss=2.0789 acc@1=44.14% | time=364.3s
  Epoch 002/100 | train_loss=2.7405 acc=31.05% | val_loss=2.2292 acc@1=43.52% | time=461.8s
  Epoch 003/100 | train_loss=2.7263 acc=31.39% | val_loss=2.1251 acc@1=44.18% | time=429.0s
  Epoch 004/100 | train_loss=2.5452 acc=34.42% | val_loss=1.9296 acc@1=48.96% | time=371.3s
  Epoch 005/100 | train_loss=2.3653 acc=37.37% | val_loss=1.7578 acc@1=53.24% | time=365.1s
  Epoch 006/100 | train_loss=2.3231 acc=38.98% | val_loss=1.7403 acc@1=54.12% | time=396.2s
  Epoch 007/100 | train_loss=2.3009 acc=38.86% | val_loss=1.8009 acc@1=52.20% | time=376.1s
  Epoch 008/100 | train_loss=2.2958 acc=40.77% | val_loss=1.7135 acc@1=54.56% | time=415.2s
  Epoch 009/100 | train_loss=2.2813 acc=40.39% | val_loss=1.7543 acc@1=53.74% 

{'model': 'resnet18',
 'epochs_run': 100,
 'optimizer': 'sgd',
 'lr_final': 0.0,
 'weight_decay': 0.0005,
 'batch_size': 32,
 'accum_steps': 2,
 'max_grad_norm': None,
 'label_smoothing': 0.0,
 'mixup_cutmix': {'enabled': True,
  'prob': 0.6,
  'mixup_alpha': 0.2,
  'cutmix_alpha': 1.0,
  'switch_prob': 0.5},
 'ema': {'enabled': True, 'decay': 0.9999, 'eval': False},
 'val': {'loss': 0.853527719783783, 'top1': 0.7734, 'top5': 0.9434},
 'test': {'loss': 0.7094504383087158, 'top1': 0.8057, 'top5': 0.9611},
 'ece_raw': 0.022068275982141495,
 'ece_temp_scaled': 0.028767734242975712,
 'temperature': 0.8810027241706848,
 'checkpoint': './outputs\\resnet18_e100_bs32_acc2_20251201-131328\\checkpoints\\resnet18_best.pt',
 'run_dir': './outputs\\resnet18_e100_bs32_acc2_20251201-131328'}

## 12) Train: WideResNet‑28×10

In [15]:
wrn_metrics = train_and_evaluate_one_model('wrn28x10')
wrn_metrics

Files already downloaded and verified
Files already downloaded and verified

==> [wrn28x10] Training for 160 epochs | opt=sgd lr=0.1 wd=0.0005 | img=32 | bs=128 | accum=1
  Epoch 001/160 | train_loss=4.4175 acc=3.67% | val_loss=4.1333 acc@1=7.14% | time=364.3s
  Epoch 002/160 | train_loss=4.0402 acc=7.73% | val_loss=3.7964 acc@1=11.38% | time=361.9s
  Epoch 003/160 | train_loss=3.7202 acc=11.97% | val_loss=3.3823 acc@1=17.32% | time=362.2s
  Epoch 004/160 | train_loss=3.4620 acc=16.03% | val_loss=3.1923 acc@1=21.86% | time=371.0s
  Epoch 005/160 | train_loss=3.1651 acc=21.35% | val_loss=3.4961 acc@1=20.02% | time=365.6s
  Epoch 006/160 | train_loss=3.0181 acc=24.65% | val_loss=2.8543 acc@1=28.20% | time=363.9s
  Epoch 007/160 | train_loss=2.8574 acc=25.85% | val_loss=2.5934 acc@1=33.62% | time=388.2s
  Epoch 008/160 | train_loss=2.7101 acc=29.85% | val_loss=2.5102 acc@1=35.74% | time=385.3s
  Epoch 009/160 | train_loss=2.6512 acc=31.62% | val_loss=2.4561 acc@1=36.12% | time=381.7s
  Ep

{'model': 'wrn28x10',
 'epochs_run': 160,
 'optimizer': 'sgd',
 'lr_final': 0.0,
 'weight_decay': 0.0005,
 'batch_size': 128,
 'accum_steps': 1,
 'max_grad_norm': None,
 'label_smoothing': 0.0,
 'mixup_cutmix': {'enabled': True,
  'prob': 0.5,
  'mixup_alpha': 0.2,
  'cutmix_alpha': 1.0,
  'switch_prob': 0.5},
 'ema': {'enabled': True, 'decay': 0.9999, 'eval': False},
 'val': {'loss': 0.761966065955162, 'top1': 0.798, 'top5': 0.9506},
 'test': {'loss': 0.6417224734306336, 'top1': 0.8271, 'top5': 0.9636},
 'ece_raw': 0.0201254317894578,
 'ece_temp_scaled': 0.0291562568038702,
 'temperature': 0.9658310413360596,
 'checkpoint': './outputs\\wrn28x10_e160_bs128_acc1_20251201-235123\\checkpoints\\wrn28x10_best.pt',
 'run_dir': './outputs\\wrn28x10_e160_bs128_acc1_20251201-235123'}

## 13) Train: ConvNeXt‑Tiny

In [16]:
convnext_metrics = train_and_evaluate_one_model('convnext_tiny')
convnext_metrics

Files already downloaded and verified
Files already downloaded and verified

==> [convnext_tiny] Training for 80 epochs | opt=adamw lr=0.0002 wd=0.05 | img=224 | bs=32 | accum=2




  Epoch 001/80 | train_loss=2.4948 acc=36.75% | val_loss=1.0260 acc@1=73.38% | time=573.1s
  Epoch 002/80 | train_loss=1.7750 acc=48.82% | val_loss=1.1076 acc@1=69.92% | time=550.4s
  Epoch 003/80 | train_loss=1.7487 acc=48.17% | val_loss=0.9991 acc@1=72.42% | time=577.9s
  Epoch 004/80 | train_loss=1.6000 acc=51.83% | val_loss=0.9590 acc@1=73.46% | time=584.8s
  Epoch 005/80 | train_loss=1.4756 acc=52.70% | val_loss=0.8885 acc@1=74.98% | time=685.6s
  Epoch 006/80 | train_loss=1.3632 acc=56.07% | val_loss=0.8800 acc@1=75.50% | time=682.3s
  Epoch 007/80 | train_loss=1.3126 acc=54.63% | val_loss=0.8405 acc@1=76.32% | time=659.5s
  Epoch 008/80 | train_loss=1.2640 acc=58.10% | val_loss=0.7986 acc@1=77.44% | time=614.9s
  Epoch 009/80 | train_loss=1.2256 acc=59.96% | val_loss=0.7603 acc@1=79.28% | time=716.0s
  Epoch 010/80 | train_loss=1.1485 acc=59.11% | val_loss=0.7808 acc@1=79.06% | time=479.7s
  Epoch 011/80 | train_loss=1.1240 acc=60.34% | val_loss=0.8376 acc@1=77.70% | time=397.3s

{'model': 'convnext_tiny',
 'epochs_run': 80,
 'optimizer': 'adamw',
 'lr_final': 0.0,
 'weight_decay': 0.05,
 'batch_size': 32,
 'accum_steps': 2,
 'max_grad_norm': 1.0,
 'label_smoothing': 0.0,
 'mixup_cutmix': {'enabled': True,
  'prob': 0.8,
  'mixup_alpha': 0.2,
  'cutmix_alpha': 1.0,
  'switch_prob': 0.5},
 'ema': {'enabled': True, 'decay': 0.9999, 'eval': False},
 'val': {'loss': 0.8627401709794998, 'top1': 0.8468, 'top5': 0.9638},
 'test': {'loss': 0.6566876657158136, 'top1': 0.8788, 'top5': 0.9755},
 'ece_raw': 0.08491486203968526,
 'ece_temp_scaled': 0.018031411035358904,
 'temperature': 1.5065102577209473,
 'checkpoint': './outputs\\convnext_tiny_e80_bs32_acc2_20251202-170955\\checkpoints\\convnext_tiny_best.pt',
 'run_dir': './outputs\\convnext_tiny_e80_bs32_acc2_20251202-170955'}

## 14) Train: ViT‑Tiny (DeiT‑Tiny)

In [17]:
vit_tiny_metrics = train_and_evaluate_one_model('vit_tiny')
vit_tiny_metrics

Files already downloaded and verified
Files already downloaded and verified

==> [vit_tiny] Training for 100 epochs | opt=adamw lr=0.0005 wd=0.05 | img=224 | bs=32 | accum=2
  Epoch 001/100 | train_loss=3.9940 acc=11.44% | val_loss=1.8894 acc@1=49.22% | time=106.4s
  Epoch 002/100 | train_loss=2.5405 acc=33.56% | val_loss=1.5944 acc@1=56.82% | time=106.9s
  Epoch 003/100 | train_loss=2.4715 acc=34.81% | val_loss=1.6897 acc@1=55.36% | time=106.2s
  Epoch 004/100 | train_loss=2.3813 acc=35.92% | val_loss=1.5153 acc@1=58.56% | time=107.3s
  Epoch 005/100 | train_loss=2.2971 acc=37.54% | val_loss=1.4419 acc@1=60.46% | time=106.9s
  Epoch 006/100 | train_loss=2.2002 acc=39.33% | val_loss=1.3748 acc@1=61.86% | time=107.2s
  Epoch 007/100 | train_loss=2.1940 acc=40.62% | val_loss=1.3898 acc@1=61.28% | time=107.7s
  Epoch 008/100 | train_loss=2.0829 acc=41.63% | val_loss=1.3359 acc@1=62.96% | time=106.7s
  Epoch 009/100 | train_loss=2.0265 acc=42.78% | val_loss=1.2797 acc@1=65.26% | time=107.3

{'model': 'vit_tiny',
 'epochs_run': 100,
 'optimizer': 'adamw',
 'lr_final': 0.0,
 'weight_decay': 0.05,
 'batch_size': 32,
 'accum_steps': 2,
 'max_grad_norm': 1.0,
 'label_smoothing': 0.0,
 'mixup_cutmix': {'enabled': True,
  'prob': 0.8,
  'mixup_alpha': 0.2,
  'cutmix_alpha': 1.0,
  'switch_prob': 0.5},
 'ema': {'enabled': True, 'decay': 0.9999, 'eval': False},
 'val': {'loss': 0.7983950404942036, 'top1': 0.8124, 'top5': 0.9526},
 'test': {'loss': 0.671459115076065, 'top1': 0.841, 'top5': 0.9637},
 'ece_raw': 0.057714962327480315,
 'ece_temp_scaled': 0.012789853978157045,
 'temperature': 1.1999342441558838,
 'checkpoint': './outputs\\vit_tiny_e100_bs32_acc2_20251203-023446\\checkpoints\\vit_tiny_best.pt',
 'run_dir': './outputs\\vit_tiny_e100_bs32_acc2_20251203-023446'}

## 15) Train: ViT‑Hybrid (ResNet‑26 + ViT Small)

In [None]:
vit_hybrid_metrics = train_and_evaluate_one_model('vit_hybrid')
vit_hybrid_metrics

Files already downloaded and verified
Files already downloaded and verified

==> [vit_hybrid] Training for 100 epochs | opt=adamw lr=0.0005 wd=0.05 | img=224 | bs=32 | accum=2
  Epoch 001/100 | train_loss=3.0226 acc=28.98% | val_loss=1.0108 acc@1=72.40% | time=383.1s
  Epoch 002/100 | train_loss=1.9644 acc=45.95% | val_loss=1.0977 acc@1=70.56% | time=378.8s
  Epoch 003/100 | train_loss=2.0293 acc=42.95% | val_loss=1.3751 acc@1=62.84% | time=378.4s
  Epoch 004/100 | train_loss=2.0624 acc=42.37% | val_loss=1.3398 acc@1=63.32% | time=378.1s
  Epoch 005/100 | train_loss=2.0233 acc=43.31% | val_loss=1.2118 acc@1=66.30% | time=377.7s
  Epoch 006/100 | train_loss=1.9054 acc=45.73% | val_loss=1.1450 acc@1=68.80% | time=379.3s
  Epoch 007/100 | train_loss=1.8399 acc=45.09% | val_loss=1.1074 acc@1=69.40% | time=378.3s
  Epoch 008/100 | train_loss=1.7718 acc=48.84% | val_loss=1.1348 acc@1=68.68% | time=379.4s
  Epoch 009/100 | train_loss=1.7474 acc=47.61% | val_loss=1.1220 acc@1=68.82% | time=378

## 16) Aggregate results across all runs (auto‑discover from ./outputs)

In [None]:

def discover_runs_and_build_df(outputs_root='./outputs'):
    rows = []
    for root, dirs, files in os.walk(outputs_root):
        if 'metrics.json' in files:
            path = os.path.join(root, 'metrics.json')
            try:
                with open(path, 'r') as f:
                    m = json.load(f)
                rows.append({
                    'model': m['model'], 'val_top1': m['val']['top1'], 'val_top5': m['val']['top5'],
                    'test_top1': m['test']['top1'], 'test_top5': m['test']['top5'],
                    'ece_raw': m.get('ece_raw', None), 'ece_temp': m.get('ece_temp_scaled', None),
                    'temperature': m.get('temperature', None),
                    'checkpoint': m['checkpoint'], 'run_dir': m['run_dir']
                })
            except Exception as e:
                print("Skip", path, e)
    if len(rows) == 0:
        print("No metrics.json found yet."); return None
    df = pd.DataFrame(rows).sort_values(['model','test_top1'], ascending=[True, False]).groupby('model', as_index=False).first()
    SUMMARY_DIR = os.path.join(outputs_root, 'summary'); ensure_dir(SUMMARY_DIR)
    df.to_csv(os.path.join(SUMMARY_DIR, 'aggregate_results.csv'), index=False)
    print("Aggregate saved to:", os.path.join(SUMMARY_DIR, 'aggregate_results.csv'))
    return df

df = discover_runs_and_build_df('./outputs')
df


## 17) Summary Plots

In [None]:

try:
    SUMMARY_DIR = os.path.join("./outputs", "summary"); ensure_dir(SUMMARY_DIR)
    csv_path = os.path.join(SUMMARY_DIR, "aggregate_results.csv")
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        plt.figure(figsize=(7,5)); plt.bar(df["model"], df["test_top1"])
        plt.ylabel("Test Top-1 Accuracy"); plt.title("Model Comparison: Top-1")
        plt.grid(True, axis="y", alpha=0.3); plt.tight_layout()
        plt.savefig(os.path.join(SUMMARY_DIR, "summary_top1.png"), dpi=150); plt.close()

        plt.figure(figsize=(7,5)); plt.bar(df["model"], df["test_top5"])
        plt.ylabel("Test Top-5 Accuracy"); plt.title("Model Comparison: Top-5")
        plt.grid(True, axis="y", alpha=0.3); plt.tight_layout()
        plt.savefig(os.path.join(SUMMARY_DIR, "summary_top5.png"), dpi=150); plt.close()

        if "ece_raw" in df.columns and df["ece_raw"].notnull().any():
            plt.figure(figsize=(7,5)); plt.bar(df["model"], df["ece_raw"])
            plt.ylabel("ECE (raw)"); plt.title("Calibration (ECE raw)")
            plt.grid(True, axis="y", alpha=0.3); plt.tight_layout()
            plt.savefig(os.path.join(SUMMARY_DIR, "summary_ece_raw.png"), dpi=150); plt.close()

        if "ece_temp" in df.columns and df["ece_temp"].notnull().any():
            plt.figure(figsize=(7,5)); plt.bar(df["model"], df["ece_temp"])
            plt.ylabel("ECE (temp‑scaled)"); plt.title("Calibration (ECE after Temp Scaling)")
            plt.grid(True, axis="y", alpha=0.3); plt.tight_layout()
            plt.savefig(os.path.join(SUMMARY_DIR, "summary_ece_temp.png"), dpi=150); plt.close()

        print("Saved summary plots in:", SUMMARY_DIR)
    else:
        print("No aggregate_results.csv found; run the aggregation cell above first.")
except Exception as e:
    print("Summary plot error:", e)


## 18) One‑click ZIP export

In [None]:

zip_name = f"cifar100_capstone_outputs_{datetime.now().strftime('%Y%m%d-%H%M%S')}"
archive_path = shutil.make_archive(zip_name, "zip", "./outputs")
print("Created ZIP:", os.path.abspath(archive_path))



## 19) How to run & tips

- **GPU check**: The notebook asserts CUDA by default. Switch `REQUIRE_GPU=False` if you want CPU (very slow).
- **If you see CUDA OOM**: Lower `MODEL_CFG[model]['batch']` and increase `accum` accordingly.
- **Early stop targets** are preset (e.g., 0.78 for WRN/ConvNeXt). Adjust in `MODEL_CFG` or set to `None`.
- **TensorBoard**: logs in `./outputs/<run>/tb`. Launch: `tensorboard --logdir outputs`
- **W&B**: set `LOG_WANDB=True`, `pip install wandb`, `wandb login`, (optional) set `WANDB_ENTITY`.
- **Artifacts per model**: best checkpoint, plots, metrics.json, history.csv, confusion_matrix.npy, reliability bins.


In [None]:

# --- One-batch sanity check for validation (optional) ---
try:
    _m = model; _lv = dl_val
    _m.eval()
    _ce = torch.nn.CrossEntropyLoss()
    with torch.no_grad():
        x, y = next(iter(_lv))
        x = x.to(DEVICE); y = y.to(DEVICE).long()
        with torch.amp.autocast('cuda', enabled=False):
            logits = _m(x).float()
            loss = _ce(logits, y)
        acc1 = (logits.argmax(1) == y).float().mean().item()
    print(f"[Sanity] val batch loss={loss.item():.4f}  acc@1={acc1*100:.2f}%")
except Exception as e:
    print("[Sanity] Skipped:", e)
