In [None]:
!wget http://cs231n.stanford.edu/tiny-imagenet-200.zip
!unzip -q tiny-imagenet-200.zip


--2025-08-15 08:34:01--  http://cs231n.stanford.edu/tiny-imagenet-200.zip
Resolving cs231n.stanford.edu (cs231n.stanford.edu)... 171.64.64.64
Connecting to cs231n.stanford.edu (cs231n.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs231n.stanford.edu/tiny-imagenet-200.zip [following]
--2025-08-15 08:34:01--  https://cs231n.stanford.edu/tiny-imagenet-200.zip
Connecting to cs231n.stanford.edu (cs231n.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 248100043 (237M) [application/zip]
Saving to: ‘tiny-imagenet-200.zip’


2025-08-15 08:34:07 (37.3 MB/s) - ‘tiny-imagenet-200.zip’ saved [248100043/248100043]



In [None]:
import os, math, random, time
from pathlib import Path
from dataclasses import dataclass
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms, utils

In [None]:
try:
    from sklearn.metrics import confusion_matrix
    HAS_SK = True
except Exception:
    HAS_SK = False

### Setup the variable

In [None]:

SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
AMP = True


TRAIN_DIR = "/content/tiny-imagenet-200/train"

NUM_CLASSES = 20
IMG_SIZE = 64
VAL_SPLIT = 0.15
BATCH_SIZE = 128
EPOCHS = 20
LR = 3e-4
WEIGHT_DECAY = 0.05
EARLY_STOP = 6

OUT_DIR = Path("tri_compare_vit")
(OUT_DIR / "figs").mkdir(parents=True, exist_ok=True)
(OUT_DIR / "grids").mkdir(parents=True, exist_ok=True)
(OUT_DIR / "curves").mkdir(parents=True, exist_ok=True)

random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

### Data & selection of 20 classes

In [None]:
train_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4802,0.4481,0.3975), std=(0.2770,0.2691,0.2821)),
])

eval_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4802,0.4481,0.3975), std=(0.2770,0.2691,0.2821)),
])

In [None]:
full_ds = datasets.ImageFolder(TRAIN_DIR)
all_classes = full_ds.classes
print(f"Found {len(all_classes)} classes; selecting {NUM_CLASSES}…")
rng = random.Random(SEED)
selected_classes = sorted(rng.sample(all_classes, NUM_CLASSES))
print("Selected classes:", selected_classes)

sel_to_new = {c:i for i,c in enumerate(selected_classes)}

sel_indices = [i for i,(_,y) in enumerate(full_ds.samples) if full_ds.classes[y] in sel_to_new]

In [None]:
class WrappedSubset(torch.utils.data.Dataset):
    def __init__(self, base, indices, transform, sel_to_new):
        self.base = base
        self.indices = indices
        self.transform = transform
        self.sel_to_new = sel_to_new

    def __len__(self): return len(self.indices)

    def __getitem__(self, i):
        bi = self.indices[i]
        path, y_old = self.base.samples[bi]
        img = self.base.loader(path)
        img = self.transform(img)
        y = self.sel_to_new[self.base.classes[y_old]]
        return img, y

In [None]:

N = len(sel_indices)
n_val = int(round(VAL_SPLIT * N))
n_train = N - n_val
train_ids, val_ids = random_split(sel_indices, [n_train, n_val], generator=torch.Generator().manual_seed(SEED))

train_ds = WrappedSubset(full_ds, list(train_ids), transform=train_tfms, sel_to_new=sel_to_new)
val_ds   = WrappedSubset(full_ds, list(val_ids),  transform=eval_tfms,  sel_to_new=sel_to_new)

num_workers = min(8, os.cpu_count() or 2)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=num_workers, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers, pin_memory=True)

### print image for each class

In [None]:

def denorm(x):
    mean = torch.tensor([0.4802, 0.4481, 0.3975])[:,None,None]
    std  = torch.tensor([0.2770, 0.2691, 0.2821])[:,None,None]
    return x*std + mean

def save_example_grids_per_class(dataset, per_class=20):
    per = {i:[] for i in range(NUM_CLASSES)}
    for i in range(len(dataset)):
        _, y = dataset[i]
        if len(per[y]) < per_class:
            per[y].append(i)
    inv_map = {v:k for k,v in sel_to_new.items()}
    for ci, idxs in per.items():
        imgs = [dataset[j][0] for j in idxs]
        if not imgs: continue
        grid = utils.make_grid(imgs, nrow=5, padding=2)
        fig = plt.figure(figsize=(6,5))
        plt.imshow(np.transpose(denorm(grid).clamp(0,1).numpy(), (1,2,0)))
        plt.axis("off"); plt.title(f"class {ci}: {inv_map[ci]}")
        p = OUT_DIR / "grids" / f"class_{ci:02d}.png"
        fig.tight_layout(); fig.savefig(p, dpi=200); plt.close(fig)
        print("Saved:", p)

print("Saving a 20-image grid for each class from the training set…")
save_example_grids_per_class(train_ds, per_class=20)

In [None]:

def save_one_per_class_grid(dataset, out_path, classes=NUM_CLASSES, nrow=5):
    found = {c: None for c in range(classes)}
    images, labels = [], []
    for i in range(len(dataset)):
        x, y = dataset[i]
        if found[y] is None:
            found[y] = i
            images.append(x); labels.append(y)
        if len(images) == classes: break
    if len(images) < classes:
        need = [c for c, idx in found.items() if idx is None]
        for j in range(len(val_ds)):
            x, y = val_ds[j]
            if y in need and all(l != y for l in labels):
                images.append(x); labels.append(y)
            if len(images) == classes: break
    assert len(images) == classes, "Could not find one sample for each class."
    grid = utils.make_grid(images, nrow=nrow, padding=2)
    fig = plt.figure(figsize=(8, 6))
    plt.imshow(np.transpose(denorm(grid).clamp(0,1).numpy(), (1,2,0)))
    plt.axis("off"); plt.title("One image per class (20 classes)")
    fig.tight_layout(); fig.savefig(out_path, dpi=220); plt.close(fig)
    print("Saved:", out_path)

one_per_class_path = OUT_DIR / "figs" / "one_per_class_20_grid.png"
save_one_per_class_grid(train_ds, one_per_class_path, classes=NUM_CLASSES, nrow=5)

### FCNN

In [None]:

class FCFNN(nn.Module):
    def __init__(self, img_size=64, num_classes=20, widths=(1024,512)):
        super().__init__()
        C=3; H=W=img_size
        flat = C*H*W
        layers = [nn.Flatten()]
        in_dim = flat
        for w in widths:
            layers += [nn.Linear(in_dim, w), nn.ReLU(inplace=True), nn.Dropout(0.2)]
            in_dim = w
        layers += [nn.Linear(in_dim, num_classes)]
        self.net = nn.Sequential(*layers)
    def forward(self, x): return self.net(x)

### Small CNN


In [None]:
class SmallCNN(nn.Module):
    def __init__(self, num_classes=20):
        super().__init__()
        def block(cin, cout):
            return nn.Sequential(
                nn.Conv2d(cin, cout, 3, padding=1), nn.BatchNorm2d(cout), nn.ReLU(inplace=True),
                nn.Conv2d(cout, cout, 3, padding=1), nn.BatchNorm2d(cout), nn.ReLU(inplace=True),
                nn.MaxPool2d(2)
            )
        self.features = nn.Sequential(
            block(3,   64),
            block(64, 128),
            block(128,256),
        )
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(256, num_classes)
        )
    def forward(self, x):
        x = self.features(x)
        return self.head(x)

### All patchs

In [None]:
class SinusoidalPositionalEmbedding(nn.Module):
    def __init__(self, dim, max_len=1024):
        super().__init__()
        pe = torch.zeros(max_len, dim)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x):
        L = x.size(1)
        return x + self.pe[:, :L, :]

class PatchifyConv(nn.Module):
    def __init__(self, img_size=64, patch=8, in_ch=3, embed_dim=192):
        super().__init__()
        assert img_size % patch == 0
        self.num_patches = (img_size // patch) ** 2
        self.proj = nn.Conv2d(in_ch, embed_dim, kernel_size=patch, stride=patch)
    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)
        return x

class PatchifyLinear(nn.Module):
    def __init__(self, img_size=64, patch=8, in_ch=3, embed_dim=192):
        super().__init__()
        assert img_size % patch == 0
        self.patch = patch
        self.num_patches = (img_size // patch) ** 2
        self.proj = nn.Linear(in_ch*patch*patch, embed_dim)
    def forward(self, x):
        B, C, H, W = x.shape
        p = self.patch
        patches = F.unfold(x, kernel_size=p, stride=p)
        patches = patches.transpose(1,2)
        return self.proj(patches)

### Transformer

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, dim, heads, mlp_ratio=4.0, attn_drop=0.0, proj_drop=0.0, drop=0.0):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn  = nn.MultiheadAttention(dim, heads, dropout=attn_drop, batch_first=True)
        self.drop1 = nn.Dropout(drop)
        self.norm2 = nn.LayerNorm(dim)
        self.mlp   = nn.Sequential(
            nn.Linear(dim, int(dim*mlp_ratio)),
            nn.GELU(),
            nn.Dropout(proj_drop),
            nn.Linear(int(dim*mlp_ratio), dim),
            nn.Dropout(proj_drop),
        )
        self.drop2 = nn.Dropout(drop)
    def forward(self, x):
        h = x
        x = self.norm1(x)
        x,_ = self.attn(x,x,x, need_weights=False)
        x = h + self.drop1(x)
        h = x
        x = self.norm2(x)
        x = self.mlp(x)
        x = h + self.drop2(x)
        return x

class ViT(nn.Module):
    def __init__(self,
                 img_size=64, patch=8, in_ch=3, num_classes=20,
                 embed_dim=192, depth=8, heads=6, mlp_ratio=4.0,
                 patch_type="conv",      # "conv" | "linear"
                 pos_type="learnable"    # "learnable" | "sinusoidal" | "none"
                 ):
        super().__init__()
        if patch_type == "conv":
            self.patch = PatchifyConv(img_size, patch, in_ch, embed_dim)
        elif patch_type == "linear":
            self.patch = PatchifyLinear(img_size, patch, in_ch, embed_dim)
        else:
            raise ValueError("patch_type must be 'conv' or 'linear'.")

        num_patches = self.patch.num_patches
        self.cls = nn.Parameter(torch.zeros(1,1,embed_dim))
        self.pos_type = pos_type
        if pos_type == "learnable":
            self.pos = nn.Parameter(torch.zeros(1, 1+num_patches, embed_dim))
            nn.init.trunc_normal_(self.pos, std=0.02)
        elif pos_type == "sinusoidal":
            self.pos = SinusoidalPositionalEmbedding(embed_dim, max_len=1+num_patches)
        elif pos_type == "none":
            self.pos = None
        else:
            raise ValueError("pos_type must be learnable/sinusoidal/none")

        self.blocks = nn.ModuleList([TransformerBlock(embed_dim, heads, mlp_ratio) for _ in range(depth)])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)
        nn.init.trunc_normal_(self.cls, std=0.02)

    def forward(self, x):
        B = x.size(0)
        x = self.patch(x)                      # (B, N, E)
        cls = self.cls.expand(B,-1,-1)         # (B,1,E)
        x = torch.cat([cls, x], dim=1)         # (B,1+N,E)
        if self.pos_type == "learnable":
            x = x + self.pos[:, :x.size(1), :]
        elif self.pos_type == "sinusoidal":
            x = self.pos(x)
        for blk in self.blocks: x = blk(x)
        x = self.norm(x)[:,0]
        return self.head(x)

### Training and Evaluation

In [None]:
def top1(logits, y):
    return (logits.argmax(1) == y).float().mean().item()

@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    tot_loss=tot_acc=0.0; n=0
    all_logits=[]; all_y=[]
    for x,y in loader:
        x=x.to(device, non_blocking=True); y=y.to(device, non_blocking=True)
        logits = model(x)
        loss = F.cross_entropy(logits, y)
        bs = x.size(0)
        tot_loss += loss.item()*bs
        tot_acc  += top1(logits, y)*bs
        n += bs
        all_logits.append(logits.cpu()); all_y.append(y.cpu())
    return tot_loss/n, tot_acc/n, torch.cat(all_logits), torch.cat(all_y)

def train_model(model, train_loader, val_loader, epochs=EPOCHS, lr=LR, wd=WEIGHT_DECAY, device=DEVICE, early_stop=EARLY_STOP, run_name="run"):
    model.to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    sch = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs)
    scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))

    history = {"train_loss":[], "train_acc":[], "val_loss":[], "val_acc":[]}
    best = float("inf"); best_state=None; wait=0

    for ep in range(1, epochs+1):
        model.train()
        tr_loss=tr_acc=0.0; n=0
        t0=time.time()
        for x,y in train_loader:
            x=x.to(device, non_blocking=True); y=y.to(device, non_blocking=True)
            opt.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):
                logits = model(x)
                loss = F.cross_entropy(logits, y)
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update()

            bs=x.size(0)
            tr_loss += loss.item()*bs
            tr_acc  += top1(logits, y)*bs
            n += bs

        tr_loss/=n; tr_acc/=n
        val_loss, val_acc, _, _ = evaluate(model, val_loader, device)
        sch.step()

        history["train_loss"].append(tr_loss)
        history["train_acc"].append(tr_acc)
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)

        print(f"[{run_name}] Epoch {ep:02d}: "
              f"train_loss={tr_loss:.4f} acc={tr_acc:.4f} | "
              f"val_loss={val_loss:.4f} acc={val_acc:.4f} | "
              f"lr={sch.get_last_lr()[0]:.2e} time={time.time()-t0:.1f}s")

        if val_loss < best - 1e-4:
            best = val_loss
            best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            wait=0
        else:
            wait+=1
            if wait>=early_stop:
                print(f"[{run_name}] Early stop.")
                break

    if best_state is not None: model.load_state_dict(best_state)
    return model, history

### Plot curves

In [None]:
def plot_curves(history, title, out_png):
    ep = range(1, len(history["train_loss"])+1)
    plt.figure(figsize=(7.5,4))
    plt.subplot(1,2,1)
    plt.plot(ep, history["train_loss"], label="Train")
    plt.plot(ep, history["val_loss"],   label="Val")
    plt.title("Loss"); plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.legend()
    plt.subplot(1,2,2)
    plt.plot(ep, history["train_acc"], label="Train")
    plt.plot(ep, history["val_acc"],   label="Val")
    plt.title("Accuracy"); plt.xlabel("Epoch"); plt.ylabel("Acc"); plt.legend()
    plt.suptitle(title); plt.tight_layout()
    plt.savefig(out_png, dpi=220); plt.close()
    print("Saved:", out_png)

### Experiment Registry

In [None]:
@dataclass
class RunSpec:
    family: str         # "ViT" | "CNN" | "FCFNN"
    name: str           # short name used in plots
    params: dict        # kwargs for model ctor

def build_model(spec: RunSpec):
    if spec.family == "ViT":
        return ViT(num_classes=NUM_CLASSES, img_size=IMG_SIZE, **spec.params)
    if spec.family == "CNN":
        return SmallCNN(num_classes=NUM_CLASSES)
    if spec.family == "FCFNN":
        return FCFNN(img_size=IMG_SIZE, num_classes=NUM_CLASSES, **spec.params)
    raise ValueError("Unknown family")

# Baselines
runs = [
    RunSpec("FCFNN", "FCFNN", {"widths": (1024,512)}),
    RunSpec("CNN",   "SmallCNN", {}),
    RunSpec("ViT",   "ViT(h=4,Conv,LearnPE)", {"patch":8, "embed_dim":192, "depth":8, "heads":4, "patch_type":"conv", "pos_type":"learnable"}),
]

# ViT ablation: number of heads
for h in [2,4,6,8]:
    runs.append(RunSpec("ViT", f"ViT_heads={h}", {"patch":8,"embed_dim":192,"depth":6,"heads":h,"patch_type":"conv","pos_type":"learnable"}))

# ViT ablation: patch embedding type
runs += [
    RunSpec("ViT", "ViT_PatchConv",   {"patch":8,"embed_dim":192,"depth":6,"heads":4,"patch_type":"conv","pos_type":"learnable"}),
    RunSpec("ViT", "ViT_PatchLinear", {"patch":8,"embed_dim":192,"depth":6,"heads":4,"patch_type":"linear","pos_type":"learnable"}),
]

# ViT ablation: positional embedding type
runs += [
    RunSpec("ViT", "ViT_PosLearn", {"patch":8,"embed_dim":192,"depth":6,"heads":4,"patch_type":"conv","pos_type":"learnable"}),
    RunSpec("ViT", "ViT_PosSine",  {"patch":8,"embed_dim":192,"depth":6,"heads":4,"patch_type":"conv","pos_type":"sinusoidal"}),
    RunSpec("ViT", "ViT_PosNone",  {"patch":8,"embed_dim":192,"depth":6,"heads":4,"patch_type":"conv","pos_type":"none"}),
]

### Run all experiments

In [None]:
histories = {}
results = []

for spec in runs:
    run_name = spec.name
    print("\n" + "="*80)
    print("Training:", run_name, "|", spec.family, spec.params)
    model = build_model(spec)
    model, hist = train_model(model, train_loader, val_loader, run_name=run_name)
    histories[run_name] = hist

    curve_path = OUT_DIR / "curves" / f"{run_name}.png"
    plot_curves(hist, title=run_name, out_png=curve_path)

    val_loss, val_acc, logits, tgts = evaluate(model, val_loader, DEVICE)
    rec = {"name": run_name, "family": spec.family, "val_loss": val_loss, "val_acc": val_acc}
    rec.update({f"p:{k}":v for k,v in spec.params.items()})
    results.append(rec)

    if HAS_SK and spec.family in ["ViT","CNN"]:
        cm = confusion_matrix(tgts.numpy(), logits.argmax(1).numpy(), labels=list(range(NUM_CLASSES)))
        plt.figure(figsize=(6,5))
        plt.imshow(cm, interpolation='nearest'); plt.colorbar()
        plt.title(f"Confusion Matrix — {run_name}")
        plt.xlabel("Pred"); plt.ylabel("True"); plt.tight_layout()
        p = OUT_DIR / "figs" / f"cm_{run_name}.png"
        plt.savefig(p, dpi=200); plt.close()
        print("Saved:", p)

### Summaries & Standard Comparison Plots

In [None]:
import pandas as pd
df = pd.DataFrame(results)
csv_path = OUT_DIR / "summary.csv"
df.to_csv(csv_path, index=False)
print("\nSaved summary:", csv_path)
print(df.sort_values("val_acc", ascending=False)[["name","family","val_acc","val_loss"]].to_string(index=False))

best_per_family = df.sort_values("val_acc", ascending=False).groupby("family").head(1)
plt.figure(figsize=(7,4))
plt.bar(best_per_family["family"], best_per_family["val_acc"])
for x, a in zip(best_per_family["family"], best_per_family["val_acc"]):
    plt.text(x, a+0.002, f"{a:.3f}", ha="center", va="bottom")
plt.title("Best Validation Accuracy per Family"); plt.ylabel("Val Acc"); plt.tight_layout()
p = OUT_DIR / "figs" / "best_per_family.png"
plt.savefig(p, dpi=220); plt.close(); print("Saved:", p)

vit_heads = df[df["name"].str.startswith("ViT_heads")]
if not vit_heads.empty:
    vit_heads_sorted = vit_heads.sort_values("p:heads")
    plt.figure(figsize=(7,4))
    xs = [str(h) for h in vit_heads_sorted["p:heads"]]
    plt.bar(xs, vit_heads_sorted["val_acc"])
    for x,a in zip(xs, vit_heads_sorted["val_acc"]):
        plt.text(x, a+0.002, f"{a:.3f}", ha="center", va="bottom")
    plt.xlabel("#Heads"); plt.ylabel("Val Acc")
    plt.title("Effect of Number of Heads (ViT)"); plt.tight_layout()
    p = OUT_DIR / "figs" / "vit_heads_ablation.png"
    plt.savefig(p, dpi=220); plt.close(); print("Saved:", p)

vit_patch = df[df["name"].str.startswith("ViT_Patch")]
if len(vit_patch) >= 2:
    vit_patch = vit_patch.sort_values("name")
    plt.figure(figsize=(7,4))
    xs = vit_patch["name"].tolist()
    plt.bar(xs, vit_patch["val_acc"])
    for x,a in zip(xs, vit_patch["val_acc"]):
        plt.text(x, a+0.002, f"{a:.3f}", ha="center", va="bottom")
    plt.title("Effect of Patch Embedding (ViT)"); plt.ylabel("Val Acc")
    plt.xticks(rotation=15); plt.tight_layout()
    p = OUT_DIR / "figs" / "vit_patch_ablation.png"
    plt.savefig(p, dpi=220); plt.close(); print("Saved:", p)

vit_pos = df[df["name"].str.startswith("ViT_Pos")]
if len(vit_pos) >= 2:
    vit_pos = vit_pos.sort_values("name")
    plt.figure(figsize=(7.5,4))
    xs = vit_pos["name"].tolist()
    plt.bar(xs, vit_pos["val_acc"])
    for x,a in zip(xs, vit_pos["val_acc"]):
        plt.text(x, a+0.002, f"{a:.3f}", ha="center", va="bottom")
    plt.title("Effect of Positional Embedding (ViT)"); plt.ylabel("Val Acc")
    plt.xticks(rotation=15); plt.tight_layout()
    p = OUT_DIR / "figs" / "vit_pos_ablation.png"
    plt.savefig(p, dpi=220); plt.close(); print("Saved:", p)

###COMBINED TABLE + DASHBOARD + COMBINED CURVES + PDF REPORT


In [None]:






# from matplotlib.backends.backend_pdf import PdfPages
# import matplotlib.image as mpimg

# FIG_DIR = OUT_DIR / "figs"
# FIG_DIR.mkdir(parents=True, exist_ok=True)

# def save_combined_table(df, out_png):
#     display_df = df.copy()
#     display_df["val_acc"] = display_df["val_acc"].map(lambda v: f"{v:.3f}")
#     display_df["val_loss"] = display_df["val_loss"].map(lambda v: f"{v:.3f}")
#     for col in ["p:heads", "p:patch_type", "p:pos_type", "p:embed_dim", "p:depth", "p:patch"]:
#         if col not in display_df.columns:
#             display_df[col] = ""
#     tmp = df.copy()
#     tmp["__ord"] = -tmp["val_acc"]
#     ord_idx = tmp.sort_values(["family","__ord"]).index
#     display_df = display_df.loc[ord_idx]
#     show_cols = ["name","family","val_acc","val_loss","p:heads","p:patch_type","p:pos_type","p:patch","p:embed_dim","p:depth"]
#     display_df = display_df[show_cols].rename(columns={
#         "name":"Run","family":"Family","val_acc":"Val Acc","val_loss":"Val Loss",
#         "p:heads":"Heads","p:patch_type":"Patch Emb","p:pos_type":"Pos Emb",
#         "p:patch":"PatchSz","p:embed_dim":"Embed","p:depth":"Depth",
#     })
#     rows = len(display_df)
#     fig_h = 1.0 + rows * 0.38
#     fig, ax = plt.subplots(figsize=(13, fig_h))
#     ax.axis("off")
#     the_table = ax.table(
#         cellText=display_df.values,
#         colLabels=display_df.columns.tolist(),
#         cellLoc="center",
#         loc="upper left",
#     )
#     the_table.auto_set_font_size(False)
#     the_table.set_fontsize(9)
#     the_table.scale(1, 1.2)
#     ax.set_title("All Runs — Validation Results & Key Params", pad=10, fontsize=12, weight="bold")
#     fig.tight_layout()
#     fig.savefig(out_png, dpi=240, bbox_inches="tight")
#     plt.close(fig)
#     print("Saved:", out_png)

# def save_dashboard(df, out_png):
#     fig, axs = plt.subplots(2, 2, figsize=(12, 8))
#     best_per_family = df.sort_values("val_acc", ascending=False).groupby("family").head(1)
#     axs[0,0].bar(best_per_family["family"], best_per_family["val_acc"])
#     for x, a in zip(best_per_family["family"], best_per_family["val_acc"]):
#         axs[0,0].text(x, a+0.003, f"{a:.3f}", ha="center", va="bottom", fontsize=9)
#     axs[0,0].set_title("Best Validation Accuracy per Family"); axs[0,0].set_ylabel("Val Acc")
#     axs[0,0].set_ylim(0, max(0.01 + best_per_family["val_acc"].max(), 0.1))

#     vit_heads = df[df["name"].str.startswith("ViT_heads")]
#     axs[0,1].set_title("ViT — Effect of Number of Heads")
#     if not vit_heads.empty:
#         vit_heads = vit_heads.sort_values("p:heads")
#         xs = vit_heads["p:heads"].astype(int).tolist()
#         axs[0,1].bar([str(x) for x in xs], vit_heads["val_acc"])
#         for x, a in zip(xs, vit_heads["val_acc"]):
#             axs[0,1].text(str(x), a+0.003, f"{a:.3f}", ha="center", va="bottom", fontsize=9)
#         axs[0,1].set_xlabel("#Heads"); axs[0,1].set_ylabel("Val Acc")
#     else:
#         axs[0,1].text(0.5, 0.5, "No heads sweep runs", ha="center", va="center")
#         axs[0,1].set_xticks([]); axs[0,1].set_yticks([])

#     vit_patch = df[df["name"].str.startswith("ViT_Patch")].sort_values("name")
#     axs[1,0].set_title("ViT — Patch Embedding Choice")
#     if len(vit_patch) >= 1:
#         xs = vit_patch["name"].tolist()
#         axs[1,0].bar(xs, vit_patch["val_acc"])
#         for x, a in zip(xs, vit_patch["val_acc"]):
#             axs[1,0].text(x, a+0.003, f"{a:.3f}", ha="center", va="bottom", fontsize=9)
#         axs[1,0].set_ylabel("Val Acc"); axs[1,0].tick_params(axis="x", rotation=15)
#     else:
#         axs[1,0].text(0.5, 0.5, "No patch ablation runs", ha="center", va="center")
#         axs[1,0].set_xticks([]); axs[1,0].set_yticks([])

#     vit_pos = df[df["name"].str.startswith("ViT_Pos")].sort_values("name")
#     axs[1,1].set_title("ViT — Positional Embedding Choice")
#     if len(vit_pos) >= 1:
#         xs = vit_pos["name"].tolist()
#         axs[1,1].bar(xs, vit_pos["val_acc"])
#         for x, a in zip(xs, vit_pos["val_acc"]):
#             axs[1,1].text(x, a+0.003, f"{a:.3f}", ha="center", va="bottom", fontsize=9)
#         axs[1,1].set_ylabel("Val Acc"); axs[1,1].tick_params(axis="x", rotation=15)
#     else:
#         axs[1,1].text(0.5, 0.5, "No positional ablation runs", ha="center", va="center")
#         axs[1,1].set_xticks([]); axs[1,1].set_yticks([])
#     fig.tight_layout(); fig.savefig(out_png, dpi=240, bbox_inches="tight"); plt.close(fig)
#     print("Saved:", out_png)

# def plot_best_family_curves(df, histories_dict, out_png):
#     plt.figure(figsize=(10, 4.5))
#     ax1 = plt.subplot(1,2,1); ax1.set_title("Loss Curves — Best per Family")
#     ax2 = plt.subplot(1,2,2); ax2.set_title("Accuracy Curves — Best per Family")
#     fam_best = df.sort_values("val_acc", ascending=False).groupby("family").head(1)
#     colors = {"FCFNN":"tab:orange", "CNN":"tab:green", "ViT":"tab:blue"}
#     plotted = False
#     for _, row in fam_best.iterrows():
#         name = row["name"]; fam = row["family"]
#         if name not in histories_dict:
#             continue
#         H = histories_dict[name]; ep = range(1, len(H["train_loss"])+1); c = colors.get(fam, None)
#         ax1.plot(ep, H["train_loss"], label=f"{fam}-{name} (train)", linestyle="--", color=c)
#         ax1.plot(ep, H["val_loss"],   label=f"{fam}-{name} (val)",   linestyle="-",  color=c)
#         ax2.plot(ep, H["train_acc"],  label=f"{fam}-{name} (train)", linestyle="--", color=c)
#         ax2.plot(ep, H["val_acc"],    label=f"{fam}-{name} (val)",   linestyle="-",  color=c)
#         plotted = True
#     for ax in (ax1, ax2):
#         ax.set_xlabel("Epoch"); ax.grid(True, linestyle="--", linewidth=0.5); ax.legend(fontsize=8)
#     if not plotted:
#         plt.clf()
#         fig = plt.figure(figsize=(6,2)); plt.axis("off")
#         plt.text(0.5, 0.5, "Histories not available", ha="center", va="center")
#         fig.savefig(out_png, dpi=240, bbox_inches="tight"); plt.close(fig); print("Saved:", out_png);
#         return
#     plt.tight_layout(); plt.savefig(out_png, dpi=240, bbox_inches="tight"); plt.close(); print("Saved:", out_png)

# combined_table_png = FIG_DIR / "combined_table.png"
# dashboard_png      = FIG_DIR / "dashboard.png"
# best_curves_png    = FIG_DIR / "best_family_curves.png"

# save_combined_table(df, combined_table_png)
# save_dashboard(df, dashboard_png)
# plot_best_family_curves(df, histories, best_curves_png)

# pdf_path = FIG_DIR / "report.pdf"
# with PdfPages(pdf_path) as pdf:
#     for img_path in [combined_table_png, dashboard_png, best_curves_png]:
#         if os.path.exists(img_path):
#             fig = plt.figure(figsize=(11, 8.5))
#             plt.imshow(mpimg.imread(img_path)); plt.axis("off")
#             pdf.savefig(fig, bbox_inches="tight"); plt.close(fig)
# print("Saved PDF report:", pdf_path)


Found 200 classes; selecting 20…
Selected classes: ['n01768244', 'n01770393', 'n01774384', 'n02058221', 'n02074367', 'n02099601', 'n02106662', 'n02132136', 'n02481823', 'n02504458', 'n02666196', 'n02730930', 'n02814533', 'n03447447', 'n04067472', 'n04265275', 'n04456115', 'n04562935', 'n07753592', 'n07768694']
Saving a 20-image grid for each class from the training set…
Saved: tri_compare_vit/grids/class_00.png
Saved: tri_compare_vit/grids/class_01.png
Saved: tri_compare_vit/grids/class_02.png
Saved: tri_compare_vit/grids/class_03.png
Saved: tri_compare_vit/grids/class_04.png
Saved: tri_compare_vit/grids/class_05.png
Saved: tri_compare_vit/grids/class_06.png
Saved: tri_compare_vit/grids/class_07.png
Saved: tri_compare_vit/grids/class_08.png
Saved: tri_compare_vit/grids/class_09.png
Saved: tri_compare_vit/grids/class_10.png
Saved: tri_compare_vit/grids/class_11.png
Saved: tri_compare_vit/grids/class_12.png
Saved: tri_compare_vit/grids/class_13.png
Saved: tri_compare_vit/grids/class_14.p

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[FCFNN] Epoch 01: train_loss=2.6309 acc=0.1984 | val_loss=2.4172 acc=0.2773 | lr=2.98e-04 time=11.7s
[FCFNN] Epoch 02: train_loss=2.4121 acc=0.2645 | val_loss=2.3404 acc=0.2880 | lr=2.93e-04 time=10.8s
[FCFNN] Epoch 03: train_loss=2.3172 acc=0.2992 | val_loss=2.3018 acc=0.3127 | lr=2.84e-04 time=10.1s
[FCFNN] Epoch 04: train_loss=2.2292 acc=0.3268 | val_loss=2.2651 acc=0.3147 | lr=2.71e-04 time=10.8s
[FCFNN] Epoch 05: train_loss=2.1331 acc=0.3572 | val_loss=2.2449 acc=0.3393 | lr=2.56e-04 time=11.9s
[FCFNN] Epoch 06: train_loss=2.0871 acc=0.3706 | val_loss=2.2183 acc=0.3407 | lr=2.38e-04 time=13.1s
[FCFNN] Epoch 07: train_loss=2.0074 acc=0.3958 | val_loss=2.2134 acc=0.3380 | lr=2.18e-04 time=10.9s
[FCFNN] Epoch 08: train_loss=1.9586 acc=0.4185 | val_loss=2.1913 acc=0.3453 | lr=1.96e-04 time=11.0s
[FCFNN] Epoch 09: train_loss=1.8815 acc=0.4387 | val_loss=2.1921 acc=0.3620 | lr=1.73e-04 time=11.1s
[FCFNN] Epoch 10: train_loss=1.8223 acc=0.4498 | val_loss=2.1878 acc=0.3460 | lr=1.50e-04 t

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[ViT(h=4,Conv,LearnPE)] Epoch 01: train_loss=2.6740 acc=0.1709 | val_loss=2.4760 acc=0.2247 | lr=2.98e-04 time=12.5s
[ViT(h=4,Conv,LearnPE)] Epoch 02: train_loss=2.4714 acc=0.2342 | val_loss=2.3210 acc=0.2867 | lr=2.93e-04 time=12.4s
[ViT(h=4,Conv,LearnPE)] Epoch 03: train_loss=2.3021 acc=0.2840 | val_loss=2.1329 acc=0.3487 | lr=2.84e-04 time=12.4s
[ViT(h=4,Conv,LearnPE)] Epoch 04: train_loss=2.1856 acc=0.3225 | val_loss=2.0623 acc=0.3813 | lr=2.71e-04 time=12.4s
[ViT(h=4,Conv,LearnPE)] Epoch 05: train_loss=2.0888 acc=0.3552 | val_loss=2.0112 acc=0.3913 | lr=2.56e-04 time=12.4s
[ViT(h=4,Conv,LearnPE)] Epoch 06: train_loss=1.9898 acc=0.3846 | val_loss=1.9069 acc=0.4247 | lr=2.38e-04 time=12.2s
[ViT(h=4,Conv,LearnPE)] Epoch 07: train_loss=1.8984 acc=0.4189 | val_loss=1.8694 acc=0.4360 | lr=2.18e-04 time=12.3s
[ViT(h=4,Conv,LearnPE)] Epoch 08: train_loss=1.8370 acc=0.4431 | val_loss=1.8197 acc=0.4427 | lr=1.96e-04 time=12.1s
[ViT(h=4,Conv,LearnPE)] Epoch 09: train_loss=1.7636 acc=0.4598 |

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[ViT_heads=2] Epoch 01: train_loss=2.6449 acc=0.1822 | val_loss=2.4156 acc=0.2460 | lr=2.98e-04 time=12.2s
[ViT_heads=2] Epoch 02: train_loss=2.3963 acc=0.2571 | val_loss=2.2147 acc=0.3360 | lr=2.93e-04 time=12.1s
[ViT_heads=2] Epoch 03: train_loss=2.2411 acc=0.3060 | val_loss=2.0743 acc=0.3713 | lr=2.84e-04 time=12.0s
[ViT_heads=2] Epoch 04: train_loss=2.1522 acc=0.3334 | val_loss=2.0201 acc=0.3833 | lr=2.71e-04 time=12.1s
[ViT_heads=2] Epoch 05: train_loss=2.0646 acc=0.3638 | val_loss=1.9472 acc=0.4160 | lr=2.56e-04 time=12.0s
[ViT_heads=2] Epoch 06: train_loss=1.9681 acc=0.3929 | val_loss=1.9158 acc=0.4153 | lr=2.38e-04 time=12.1s
[ViT_heads=2] Epoch 07: train_loss=1.8887 acc=0.4125 | val_loss=1.8484 acc=0.4447 | lr=2.18e-04 time=12.1s
[ViT_heads=2] Epoch 08: train_loss=1.8140 acc=0.4344 | val_loss=1.8227 acc=0.4600 | lr=1.96e-04 time=12.0s
[ViT_heads=2] Epoch 09: train_loss=1.7586 acc=0.4555 | val_loss=1.7748 acc=0.4693 | lr=1.73e-04 time=12.3s
[ViT_heads=2] Epoch 10: train_loss=1.

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[ViT_heads=4] Epoch 01: train_loss=2.6377 acc=0.1824 | val_loss=2.4178 acc=0.2513 | lr=2.98e-04 time=12.3s
[ViT_heads=4] Epoch 02: train_loss=2.3879 acc=0.2664 | val_loss=2.1929 acc=0.3280 | lr=2.93e-04 time=12.2s
[ViT_heads=4] Epoch 03: train_loss=2.2400 acc=0.3102 | val_loss=2.1105 acc=0.3680 | lr=2.84e-04 time=12.1s
[ViT_heads=4] Epoch 04: train_loss=2.1253 acc=0.3420 | val_loss=2.0176 acc=0.3807 | lr=2.71e-04 time=12.2s
[ViT_heads=4] Epoch 05: train_loss=2.0409 acc=0.3716 | val_loss=1.9768 acc=0.3927 | lr=2.56e-04 time=12.0s
[ViT_heads=4] Epoch 06: train_loss=1.9664 acc=0.3920 | val_loss=1.9227 acc=0.4107 | lr=2.38e-04 time=12.2s
[ViT_heads=4] Epoch 07: train_loss=1.8974 acc=0.4167 | val_loss=1.8600 acc=0.4293 | lr=2.18e-04 time=12.1s
[ViT_heads=4] Epoch 08: train_loss=1.8065 acc=0.4391 | val_loss=1.8538 acc=0.4327 | lr=1.96e-04 time=12.1s
[ViT_heads=4] Epoch 09: train_loss=1.7643 acc=0.4560 | val_loss=1.7796 acc=0.4593 | lr=1.73e-04 time=12.2s
[ViT_heads=4] Epoch 10: train_loss=1.

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[ViT_heads=6] Epoch 01: train_loss=2.6668 acc=0.1726 | val_loss=2.4655 acc=0.2267 | lr=2.98e-04 time=12.2s
[ViT_heads=6] Epoch 02: train_loss=2.4220 acc=0.2469 | val_loss=2.2570 acc=0.3020 | lr=2.93e-04 time=12.0s
[ViT_heads=6] Epoch 03: train_loss=2.2721 acc=0.2968 | val_loss=2.1247 acc=0.3467 | lr=2.84e-04 time=11.9s
[ViT_heads=6] Epoch 04: train_loss=2.1470 acc=0.3299 | val_loss=2.0004 acc=0.3933 | lr=2.71e-04 time=12.2s
[ViT_heads=6] Epoch 05: train_loss=2.0429 acc=0.3705 | val_loss=1.9971 acc=0.3820 | lr=2.56e-04 time=12.1s
[ViT_heads=6] Epoch 06: train_loss=1.9656 acc=0.3947 | val_loss=1.8787 acc=0.4340 | lr=2.38e-04 time=12.2s
[ViT_heads=6] Epoch 07: train_loss=1.8875 acc=0.4127 | val_loss=1.9292 acc=0.4120 | lr=2.18e-04 time=12.1s
[ViT_heads=6] Epoch 08: train_loss=1.8215 acc=0.4387 | val_loss=1.8562 acc=0.4520 | lr=1.96e-04 time=12.2s
[ViT_heads=6] Epoch 09: train_loss=1.7637 acc=0.4545 | val_loss=1.8131 acc=0.4553 | lr=1.73e-04 time=12.2s
[ViT_heads=6] Epoch 10: train_loss=1.

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[ViT_heads=8] Epoch 01: train_loss=2.6507 acc=0.1799 | val_loss=2.4226 acc=0.2513 | lr=2.98e-04 time=12.2s
[ViT_heads=8] Epoch 02: train_loss=2.3858 acc=0.2585 | val_loss=2.2288 acc=0.3120 | lr=2.93e-04 time=12.0s
[ViT_heads=8] Epoch 03: train_loss=2.2481 acc=0.3052 | val_loss=2.1240 acc=0.3493 | lr=2.84e-04 time=12.1s
[ViT_heads=8] Epoch 04: train_loss=2.1272 acc=0.3408 | val_loss=2.0411 acc=0.3753 | lr=2.71e-04 time=12.2s
[ViT_heads=8] Epoch 05: train_loss=2.0259 acc=0.3719 | val_loss=1.9436 acc=0.4013 | lr=2.56e-04 time=12.0s
[ViT_heads=8] Epoch 06: train_loss=1.9429 acc=0.4022 | val_loss=1.8752 acc=0.4247 | lr=2.38e-04 time=12.1s
[ViT_heads=8] Epoch 07: train_loss=1.8563 acc=0.4226 | val_loss=1.7970 acc=0.4513 | lr=2.18e-04 time=12.0s
[ViT_heads=8] Epoch 08: train_loss=1.7786 acc=0.4484 | val_loss=1.7737 acc=0.4647 | lr=1.96e-04 time=11.9s
[ViT_heads=8] Epoch 09: train_loss=1.7168 acc=0.4740 | val_loss=1.7577 acc=0.4653 | lr=1.73e-04 time=12.0s
[ViT_heads=8] Epoch 10: train_loss=1.

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[ViT_PatchConv] Epoch 01: train_loss=2.6515 acc=0.1819 | val_loss=2.4346 acc=0.2533 | lr=2.98e-04 time=12.1s
[ViT_PatchConv] Epoch 02: train_loss=2.4006 acc=0.2556 | val_loss=2.2250 acc=0.3313 | lr=2.93e-04 time=11.9s
[ViT_PatchConv] Epoch 03: train_loss=2.2569 acc=0.3000 | val_loss=2.1057 acc=0.3660 | lr=2.84e-04 time=12.0s
[ViT_PatchConv] Epoch 04: train_loss=2.1488 acc=0.3349 | val_loss=2.0319 acc=0.3853 | lr=2.71e-04 time=12.1s
[ViT_PatchConv] Epoch 05: train_loss=2.0677 acc=0.3678 | val_loss=1.9839 acc=0.4020 | lr=2.56e-04 time=12.0s
[ViT_PatchConv] Epoch 06: train_loss=1.9885 acc=0.3896 | val_loss=1.9494 acc=0.4027 | lr=2.38e-04 time=12.0s
[ViT_PatchConv] Epoch 07: train_loss=1.9292 acc=0.4056 | val_loss=1.9159 acc=0.4187 | lr=2.18e-04 time=11.8s
[ViT_PatchConv] Epoch 08: train_loss=1.8578 acc=0.4302 | val_loss=1.8348 acc=0.4447 | lr=1.96e-04 time=11.9s
[ViT_PatchConv] Epoch 09: train_loss=1.7932 acc=0.4548 | val_loss=1.8096 acc=0.4420 | lr=1.73e-04 time=11.7s
[ViT_PatchConv] Epo

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[ViT_PatchLinear] Epoch 01: train_loss=2.6534 acc=0.1774 | val_loss=2.4376 acc=0.2480 | lr=2.98e-04 time=12.1s
[ViT_PatchLinear] Epoch 02: train_loss=2.4081 acc=0.2631 | val_loss=2.2341 acc=0.3200 | lr=2.93e-04 time=12.0s
[ViT_PatchLinear] Epoch 03: train_loss=2.2390 acc=0.3065 | val_loss=2.1317 acc=0.3533 | lr=2.84e-04 time=12.1s
[ViT_PatchLinear] Epoch 04: train_loss=2.1333 acc=0.3422 | val_loss=2.0288 acc=0.3973 | lr=2.71e-04 time=12.0s
[ViT_PatchLinear] Epoch 05: train_loss=2.0449 acc=0.3682 | val_loss=2.0501 acc=0.3860 | lr=2.56e-04 time=11.9s
[ViT_PatchLinear] Epoch 06: train_loss=1.9545 acc=0.4007 | val_loss=1.9421 acc=0.4167 | lr=2.38e-04 time=11.9s
[ViT_PatchLinear] Epoch 07: train_loss=1.8830 acc=0.4211 | val_loss=1.8838 acc=0.4347 | lr=2.18e-04 time=11.6s
[ViT_PatchLinear] Epoch 08: train_loss=1.8242 acc=0.4292 | val_loss=1.8564 acc=0.4293 | lr=1.96e-04 time=11.5s
[ViT_PatchLinear] Epoch 09: train_loss=1.7768 acc=0.4474 | val_loss=1.8319 acc=0.4453 | lr=1.73e-04 time=11.9s
[

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[ViT_PosLearn] Epoch 01: train_loss=2.6638 acc=0.1741 | val_loss=2.4684 acc=0.2087 | lr=2.98e-04 time=12.0s
[ViT_PosLearn] Epoch 02: train_loss=2.4321 acc=0.2501 | val_loss=2.2959 acc=0.2660 | lr=2.93e-04 time=11.9s
[ViT_PosLearn] Epoch 03: train_loss=2.2853 acc=0.2931 | val_loss=2.1665 acc=0.3353 | lr=2.84e-04 time=12.0s
[ViT_PosLearn] Epoch 04: train_loss=2.1611 acc=0.3381 | val_loss=2.0354 acc=0.3793 | lr=2.71e-04 time=11.9s
[ViT_PosLearn] Epoch 05: train_loss=2.0500 acc=0.3673 | val_loss=1.9773 acc=0.4140 | lr=2.56e-04 time=11.9s
[ViT_PosLearn] Epoch 06: train_loss=1.9716 acc=0.3938 | val_loss=1.9068 acc=0.4267 | lr=2.38e-04 time=11.4s
[ViT_PosLearn] Epoch 07: train_loss=1.8936 acc=0.4175 | val_loss=1.8773 acc=0.4307 | lr=2.18e-04 time=11.8s
[ViT_PosLearn] Epoch 08: train_loss=1.8227 acc=0.4346 | val_loss=1.8413 acc=0.4447 | lr=1.96e-04 time=12.2s
[ViT_PosLearn] Epoch 09: train_loss=1.7677 acc=0.4533 | val_loss=1.7789 acc=0.4593 | lr=1.73e-04 time=12.0s
[ViT_PosLearn] Epoch 10: tra

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[ViT_PosSine] Epoch 01: train_loss=2.7579 acc=0.1495 | val_loss=2.5880 acc=0.2040 | lr=2.98e-04 time=12.2s
[ViT_PosSine] Epoch 02: train_loss=2.5562 acc=0.2066 | val_loss=2.3394 acc=0.2747 | lr=2.93e-04 time=11.9s
[ViT_PosSine] Epoch 03: train_loss=2.3580 acc=0.2776 | val_loss=2.1427 acc=0.3507 | lr=2.84e-04 time=11.5s
[ViT_PosSine] Epoch 04: train_loss=2.1762 acc=0.3335 | val_loss=1.9881 acc=0.3933 | lr=2.71e-04 time=11.6s
[ViT_PosSine] Epoch 05: train_loss=2.0465 acc=0.3686 | val_loss=1.9252 acc=0.4027 | lr=2.56e-04 time=11.8s
[ViT_PosSine] Epoch 06: train_loss=1.9469 acc=0.4020 | val_loss=1.8733 acc=0.4167 | lr=2.38e-04 time=12.1s
[ViT_PosSine] Epoch 07: train_loss=1.8679 acc=0.4232 | val_loss=1.8243 acc=0.4360 | lr=2.18e-04 time=12.2s
[ViT_PosSine] Epoch 08: train_loss=1.7934 acc=0.4476 | val_loss=1.8120 acc=0.4513 | lr=1.96e-04 time=12.0s
[ViT_PosSine] Epoch 09: train_loss=1.7222 acc=0.4747 | val_loss=1.7549 acc=0.4640 | lr=1.73e-04 time=12.1s
[ViT_PosSine] Epoch 10: train_loss=1.

  scaler = torch.cuda.amp.GradScaler(enabled=(AMP and device.startswith("cuda")))
  with torch.cuda.amp.autocast(enabled=(AMP and device.startswith("cuda"))):


[ViT_PosNone] Epoch 01: train_loss=2.6679 acc=0.1741 | val_loss=2.4524 acc=0.2440 | lr=2.98e-04 time=11.5s
[ViT_PosNone] Epoch 02: train_loss=2.4476 acc=0.2467 | val_loss=2.2584 acc=0.3173 | lr=2.93e-04 time=11.4s
[ViT_PosNone] Epoch 03: train_loss=2.2751 acc=0.3005 | val_loss=2.1071 acc=0.3687 | lr=2.84e-04 time=11.7s
[ViT_PosNone] Epoch 04: train_loss=2.1642 acc=0.3278 | val_loss=2.0627 acc=0.3680 | lr=2.71e-04 time=12.1s
[ViT_PosNone] Epoch 05: train_loss=2.0661 acc=0.3602 | val_loss=1.9585 acc=0.4213 | lr=2.56e-04 time=12.1s
[ViT_PosNone] Epoch 06: train_loss=1.9707 acc=0.3888 | val_loss=1.9140 acc=0.4247 | lr=2.38e-04 time=12.0s
[ViT_PosNone] Epoch 07: train_loss=1.9055 acc=0.4162 | val_loss=1.8858 acc=0.4340 | lr=2.18e-04 time=12.1s
[ViT_PosNone] Epoch 08: train_loss=1.8332 acc=0.4325 | val_loss=1.8094 acc=0.4587 | lr=1.96e-04 time=12.1s
[ViT_PosNone] Epoch 09: train_loss=1.7848 acc=0.4474 | val_loss=1.7955 acc=0.4627 | lr=1.73e-04 time=12.1s
[ViT_PosNone] Epoch 10: train_loss=1.