# Name - Arnav Samal (NIT Rourkela)

Q1 — Vision Transformer on CIFAR-10 (PyTorch)
Goal. Implement a ViT and train on CIFAR-10 (10 classes). Your objective is to achieve the highest
possible test accuracy. You are free to try various improvements and tricks to improve performance.

- Note: You must only use Colab
- Paper: An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale (Dosovitskiy
et al., ICLR 2021).

Requirements
- Patchify images, add learnable positional embeddings, prepend a CLS token, stack
Transformer encoder blocks (MHSA + MLP with residual + norm), classify from CLS.
- In README.md, include: how to run in Colab, the config for your best model, and a tiny
results table (Overall classification test accuracy).

(Bonus — optional analysis)
Any concise analysis earns bonus marks (e.g., patch size choices, depth/width trade-offs,
augmentation effects, optimizer/schedule variants, overlapping vs. non-overlapping patches, etc.).
Keep it short and crisp. The analysis should also be the part of the README.md

# Imports & Installations

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import math
import copy
import time
import gc
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
config = {
    "img_size": 32,
    "patch_size": 4,
    "in_channels": 3,
    "num_classes": 10,
    "embed_dim": 256,
    "num_heads": 8,
    "mlp_dim": 512,
    "num_layers": 6,
    "dropout": 0.1,
    "lr": 3e-4,
    "batch_size": 512,
    "epochs": 20
}

# Model Building

In [3]:
class PatchEmbedding(nn.Module):
    def __init__(self, img_size, patch_size, in_channels, embed_dim, overlap=0):
        super().__init__()
        self.patch_size = patch_size
        self.num_patches = ((img_size - overlap) // (patch_size - overlap)) ** 2
        self.proj = nn.Conv2d(in_channels, embed_dim,
                              kernel_size=patch_size, stride=patch_size - overlap)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + 1, embed_dim))

    def forward(self, x):
        B = x.size(0)
        x = self.proj(x).flatten(2).transpose(1, 2)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        return x

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.mlp(self.norm2(x))
        return x

In [5]:
class ViT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.patch_embed = PatchEmbedding(config["img_size"], config["patch_size"],
                                          config["in_channels"], config["embed_dim"])
        self.transformer = nn.Sequential(
            *[TransformerBlock(config["embed_dim"], config["num_heads"], config["mlp_dim"], config["dropout"])
              for _ in range(config["num_layers"])]
        )
        self.norm = nn.LayerNorm(config["embed_dim"])
        self.head = nn.Linear(config["embed_dim"], config["num_classes"])

    def forward(self, x):
        x = self.patch_embed(x)
        x = self.transformer(x)
        cls_token = self.norm(x[:, 0])
        x = self.head(cls_token)
        return x

# Data Loading

In [6]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = DataLoader(trainset, batch_size=config["batch_size"], shuffle=True, num_workers=2, pin_memory=False)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = DataLoader(testset, batch_size=config["batch_size"], shuffle=False, num_workers=2, pin_memory=False)

100%|██████████| 170M/170M [00:03<00:00, 53.8MB/s]


In [None]:
def train(model, trainloader, testloader, config, optimizer=None):
    criterion = nn.CrossEntropyLoss().to(device)
    model.to(device)

    if optimizer is None:
        optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"])

    for epoch in range(config["epochs"]):
        model.train()
        running_loss = 0.0
        for imgs, labels in tqdm(trainloader):
            imgs, labels = imgs.to(device, non_blocking=True), labels.to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)  
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            del imgs, labels, outputs, loss
            torch.cuda.empty_cache()

        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for imgs, labels in testloader:
                imgs, labels = imgs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
                outputs = model(imgs)
                _, predicted = outputs.max(1)

                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

                del imgs, labels, outputs, predicted 
            torch.cuda.empty_cache()

        acc = 100.0 * correct / total
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(trainloader):.4f}, Test Acc: {acc:.2f}%")

    return float(acc)

model = ViT(config)
train(model, trainloader, testloader, config)

model.to("cpu") 
del model         
gc.collect()      
torch.cuda.empty_cache() 

100%|██████████| 98/98 [00:38<00:00,  2.57it/s]


Epoch 1, Loss: 1.8573, Test Acc: 38.72%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 2, Loss: 1.5661, Test Acc: 47.03%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 3, Loss: 1.3925, Test Acc: 52.37%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 4, Loss: 1.2830, Test Acc: 55.77%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 5, Loss: 1.1926, Test Acc: 56.88%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 6, Loss: 1.1300, Test Acc: 60.31%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 7, Loss: 1.0837, Test Acc: 61.84%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 8, Loss: 1.0305, Test Acc: 64.30%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 9, Loss: 0.9901, Test Acc: 65.93%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 10, Loss: 0.9526, Test Acc: 67.27%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 11, Loss: 0.9225, Test Acc: 68.75%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 12, Loss: 0.8916, Test Acc: 68.15%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 13, Loss: 0.8569, Test Acc: 69.27%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 14, Loss: 0.8332, Test Acc: 70.17%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 15, Loss: 0.8021, Test Acc: 71.25%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 16, Loss: 0.7849, Test Acc: 72.96%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 17, Loss: 0.7496, Test Acc: 72.72%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 18, Loss: 0.7378, Test Acc: 73.74%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 19, Loss: 0.7106, Test Acc: 74.07%


100%|██████████| 98/98 [00:37<00:00,  2.64it/s]


Epoch 20, Loss: 0.6945, Test Acc: 73.50%


# Extra Analysis

In [None]:
def get_optimizer(model, optimizer_type="AdamW", lr=3e-4):
    if optimizer_type == "AdamW":
        return torch.optim.AdamW(model.parameters(), lr=lr)
    else:  
        return torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
def run_experiment(config, desc="", trainloader_override=None, optimizer_type="AdamW", overlap=0):
    print(f"\n=== Experiment: {desc} ===")

    model = ViT(config).to(device)
    model.patch_embed = PatchEmbedding(
        img_size=config["img_size"],
        patch_size=config["patch_size"],
        in_channels=config["in_channels"],
        embed_dim=config["embed_dim"],
        overlap=overlap
    ).to(device)

    optimizer = get_optimizer(model, optimizer_type, lr=config["lr"])
    loader = trainloader_override if trainloader_override else trainloader
    acc = train(model, loader, testloader, config, optimizer=optimizer)
    model.to("cpu")   
    del optimizer
    del model         
    gc.collect()      
    torch.cuda.empty_cache() 
    print(torch.cuda.memory_summary())
    return acc

In [10]:
config["patch_size"] = 4
config["num_layers"] = 6
config["embed_dim"] = 256
baseline_acc = run_experiment(config, "Baseline: patch=4, layers=6, embed=256")

config["patch_size"] = 8
patch8_acc = run_experiment(config, "Patch size 8x8")

config["num_layers"] = 4
config["patch_size"] = 4
shallow_acc = run_experiment(config, "Shallow: layers=4")

config["embed_dim"] = 512
wide_acc = run_experiment(config, "Wider embedding: 512")

trainset_no_aug = torchvision.datasets.CIFAR10(root='./data', train=True, download=True,
                                               transform=transform_test)
trainloader_no_aug = DataLoader(trainset_no_aug, batch_size=config["batch_size"], shuffle=True)
no_aug_acc = run_experiment(config, "No data augmentation", trainloader_override=trainloader_no_aug)

experiments = [
    {"patch_size":8, "overlap":0, "optimizer":"AdamW", "desc":"Baseline, non-overlapping, AdamW"},
    {"patch_size":8, "overlap":4, "optimizer":"AdamW", "desc":"Overlapping patches, AdamW"},
    {"patch_size":8, "overlap":0, "optimizer":"Adam", "desc":"Non-overlapping, Adam"},
]

results = {}
for exp in experiments:
    config["patch_size"] = exp["patch_size"]
    acc = run_experiment(config, exp["desc"], optimizer_type=exp["optimizer"], overlap=exp["overlap"])
    results[exp["desc"]] = acc


=== Experiment: Baseline: patch=4, layers=6, embed=256 ===


100%|██████████| 98/98 [00:37<00:00,  2.64it/s]


Epoch 1, Loss: 1.8546, Test Acc: 38.10%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 2, Loss: 1.5657, Test Acc: 46.35%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 3, Loss: 1.3959, Test Acc: 51.81%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 4, Loss: 1.2898, Test Acc: 54.62%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 5, Loss: 1.2168, Test Acc: 56.42%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 6, Loss: 1.1422, Test Acc: 59.74%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 7, Loss: 1.0911, Test Acc: 62.30%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 8, Loss: 1.0390, Test Acc: 64.21%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 9, Loss: 1.0047, Test Acc: 64.73%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 10, Loss: 0.9636, Test Acc: 65.84%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 11, Loss: 0.9358, Test Acc: 68.34%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 12, Loss: 0.8984, Test Acc: 69.07%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 13, Loss: 0.8734, Test Acc: 68.73%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 14, Loss: 0.8486, Test Acc: 70.70%


100%|██████████| 98/98 [00:37<00:00,  2.65it/s]


Epoch 15, Loss: 0.8022, Test Acc: 71.39%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 16, Loss: 0.7977, Test Acc: 71.11%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 17, Loss: 0.7617, Test Acc: 72.62%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 18, Loss: 0.7428, Test Acc: 72.34%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 19, Loss: 0.7168, Test Acc: 73.64%


100%|██████████| 98/98 [00:36<00:00,  2.65it/s]


Epoch 20, Loss: 0.6977, Test Acc: 73.38%
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  16640 KiB |   4027 MiB |  61341 GiB |  61341 GiB |
|       from large pool |  16640 KiB |   3984 MiB |  61104 GiB |  61104 GiB |
|       from small pool |      0 KiB |     60 MiB |    236 GiB |    236 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  16640 KiB |   4027 MiB |  61341 GiB |  61341 GiB |
|       from large pool |  16640 KiB |   3984 MiB |  61104 GiB |  61104 GiB |
|       from small pool |      0 KiB |     60 MiB |    236 GiB |    236 GiB |
|----------------------

100%|██████████| 98/98 [00:12<00:00,  7.85it/s]


Epoch 1, Loss: 1.8886, Test Acc: 35.89%


100%|██████████| 98/98 [00:12<00:00,  7.81it/s]


Epoch 2, Loss: 1.6717, Test Acc: 40.50%


100%|██████████| 98/98 [00:12<00:00,  7.91it/s]


Epoch 3, Loss: 1.5561, Test Acc: 45.80%


100%|██████████| 98/98 [00:12<00:00,  7.77it/s]


Epoch 4, Loss: 1.4671, Test Acc: 47.22%


100%|██████████| 98/98 [00:12<00:00,  7.76it/s]


Epoch 5, Loss: 1.3974, Test Acc: 48.66%


100%|██████████| 98/98 [00:12<00:00,  7.71it/s]


Epoch 6, Loss: 1.3480, Test Acc: 52.40%


100%|██████████| 98/98 [00:12<00:00,  7.67it/s]


Epoch 7, Loss: 1.2938, Test Acc: 53.66%


100%|██████████| 98/98 [00:12<00:00,  7.83it/s]


Epoch 8, Loss: 1.2516, Test Acc: 56.65%


100%|██████████| 98/98 [00:12<00:00,  7.70it/s]


Epoch 9, Loss: 1.2182, Test Acc: 57.04%


100%|██████████| 98/98 [00:12<00:00,  7.58it/s]


Epoch 10, Loss: 1.1813, Test Acc: 57.57%


100%|██████████| 98/98 [00:12<00:00,  7.72it/s]


Epoch 11, Loss: 1.1575, Test Acc: 59.43%


100%|██████████| 98/98 [00:12<00:00,  7.63it/s]


Epoch 12, Loss: 1.1256, Test Acc: 61.11%


100%|██████████| 98/98 [00:12<00:00,  7.69it/s]


Epoch 13, Loss: 1.0950, Test Acc: 61.82%


100%|██████████| 98/98 [00:12<00:00,  7.67it/s]


Epoch 14, Loss: 1.0673, Test Acc: 62.55%


100%|██████████| 98/98 [00:12<00:00,  7.62it/s]


Epoch 15, Loss: 1.0461, Test Acc: 62.45%


100%|██████████| 98/98 [00:12<00:00,  7.71it/s]


Epoch 16, Loss: 1.0184, Test Acc: 64.32%


100%|██████████| 98/98 [00:12<00:00,  7.83it/s]


Epoch 17, Loss: 1.0028, Test Acc: 64.82%


100%|██████████| 98/98 [00:12<00:00,  7.78it/s]


Epoch 18, Loss: 0.9749, Test Acc: 65.54%


100%|██████████| 98/98 [00:12<00:00,  7.79it/s]


Epoch 19, Loss: 0.9590, Test Acc: 66.44%


100%|██████████| 98/98 [00:12<00:00,  7.66it/s]


Epoch 20, Loss: 0.9371, Test Acc: 65.40%
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  16640 KiB |   4027 MiB |  68805 GiB |  68805 GiB |
|       from large pool |  16640 KiB |   3984 MiB |  68451 GiB |  68451 GiB |
|       from small pool |      0 KiB |     61 MiB |    353 GiB |    353 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  16640 KiB |   4027 MiB |  68805 GiB |  68805 GiB |
|       from large pool |  16640 KiB |   3984 MiB |  68451 GiB |  68451 GiB |
|       from small pool |      0 KiB |     61 MiB |    353 GiB |    353 GiB |
|----------------------

100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 1, Loss: 1.8519, Test Acc: 38.19%


100%|██████████| 98/98 [00:25<00:00,  3.86it/s]


Epoch 2, Loss: 1.5373, Test Acc: 46.77%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 3, Loss: 1.3825, Test Acc: 50.81%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 4, Loss: 1.2789, Test Acc: 57.27%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 5, Loss: 1.1899, Test Acc: 58.44%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 6, Loss: 1.1418, Test Acc: 61.50%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 7, Loss: 1.0972, Test Acc: 62.81%


100%|██████████| 98/98 [00:25<00:00,  3.84it/s]


Epoch 8, Loss: 1.0439, Test Acc: 64.08%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 9, Loss: 1.0117, Test Acc: 65.18%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 10, Loss: 0.9699, Test Acc: 64.93%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 11, Loss: 0.9485, Test Acc: 67.10%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 12, Loss: 0.9190, Test Acc: 67.44%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 13, Loss: 0.8865, Test Acc: 68.38%


100%|██████████| 98/98 [00:25<00:00,  3.86it/s]


Epoch 14, Loss: 0.8608, Test Acc: 70.18%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 15, Loss: 0.8369, Test Acc: 70.36%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 16, Loss: 0.8148, Test Acc: 71.47%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 17, Loss: 0.7973, Test Acc: 71.54%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 18, Loss: 0.7733, Test Acc: 72.27%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 19, Loss: 0.7542, Test Acc: 73.23%


100%|██████████| 98/98 [00:25<00:00,  3.85it/s]


Epoch 20, Loss: 0.7414, Test Acc: 73.06%
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  16640 KiB |   4027 MiB |  89425 GiB |  89425 GiB |
|       from large pool |  16640 KiB |   3984 MiB |  88990 GiB |  88990 GiB |
|       from small pool |      0 KiB |     61 MiB |    435 GiB |    435 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  16640 KiB |   4027 MiB |  89425 GiB |  89425 GiB |
|       from large pool |  16640 KiB |   3984 MiB |  88990 GiB |  88990 GiB |
|       from small pool |      0 KiB |     61 MiB |    435 GiB |    435 GiB |
|----------------------

100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 1, Loss: 1.8920, Test Acc: 35.77%


100%|██████████| 98/98 [00:43<00:00,  2.23it/s]


Epoch 2, Loss: 1.5463, Test Acc: 48.34%


100%|██████████| 98/98 [00:43<00:00,  2.23it/s]


Epoch 3, Loss: 1.3864, Test Acc: 52.03%


100%|██████████| 98/98 [00:43<00:00,  2.23it/s]


Epoch 4, Loss: 1.2722, Test Acc: 56.21%


100%|██████████| 98/98 [00:43<00:00,  2.23it/s]


Epoch 5, Loss: 1.1917, Test Acc: 59.37%


100%|██████████| 98/98 [00:43<00:00,  2.23it/s]


Epoch 6, Loss: 1.1251, Test Acc: 59.81%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 7, Loss: 1.0765, Test Acc: 62.99%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 8, Loss: 1.0287, Test Acc: 64.61%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 9, Loss: 0.9939, Test Acc: 63.43%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 10, Loss: 0.9534, Test Acc: 66.24%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 11, Loss: 0.9221, Test Acc: 68.31%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 12, Loss: 0.8941, Test Acc: 67.89%


100%|██████████| 98/98 [00:43<00:00,  2.23it/s]


Epoch 13, Loss: 0.8685, Test Acc: 68.91%


100%|██████████| 98/98 [00:43<00:00,  2.23it/s]


Epoch 14, Loss: 0.8405, Test Acc: 69.38%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 15, Loss: 0.8278, Test Acc: 70.55%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 16, Loss: 0.8020, Test Acc: 71.40%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 17, Loss: 0.7752, Test Acc: 71.99%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 18, Loss: 0.7618, Test Acc: 71.56%


100%|██████████| 98/98 [00:43<00:00,  2.24it/s]


Epoch 19, Loss: 0.7372, Test Acc: 73.38%


100%|██████████| 98/98 [00:43<00:00,  2.23it/s]


Epoch 20, Loss: 0.7205, Test Acc: 73.40%
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  16640 KiB |   4230 MiB | 121423 GiB | 121423 GiB |
|       from large pool |  16640 KiB |   4188 MiB | 120868 GiB | 120868 GiB |
|       from small pool |      0 KiB |     61 MiB |    555 GiB |    555 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  16640 KiB |   4230 MiB | 121423 GiB | 121423 GiB |
|       from large pool |  16640 KiB |   4188 MiB | 120868 GiB | 120868 GiB |
|       from small pool |      0 KiB |     61 MiB |    555 GiB |    555 GiB |
|----------------------

100%|██████████| 98/98 [00:54<00:00,  1.80it/s]


Epoch 1, Loss: 1.8034, Test Acc: 41.38%


100%|██████████| 98/98 [00:54<00:00,  1.80it/s]


Epoch 2, Loss: 1.4549, Test Acc: 50.88%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 3, Loss: 1.2560, Test Acc: 55.60%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 4, Loss: 1.1203, Test Acc: 58.52%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 5, Loss: 1.0057, Test Acc: 61.16%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 6, Loss: 0.9115, Test Acc: 62.48%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 7, Loss: 0.8288, Test Acc: 63.02%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 8, Loss: 0.7429, Test Acc: 63.90%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 9, Loss: 0.6610, Test Acc: 64.01%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 10, Loss: 0.5852, Test Acc: 64.48%


100%|██████████| 98/98 [00:53<00:00,  1.82it/s]


Epoch 11, Loss: 0.5231, Test Acc: 63.56%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 12, Loss: 0.4522, Test Acc: 63.69%


100%|██████████| 98/98 [00:54<00:00,  1.80it/s]


Epoch 13, Loss: 0.3944, Test Acc: 65.05%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 14, Loss: 0.3459, Test Acc: 64.53%


100%|██████████| 98/98 [00:54<00:00,  1.80it/s]


Epoch 15, Loss: 0.2842, Test Acc: 64.63%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 16, Loss: 0.2685, Test Acc: 63.90%


100%|██████████| 98/98 [00:54<00:00,  1.80it/s]


Epoch 17, Loss: 0.2207, Test Acc: 65.00%


100%|██████████| 98/98 [00:54<00:00,  1.80it/s]


Epoch 18, Loss: 0.2018, Test Acc: 64.51%


100%|██████████| 98/98 [00:54<00:00,  1.81it/s]


Epoch 19, Loss: 0.1782, Test Acc: 64.44%


100%|██████████| 98/98 [00:54<00:00,  1.80it/s]


Epoch 20, Loss: 0.1665, Test Acc: 63.94%
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  16640 KiB |   4230 MiB | 153422 GiB | 153422 GiB |
|       from large pool |  16640 KiB |   4188 MiB | 152747 GiB | 152747 GiB |
|       from small pool |      0 KiB |     61 MiB |    675 GiB |    675 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  16640 KiB |   4230 MiB | 153422 GiB | 153422 GiB |
|       from large pool |  16640 KiB |   4188 MiB | 152747 GiB | 152747 GiB |
|       from small pool |      0 KiB |     61 MiB |    675 GiB |    675 GiB |
|----------------------

100%|██████████| 98/98 [00:13<00:00,  7.32it/s]


Epoch 1, Loss: 1.9215, Test Acc: 35.03%


100%|██████████| 98/98 [00:12<00:00,  7.58it/s]


Epoch 2, Loss: 1.6512, Test Acc: 42.47%


100%|██████████| 98/98 [00:13<00:00,  7.33it/s]


Epoch 3, Loss: 1.5312, Test Acc: 44.56%


100%|██████████| 98/98 [00:13<00:00,  7.30it/s]


Epoch 4, Loss: 1.4492, Test Acc: 47.54%


100%|██████████| 98/98 [00:13<00:00,  7.40it/s]


Epoch 5, Loss: 1.3852, Test Acc: 50.02%


100%|██████████| 98/98 [00:13<00:00,  7.41it/s]


Epoch 6, Loss: 1.3264, Test Acc: 52.05%


100%|██████████| 98/98 [00:13<00:00,  7.25it/s]


Epoch 7, Loss: 1.2948, Test Acc: 53.62%


100%|██████████| 98/98 [00:13<00:00,  7.28it/s]


Epoch 8, Loss: 1.2583, Test Acc: 56.44%


100%|██████████| 98/98 [00:13<00:00,  7.43it/s]


Epoch 9, Loss: 1.2200, Test Acc: 57.62%


100%|██████████| 98/98 [00:13<00:00,  7.50it/s]


Epoch 10, Loss: 1.1887, Test Acc: 57.33%


100%|██████████| 98/98 [00:13<00:00,  7.31it/s]


Epoch 11, Loss: 1.1613, Test Acc: 59.48%


100%|██████████| 98/98 [00:13<00:00,  7.32it/s]


Epoch 12, Loss: 1.1402, Test Acc: 59.81%


100%|██████████| 98/98 [00:13<00:00,  7.32it/s]


Epoch 13, Loss: 1.1074, Test Acc: 60.59%


100%|██████████| 98/98 [00:12<00:00,  7.54it/s]


Epoch 14, Loss: 1.0839, Test Acc: 60.63%


100%|██████████| 98/98 [00:12<00:00,  7.55it/s]


Epoch 15, Loss: 1.0637, Test Acc: 61.73%


100%|██████████| 98/98 [00:13<00:00,  7.47it/s]


Epoch 16, Loss: 1.0354, Test Acc: 63.08%


100%|██████████| 98/98 [00:13<00:00,  7.39it/s]


Epoch 17, Loss: 1.0188, Test Acc: 63.16%


100%|██████████| 98/98 [00:12<00:00,  7.62it/s]


Epoch 18, Loss: 1.0017, Test Acc: 64.59%


100%|██████████| 98/98 [00:13<00:00,  7.54it/s]


Epoch 19, Loss: 0.9802, Test Acc: 64.93%


100%|██████████| 98/98 [00:13<00:00,  7.33it/s]


Epoch 20, Loss: 0.9653, Test Acc: 66.10%
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  16640 KiB |   4230 MiB | 162178 GiB | 162178 GiB |
|       from large pool |  16640 KiB |   4188 MiB | 161383 GiB | 161383 GiB |
|       from small pool |      0 KiB |     62 MiB |    794 GiB |    794 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  16640 KiB |   4230 MiB | 162178 GiB | 162178 GiB |
|       from large pool |  16640 KiB |   4188 MiB | 161383 GiB | 161383 GiB |
|       from small pool |      0 KiB |     62 MiB |    794 GiB |    794 GiB |
|----------------------

100%|██████████| 98/98 [00:32<00:00,  3.00it/s]


Epoch 1, Loss: 1.7958, Test Acc: 43.98%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 2, Loss: 1.4608, Test Acc: 51.36%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 3, Loss: 1.3213, Test Acc: 54.50%


100%|██████████| 98/98 [00:32<00:00,  2.99it/s]


Epoch 4, Loss: 1.2255, Test Acc: 57.22%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 5, Loss: 1.1593, Test Acc: 60.10%


100%|██████████| 98/98 [00:32<00:00,  2.97it/s]


Epoch 6, Loss: 1.0978, Test Acc: 62.92%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 7, Loss: 1.0588, Test Acc: 62.39%


100%|██████████| 98/98 [00:32<00:00,  2.99it/s]


Epoch 8, Loss: 1.0196, Test Acc: 64.46%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 9, Loss: 0.9901, Test Acc: 65.71%


100%|██████████| 98/98 [00:32<00:00,  2.99it/s]


Epoch 10, Loss: 0.9543, Test Acc: 66.96%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 11, Loss: 0.9326, Test Acc: 66.77%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 12, Loss: 0.9102, Test Acc: 68.29%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 13, Loss: 0.8804, Test Acc: 67.90%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 14, Loss: 0.8517, Test Acc: 69.24%


100%|██████████| 98/98 [00:33<00:00,  2.97it/s]


Epoch 15, Loss: 0.8368, Test Acc: 69.50%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 16, Loss: 0.8143, Test Acc: 70.94%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 17, Loss: 0.7889, Test Acc: 69.92%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 18, Loss: 0.7764, Test Acc: 71.49%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 19, Loss: 0.7568, Test Acc: 71.93%


100%|██████████| 98/98 [00:32<00:00,  2.98it/s]


Epoch 20, Loss: 0.7401, Test Acc: 72.54%
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  16640 KiB |   4230 MiB | 186421 GiB | 186421 GiB |
|       from large pool |  16640 KiB |   4188 MiB | 185508 GiB | 185508 GiB |
|       from small pool |      0 KiB |     62 MiB |    913 GiB |    913 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  16640 KiB |   4230 MiB | 186421 GiB | 186421 GiB |
|       from large pool |  16640 KiB |   4188 MiB | 185508 GiB | 185508 GiB |
|       from small pool |      0 KiB |     62 MiB |    913 GiB |    913 GiB |
|----------------------

100%|██████████| 98/98 [00:13<00:00,  7.49it/s]


Epoch 1, Loss: 1.8855, Test Acc: 36.71%


100%|██████████| 98/98 [00:13<00:00,  7.53it/s]


Epoch 2, Loss: 1.6479, Test Acc: 42.15%


100%|██████████| 98/98 [00:13<00:00,  7.36it/s]


Epoch 3, Loss: 1.5201, Test Acc: 47.13%


100%|██████████| 98/98 [00:12<00:00,  7.64it/s]


Epoch 4, Loss: 1.4362, Test Acc: 48.82%


100%|██████████| 98/98 [00:13<00:00,  7.48it/s]


Epoch 5, Loss: 1.3719, Test Acc: 50.79%


100%|██████████| 98/98 [00:13<00:00,  7.46it/s]


Epoch 6, Loss: 1.3332, Test Acc: 52.26%


100%|██████████| 98/98 [00:13<00:00,  7.52it/s]


Epoch 7, Loss: 1.2797, Test Acc: 54.59%


100%|██████████| 98/98 [00:13<00:00,  7.43it/s]


Epoch 8, Loss: 1.2414, Test Acc: 56.04%


100%|██████████| 98/98 [00:12<00:00,  7.56it/s]


Epoch 9, Loss: 1.2128, Test Acc: 57.33%


100%|██████████| 98/98 [00:13<00:00,  7.46it/s]


Epoch 10, Loss: 1.1861, Test Acc: 58.67%


100%|██████████| 98/98 [00:13<00:00,  7.51it/s]


Epoch 11, Loss: 1.1486, Test Acc: 59.36%


100%|██████████| 98/98 [00:13<00:00,  7.48it/s]


Epoch 12, Loss: 1.1295, Test Acc: 61.04%


100%|██████████| 98/98 [00:12<00:00,  7.64it/s]


Epoch 13, Loss: 1.0925, Test Acc: 61.48%


100%|██████████| 98/98 [00:13<00:00,  7.45it/s]


Epoch 14, Loss: 1.0661, Test Acc: 62.66%


100%|██████████| 98/98 [00:13<00:00,  7.43it/s]


Epoch 15, Loss: 1.0501, Test Acc: 63.20%


100%|██████████| 98/98 [00:13<00:00,  7.32it/s]


Epoch 16, Loss: 1.0292, Test Acc: 63.79%


100%|██████████| 98/98 [00:13<00:00,  7.53it/s]


Epoch 17, Loss: 1.0052, Test Acc: 64.63%


100%|██████████| 98/98 [00:12<00:00,  7.57it/s]


Epoch 18, Loss: 0.9832, Test Acc: 65.14%


100%|██████████| 98/98 [00:13<00:00,  7.41it/s]


Epoch 19, Loss: 0.9721, Test Acc: 65.26%


100%|██████████| 98/98 [00:12<00:00,  7.58it/s]


Epoch 20, Loss: 0.9497, Test Acc: 65.61%
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  16640 KiB |   4230 MiB | 195177 GiB | 195177 GiB |
|       from large pool |  16640 KiB |   4188 MiB | 194144 GiB | 194144 GiB |
|       from small pool |      0 KiB |     62 MiB |   1032 GiB |   1032 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  16640 KiB |   4230 MiB | 195177 GiB | 195177 GiB |
|       from large pool |  16640 KiB |   4188 MiB | 194144 GiB | 194144 GiB |
|       from small pool |      0 KiB |     62 MiB |   1032 GiB |   1032 GiB |
|----------------------

In [11]:
print("\n----- Summary of All Experiments -----")
summary = {
    "Baseline (4x4, 6 layers, 256 embed)": baseline_acc,
    "Patch size 8x8": patch8_acc,
    "Shallow (layers=4)": shallow_acc,
    "Wider embedding (512)": wide_acc,
    "No data augmentation": no_aug_acc,
}
summary.update(results)

for k, v in summary.items():
    print(f"{k}: {v:.2f}%")


----- Summary of All Experiments -----
Baseline (4x4, 6 layers, 256 embed): 73.38%
Patch size 8x8: 65.40%
Shallow (layers=4): 73.06%
Wider embedding (512): 73.40%
No data augmentation: 63.94%
Baseline, non-overlapping, AdamW: 66.10%
Overlapping patches, AdamW: 72.54%
Non-overlapping, Adam: 65.61%
