# Adapter Tuning Grid Search
This notebook runs a **small grid search** for ViT adapter tuning on CIFAR-100 under course-style constraints (≤5 epochs).

It evaluates configurations over adapter bottleneck dimension, learning rate, and (optionally) LayerNorm tuning, and summarizes results in a table.

# Install Dependencies

In [None]:

!pip -q install timm==0.9.16 torchvision --upgrade


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/2.2 MB[0m [31m21.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m124.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m110.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import time
from pathlib import Path
import random
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
import timm

import pandas as pd


# Base configuration

In [None]:

base_cfg = {
    "data_root": "./data",
    "batch_size": 32,
    "num_workers": 2,
    "epochs": 5,
    "weight_decay": 0.1,
    "model_name": "vit_small_patch16_224",
    "seed": 42,
    "use_amp": True,
    "subset_ratio": 0.5,
    "out_dir": "./grid_ckpts",
}
assert base_cfg["epochs"] <= 5


In [None]:
# Grid Search Defination

In [None]:
grid = {
    # Adapter bottleneck dimension m
    "adapter_dim": [32, 64],

    # LR is often higher for adapter than full finetune
    "lr": [3e-4, 5e-4],

    # Whether to train LayerNorms (often helps stability)
    "train_layernorm": [True, False],
}


# Utilities

In [None]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def get_device():
    return "cuda" if torch.cuda.is_available() else "cpu"

def count_params(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return trainable, total


# Data loaders (CIFAR-100 -> 224x224 for ViT)


In [None]:
def get_dataloaders(data_root, batch_size, num_workers, subset_ratio=1.0, seed=42):
    mean = (0.5071, 0.4867, 0.4408)
    std = (0.2675, 0.2565, 0.2761)

    train_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])

    test_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])

    train_set = datasets.CIFAR100(root=data_root, train=True, download=True, transform=train_transform)
    test_set  = datasets.CIFAR100(root=data_root, train=False, download=True, transform=test_transform)

    if subset_ratio < 1.0:
        assert subset_ratio > 0.0
        rng = np.random.RandomState(seed)
        indices = rng.permutation(len(train_set))
        k = int(len(train_set) * subset_ratio)
        indices = indices[:k]
        train_set = Subset(train_set, indices)
        print(f"Using subset of train set: {k} samples ({subset_ratio*100:.1f}% of 50k)")

    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True,
                              num_workers=num_workers, pin_memory=True)
    test_loader  = DataLoader(test_set, batch_size=batch_size, shuffle=False,
                              num_workers=num_workers, pin_memory=True)
    return train_loader, test_loader


# Adapter injection

In [None]:
class BottleneckAdapter(nn.Module):
    def __init__(self, dim: int, bottleneck: int = 64, dropout: float = 0.0):
        super().__init__()
        self.down = nn.Linear(dim, bottleneck)
        self.act = nn.GELU()
        self.up = nn.Linear(bottleneck, dim)
        self.drop = nn.Dropout(dropout)

        # Near-identity init
        nn.init.zeros_(self.up.weight)
        nn.init.zeros_(self.up.bias)

    def forward(self, x):
        return x + self.drop(self.up(self.act(self.down(x))))

def inject_adapters_timm_vit(model: nn.Module, bottleneck: int = 64, dropout: float = 0.0):
    assert hasattr(model, "blocks"), "Model does not have .blocks; are you sure it's a timm ViT?"
    dim = getattr(model, "embed_dim", None)
    if dim is None:
        dim = model.blocks[0].norm1.normalized_shape[0]

    for blk in model.blocks:
        blk.adapter = BottleneckAdapter(dim, bottleneck=bottleneck, dropout=dropout)
        blk._forward_orig = blk.forward

        def forward_with_adapter(x, blk=blk):
            x = blk._forward_orig(x)
            x = blk.adapter(x)
            return x

        blk.forward = forward_with_adapter

    return model

def create_adapter_model(model_name: str, num_classes: int, adapter_dim: int):
    model = timm.create_model(model_name, pretrained=True, num_classes=num_classes)
    model = inject_adapters_timm_vit(model, bottleneck=adapter_dim, dropout=0.0)
    return model

def apply_adapter_freeze(model: nn.Module, train_layernorm: bool = False):
    # Freeze all
    for p in model.parameters():
        p.requires_grad = False

    # Train head
    for p in model.head.parameters():
        p.requires_grad = True

    # Train adapters
    for blk in model.blocks:
        for p in blk.adapter.parameters():
            p.requires_grad = True

    # Optional: train LayerNorms
    if train_layernorm:
        for name, p in model.named_parameters():
            if "norm" in name.lower():
                p.requires_grad = True


# Train/Eval

In [None]:
def train_one_epoch(model, loader, criterion, optimizer, device, scaler=None):
    model.train()
    running_loss, correct, total = 0.0, 0, 0
    for images, targets in loader:
        images = images.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        if scaler is not None:
            with torch.cuda.amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, targets)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(images)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        running_loss += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        total += targets.size(0)
        correct += (preds == targets).sum().item()

    return running_loss / total, 100.0 * correct / total

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss, correct, total = 0.0, 0, 0
    for images, targets in loader:
        images = images.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)
        outputs = model(images)
        loss = criterion(outputs, targets)
        running_loss += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        total += targets.size(0)
        correct += (preds == targets).sum().item()
    return running_loss / total, 100.0 * correct / total


# Run Grid Search

In [None]:
from itertools import product

set_seed(base_cfg["seed"])
device = get_device()
print("Using device:", device)

train_loader, val_loader = get_dataloaders(
    base_cfg["data_root"],
    base_cfg["batch_size"],
    base_cfg["num_workers"],
    subset_ratio=base_cfg["subset_ratio"],
    seed=base_cfg["seed"],
)

out_dir = Path(base_cfg["out_dir"])
out_dir.mkdir(parents=True, exist_ok=True)

criterion = nn.CrossEntropyLoss()

results = []
keys = list(grid.keys())
values = [grid[k] for k in keys]

run_id = 0
t0_all = time.time()

for combo in product(*values):
    run_id += 1
    cfg_run = dict(base_cfg)
    cfg_run.update(dict(zip(keys, combo)))

    print("\n" + "="*70)
    print(f"Run {run_id}: " + ", ".join([f"{k}={cfg_run[k]}" for k in keys]))

    # Build model
    model = create_adapter_model(cfg_run["model_name"], num_classes=100, adapter_dim=cfg_run["adapter_dim"]).to(device)
    apply_adapter_freeze(model, train_layernorm=cfg_run["train_layernorm"])
    trainable, total = count_params(model)
    print(f"Trainable params: {trainable}/{total} ({100*trainable/total:.2f}%)")

    optimizer = torch.optim.AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=cfg_run["lr"], weight_decay=cfg_run["weight_decay"]
    )
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg_run["epochs"])
    scaler = torch.cuda.amp.GradScaler() if (cfg_run["use_amp"] and device == "cuda") else None

    best_acc = 0.0
    best_epoch = 0
    t0 = time.time()
    for epoch in range(1, cfg_run["epochs"] + 1):
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device, scaler)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)
        scheduler.step()
        if val_acc > best_acc:
            best_acc = val_acc
            best_epoch = epoch

        print(f"  Epoch {epoch}: train_acc={train_acc:.2f} val_acc={val_acc:.2f}")

    elapsed_min = (time.time() - t0) / 60.0

    # Save a lightweight checkpoint for the best run only later; here record metrics
    results.append({
        "run": run_id,
        "adapter_dim": cfg_run["adapter_dim"],
        "lr": cfg_run["lr"],
        "train_layernorm": cfg_run["train_layernorm"],
        "epochs": cfg_run["epochs"],
        "subset_ratio": cfg_run["subset_ratio"],
        "trainable_params": trainable,
        "total_params": total,
        "trainable_pct": 100*trainable/total,
        "best_val_acc": best_acc,
        "best_epoch": best_epoch,
        "time_min": elapsed_min,
    })

df = pd.DataFrame(results).sort_values("best_val_acc", ascending=False).reset_index(drop=True)
print("\nGrid search finished in %.1f min" % ((time.time()-t0_all)/60.0))
df


Using device: cuda


100%|██████████| 169M/169M [00:03<00:00, 53.5MB/s]


Using subset of train set: 25000 samples (50.0% of 50k)

Run 1: adapter_dim=32, lr=0.0003, train_layernorm=True


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/88.2M [00:00<?, ?B/s]

Trainable params: 357604/22004068 (1.63%)


  scaler = torch.cuda.amp.GradScaler() if (cfg_run["use_amp"] and device == "cuda") else None
  with torch.cuda.amp.autocast():


  Epoch 1: train_acc=63.33 val_acc=86.15
  Epoch 2: train_acc=77.36 val_acc=87.45
  Epoch 3: train_acc=79.92 val_acc=88.79
  Epoch 4: train_acc=81.71 val_acc=89.34
  Epoch 5: train_acc=82.56 val_acc=89.38

Run 2: adapter_dim=32, lr=0.0005, train_layernorm=True
Trainable params: 357604/22004068 (1.63%)
  Epoch 1: train_acc=66.42 val_acc=86.42
  Epoch 2: train_acc=77.75 val_acc=87.85
  Epoch 3: train_acc=80.01 val_acc=88.10
  Epoch 4: train_acc=82.20 val_acc=89.58
  Epoch 5: train_acc=83.61 val_acc=89.91

Run 3: adapter_dim=64, lr=0.0003, train_layernorm=True
Trainable params: 652900/22299364 (2.93%)
  Epoch 1: train_acc=64.00 val_acc=86.23
  Epoch 2: train_acc=77.80 val_acc=87.27
  Epoch 3: train_acc=80.29 val_acc=88.64
  Epoch 4: train_acc=81.94 val_acc=89.42
  Epoch 5: train_acc=83.55 val_acc=89.90

Run 4: adapter_dim=64, lr=0.0005, train_layernorm=True
Trainable params: 652900/22299364 (2.93%)
  Epoch 1: train_acc=66.65 val_acc=86.11
  Epoch 2: train_acc=77.98 val_acc=87.96
  Epoch 3

Unnamed: 0,run,adapter_dim,lr,train_layernorm,epochs,subset_ratio,trainable_params,total_params,trainable_pct,best_val_acc,best_epoch,time_min
0,4,64,0.0005,True,5,0.5,652900,22299364,2.927886,90.05,5,3.85225
1,2,32,0.0005,True,5,0.5,357604,22004068,1.625172,89.91,5,3.808922
2,3,64,0.0003,True,5,0.5,652900,22299364,2.927886,89.9,5,3.819445
3,1,32,0.0003,True,5,0.5,357604,22004068,1.625172,89.38,5,3.810183


# Best Configuration

In [None]:
csv_path = Path(base_cfg["out_dir"]) / "adapter_grid_results.csv"
df.to_csv(csv_path, index=False)
print("Saved:", csv_path)

best = df.iloc[0].to_dict()
print("\nBest config:")
for k, v in best.items():
    print(f"  {k}: {v}")


Saved: grid_ckpts/adapter_grid_results.csv

Best config:
  run: 4
  adapter_dim: 64
  lr: 0.0005
  train_layernorm: True
  epochs: 5
  subset_ratio: 0.5
  trainable_params: 652900
  total_params: 22299364
  trainable_pct: 2.927886194422406
  best_val_acc: 90.05
  best_epoch: 5
  time_min: 3.852249503135681
