In [None]:
import argparse
import time
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
import numpy as np
import timm


In [None]:
from peft import LoraConfig, get_peft_model

In [None]:
def parse_args():
    parser = argparse.ArgumentParser(
        description="ViT fine-tuning on CIFAR-100 with time/epoch constraints"
    )
    parser.add_argument("--data_root", type=str, default="./data")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--num_workers", type=int, default=4)
    parser.add_argument("--epochs", type=int, default=5)
    parser.add_argument("--lr", type=float, default=5e-5)
    parser.add_argument("--weight_decay", type=float, default=0.1)
    parser.add_argument("--model_name", type=str,
                        default="vit_small_patch16_224")
    parser.add_argument("--out_dir", type=str,
                        default="./ckpt_vit_cifar100")
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--use_amp", action="store_true")
    parser.add_argument("--subset_ratio", type=float, default=1.0)

    args, _ = parser.parse_known_args()
    return args


In [None]:


def set_seed(seed: int):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


In [None]:
def get_dataloaders(data_root, batch_size, num_workers,
                    subset_ratio=1.0, seed=42):
    # CIFAR-100: 32x32 -> resize 224x224 for ViT
    mean = (0.5071, 0.4867, 0.4408)
    std = (0.2675, 0.2565, 0.2761)

    train_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])

    test_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])

    train_set = datasets.CIFAR100(
        root=data_root, train=True, download=True, transform=train_transform
    )
    test_set = datasets.CIFAR100(
        root=data_root, train=False, download=True, transform=test_transform
    )

    if subset_ratio < 1.0:
        assert subset_ratio > 0.0
        np.random.seed(seed)
        indices = np.random.permutation(len(train_set))
        k = int(len(train_set) * subset_ratio)
        indices = indices[:k]
        train_set = Subset(train_set, indices)
        print(f"Using subset of train set: {k} samples "
              f"({subset_ratio*100:.1f}% of 50k)")

    train_loader = DataLoader(
        train_set, batch_size=batch_size, shuffle=True,
        num_workers=num_workers, pin_memory=True
    )
    test_loader = DataLoader(
        test_set, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=True
    )
    return train_loader, test_loader


In [None]:
def create_model(model_name: str, num_classes: int = 100):
    # 1. 创建 ViT 预训练模型
    model = timm.create_model(
        model_name,
        pretrained=True,
        num_classes=num_classes
    )

    # 2. LoRA 配置
    # lora_config = LoraConfig(
    #     r=8,                       # LoRA rank（8 在 Colab 很稳）
    #     lora_alpha=16,
    #     lora_dropout=0.1,
    #     bias="none",
    #     target_modules=[
    #         "qkv",                 # ViT attention 里的 QKV
    #         "proj"                 # attention 输出 projection
    #     ],
    # )
    lora_config = LoraConfig(
        r=8,                     # 比 8 稍大
        lora_alpha=32,
        lora_dropout=0.1,
        bias="none",
        target_modules=[
             "qkv",
             "proj",
             "fc1",                # MLP
             "fc2",
         ],
    )


    # 3. 注入 LoRA
    model = get_peft_model(model, lora_config)

    # 4. 打印可训练参数比例（强烈建议保留）
    model.print_trainable_parameters()

    return model

In [None]:
def train_one_epoch(model, loader, criterion, optimizer,
                    device, epoch, scaler=None):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    start = time.time()

    for i, (images, targets) in enumerate(loader):
        images = images.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        optimizer.zero_grad()

        if scaler is not None:
            with torch.cuda.amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, targets)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(images)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        running_loss += loss.item() * images.size(0)
        _, preds = outputs.max(1)
        total += targets.size(0)
        correct += preds.eq(targets).sum().item()

        if (i + 1) % 50 == 0:
            print(f"Epoch [{epoch}] Step [{i+1}/{len(loader)}] "
                  f"Loss: {loss.item():.4f}")

    epoch_loss = running_loss / total
    acc = correct / total * 100.0
    elapsed = time.time() - start
    print(f"Train Epoch {epoch}: Loss {epoch_loss:.4f}, "
          f"Acc {acc:.2f}%, Time {elapsed/60:.2f} min")
    return epoch_loss, acc



In [None]:
def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, targets in loader:
            images = images.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, targets)

            running_loss += loss.item() * images.size(0)
            _, preds = outputs.max(1)
            total += targets.size(0)
            correct += preds.eq(targets).sum().item()

    epoch_loss = running_loss / total
    acc = correct / total * 100.0
    print(f"Val : Loss {epoch_loss:.4f}, Acc {acc:.2f}%")
    return epoch_loss, acc

In [None]:
def main():
    args = parse_args()
    assert args.epochs <= 5, "Requirement: epochs must be <= 5"

    set_seed(args.seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    train_loader, val_loader = get_dataloaders(
        args.data_root,
        args.batch_size,
        args.num_workers,
        subset_ratio=args.subset_ratio,
        seed=args.seed,
    )

    model = create_model(args.model_name, num_classes=100)
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(
      filter(lambda p: p.requires_grad, model.parameters()),
      lr=args.lr,
      weight_decay=args.weight_decay
    )

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=args.epochs
    )

    scaler = torch.cuda.amp.GradScaler() if args.use_amp and device == "cuda" else None

    best_acc = 0.0
    for epoch in range(1, args.epochs + 1):
        train_one_epoch(
            model, train_loader, criterion, optimizer, device, epoch, scaler
        )
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)
        scheduler.step()

        if val_acc > best_acc:
            best_acc = val_acc
            ckpt_path = out_dir / "best_vit_finetune.pth"
            torch.save(
                {
                    "epoch": epoch,
                    "model_state": model.state_dict(),
                    "optimizer_state": optimizer.state_dict(),
                    "best_acc": best_acc,
                },
                ckpt_path,
            )
            print(f"Saved best model to {ckpt_path} "
                  f"(acc={best_acc:.2f}%)")

    print(f"Training finished. Best val acc: {best_acc:.2f}%")


if __name__ == "__main__":
    main()

Using device: cuda


100%|██████████| 169M/169M [00:05<00:00, 29.3MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/88.2M [00:00<?, ?B/s]

trainable params: 599,040 || all params: 22,303,204 || trainable%: 2.6859
Epoch [1] Step [50/1563] Loss: 4.8672
Epoch [1] Step [100/1563] Loss: 4.7109
Epoch [1] Step [150/1563] Loss: 4.8424
Epoch [1] Step [200/1563] Loss: 4.6571
Epoch [1] Step [250/1563] Loss: 4.3957
Epoch [1] Step [300/1563] Loss: 4.4307
Epoch [1] Step [350/1563] Loss: 3.9899
Epoch [1] Step [400/1563] Loss: 3.7926
Epoch [1] Step [450/1563] Loss: 3.4201
Epoch [1] Step [500/1563] Loss: 3.4343
Epoch [1] Step [550/1563] Loss: 3.1962
Epoch [1] Step [600/1563] Loss: 2.8784
Epoch [1] Step [650/1563] Loss: 2.6581
Epoch [1] Step [700/1563] Loss: 2.2470
Epoch [1] Step [750/1563] Loss: 2.4545
Epoch [1] Step [800/1563] Loss: 2.4275
Epoch [1] Step [850/1563] Loss: 2.5353
Epoch [1] Step [900/1563] Loss: 2.1814
Epoch [1] Step [950/1563] Loss: 1.9161
Epoch [1] Step [1000/1563] Loss: 2.0632
Epoch [1] Step [1050/1563] Loss: 2.3556
Epoch [1] Step [1100/1563] Loss: 2.0974
Epoch [1] Step [1150/1563] Loss: 2.4786
Epoch [1] Step [1200/1563]