<a href="https://colab.research.google.com/github/aumkeshchaudhary/Hybrid_CNN-ViT_CIFAR-100-/blob/main/Hybrid_CNN%2BViT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# installs & imports
!pip install -q torch torchvision tqdm einops

import math, os, random, time
from pathlib import Path
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
from torchvision.transforms import AutoAugmentPolicy

from einops import rearrange

In [None]:
# config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

cfg = {
    "image_size": 32,
    "patch_size": 4,
    "in_channels": 3,
    "num_classes": 100,
    "emb_dim": 384,     # smaller, regularizes better
    "num_heads": 6,
    "depth": 8,
    "mlp_ratio": 4.0,
    "drop": 0.1,
    "drop_path": 0.1,   # stochastic depth (important)
    "batch_size": 128,
    "epochs": 200,      # 200–300 is enough with cosine LR
    "lr": 3e-4,
    "weight_decay": 0.05,
    "warmup_epochs": 5,
    "label_smoothing": 0.1,
    "ema_decay": 0.9999,
    "seed": 42
}

device: cuda


In [None]:
# reproducibility
torch.manual_seed(cfg["seed"])
random.seed(cfg["seed"])

In [None]:
mean = (0.5071, 0.4867, 0.4408)
std  = (0.2675, 0.2565, 0.2761)

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(32, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.4, 0.4, 0.4, 0.1),
    transforms.AutoAugment(policy=AutoAugmentPolicy.CIFAR10),
    transforms.ToTensor(),  # Move ToTensor before RandomErasing
    transforms.RandomErasing(p=0.25, value='random'),
    transforms.Normalize(mean, std),
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

train_ds = datasets.CIFAR100(root="data", train=True, download=True, transform=train_transform)
test_ds  = datasets.CIFAR100(root="data", train=False, download=True, transform=test_transform)

train_loader = DataLoader(train_ds, batch_size=cfg["batch_size"], shuffle=True, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False, num_workers=2, pin_memory=True)

100%|██████████| 169M/169M [00:15<00:00, 10.8MB/s]


In [None]:
# ViT model implementation
# --- Conv stem (replace PatchEmbed) ---
class ConvPatchEmbed(nn.Module):
    def __init__(self, in_chans=3, embed_dim=384):
        super().__init__()
        # Input 32x32 -> conv1: 32x32 -> conv2 stride2 -> 16x16 -> conv3 stride2 -> 8x8
        self.conv = nn.Sequential(
            nn.Conv2d(in_chans, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),

            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),

            nn.Conv2d(128, embed_dim, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(embed_dim),
            nn.ReLU(inplace=True),
        )
        # n_patches = (32/4)^2 = 8*8 = 64
        self.n_patches = (32 // 4) ** 2

    def forward(self, x):
        # x: (B, C, H, W)
        x = self.conv(x)                  # (B, E, H/4, W/4) -> H/4=8 for 32x32
        x = x.flatten(2)                  # (B, E, N)
        x = x.transpose(1, 2)             # (B, N, E)
        return x

class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, drop=0.):
        super().__init__()
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, in_features)
        self.drop = nn.Dropout(drop)
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim*3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2,0,3,1,4)
        q, k, v = qkv[0], qkv[1], qkv[2]   # each: (B, heads, N, head_dim)
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        x = (attn @ v).transpose(1,2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., drop=0., attn_drop=0., drop_path=0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, num_heads=num_heads, attn_drop=attn_drop, proj_drop=drop)
        self.drop_path = nn.Identity() if drop_path == 0. else _StochasticDepth(drop_path)
        self.norm2 = nn.LayerNorm(dim)
        self.mlp = MLP(dim, int(dim*mlp_ratio), drop=drop)

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

# Simple implementation of stochastic depth
class _StochasticDepth(nn.Module):
    def __init__(self, p):
        super().__init__()
        self.p = p
    def forward(self, x):
        if not self.training or self.p == 0.:
            return x
        keep = torch.rand(x.shape[0], 1, 1, device=x.device) >= self.p
        return x * keep / (1 - self.p)

class ViT(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        img_size, patch_size = cfg["image_size"], cfg["patch_size"]
        # Use ConvPatchEmbed (hybrid) instead of linear patch conv with kernel=patch_size
        self.patch_embed = ConvPatchEmbed(cfg["in_channels"], cfg["emb_dim"])
        n_patches = self.patch_embed.n_patches

        self.cls_token = nn.Parameter(torch.zeros(1,1,cfg["emb_dim"]))
        self.pos_embed = nn.Parameter(torch.zeros(1, 1 + n_patches, cfg["emb_dim"]))
        self.pos_drop = nn.Dropout(p=cfg["drop"])

        # transformer blocks
        dpr = [x.item() for x in torch.linspace(0, cfg.get("drop_path", 0.2), cfg["depth"])]  # stochastic depth decay
        self.blocks = nn.ModuleList([
            Block(cfg["emb_dim"], num_heads=cfg["num_heads"], mlp_ratio=cfg["mlp_ratio"], drop=cfg["drop"], drop_path=dpr[i])
            for i in range(cfg["depth"])
        ])
        self.norm = nn.LayerNorm(cfg["emb_dim"])
        self.head = nn.Linear(cfg["emb_dim"], cfg["num_classes"])

        # init
        nn.init.trunc_normal_(self.pos_embed, std=.02)
        nn.init.trunc_normal_(self.cls_token, std=.02)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            nn.init.zeros_(m.bias)
            nn.init.ones_(m.weight)
        elif isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            if getattr(m, "bias", None) is not None:
                nn.init.zeros_(m.bias)

    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)             # (B, N, E)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)   # (B, 1+N, E)
        x = x + self.pos_embed
        x = self.pos_drop(x)

        for blk in self.blocks:
            x = blk(x)

        x = self.norm(x)
        cls = x[:, 0]
        out = self.head(cls)
        return out

In [None]:
# create model, optimizer, scheduler, loss
model = ViT(cfg).to(device)

# optional EMA for smoother training
try:
    from torch_ema import ExponentialMovingAverage
    ema = ExponentialMovingAverage(model.parameters(), decay=0.9999)
except ImportError:
    ema = None

# optimizer + cosine scheduler
optimizer = AdamW(model.parameters(), lr=cfg["lr"], weight_decay=cfg["weight_decay"])
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg["epochs"])

# loss with label smoothing
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

In [None]:
# train & eval loops
def train_one_epoch(model, loader, optimizer, epoch):
    model.train()
    running_loss = 0.0
    total = 0
    correct = 0
    pbar = tqdm(loader, desc=f"Train Epoch {epoch}")
    for images, targets in pbar:
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        if ema:
         ema.update()

        running_loss += loss.item() * images.size(0)
        _, preds = outputs.max(1)
        total += targets.size(0)
        correct += preds.eq(targets).sum().item()
        pbar.set_postfix(loss=running_loss/total, acc=100.*correct/total)
    return running_loss/total, 100.*correct/total

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    total, correct = 0, 0
    losses = 0.0
    for images, targets in loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        loss = criterion(outputs, targets)
        losses += loss.item() * images.size(0)
        _, preds = outputs.max(1)
        total += targets.size(0)
        correct += preds.eq(targets).sum().item()
    return losses/total, 100.*correct/total

In [None]:
# main training loop
best_acc = 0.0
total_epochs = cfg["epochs"]
for epoch in range(total_epochs):
    scheduler.step()
    scheduler.step()
    lr = scheduler.get_last_lr()[0]


    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, epoch)
    val_loss, val_acc = evaluate(model, test_loader)
    print(f"Epoch {epoch}: train_loss={train_loss:.4f} train_acc={train_acc:.2f} val_loss={val_loss:.4f} val_acc={val_acc:.2f} lr={lr:.6f}")

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_vit_cifar100_small.pt")
        print("Saved best model:", best_acc)

print("Best test accuracy: %.2f%%" % (best_acc))

Train Epoch 0: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=5.08, loss=4.37]


Epoch 0: train_loss=4.3742 train_acc=5.08 val_loss=3.8915 val_acc=11.92 lr=0.000300
Saved best model: 11.92


Train Epoch 1: 100%|██████████| 391/391 [00:56<00:00,  6.87it/s, acc=10.2, loss=4.04]


Epoch 1: train_loss=4.0444 train_acc=10.16 val_loss=3.5421 val_acc=20.10 lr=0.000300
Saved best model: 20.1


Train Epoch 2: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=14.3, loss=3.83]


Epoch 2: train_loss=3.8311 train_acc=14.26 val_loss=3.3319 val_acc=24.31 lr=0.000299
Saved best model: 24.31


Train Epoch 3: 100%|██████████| 391/391 [00:57<00:00,  6.81it/s, acc=17, loss=3.7]


Epoch 3: train_loss=3.6962 train_acc=16.98 val_loss=3.2220 val_acc=27.10 lr=0.000299
Saved best model: 27.1


Train Epoch 4: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=18.9, loss=3.59]


Epoch 4: train_loss=3.5905 train_acc=18.94 val_loss=3.0478 val_acc=31.02 lr=0.000298
Saved best model: 31.02


Train Epoch 5: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=21.1, loss=3.51]


Epoch 5: train_loss=3.5056 train_acc=21.08 val_loss=2.9983 val_acc=32.96 lr=0.000297
Saved best model: 32.96


Train Epoch 6: 100%|██████████| 391/391 [00:57<00:00,  6.81it/s, acc=22.6, loss=3.43]


Epoch 6: train_loss=3.4337 train_acc=22.58 val_loss=2.9316 val_acc=34.14 lr=0.000296
Saved best model: 34.14


Train Epoch 7: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=24.4, loss=3.37]


Epoch 7: train_loss=3.3698 train_acc=24.38 val_loss=2.8450 val_acc=36.54 lr=0.000295
Saved best model: 36.54


Train Epoch 8: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=25.7, loss=3.31]


Epoch 8: train_loss=3.3077 train_acc=25.73 val_loss=2.8080 val_acc=37.40 lr=0.000294
Saved best model: 37.4


Train Epoch 9: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=27.2, loss=3.25]


Epoch 9: train_loss=3.2471 train_acc=27.18 val_loss=2.7476 val_acc=39.21 lr=0.000293
Saved best model: 39.21


Train Epoch 10: 100%|██████████| 391/391 [00:57<00:00,  6.81it/s, acc=28.5, loss=3.2]


Epoch 10: train_loss=3.2032 train_acc=28.53 val_loss=2.7418 val_acc=39.55 lr=0.000291
Saved best model: 39.55


Train Epoch 11: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=29.3, loss=3.16]


Epoch 11: train_loss=3.1591 train_acc=29.28 val_loss=2.6495 val_acc=42.02 lr=0.000289
Saved best model: 42.02


Train Epoch 12: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=31, loss=3.1]


Epoch 12: train_loss=3.1036 train_acc=30.97 val_loss=2.5947 val_acc=43.30 lr=0.000288
Saved best model: 43.3


Train Epoch 13: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=31.9, loss=3.06]


Epoch 13: train_loss=3.0609 train_acc=31.92 val_loss=2.5384 val_acc=45.48 lr=0.000286
Saved best model: 45.48


Train Epoch 14: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=33.3, loss=3]


Epoch 14: train_loss=3.0037 train_acc=33.33 val_loss=2.5211 val_acc=45.54 lr=0.000284
Saved best model: 45.54


Train Epoch 15: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=34.4, loss=2.97]


Epoch 15: train_loss=2.9741 train_acc=34.42 val_loss=2.4808 val_acc=46.47 lr=0.000281
Saved best model: 46.47


Train Epoch 16: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=35.3, loss=2.93]


Epoch 16: train_loss=2.9268 train_acc=35.30 val_loss=2.4669 val_acc=47.34 lr=0.000279
Saved best model: 47.34


Train Epoch 17: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=36.4, loss=2.9]


Epoch 17: train_loss=2.8976 train_acc=36.37 val_loss=2.4463 val_acc=47.87 lr=0.000277
Saved best model: 47.87


Train Epoch 18: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=37, loss=2.86]


Epoch 18: train_loss=2.8561 train_acc=37.01 val_loss=2.3564 val_acc=50.30 lr=0.000274
Saved best model: 50.3


Train Epoch 19: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=37.9, loss=2.83]


Epoch 19: train_loss=2.8290 train_acc=37.90 val_loss=2.3579 val_acc=51.14 lr=0.000271
Saved best model: 51.14


Train Epoch 20: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=39.5, loss=2.78]


Epoch 20: train_loss=2.7804 train_acc=39.46 val_loss=2.3427 val_acc=51.14 lr=0.000269


Train Epoch 21: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=40.3, loss=2.74]


Epoch 21: train_loss=2.7448 train_acc=40.29 val_loss=2.3360 val_acc=51.10 lr=0.000266


Train Epoch 22: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=41.1, loss=2.71]


Epoch 22: train_loss=2.7121 train_acc=41.12 val_loss=2.3810 val_acc=50.17 lr=0.000263


Train Epoch 23: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=42.1, loss=2.68]


Epoch 23: train_loss=2.6834 train_acc=42.10 val_loss=2.2615 val_acc=53.27 lr=0.000259
Saved best model: 53.27


Train Epoch 24: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=42.8, loss=2.65]


Epoch 24: train_loss=2.6498 train_acc=42.83 val_loss=2.2522 val_acc=53.40 lr=0.000256
Saved best model: 53.4


Train Epoch 25: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=43.9, loss=2.62]


Epoch 25: train_loss=2.6193 train_acc=43.88 val_loss=2.2158 val_acc=54.71 lr=0.000253
Saved best model: 54.71


Train Epoch 26: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=44.3, loss=2.6]


Epoch 26: train_loss=2.5955 train_acc=44.30 val_loss=2.1961 val_acc=56.00 lr=0.000249
Saved best model: 56.0


Train Epoch 27: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=45.9, loss=2.55]


Epoch 27: train_loss=2.5489 train_acc=45.85 val_loss=2.1671 val_acc=56.64 lr=0.000246
Saved best model: 56.64


Train Epoch 28: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=46.1, loss=2.54]


Epoch 28: train_loss=2.5369 train_acc=46.05 val_loss=2.1839 val_acc=55.75 lr=0.000242


Train Epoch 29: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=47.3, loss=2.49]


Epoch 29: train_loss=2.4892 train_acc=47.31 val_loss=2.1556 val_acc=56.97 lr=0.000238
Saved best model: 56.97


Train Epoch 30: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=47.9, loss=2.47]


Epoch 30: train_loss=2.4655 train_acc=47.94 val_loss=2.1242 val_acc=57.41 lr=0.000234
Saved best model: 57.41


Train Epoch 31: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=48.7, loss=2.44]


Epoch 31: train_loss=2.4353 train_acc=48.68 val_loss=2.1232 val_acc=57.90 lr=0.000230
Saved best model: 57.9


Train Epoch 32: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=49.6, loss=2.42]


Epoch 32: train_loss=2.4154 train_acc=49.57 val_loss=2.1256 val_acc=58.07 lr=0.000226
Saved best model: 58.07


Train Epoch 33: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=50.3, loss=2.38]


Epoch 33: train_loss=2.3839 train_acc=50.25 val_loss=2.1332 val_acc=57.71 lr=0.000222


Train Epoch 34: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=51.4, loss=2.35]


Epoch 34: train_loss=2.3524 train_acc=51.35 val_loss=2.0998 val_acc=58.57 lr=0.000218
Saved best model: 58.57


Train Epoch 35: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=51.9, loss=2.33]


Epoch 35: train_loss=2.3260 train_acc=51.95 val_loss=2.0701 val_acc=59.95 lr=0.000214
Saved best model: 59.95


Train Epoch 36: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=53.2, loss=2.3]


Epoch 36: train_loss=2.2985 train_acc=53.17 val_loss=2.0656 val_acc=60.10 lr=0.000210
Saved best model: 60.1


Train Epoch 37: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=53.5, loss=2.28]


Epoch 37: train_loss=2.2817 train_acc=53.48 val_loss=2.0812 val_acc=59.22 lr=0.000205


Train Epoch 38: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=54.3, loss=2.25]


Epoch 38: train_loss=2.2488 train_acc=54.30 val_loss=2.0559 val_acc=60.36 lr=0.000201
Saved best model: 60.36


Train Epoch 39: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=55.3, loss=2.22]


Epoch 39: train_loss=2.2191 train_acc=55.32 val_loss=2.0426 val_acc=60.56 lr=0.000196
Saved best model: 60.56


Train Epoch 40: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=55.8, loss=2.21]


Epoch 40: train_loss=2.2058 train_acc=55.83 val_loss=2.0319 val_acc=61.35 lr=0.000192
Saved best model: 61.35


Train Epoch 41: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=57, loss=2.17]


Epoch 41: train_loss=2.1697 train_acc=56.96 val_loss=2.0153 val_acc=61.61 lr=0.000187
Saved best model: 61.61


Train Epoch 42: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=57.6, loss=2.15]


Epoch 42: train_loss=2.1453 train_acc=57.60 val_loss=2.0322 val_acc=61.36 lr=0.000183


Train Epoch 43: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=58.1, loss=2.12]


Epoch 43: train_loss=2.1227 train_acc=58.15 val_loss=2.0238 val_acc=61.51 lr=0.000178


Train Epoch 44: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=59.1, loss=2.09]


Epoch 44: train_loss=2.0939 train_acc=59.12 val_loss=2.0207 val_acc=61.86 lr=0.000173
Saved best model: 61.86


Train Epoch 45: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=59.9, loss=2.07]


Epoch 45: train_loss=2.0750 train_acc=59.86 val_loss=2.0239 val_acc=61.62 lr=0.000169


Train Epoch 46: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=60.7, loss=2.05]


Epoch 46: train_loss=2.0518 train_acc=60.67 val_loss=2.0057 val_acc=62.17 lr=0.000164
Saved best model: 62.17


Train Epoch 47: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=61.6, loss=2.03]


Epoch 47: train_loss=2.0327 train_acc=61.57 val_loss=1.9922 val_acc=62.73 lr=0.000159
Saved best model: 62.73


Train Epoch 48: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=62, loss=2.01]


Epoch 48: train_loss=2.0094 train_acc=61.96 val_loss=1.9859 val_acc=63.32 lr=0.000155
Saved best model: 63.32


Train Epoch 49: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=62.4, loss=1.99]


Epoch 49: train_loss=1.9916 train_acc=62.42 val_loss=1.9819 val_acc=63.75 lr=0.000150
Saved best model: 63.75


Train Epoch 50: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=63.3, loss=1.98]


Epoch 50: train_loss=1.9761 train_acc=63.33 val_loss=1.9900 val_acc=62.89 lr=0.000145


Train Epoch 51: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=64, loss=1.95]


Epoch 51: train_loss=1.9463 train_acc=64.03 val_loss=1.9906 val_acc=63.30 lr=0.000141


Train Epoch 52: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=64.8, loss=1.92]


Epoch 52: train_loss=1.9242 train_acc=64.77 val_loss=1.9810 val_acc=63.79 lr=0.000136
Saved best model: 63.79


Train Epoch 53: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=65.3, loss=1.9]


Epoch 53: train_loss=1.9032 train_acc=65.35 val_loss=1.9932 val_acc=63.32 lr=0.000131


Train Epoch 54: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=66.1, loss=1.89]


Epoch 54: train_loss=1.8872 train_acc=66.11 val_loss=1.9690 val_acc=64.11 lr=0.000127
Saved best model: 64.11


Train Epoch 55: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=66.5, loss=1.87]


Epoch 55: train_loss=1.8727 train_acc=66.46 val_loss=1.9688 val_acc=64.54 lr=0.000122
Saved best model: 64.54


Train Epoch 56: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=67, loss=1.86]


Epoch 56: train_loss=1.8601 train_acc=67.03 val_loss=1.9687 val_acc=64.25 lr=0.000117


Train Epoch 57: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=67.6, loss=1.84]


Epoch 57: train_loss=1.8442 train_acc=67.60 val_loss=1.9789 val_acc=64.16 lr=0.000113


Train Epoch 58: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=68, loss=1.83]


Epoch 58: train_loss=1.8316 train_acc=67.96 val_loss=1.9613 val_acc=64.79 lr=0.000108
Saved best model: 64.79


Train Epoch 59: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=68.7, loss=1.81]


Epoch 59: train_loss=1.8058 train_acc=68.73 val_loss=1.9582 val_acc=65.06 lr=0.000104
Saved best model: 65.06


Train Epoch 60: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=69.4, loss=1.79]


Epoch 60: train_loss=1.7892 train_acc=69.38 val_loss=1.9651 val_acc=64.85 lr=0.000099


Train Epoch 61: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=69.6, loss=1.78]


Epoch 61: train_loss=1.7799 train_acc=69.59 val_loss=1.9732 val_acc=64.49 lr=0.000095


Train Epoch 62: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=70.2, loss=1.76]


Epoch 62: train_loss=1.7644 train_acc=70.20 val_loss=1.9705 val_acc=64.63 lr=0.000090


Train Epoch 63: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=70.7, loss=1.76]


Epoch 63: train_loss=1.7569 train_acc=70.69 val_loss=1.9482 val_acc=65.32 lr=0.000086
Saved best model: 65.32


Train Epoch 64: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=71.4, loss=1.74]


Epoch 64: train_loss=1.7354 train_acc=71.36 val_loss=1.9507 val_acc=65.29 lr=0.000082


Train Epoch 65: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=71.4, loss=1.73]


Epoch 65: train_loss=1.7291 train_acc=71.40 val_loss=1.9541 val_acc=65.11 lr=0.000078


Train Epoch 66: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=72, loss=1.71]


Epoch 66: train_loss=1.7112 train_acc=71.98 val_loss=1.9672 val_acc=64.96 lr=0.000074


Train Epoch 67: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=72.5, loss=1.7]


Epoch 67: train_loss=1.6970 train_acc=72.45 val_loss=1.9485 val_acc=65.52 lr=0.000070
Saved best model: 65.52


Train Epoch 68: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=72.6, loss=1.69]


Epoch 68: train_loss=1.6919 train_acc=72.65 val_loss=1.9622 val_acc=65.46 lr=0.000066


Train Epoch 69: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=73, loss=1.68]


Epoch 69: train_loss=1.6815 train_acc=72.95 val_loss=1.9512 val_acc=65.36 lr=0.000062


Train Epoch 70: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=73.4, loss=1.67]


Epoch 70: train_loss=1.6702 train_acc=73.44 val_loss=1.9524 val_acc=65.60 lr=0.000058
Saved best model: 65.6


Train Epoch 71: 100%|██████████| 391/391 [00:57<00:00,  6.80it/s, acc=74, loss=1.65]


Epoch 71: train_loss=1.6529 train_acc=73.98 val_loss=1.9470 val_acc=65.87 lr=0.000054
Saved best model: 65.87


Train Epoch 72: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=74, loss=1.65]


Epoch 72: train_loss=1.6520 train_acc=73.97 val_loss=1.9479 val_acc=65.46 lr=0.000051


Train Epoch 73: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=74.3, loss=1.64]


Epoch 73: train_loss=1.6435 train_acc=74.31 val_loss=1.9576 val_acc=65.27 lr=0.000047


Train Epoch 74: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=75, loss=1.63]


Epoch 74: train_loss=1.6253 train_acc=75.05 val_loss=1.9518 val_acc=65.53 lr=0.000044


Train Epoch 75: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=75.2, loss=1.62]


Epoch 75: train_loss=1.6226 train_acc=75.16 val_loss=1.9447 val_acc=65.79 lr=0.000041


Train Epoch 76: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=75.3, loss=1.61]


Epoch 76: train_loss=1.6143 train_acc=75.35 val_loss=1.9401 val_acc=65.81 lr=0.000037


Train Epoch 77: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=75.7, loss=1.6]


Epoch 77: train_loss=1.6015 train_acc=75.66 val_loss=1.9350 val_acc=66.46 lr=0.000034
Saved best model: 66.46


Train Epoch 78: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=75.7, loss=1.6]


Epoch 78: train_loss=1.5998 train_acc=75.72 val_loss=1.9346 val_acc=66.02 lr=0.000031


Train Epoch 79: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=76.6, loss=1.58]


Epoch 79: train_loss=1.5805 train_acc=76.63 val_loss=1.9479 val_acc=65.94 lr=0.000029


Train Epoch 80: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=76.5, loss=1.58]


Epoch 80: train_loss=1.5780 train_acc=76.55 val_loss=1.9376 val_acc=66.05 lr=0.000026


Train Epoch 81: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=76.8, loss=1.57]


Epoch 81: train_loss=1.5735 train_acc=76.80 val_loss=1.9379 val_acc=66.06 lr=0.000023


Train Epoch 82: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=76.8, loss=1.57]


Epoch 82: train_loss=1.5728 train_acc=76.81 val_loss=1.9349 val_acc=66.37 lr=0.000021


Train Epoch 83: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=77.3, loss=1.56]


Epoch 83: train_loss=1.5593 train_acc=77.29 val_loss=1.9353 val_acc=66.64 lr=0.000019
Saved best model: 66.64


Train Epoch 84: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=77, loss=1.56]


Epoch 84: train_loss=1.5629 train_acc=77.03 val_loss=1.9369 val_acc=66.63 lr=0.000016


Train Epoch 85: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=77.4, loss=1.56]


Epoch 85: train_loss=1.5583 train_acc=77.38 val_loss=1.9346 val_acc=66.46 lr=0.000014


Train Epoch 86: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=77.2, loss=1.56]


Epoch 86: train_loss=1.5571 train_acc=77.21 val_loss=1.9326 val_acc=66.61 lr=0.000012


Train Epoch 87: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=77.1, loss=1.56]


Epoch 87: train_loss=1.5610 train_acc=77.14 val_loss=1.9321 val_acc=66.42 lr=0.000011


Train Epoch 88: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=77.6, loss=1.55]


Epoch 88: train_loss=1.5482 train_acc=77.65 val_loss=1.9294 val_acc=66.57 lr=0.000009


Train Epoch 89: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=77.8, loss=1.54]


Epoch 89: train_loss=1.5425 train_acc=77.76 val_loss=1.9314 val_acc=66.59 lr=0.000007


Train Epoch 90: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=77.5, loss=1.55]


Epoch 90: train_loss=1.5498 train_acc=77.48 val_loss=1.9376 val_acc=66.41 lr=0.000006


Train Epoch 91: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=77.9, loss=1.54]


Epoch 91: train_loss=1.5390 train_acc=77.91 val_loss=1.9369 val_acc=66.51 lr=0.000005


Train Epoch 92: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=77.9, loss=1.54]


Epoch 92: train_loss=1.5432 train_acc=77.89 val_loss=1.9309 val_acc=66.56 lr=0.000004


Train Epoch 93: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=77.9, loss=1.53]


Epoch 93: train_loss=1.5341 train_acc=77.88 val_loss=1.9296 val_acc=66.67 lr=0.000003
Saved best model: 66.67


Train Epoch 94: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=78, loss=1.54]


Epoch 94: train_loss=1.5391 train_acc=77.95 val_loss=1.9319 val_acc=66.46 lr=0.000002


Train Epoch 95: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=77.8, loss=1.54]


Epoch 95: train_loss=1.5372 train_acc=77.80 val_loss=1.9308 val_acc=66.41 lr=0.000001


Train Epoch 96: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=78.4, loss=1.53]


Epoch 96: train_loss=1.5263 train_acc=78.42 val_loss=1.9313 val_acc=66.50 lr=0.000001


Train Epoch 97: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=77.9, loss=1.54]


Epoch 97: train_loss=1.5390 train_acc=77.87 val_loss=1.9321 val_acc=66.42 lr=0.000000


Train Epoch 98: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=77.7, loss=1.54]


Epoch 98: train_loss=1.5428 train_acc=77.65 val_loss=1.9307 val_acc=66.40 lr=0.000000


Train Epoch 99: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=78.4, loss=1.53]


Epoch 99: train_loss=1.5265 train_acc=78.40 val_loss=1.9313 val_acc=66.53 lr=0.000000


Train Epoch 100: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=78, loss=1.54]


Epoch 100: train_loss=1.5369 train_acc=77.98 val_loss=1.9308 val_acc=66.45 lr=0.000000


Train Epoch 101: 100%|██████████| 391/391 [00:57<00:00,  6.79it/s, acc=78, loss=1.53]


Epoch 101: train_loss=1.5338 train_acc=78.01 val_loss=1.9312 val_acc=66.53 lr=0.000000


Train Epoch 102: 100%|██████████| 391/391 [00:57<00:00,  6.78it/s, acc=78.3, loss=1.53]


Epoch 102: train_loss=1.5289 train_acc=78.33 val_loss=1.9293 val_acc=66.55 lr=0.000001


Train Epoch 103:  40%|███▉      | 156/391 [00:23<00:34,  6.74it/s, acc=77.8, loss=1.53]


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), "checkpoint_hybrid_epoch100.pth")

In [None]:
for g in optimizer.param_groups:
    g["lr"] = 3e-4  # restart cosine cycle
scheduler.T_max += 100h