<a href="https://colab.research.google.com/github/anpham1331372/ECGR5106/blob/main/ECGR5106_HW6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Config 1
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import time
from torch.utils.data import DataLoader
from torchinfo import summary
from torchvision.models import resnet18
from tqdm import tqdm

# ====================
# Configuration
# ====================
config = {
    "image_size": 32,
    "patch_size": 4,
    "embed_dim": 256,
    "num_heads": 4,
    "num_layers": 4,
    "mlp_dim": 512,
    "num_classes": 100,
    "num_epochs": 20,
    "batch_size": 64,
    "learning_rate": 0.001,
    "dropout": 0.1
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ====================
# Dataset
# ====================
transform = transforms.Compose([
    transforms.Resize((config["image_size"], config["image_size"])),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False)

# ====================
# Vision Transformer Components
# ====================
class PatchEmbedding(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, embed_dim):
        super().__init__()
        self.num_patches = (image_size // patch_size) ** 2
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.mlp(self.norm2(x))
        return x

class VisionTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.patch_embed = PatchEmbedding(config["image_size"], config["patch_size"], 3, config["embed_dim"])
        num_patches = (config["image_size"] // config["patch_size"]) ** 2
        self.cls_token = nn.Parameter(torch.zeros(1, 1, config["embed_dim"]))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, config["embed_dim"]))
        self.dropout = nn.Dropout(config["dropout"])

        self.encoder = nn.ModuleList([
            TransformerEncoder(config["embed_dim"], config["num_heads"], config["mlp_dim"], config["dropout"])
            for _ in range(config["num_layers"])
        ])
        self.norm = nn.LayerNorm(config["embed_dim"])
        self.head = nn.Linear(config["embed_dim"], config["num_classes"])

    def forward(self, x):
        B = x.size(0)
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.dropout(x)
        for layer in self.encoder:
            x = layer(x)
        x = self.norm(x)
        return self.head(x[:, 0])

# ====================
# Helpers
# ====================
def train(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for images, labels in tqdm(loader, desc="Training", leave=False):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Evaluating", leave=False):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total

def get_model_stats(model, input_size):
    info = summary(model, input_size=input_size, verbose=0)
    total_params = sum(p.numel() for p in model.parameters())
    total_macs = info.total_mult_adds
    return total_params, total_macs

# ====================
# Run ViT Experiment
# ====================
print("=== ViT Configuration ===")
print(config)

vit_model = VisionTransformer(config).to(device)
vit_criterion = nn.CrossEntropyLoss()
vit_optimizer = torch.optim.Adam(vit_model.parameters(), lr=config["learning_rate"])

vit_epoch_times = []
start_time = time.time()
for epoch in range(config["num_epochs"]):
    epoch_start = time.time()
    train_loss = train(vit_model, train_loader, vit_optimizer, vit_criterion)
    vit_acc = evaluate(vit_model, test_loader)
    epoch_end = time.time()
    epoch_time = epoch_end - epoch_start
    vit_epoch_times.append(epoch_time)
    print(f"[ViT] Epoch {epoch+1}/{config['num_epochs']}: Loss={train_loss:.4f}, Test Accuracy={vit_acc:.2f}%, Time={epoch_time:.2f}s")
vit_total_time = time.time() - start_time

vit_params, vit_macs = get_model_stats(vit_model, input_size=(1, 3, config["image_size"], config["image_size"]))

# ====================
# ResNet-18 Baseline
# ====================
print("\n=== ResNet-18 Baseline ===")
resnet_model = resnet18(num_classes=config["num_classes"]).to(device)
resnet_optimizer = torch.optim.Adam(resnet_model.parameters(), lr=config["learning_rate"])

resnet_epoch_times = []
start_time = time.time()
for epoch in range(config["num_epochs"]):
    epoch_start = time.time()
    train_loss = train(resnet_model, train_loader, resnet_optimizer, vit_criterion)
    resnet_acc = evaluate(resnet_model, test_loader)
    epoch_end = time.time()
    epoch_time = epoch_end - epoch_start
    resnet_epoch_times.append(epoch_time)
    print(f"[ResNet] Epoch {epoch+1}/{config['num_epochs']}: Loss={train_loss:.4f}, Test Accuracy={resnet_acc:.2f}%, Time={epoch_time:.2f}s")
resnet_total_time = time.time() - start_time

resnet_params, resnet_macs = get_model_stats(resnet_model, input_size=(1, 3, 32, 32))

# ====================
# Final Summary
# ====================
print("\n=== Summary Table ===")
print(f"{'Model':<12} {'Params':>12} {'MACs':>12} {'Accuracy':>10} {'Total Time (s)':>15} {'Avg Epoch Time (s)':>20}")
print(f"{'ViT':<12} {vit_params:>12,} {vit_macs:>12,} {vit_acc:>10.2f} {vit_total_time:>15.2f} {sum(vit_epoch_times)/len(vit_epoch_times):>20.2f}")
print(f"{'ResNet-18':<12} {resnet_params:>12,} {resnet_macs:>12,} {resnet_acc:>10.2f} {resnet_total_time:>15.2f} {sum(resnet_epoch_times)/len(resnet_epoch_times):>20.2f}")


=== ViT Configuration ===
{'image_size': 32, 'patch_size': 4, 'embed_dim': 256, 'num_heads': 4, 'num_layers': 4, 'mlp_dim': 512, 'num_classes': 100, 'num_epochs': 10, 'batch_size': 64, 'learning_rate': 0.001, 'dropout': 0.1}




[ViT] Epoch 1/10: Loss=3.9023, Test Accuracy=13.02%




[ViT] Epoch 2/10: Loss=3.4436, Test Accuracy=19.55%




[ViT] Epoch 3/10: Loss=3.2495, Test Accuracy=21.24%




[ViT] Epoch 4/10: Loss=3.1157, Test Accuracy=24.56%




[ViT] Epoch 5/10: Loss=3.0184, Test Accuracy=25.14%




[ViT] Epoch 6/10: Loss=2.9389, Test Accuracy=25.56%




[ViT] Epoch 7/10: Loss=2.8523, Test Accuracy=28.17%




[ViT] Epoch 8/10: Loss=2.7902, Test Accuracy=29.06%




[ViT] Epoch 9/10: Loss=2.7091, Test Accuracy=29.94%




[ViT] Epoch 10/10: Loss=2.6300, Test Accuracy=31.48%

=== ResNet-18 Baseline ===




[ResNet] Epoch 1/10: Loss=3.5505, Test Accuracy=23.47%




[ResNet] Epoch 2/10: Loss=2.7920, Test Accuracy=31.05%




[ResNet] Epoch 3/10: Loss=2.3850, Test Accuracy=37.18%




[ResNet] Epoch 4/10: Loss=2.0767, Test Accuracy=40.24%




[ResNet] Epoch 5/10: Loss=1.8171, Test Accuracy=43.55%




[ResNet] Epoch 6/10: Loss=1.5590, Test Accuracy=44.72%




[ResNet] Epoch 7/10: Loss=1.3042, Test Accuracy=44.48%




[ResNet] Epoch 8/10: Loss=1.0535, Test Accuracy=45.26%




[ResNet] Epoch 9/10: Loss=0.8129, Test Accuracy=44.88%


                                                             

[ResNet] Epoch 10/10: Loss=0.6266, Test Accuracy=43.77%

=== Summary Table ===
Model              Params         MACs   Accuracy   Time (s)
ViT             2,164,068    1,888,868      31.48     367.66
ResNet-18      11,227,812   37,072,356      43.77     317.44




In [None]:
#Config 2
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import time
from torch.utils.data import DataLoader
from torchinfo import summary
from tqdm import tqdm

# ====================
# Configuration
# ====================
config = {
    "image_size": 32,
    "patch_size": 4,
    "embed_dim": 512,
    "num_heads": 8,
    "num_layers": 8,
    "mlp_dim": 2048,
    "num_classes": 100,
    "num_epochs": 20,
    "batch_size": 64,
    "learning_rate": 0.001,
    "dropout": 0.1
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ====================
# Dataset
# ====================
transform = transforms.Compose([
    transforms.Resize((config["image_size"], config["image_size"])),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False)

# ====================
# Vision Transformer Components
# ====================
class PatchEmbedding(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, embed_dim):
        super().__init__()
        self.num_patches = (image_size // patch_size) ** 2
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.mlp(self.norm2(x))
        return x

class VisionTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.patch_embed = PatchEmbedding(config["image_size"], config["patch_size"], 3, config["embed_dim"])
        num_patches = (config["image_size"] // config["patch_size"]) ** 2
        self.cls_token = nn.Parameter(torch.zeros(1, 1, config["embed_dim"]))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, config["embed_dim"]))
        self.dropout = nn.Dropout(config["dropout"])

        self.encoder = nn.ModuleList([
            TransformerEncoder(config["embed_dim"], config["num_heads"], config["mlp_dim"], config["dropout"])
            for _ in range(config["num_layers"])
        ])
        self.norm = nn.LayerNorm(config["embed_dim"])
        self.head = nn.Linear(config["embed_dim"], config["num_classes"])

    def forward(self, x):
        B = x.size(0)
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.dropout(x)
        for layer in self.encoder:
            x = layer(x)
        x = self.norm(x)
        return self.head(x[:, 0])

# ====================
# Helpers
# ====================
def train(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for images, labels in tqdm(loader, desc="Training", leave=False):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Evaluating", leave=False):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total

def get_model_stats(model, input_size):
    info = summary(model, input_size=input_size, verbose=0)
    total_params = sum(p.numel() for p in model.parameters())
    total_flops = info.total_mult_adds * 2  # MACs to FLOPs
    return total_params, total_flops

# ====================
# Run ViT Training
# ====================
print("=== ViT Configuration ===")
print(config)

vit_model = VisionTransformer(config).to(device)
vit_criterion = nn.CrossEntropyLoss()
vit_optimizer = torch.optim.Adam(vit_model.parameters(), lr=config["learning_rate"])

vit_epoch_times = []
start_time = time.time()
for epoch in range(config["num_epochs"]):
    epoch_start = time.time()
    train_loss = train(vit_model, train_loader, vit_optimizer, vit_criterion)
    vit_acc = evaluate(vit_model, test_loader)
    epoch_end = time.time()
    epoch_time = epoch_end - epoch_start
    vit_epoch_times.append(epoch_time)
    print(f"[ViT] Epoch {epoch+1}/{config['num_epochs']}: Loss={train_loss:.4f}, Test Accuracy={vit_acc:.2f}%, Time={epoch_time:.2f}s")
vit_total_time = time.time() - start_time

vit_params, vit_flops = get_model_stats(vit_model, input_size=(1, 3, config["image_size"], config["image_size"]))

# ====================
# Final Summary
# ====================
print("\n=== ViT Summary Table ===")
print(f"{'Model':<12} {'Params':>12} {'FLOPs':>15} {'Accuracy':>10} {'Total Time (s)':>15} {'Avg Epoch Time (s)':>20}")
print(f"{'ViT':<12} {vit_params:>12,} {vit_flops:>15,} {vit_acc:>10.2f} {vit_total_time:>15.2f} {sum(vit_epoch_times)/len(vit_epoch_times):>20.2f}")


In [None]:
#Config 3
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import time
from torch.utils.data import DataLoader
from torchinfo import summary
from tqdm import tqdm

# ====================
# Configuration
# ====================
config = {
    "image_size": 32,
    "patch_size": 8,
    "embed_dim": 256,
    "num_heads": 2,
    "num_layers": 4,
    "mlp_dim": 512,
    "num_classes": 100,
    "num_epochs": 20,
    "batch_size": 64,
    "learning_rate": 0.001,
    "dropout": 0.1
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ====================
# Dataset
# ====================
transform = transforms.Compose([
    transforms.Resize((config["image_size"], config["image_size"])),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False)

# ====================
# Vision Transformer Components
# ====================
class PatchEmbedding(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, embed_dim):
        super().__init__()
        self.num_patches = (image_size // patch_size) ** 2
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.mlp(self.norm2(x))
        return x

class VisionTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.patch_embed = PatchEmbedding(config["image_size"], config["patch_size"], 3, config["embed_dim"])
        num_patches = (config["image_size"] // config["patch_size"]) ** 2
        self.cls_token = nn.Parameter(torch.zeros(1, 1, config["embed_dim"]))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, config["embed_dim"]))
        self.dropout = nn.Dropout(config["dropout"])

        self.encoder = nn.ModuleList([
            TransformerEncoder(config["embed_dim"], config["num_heads"], config["mlp_dim"], config["dropout"])
            for _ in range(config["num_layers"])
        ])
        self.norm = nn.LayerNorm(config["embed_dim"])
        self.head = nn.Linear(config["embed_dim"], config["num_classes"])

    def forward(self, x):
        B = x.size(0)
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.dropout(x)
        for layer in self.encoder:
            x = layer(x)
        x = self.norm(x)
        return self.head(x[:, 0])

# ====================
# Helpers
# ====================
def train(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for images, labels in tqdm(loader, desc="Training", leave=False):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Evaluating", leave=False):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total

def get_model_stats(model, input_size):
    info = summary(model, input_size=input_size, verbose=0)
    total_params = sum(p.numel() for p in model.parameters())
    total_flops = info.total_mult_adds * 2  # MACs to FLOPs
    return total_params, total_flops

# ====================
# Run ViT Training
# ====================
print("=== ViT Configuration ===")
print(config)

vit_model = VisionTransformer(config).to(device)
vit_criterion = nn.CrossEntropyLoss()
vit_optimizer = torch.optim.Adam(vit_model.parameters(), lr=config["learning_rate"])

vit_epoch_times = []
start_time = time.time()
for epoch in range(config["num_epochs"]):
    epoch_start = time.time()
    train_loss = train(vit_model, train_loader, vit_optimizer, vit_criterion)
    vit_acc = evaluate(vit_model, test_loader)
    epoch_end = time.time()
    epoch_time = epoch_end - epoch_start
    vit_epoch_times.append(epoch_time)
    print(f"[ViT] Epoch {epoch+1}/{config['num_epochs']}: Loss={train_loss:.4f}, Test Accuracy={vit_acc:.2f}%, Time={epoch_time:.2f}s")
vit_total_time = time.time() - start_time

vit_params, vit_flops = get_model_stats(vit_model, input_size=(1, 3, config["image_size"], config["image_size"]))

# ====================
# Final Summary
# ====================
print("\n=== ViT Summary Table ===")
print(f"{'Model':<12} {'Params':>12} {'FLOPs':>15} {'Accuracy':>10} {'Total Time (s)':>15} {'Avg Epoch Time (s)':>20}")
print(f"{'ViT':<12} {vit_params:>12,} {vit_flops:>15,} {vit_acc:>10.2f} {vit_total_time:>15.2f} {sum(vit_epoch_times)/len(vit_epoch_times):>20.2f}")


In [None]:
#Config 4
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import time
from torch.utils.data import DataLoader
from torchinfo import summary
from tqdm import tqdm

# ====================
# Configuration
# ====================
config = {
    "image_size": 32,
    "patch_size": 8,
    "embed_dim": 256,
    "num_heads": 4,
    "num_layers": 8,
    "mlp_dim": 1028,
    "num_classes": 100,
    "num_epochs": 20,
    "batch_size": 64,
    "learning_rate": 0.001,
    "dropout": 0.1
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ====================
# Dataset
# ====================
transform = transforms.Compose([
    transforms.Resize((config["image_size"], config["image_size"])),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False)

# ====================
# Vision Transformer Components
# ====================
class PatchEmbedding(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, embed_dim):
        super().__init__()
        self.num_patches = (image_size // patch_size) ** 2
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2).transpose(1, 2)
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.mlp(self.norm2(x))
        return x

class VisionTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.patch_embed = PatchEmbedding(config["image_size"], config["patch_size"], 3, config["embed_dim"])
        num_patches = (config["image_size"] // config["patch_size"]) ** 2
        self.cls_token = nn.Parameter(torch.zeros(1, 1, config["embed_dim"]))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, config["embed_dim"]))
        self.dropout = nn.Dropout(config["dropout"])

        self.encoder = nn.ModuleList([
            TransformerEncoder(config["embed_dim"], config["num_heads"], config["mlp_dim"], config["dropout"])
            for _ in range(config["num_layers"])
        ])
        self.norm = nn.LayerNorm(config["embed_dim"])
        self.head = nn.Linear(config["embed_dim"], config["num_classes"])

    def forward(self, x):
        B = x.size(0)
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.dropout(x)
        for layer in self.encoder:
            x = layer(x)
        x = self.norm(x)
        return self.head(x[:, 0])

# ====================
# Helpers
# ====================
def train(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for images, labels in tqdm(loader, desc="Training", leave=False):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Evaluating", leave=False):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total

def get_model_stats(model, input_size):
    info = summary(model, input_size=input_size, verbose=0)
    total_params = sum(p.numel() for p in model.parameters())
    total_flops = info.total_mult_adds * 2  # MACs to FLOPs
    return total_params, total_flops

# ====================
# Run ViT Training
# ====================
print("=== ViT Configuration ===")
print(config)

vit_model = VisionTransformer(config).to(device)
vit_criterion = nn.CrossEntropyLoss()
vit_optimizer = torch.optim.Adam(vit_model.parameters(), lr=config["learning_rate"])

vit_epoch_times = []
start_time = time.time()
for epoch in range(config["num_epochs"]):
    epoch_start = time.time()
    train_loss = train(vit_model, train_loader, vit_optimizer, vit_criterion)
    vit_acc = evaluate(vit_model, test_loader)
    epoch_end = time.time()
    epoch_time = epoch_end - epoch_start
    vit_epoch_times.append(epoch_time)
    print(f"[ViT] Epoch {epoch+1}/{config['num_epochs']}: Loss={train_loss:.4f}, Test Accuracy={vit_acc:.2f}%, Time={epoch_time:.2f}s")
vit_total_time = time.time() - start_time

vit_params, vit_flops = get_model_stats(vit_model, input_size=(1, 3, config["image_size"], config["image_size"]))

# ====================
# Final Summary
# ====================
print("\n=== ViT Summary Table ===")
print(f"{'Model':<12} {'Params':>12} {'FLOPs':>15} {'Accuracy':>10} {'Total Time (s)':>15} {'Avg Epoch Time (s)':>20}")
print(f"{'ViT':<12} {vit_params:>12,} {vit_flops:>15,} {vit_acc:>10.2f} {vit_total_time:>15.2f} {sum(vit_epoch_times)/len(vit_epoch_times):>20.2f}")


=== ViT Configuration ===
{'image_size': 32, 'patch_size': 8, 'embed_dim': 256, 'num_heads': 4, 'num_layers': 8, 'mlp_dim': 1028, 'num_classes': 100, 'num_epochs': 10, 'batch_size': 64, 'learning_rate': 0.001, 'dropout': 0.1}




[ViT] Epoch 1/10: Loss=4.1574, Test Accuracy=8.49%




[ViT] Epoch 2/10: Loss=3.9020, Test Accuracy=10.07%




[ViT] Epoch 3/10: Loss=3.8231, Test Accuracy=10.11%




[ViT] Epoch 4/10: Loss=3.8958, Test Accuracy=10.72%




[ViT] Epoch 5/10: Loss=3.8179, Test Accuracy=10.64%




[ViT] Epoch 6/10: Loss=3.9201, Test Accuracy=10.32%




[ViT] Epoch 7/10: Loss=3.8962, Test Accuracy=9.91%




[ViT] Epoch 8/10: Loss=3.8572, Test Accuracy=11.45%




[ViT] Epoch 9/10: Loss=3.8770, Test Accuracy=9.99%


                                                             

[ViT] Epoch 10/10: Loss=3.9540, Test Accuracy=9.93%

=== Summary Table ===
Model              Params            MACs           FLOPs   Accuracy   Time (s)
ViT             6,414,724       5,054,084      10,108,168       9.93     369.00




In [None]:
#problem 2
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from transformers import SwinForImageClassification, SwinConfig, AutoImageProcessor
from tqdm import tqdm
import time

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
num_epochs = 5
batch_size = 32
learning_rate = 2e-5
image_size = 224  # Swin expects 224x224
num_classes = 100

# Load image processor
processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")

# Data transforms for Swin
transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

# CIFAR-100 dataset
train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# Load pretrained Swin Transformer (Tiny)
model = SwinForImageClassification.from_pretrained(
    "microsoft/swin-tiny-patch4-window7-224",
    num_labels=num_classes,
    ignore_mismatched_sizes=True
).to(device)

# Freeze backbone
for param in model.swin.parameters():
    param.requires_grad = False

# Only train classification head
for param in model.classifier.parameters():
    param.requires_grad = True

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=learning_rate)

# Training function
def train():
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        start_time = time.time()
        progress_bar = tqdm(train_loader, desc=f'Epoch [{epoch+1}/{num_epochs}]')
        for images, labels in progress_bar:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images).logits
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})

        end_time = time.time()
        print(f"Epoch [{epoch+1}] Training Time: {end_time - start_time:.2f}s | Loss: {running_loss / len(train_loader):.4f}")

# Testing function
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc='Testing'):
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images).logits
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')

# Run
if __name__ == '__main__':
    print("\nFine-tuning pretrained Swin Transformer (Tiny) on CIFAR-100...")
    train()
    print("\nEvaluating fine-tuned model...")
    test()


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([100]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([100, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Fine-tuning pretrained Swin Transformer (Tiny) on CIFAR-100...


Epoch [1/5]: 100%|██████████| 1563/1563 [04:53<00:00,  5.32it/s, loss=3.54]


Epoch [1] Training Time: 293.80s | Loss: 4.0361


Epoch [2/5]: 100%|██████████| 1563/1563 [04:53<00:00,  5.33it/s, loss=2.17]


Epoch [2] Training Time: 293.36s | Loss: 3.0449


Epoch [3/5]: 100%|██████████| 1563/1563 [04:53<00:00,  5.32it/s, loss=1.22]


Epoch [3] Training Time: 293.73s | Loss: 2.3686


Epoch [4/5]: 100%|██████████| 1563/1563 [04:52<00:00,  5.35it/s, loss=1.48]


Epoch [4] Training Time: 292.22s | Loss: 1.9390


Epoch [5/5]: 100%|██████████| 1563/1563 [04:53<00:00,  5.33it/s, loss=1.66]


Epoch [5] Training Time: 293.30s | Loss: 1.6699

Evaluating fine-tuned model...


Testing: 100%|██████████| 313/313 [00:57<00:00,  5.42it/s]

Test Accuracy: 66.58%





In [None]:
#problem 2 small
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from transformers import SwinForImageClassification, AutoImageProcessor
from tqdm import tqdm
import time

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
num_epochs = 5
batch_size = 32
learning_rate = 2e-5
image_size = 224
num_classes = 100

# Load image processor
processor = AutoImageProcessor.from_pretrained("microsoft/swin-small-patch4-window7-224")

# Data transforms for Swin
transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

# CIFAR-100 dataset
train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# Load pretrained Swin Transformer (Small)
model = SwinForImageClassification.from_pretrained(
    "microsoft/swin-small-patch4-window7-224",
    num_labels=num_classes,
    ignore_mismatched_sizes=True
).to(device)

# Freeze backbone
for param in model.swin.parameters():
    param.requires_grad = False

# Only train classification head
for param in model.classifier.parameters():
    param.requires_grad = True

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=learning_rate)

# Training function
def train():
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        start_time = time.time()
        progress_bar = tqdm(train_loader, desc=f'Epoch [{epoch+1}/{num_epochs}]')
        for images, labels in progress_bar:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images).logits
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})

        end_time = time.time()
        print(f"Epoch [{epoch+1}] Training Time: {end_time - start_time:.2f}s | Loss: {running_loss / len(train_loader):.4f}")

# Testing function
def test():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc='Testing'):
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images).logits
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')

# Run
if __name__ == '__main__':
    print("\nFine-tuning pretrained Swin Transformer (Small) on CIFAR-100...")
    train()
    print("\nEvaluating fine-tuned model...")
    test()
