<a href="https://colab.research.google.com/github/anmol9910/computer-vision-assessment-iisc/blob/main/q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


# Core PyTorch libraries for building and training models
import torch
import torch.nn as nn
import torch.optim as optim

# Torchvision for datasets and image transformations
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# TQDM for a clean progress bar during training
from tqdm import tqdm
import os # For handling file paths (like saving models)

# - Device Configuration -
# Set the device to a CUDA GPU if available, otherwise use the CPU
COMPUTE_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device configured for training: {COMPUTE_DEVICE}")

Device configured for training: cuda


In [None]:


# - Training Configuration -
TRAIN_BATCH_SIZE = 256
VALID_BATCH_SIZE = 256
INITIAL_LR = 3e-4  # A common starting learning rate for AdamW
NUM_EPOCHS = 15   # More epochs for better convergence
WEIGHT_DECAY = 1e-4 # Regularization parameter for AdamW

#  Model Architecture Configuration
# Based on a smaller version of ViT, suitable for CIFAR-10
IMG_DIM = 32
PATCH_DIM = 4
INPUT_CHANNELS = 3
NUM_CLASSES_OUTPUT = 10

EMBEDDING_DIM = 384       # d_model: The dimensionality of the token embeddings
TRANSFORMER_DEPTH = 7         # Number of stacked transformer encoder blocks
NUM_ATTENTION_HEADS = 6   # Number of heads for the multi-head attention mechanism
MLP_HIDDEN_DIM = EMBEDDING_DIM * 4 # Hidden dimension for the MLP in the encoder

# --- Dropout Rates ---
ATTN_DROPOUT_RATE = 0.1
MLP_DROPOUT_RATE = 0.1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Cell 3: Data Preparation and Augmentation

# --- Data Augmentation Pipelines ---
# For training, we apply aggressive augmentations to prevent overfitting
training_augmentations = transforms.Compose([
    transforms.RandomCrop(IMG_DIM, padding=4, padding_mode='reflect'),
    transforms.RandomHorizontalFlip(),

    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])

# For validation/testing, we only normalize the images
validation_augmentations = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
])

# --- Dataset and DataLoader Creation ---
train_data = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=training_augmentations)
test_data = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=validation_augmentations)

train_dataloader = DataLoader(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
test_dataloader = DataLoader(test_data, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

print(f"Data Loaded: {len(train_data)} training samples and {len(test_data)} test samples.")

Data Loaded: 50000 training samples and 10000 test samples.


In [None]:
# Cell 4: Image to Patches Module

# This class handles the initial patching and embedding of the input image.
class ImageToPatches(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, embedding_dim):
        super().__init__()
        if image_size % patch_size != 0:
            raise ValueError("Image dimensions must be divisible by the patch size.")

        self.num_patches = (image_size // patch_size) ** 2

        # We use a Conv2D layer as an efficient way to perform both patching and linear projection.
        self.patch_projection_layer = nn.Conv2d(
            in_channels,
            embedding_dim,
            kernel_size=patch_size,
            stride=patch_size
        )

    def forward(self, image_tensor):
        # image_tensor shape: [B, C, H, W]
        # After projection: [B, D, H/P, W/P]
        projections = self.patch_projection_layer(image_tensor)

        # Reshape for the transformer: [B, D, Num_Patches] -> [B, Num_Patches, D]
        projections = projections.flatten(2).transpose(1, 2)
        return projections

In [None]:
# Cell 5: Transformer Encoder Block Module

# This class implements a single block of the Transformer Encoder.
class ViTEncoderBlock(nn.Module):
    def __init__(self, embedding_dim, num_attention_heads, mlp_hidden_dim, drop_rate_attn=0.1, drop_rate_mlp=0.1):
        super().__init__()
        # --- Self-Attention Mechanism ---
        self.pre_attn_norm = nn.LayerNorm(embedding_dim)
        self.attention = nn.MultiheadAttention(embedding_dim, num_attention_heads, dropout=drop_rate_attn, batch_first=True)

        # --- Feed-Forward MLP ---
        self.post_attn_norm = nn.LayerNorm(embedding_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(embedding_dim, mlp_hidden_dim),
            nn.GELU(),
            nn.Dropout(drop_rate_mlp),
            nn.Linear(mlp_hidden_dim, embedding_dim),
            nn.Dropout(drop_rate_mlp)
        )

    def forward(self, input_tensor):
        # Pre-Normalization (Pre-Norm) architecture is often more stable
        # First residual connection (around attention)
        norm_input = self.pre_attn_norm(input_tensor)
        attention_output, _ = self.attention(norm_input, norm_input, norm_input)
        residual1 = input_tensor + attention_output

        # Second residual connection (around MLP)
        norm_residual1 = self.post_attn_norm(residual1)
        mlp_output = self.feed_forward(norm_residual1)
        final_output = residual1 + mlp_output

        return final_output

In [None]:
# Cell 6: Complete Vision Transformer Model

# This class assembles all the components into the final Vision Transformer model.
class MyVisionTransformer(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, num_classes,
                 embedding_dim, depth, num_heads, mlp_dim, dropout_rates):
        super().__init__()

        # 1. Patching and Embedding Layer
        self.patch_creator = ImageToPatches(image_size, patch_size, in_channels, embedding_dim)
        num_patches = self.patch_creator.num_patches

        # 2. Learnable CLS token for classification
        self.cls_token = nn.Parameter(torch.randn(1, 1, embedding_dim))

        # 3. Learnable Positional Embeddings
        self.positional_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, embedding_dim))
        self.pos_dropout = nn.Dropout(p=dropout_rates['mlp'])

        # 4. Stack of Transformer Encoder Blocks
        self.encoder_layers = nn.ModuleList([
            ViTEncoderBlock(embedding_dim, num_heads, mlp_dim, dropout_rates['attn'], dropout_rates['mlp'])
            for _ in range(depth)
        ])

        # 5. Final classification head
        self.final_norm = nn.LayerNorm(embedding_dim)
        self.classifier_head = nn.Linear(embedding_dim, num_classes)

    def forward(self, image):
        batch_size = image.shape[0]

        # Create patches and prepend the CLS token
        patches = self.patch_creator(image)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        tokens = torch.cat((cls_tokens, patches), dim=1)

        # Add positional information
        tokens_with_pos = tokens + self.positional_embeddings
        tokens_with_pos = self.pos_dropout(tokens_with_pos)

        # Pass through the transformer encoder stack
        encoded_tokens = tokens_with_pos
        for layer in self.encoder_layers:
            encoded_tokens = layer(encoded_tokens)

        # Extract the CLS token's output for classification
        cls_output = encoded_tokens[:, 0]
        cls_output_norm = self.final_norm(cls_output)

        # Get final class logits
        logits = self.classifier_head(cls_output_norm)
        return logits

In [None]:
# Cell 7: Model Initialization and Sanity Check

dropout_config = {'attn': ATTN_DROPOUT_RATE, 'mlp': MLP_DROPOUT_RATE}

# Instantiate the model with our defined hyperparameters
vit_model = MyVisionTransformer(
    image_size=IMG_DIM,
    patch_size=PATCH_DIM,
    in_channels=INPUT_CHANNELS,
    num_classes=NUM_CLASSES_OUTPUT,
    embedding_dim=EMBEDDING_DIM,
    depth=TRANSFORMER_DEPTH,
    num_heads=NUM_ATTENTION_HEADS,
    mlp_dim=MLP_HIDDEN_DIM,
    dropout_rates=dropout_config
).to(COMPUTE_DEVICE)

# --- Sanity Check ---
# Check if a dummy tensor passes through the model without errors
dummy_tensor = torch.randn(5, INPUT_CHANNELS, IMG_DIM, IMG_DIM).to(COMPUTE_DEVICE)
prediction = vit_model(dummy_tensor)
print(f"Sanity Check Passed! Output shape for a batch of 5: {prediction.shape}") # Should be [5, 10]

Sanity Check Passed! Output shape for a batch of 5: torch.Size([5, 10])


In [None]:
# Cell 8: Loss, Optimizer, and Scheduler

# Using CrossEntropyLoss for multi-class classification
loss_function = nn.CrossEntropyLoss()

# AdamW is a good default optimizer for transformers
model_optimizer = optim.AdamW(vit_model.parameters(), lr=INITIAL_LR, weight_decay=WEIGHT_DECAY)

# CosineAnnealingLR adjusts the learning rate in a cosine curve, often improving results
lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(model_optimizer, T_max=NUM_EPOCHS)

In [None]:
# Cell 9: Training and Evaluation Loop (with Best Model Saving)

best_accuracy = 0.0
MODEL_SAVE_PATH = "/content/drive/MyDrive/best_vit_cifar10.pth"

for epoch in range(NUM_EPOCHS):
    # --- Training Phase ---
    vit_model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} [Training]")

    for images, labels in progress_bar:
        images, labels = images.to(COMPUTE_DEVICE), labels.to(COMPUTE_DEVICE)

        model_optimizer.zero_grad()
        outputs = vit_model(images)
        loss = loss_function(outputs, labels)
        loss.backward()
        model_optimizer.step()

        total_train_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    # --- Validation Phase ---
    vit_model.eval()
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for images, labels in test_dataloader:
            images, labels = images.to(COMPUTE_DEVICE), labels.to(COMPUTE_DEVICE)
            outputs = vit_model(images)
            _, predicted_class = torch.max(outputs, 1)
            total_samples += labels.size(0)
            correct_predictions += (predicted_class == labels).sum().item()

    current_accuracy = 100 * correct_predictions / total_samples

    # Print epoch summary
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} Summary: Avg Train Loss: {avg_train_loss:.4f} | Test Accuracy: {current_accuracy:.2f}%")

    # Save the model if it has the best accuracy so far
    if current_accuracy > best_accuracy:
        best_accuracy = current_accuracy
        torch.save(vit_model.state_dict(), MODEL_SAVE_PATH)
        print(f" New best model saved with accuracy: {best_accuracy:.2f}%")

    # Update the learning rate
    lr_scheduler.step()

print("\n--- Training Finished ---")

Epoch 1/15 [Training]: 100%|██████████| 196/196 [01:34<00:00,  2.08it/s, loss=1.57]


Epoch 1 Summary: Avg Train Loss: 1.8278 | Test Accuracy: 45.16%
 New best model saved with accuracy: 45.16%


Epoch 2/15 [Training]: 100%|██████████| 196/196 [01:35<00:00,  2.04it/s, loss=1.48]


Epoch 2 Summary: Avg Train Loss: 1.4867 | Test Accuracy: 52.22%
 New best model saved with accuracy: 52.22%


Epoch 3/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.03it/s, loss=1.3]


Epoch 3 Summary: Avg Train Loss: 1.3576 | Test Accuracy: 54.64%
 New best model saved with accuracy: 54.64%


Epoch 4/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.04it/s, loss=1.18]


Epoch 4 Summary: Avg Train Loss: 1.2655 | Test Accuracy: 59.55%
 New best model saved with accuracy: 59.55%


Epoch 5/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.03it/s, loss=1.37]


Epoch 5 Summary: Avg Train Loss: 1.2081 | Test Accuracy: 60.51%
 New best model saved with accuracy: 60.51%


Epoch 6/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.03it/s, loss=1.31]


Epoch 6 Summary: Avg Train Loss: 1.1587 | Test Accuracy: 61.35%
 New best model saved with accuracy: 61.35%


Epoch 7/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.02it/s, loss=0.886]


Epoch 7 Summary: Avg Train Loss: 1.0958 | Test Accuracy: 63.42%
 New best model saved with accuracy: 63.42%


Epoch 8/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.03it/s, loss=1.12]


Epoch 8 Summary: Avg Train Loss: 1.0497 | Test Accuracy: 64.69%
 New best model saved with accuracy: 64.69%


Epoch 9/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.03it/s, loss=0.941]


Epoch 9 Summary: Avg Train Loss: 1.0075 | Test Accuracy: 66.53%
 New best model saved with accuracy: 66.53%


Epoch 10/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.04it/s, loss=1.09]


Epoch 10 Summary: Avg Train Loss: 0.9696 | Test Accuracy: 68.13%
 New best model saved with accuracy: 68.13%


Epoch 11/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.03it/s, loss=0.91]


Epoch 11 Summary: Avg Train Loss: 0.9333 | Test Accuracy: 68.85%
 New best model saved with accuracy: 68.85%


Epoch 12/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.04it/s, loss=0.962]


Epoch 12 Summary: Avg Train Loss: 0.8992 | Test Accuracy: 68.97%
 New best model saved with accuracy: 68.97%


Epoch 13/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.03it/s, loss=0.827]


Epoch 13 Summary: Avg Train Loss: 0.8746 | Test Accuracy: 70.07%
 New best model saved with accuracy: 70.07%


Epoch 14/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.04it/s, loss=0.785]


Epoch 14 Summary: Avg Train Loss: 0.8601 | Test Accuracy: 70.25%
 New best model saved with accuracy: 70.25%


Epoch 15/15 [Training]: 100%|██████████| 196/196 [01:36<00:00,  2.03it/s, loss=0.857]


Epoch 15 Summary: Avg Train Loss: 0.8518 | Test Accuracy: 70.68%
 New best model saved with accuracy: 70.68%

--- Training Finished ---


In [13]:
# Cell 10: Load Best Model and Report Final Accuracy


MODEL_SAVE_PATH = "/content/drive/MyDrive/best_vit_cifar10.pth"

# Hum yahan dropout_config ko define kar rahe hain
dropout_config = {'attn': ATTN_DROPOUT_RATE, 'mlp': MLP_DROPOUT_RATE}

# Ab hum naye model ka ek instance banayenge
final_model = MyVisionTransformer(
    image_size=IMG_DIM, patch_size=PATCH_DIM, in_channels=INPUT_CHANNELS, num_classes=NUM_CLASSES_OUTPUT,
    embedding_dim=EMBEDDING_DIM, depth=TRANSFORMER_DEPTH, num_heads=NUM_ATTENTION_HEADS,
    mlp_dim=MLP_HIDDEN_DIM, dropout_rates=dropout_config
).to(COMPUTE_DEVICE)

# Best model ke saved weights ko load karenge
final_model.load_state_dict(torch.load(MODEL_SAVE_PATH))
final_model.eval()

# --- Final Evaluation ---
final_correct = 0
final_total = 0
with torch.no_grad():
    for images, labels in tqdm(test_dataloader, desc="Final Evaluation"):
        images, labels = images.to(COMPUTE_DEVICE), labels.to(COMPUTE_DEVICE)
        outputs = final_model(images)
        _, predicted = torch.max(outputs.data, 1)
        final_total += labels.size(0)
        final_correct += (predicted == labels).sum().item()

final_test_accuracy = 100 * final_correct / final_total
print("\n--- Final Results ---")
print(f"Best model loaded from '{MODEL_SAVE_PATH}'")
print(f"Final Overall Test Accuracy on the best model: {final_test_accuracy:.2f}%")

Final Evaluation: 100%|██████████| 40/40 [00:06<00:00,  5.74it/s]


--- Final Results ---
Best model loaded from '/content/drive/MyDrive/best_vit_cifar10.pth'
Final Overall Test Accuracy on the best model: 70.68%



