First we import all the necessities

In [1]:
!pip install torchvision


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchvision)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt


Next we perform the preprocessing for the dataset

In [3]:
#Precalculated values for mean and standard deviation of the R,G,B channels in the CIFAR10 dataset
mean = (0.4914, 0.4822, 0.4465)
std  = (0.2470, 0.2435, 0.2616)

#transforms.Compose defines a set of operations applied to each image from the dataset
baseline_transform = transforms.Compose([
    #ToTensor is used to convert the image into a tensor. It becomes torch.Size([3, 32, 32]) ([Channel,Heigh,Width])
    #Also pixel values are scaled to 0.0-1.0 (from 0-255) to help with normalization
    transforms.ToTensor(),
    #Normalize does channel-wise normalization using output[channel] = (input[channel] - mean[channel]) / std[channel]
    #Normalization centers each channel at 0 and speeds up and makes training easier
    transforms.Normalize(mean, std)
])

#Creates training and testing dataset objects and applies the baseline_transform given above to all the images in the dataset
train_dataset = datasets.CIFAR10(root="./data", train=True, download=True, transform=baseline_transform)
test_dataset  = datasets.CIFAR10(root="./data", train=False, download=True, transform=baseline_transform)

#Done to provide mini-batches(of size 64 images and 64 labels here) from the dataset
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=64, shuffle=False)


100%|██████████| 170M/170M [00:01<00:00, 89.2MB/s] 


Next we have the baseline CNN model

In [4]:
class BaselineCNN(nn.Module):
    #Defines a basic CNN model
    def __init__(self):
        super().__init__()
        
        #First convolution layer
        #Input 3 channels(RGB) and 32 output feature maps(32 different filters)
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        
        #Second convolution layer
        #Input 32 channels from 1st conv layer and 64 output feature maps(64 different filters)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        
        #Final fully connected layer(after flattening)
        self.fc = nn.Linear(64 * 8 * 8, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x)) #Passes input into first conv layer then applies actviation function ReLU((B,32,32,32))
        x = F.max_pool2d(x, 2)   #Reduces 32*32 to 16*16((B,32,16,16))

        x = F.relu(self.conv2(x))#Passes input from first conv layer(after pooling) into second conv layer then applies actviation function ReLU((B,64,16,16))
        x = F.max_pool2d(x, 2)   #Reduces 16*16 to 8*8((B,64,8,8))

        x = x.view(x.size(0), -1) #Flattens the ((B,64,8,8)) into ((B,64*8*8)) linear layer
        x = self.fc(x) #Applies the fully connected layer on final linear layer
        return x


Next we have the training and evalutation part

In [5]:
def train(model, loader, criterion, optimizer, device):
    #Puts in training mode
    model.train()
    #Variables
    total_loss = 0
    correct = 0
    total = 0

    #Loops over the batches of images - img is a tensor of (B,3,32,32) and lbl is the category which it belongs to
    for img, lbl in loader:
        img, lbl = img.to(device), lbl.to(device)
        
        optimizer.zero_grad() #Resets grads to 0
        out = model(img) #Feeds image through model - out is of (B,10)
        loss = criterion(out, lbl) #Calculates loss
        loss.backward() #Backprop
        optimizer.step() #Updates model weights

        total_loss += loss.item() #Keeps track of total loss
        _, pred = torch.max(out, 1) #Gets index of maximum value in output tensor - (ie the prediction of the model)
        total += lbl.size(0) #Number of samples in the batch
        correct += (pred == lbl).sum().item() #Counts number of times predictions and labels match

    return total_loss / len(loader), correct / total #Returns average loss per batch and accuracy

#Similar to above function but model is in eval mode - no backprop, updating weights and gradient tracking
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for img, lbl in loader:
            img, lbl = img.to(device), lbl.to(device)
            out = model(img)
            loss = criterion(out, lbl)

            total_loss += loss.item()
            _, pred = torch.max(out, 1)
            total += lbl.size(0)
            correct += (pred == lbl).sum().item()

    return total_loss / len(loader), correct / total


Next we train the model

In [6]:
#Defines device as cpu or gpu
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BaselineCNN().to(device)

#Uses the CrossEntropy function for calculating loss
criterion = nn.CrossEntropyLoss()
#Optimizer uses gradient descent
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

EPOCHS = 10

baseline_history = {"train_acc": [], "test_acc": []}

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)

    baseline_history["train_acc"].append(train_acc)
    baseline_history["test_acc"].append(test_acc)

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")


Epoch 1/10 | Train Acc: 0.5229 | Test Acc: 0.6223
Epoch 2/10 | Train Acc: 0.6601 | Test Acc: 0.6561
Epoch 3/10 | Train Acc: 0.7058 | Test Acc: 0.6947
Epoch 4/10 | Train Acc: 0.7383 | Test Acc: 0.6908
Epoch 5/10 | Train Acc: 0.7579 | Test Acc: 0.6989
Epoch 6/10 | Train Acc: 0.7782 | Test Acc: 0.7082
Epoch 7/10 | Train Acc: 0.7897 | Test Acc: 0.7073
Epoch 8/10 | Train Acc: 0.8029 | Test Acc: 0.7047
Epoch 9/10 | Train Acc: 0.8135 | Test Acc: 0.7236
Epoch 10/10 | Train Acc: 0.8227 | Test Acc: 0.7045


-----------IMPROVED SECTION----------
Data augmentation - flipping and random crop
Improved CNN architecture with better layers, batch norm, new optimizer and a scheduler

In [11]:
augmented_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),#Flips random images left-right. This helps model learn that features do not depend on orientation(because many images in the dataset look same when flipped)
    transforms.RandomCrop(32, padding=4),#This pads images and then crops 32*32 from them - this simulates slight translations. This helps the model recognize even shifted or off centre images
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

train_dataset_aug = datasets.CIFAR10(root="./data",train=True,download=False,transform=augmented_transform)

train_loader_aug = DataLoader(train_dataset_aug, batch_size=128, shuffle=True) #Batch size increased as it helps with Adam optimizer


In [8]:
class ImprovedCNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        def conv_block(in_c, out_c):
            return nn.Sequential(
                nn.Conv2d(in_c, out_c, 3, padding=1),
                nn.BatchNorm2d(out_c),
                nn.ReLU()
            )

        self.features = nn.Sequential(
            conv_block(3, 64),
            conv_block(64, 64),
            nn.MaxPool2d(2),      # 32 → 16
            
            conv_block(64, 128),
            conv_block(128, 128),
            nn.MaxPool2d(2),      # 16 → 8
            
            conv_block(128, 256),
            nn.MaxPool2d(2)       # 8 → 4
        )

        self.classifier = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        x = self.features(x)
        x = F.adaptive_avg_pool2d(x, 1)   # Output becomes (B, 256, 1, 1)
        x = x.view(x.size(0), -1)         # Flatten → (B, 256)
        return self.classifier(x)


In [9]:
model2 = ImprovedCNN().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=0.001) #Another kind of optimizer called Adam - builds upon the SGD algorithm. Don't exactly know how this works yet, but it is a better optimizer
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) #Scheduler that reduces the learning rate every 10 epochs - Initial LR is high for faster learning, then i starts getting slower for more precise learning


In [10]:
EPOCHS = 25
improved_history = {"train_acc": [], "test_acc": []}

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model2, train_loader_aug, criterion, optimizer, device)
    test_loss, test_acc = evaluate(model2, test_loader, criterion, device)
    
    scheduler.step()

    improved_history["train_acc"].append(train_acc)
    improved_history["test_acc"].append(test_acc)

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

Epoch 1/25 | Train Acc: 0.4607 | Test Acc: 0.5753
Epoch 2/25 | Train Acc: 0.6160 | Test Acc: 0.6477
Epoch 3/25 | Train Acc: 0.6710 | Test Acc: 0.6591
Epoch 4/25 | Train Acc: 0.7085 | Test Acc: 0.6754
Epoch 5/25 | Train Acc: 0.7372 | Test Acc: 0.7285
Epoch 6/25 | Train Acc: 0.7601 | Test Acc: 0.7047
Epoch 7/25 | Train Acc: 0.7779 | Test Acc: 0.7694
Epoch 8/25 | Train Acc: 0.7944 | Test Acc: 0.7611
Epoch 9/25 | Train Acc: 0.8068 | Test Acc: 0.8046
Epoch 10/25 | Train Acc: 0.8155 | Test Acc: 0.8147
Epoch 11/25 | Train Acc: 0.8433 | Test Acc: 0.8369
Epoch 12/25 | Train Acc: 0.8507 | Test Acc: 0.8338
Epoch 13/25 | Train Acc: 0.8558 | Test Acc: 0.8446
Epoch 14/25 | Train Acc: 0.8595 | Test Acc: 0.8502
Epoch 15/25 | Train Acc: 0.8635 | Test Acc: 0.8467
Epoch 16/25 | Train Acc: 0.8706 | Test Acc: 0.8364
Epoch 17/25 | Train Acc: 0.8716 | Test Acc: 0.8576
Epoch 18/25 | Train Acc: 0.8776 | Test Acc: 0.8527
Epoch 19/25 | Train Acc: 0.8787 | Test Acc: 0.8481
Epoch 20/25 | Train Acc: 0.8791 | Test A