First we import all the necessities

In [1]:
!pip install torchvision


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchvision)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt


Next we perform the preprocessing for the dataset

In [3]:
#Precalculated values for mean and standard deviation of the R,G,B channels in the CIFAR10 dataset
mean = (0.4914, 0.4822, 0.4465)
std  = (0.2470, 0.2435, 0.2616)

#transforms.Compose defines a set of operations applied to each image from the dataset
baseline_transform = transforms.Compose([
    #ToTensor is used to convert the image into a tensor. It becomes torch.Size([3, 32, 32]) ([Channel,Heigh,Width])
    #Also pixel values are scaled to 0.0-1.0 (from 0-255) to help with normalization
    transforms.ToTensor(),
    #Normalize does channel-wise normalization using output[channel] = (input[channel] - mean[channel]) / std[channel]
    #Normalization centers each channel at 0 and speeds up and makes training easier
    transforms.Normalize(mean, std)
])

#Creates training and testing dataset objects and applies the baseline_transform given above to all the images in the dataset
train_dataset = datasets.CIFAR10(root="./data", train=True, download=True, transform=baseline_transform)
test_dataset  = datasets.CIFAR10(root="./data", train=False, download=True, transform=baseline_transform)

#Done to provide mini-batches(of size 64 images and 64 labels here) from the dataset
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=64, shuffle=False)


100%|██████████| 170M/170M [00:04<00:00, 34.9MB/s] 


Next we have the baseline CNN model

In [4]:
class BaselineCNN(nn.Module):
    #Defines a basic CNN model
    def __init__(self):
        super().__init__()
        
        #First convolution layer
        #Input 3 channels(RGB) and 32 output feature maps(32 different filters)
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        
        #Second convolution layer
        #Input 32 channels from 1st conv layer and 64 output feature maps(64 different filters)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        
        #Final fully connected layer(after flattening)
        self.fc = nn.Linear(64 * 8 * 8, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x)) #Passes input into first conv layer then applies actviation function ReLU((B,32,32,32))
        x = F.max_pool2d(x, 2)   #Reduces 32*32 to 16*16((B,32,16,16))

        x = F.relu(self.conv2(x))#Passes input from first conv layer(after pooling) into second conv layer then applies actviation function ReLU((B,64,16,16))
        x = F.max_pool2d(x, 2)   #Reduces 16*16 to 8*8((B,64,8,8))

        x = x.view(x.size(0), -1) #Flattens the ((B,64,8,8)) into ((B,64*8*8)) linear layer
        x = self.fc(x) #Applies the fully connected layer on final linear layer
        return x


Next we have the training and evalutation part

In [6]:
def train(model, loader, criterion, optimizer, device):
    #Puts in training mode
    model.train()
    #Variables
    total_loss = 0
    correct = 0
    total = 0

    #Loops over the batches of images - img is a tensor of (B,3,32,32) and lbl is the category which it belongs to
    for img, lbl in loader:
        img, lbl = img.to(device), lbl.to(device)
        
        optimizer.zero_grad() #Resets grads to 0
        out = model(img) #Feeds image through model - out is of (B,10)
        loss = criterion(out, lbl) #Calculates loss
        loss.backward() #Backprop
        optimizer.step() #Updates model weights

        total_loss += loss.item() #Keeps track of total loss
        _, pred = torch.max(out, 1) #Gets index of maximum value in output tensor - (ie the prediction of the model)
        total += lbl.size(0) #Number of samples in the batch
        correct += (pred == lbl).sum().item() #Counts number of times predictions and labels match

    return total_loss / len(loader), correct / total #Returns average loss per batch and accuracy

#Similar to above function but model is in eval mode - no backprop, updating weights and gradient tracking
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for img, lbl in loader:
            img, lbl = img.to(device), lbl.to(device)
            out = model(img)
            loss = criterion(out, lbl)

            total_loss += loss.item()
            _, pred = torch.max(out, 1)
            total += lbl.size(0)
            correct += (pred == lbl).sum().item()

    return total_loss / len(loader), correct / total


Next we train the model

In [7]:
#Defines device as cpu or gpu
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BaselineCNN().to(device)

#Uses the CrossEntropy function for calculating loss
criterion = nn.CrossEntropyLoss()
#Optimizer uses gradient descent
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

EPOCHS = 10

baseline_history = {"train_acc": [], "test_acc": []}

for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)

    baseline_history["train_acc"].append(train_acc)
    baseline_history["test_acc"].append(test_acc)

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")


Epoch 1/10 | Train Acc: 0.5248 | Test Acc: 0.6131
Epoch 2/10 | Train Acc: 0.6604 | Test Acc: 0.6645
Epoch 3/10 | Train Acc: 0.7087 | Test Acc: 0.6957
Epoch 4/10 | Train Acc: 0.7392 | Test Acc: 0.6937
Epoch 5/10 | Train Acc: 0.7608 | Test Acc: 0.6944
Epoch 6/10 | Train Acc: 0.7772 | Test Acc: 0.7024
Epoch 7/10 | Train Acc: 0.7942 | Test Acc: 0.7152
Epoch 8/10 | Train Acc: 0.8069 | Test Acc: 0.7095
Epoch 9/10 | Train Acc: 0.8162 | Test Acc: 0.7121
Epoch 10/10 | Train Acc: 0.8263 | Test Acc: 0.7036
