# 1. Setup

Pytorch needs pythont 3.10

## 1.1 Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, Dataset

import matplotlib.pyplot as plt

import numpy as np

import pandas as pd

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cuda


## 1.2 Load Dataset

In [3]:
train_transforms = transforms.Compose([
    transforms.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    transforms.RandomCrop(size=(28, 28), padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)) # mean and std of MNIST dataset
])

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)) # mean and std of MNIST dataset
])

train_dataset = datasets.MNIST(root='./../data', train=True, transform=train_transforms, download=True)
test_dataset = datasets.MNIST(root='./../data', train=False, transform=test_transforms, download=True)

batch_size = 100
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

print(f'Number of training batches: {len(train_loader)}')
print(f'Number of testing batches: {len(test_loader)}')

Number of training batches: 600
Number of testing batches: 100


## 1.3 Print Model Summary Function

In [4]:
def print_model_summary(model):
    def layer_summary(layer):
        output_shape = None
        if hasattr(layer, 'out_channels'):
            output_shape = (layer.out_channels, "H_out", "W_out")
        elif hasattr(layer, 'out_features'):
            output_shape = (layer.out_features)
        elif isinstance(layer, torch.nn.modules.pooling._MaxPoolNd):
            output_shape = (layer.kernel_size, "H_out", "W_out")

        num_params = sum(p.numel() for p in layer.parameters() if p.requires_grad)
        return output_shape, num_params

    model_name = model.__class__.__name__
    print(f"'{model_name}' Model Summary:")

    print("="*75)
    print(f"{'Layer':<30} {'Output Shape':<30} {'Param #':<15}")
    print("="*75)
    
    for name, layer in model.named_children():
        output_shape, num_params = layer_summary(layer)
        print(f"{name:<30} {str(output_shape):<30} {num_params:<15}")
    
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print("="*75)
    print(f"Total params:          {total_params}")
    print(f"Trainable params:      {trainable_params}")
    print(f"Non-trainable params:  {total_params - trainable_params}")

## 1.4 Train and Test Functions 

In [5]:
def train(model, criterion, optimizer, epochs, scheduler=None):
    model.train()
    for epoch in range(epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i+1) % 100 == 0:
                print(f'Epoch {epoch+1}/{epochs}, Step {i+1}/{len(train_loader)}, Loss: {loss.item()}')

        if scheduler is not None:
            scheduler.step()

def test(model):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f'Accuracy: {correct / total * 100}%')

# 2. Singlelayer Softmax Classifier

from https://gist.github.com/dinhnguyenduc1994/b5881bf922054afb311b0c9a053c0357

In [6]:
class SoftMax(nn.Module):
    def __init__(self):
        super(SoftMax, self).__init__()
        self.linear = nn.Linear(28 * 28, 10)
        
    def forward(self, x):
        x = x.view(-1, 28 * 28)
        z = self.linear(x)
        return z
    
sl_model = SoftMax().to(device)
sl_optimizer = torch.optim.SGD(sl_model.parameters(), lr=0.1)
sl_criterion = nn.CrossEntropyLoss()

print_model_summary(sl_model)

'SoftMax' Model Summary:
Layer                          Output Shape                   Param #        
linear                         10                             7850           
Total params:          7850
Trainable params:      7850
Non-trainable params:  0


In [7]:
train(sl_model, sl_criterion, sl_optimizer, epochs=20)
test(sl_model)

Epoch 1/20, Step 100/600, Loss: 1.9170368909835815
Epoch 1/20, Step 200/600, Loss: 1.9333980083465576
Epoch 1/20, Step 300/600, Loss: 1.9247078895568848
Epoch 1/20, Step 400/600, Loss: 1.7628065347671509
Epoch 1/20, Step 500/600, Loss: 2.2719123363494873
Epoch 1/20, Step 600/600, Loss: 2.4166715145111084
Epoch 2/20, Step 100/600, Loss: 2.2451603412628174
Epoch 2/20, Step 200/600, Loss: 1.6888362169265747
Epoch 2/20, Step 300/600, Loss: 1.9403436183929443
Epoch 2/20, Step 400/600, Loss: 2.018605947494507
Epoch 2/20, Step 500/600, Loss: 1.9162933826446533
Epoch 2/20, Step 600/600, Loss: 1.8245468139648438
Epoch 3/20, Step 100/600, Loss: 1.8393642902374268
Epoch 3/20, Step 200/600, Loss: 1.7934350967407227
Epoch 3/20, Step 300/600, Loss: 1.7880862951278687
Epoch 3/20, Step 400/600, Loss: 1.7706981897354126
Epoch 3/20, Step 500/600, Loss: 2.1601662635803223
Epoch 3/20, Step 600/600, Loss: 2.026421546936035
Epoch 4/20, Step 100/600, Loss: 2.0866973400115967
Epoch 4/20, Step 200/600, Loss: 1

# 3. Simple CNN

In [8]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(32 * 14 * 14, 64)
        self.fc2 = nn.Linear(64, 10)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))

        x = x.view(-1, 32 * 14 * 14) # Flatten the tensor
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
simple_model = SimpleCNN().to(device)
simple_criterion = nn.CrossEntropyLoss()
simple_optimizer = optim.Adam(simple_model.parameters(), lr=0.001)

print_model_summary(simple_model)

'SimpleCNN' Model Summary:
Layer                          Output Shape                   Param #        
conv1                          (32, 'H_out', 'W_out')         320            
pool                           (2, 'H_out', 'W_out')          0              
fc1                            64                             401472         
fc2                            10                             650            
Total params:          402442
Trainable params:      402442
Non-trainable params:  0


In [9]:
train(simple_model, simple_criterion, simple_optimizer, epochs=20)
test(simple_model)

Epoch 1/20, Step 100/600, Loss: 1.3722882270812988
Epoch 1/20, Step 200/600, Loss: 0.7854834794998169
Epoch 1/20, Step 300/600, Loss: 0.6508156061172485
Epoch 1/20, Step 400/600, Loss: 0.5874542593955994
Epoch 1/20, Step 500/600, Loss: 0.5449044704437256
Epoch 1/20, Step 600/600, Loss: 0.446777880191803
Epoch 2/20, Step 100/600, Loss: 0.3448336124420166
Epoch 2/20, Step 200/600, Loss: 0.3206726014614105
Epoch 2/20, Step 300/600, Loss: 0.31378573179244995
Epoch 2/20, Step 400/600, Loss: 0.3655962347984314
Epoch 2/20, Step 500/600, Loss: 0.46703147888183594
Epoch 2/20, Step 600/600, Loss: 0.6107413172721863
Epoch 3/20, Step 100/600, Loss: 0.3311033248901367
Epoch 3/20, Step 200/600, Loss: 0.287320077419281
Epoch 3/20, Step 300/600, Loss: 0.4268115162849426
Epoch 3/20, Step 400/600, Loss: 0.26430168747901917
Epoch 3/20, Step 500/600, Loss: 0.425217866897583
Epoch 3/20, Step 600/600, Loss: 0.3057456314563751
Epoch 4/20, Step 100/600, Loss: 0.29623669385910034
Epoch 4/20, Step 200/600, Loss

# 4. Accuracy Optimized CNN

In [10]:
class AccuracyOptimCNN(nn.Module):
    def __init__(self):
        super(AccuracyOptimCNN, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(32)
        self.bn4 = nn.BatchNorm2d(64)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        
        self.dropout1 = nn.Dropout(0.25)
        self.fc1 = nn.Linear(64 * 7 * 7, 64)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 10)

    def forward(self, x):
        x = torch.relu(self.bn1(self.conv1(x)))
        x = torch.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = torch.relu(self.bn3(self.conv3(x)))
        x = torch.relu(self.bn4(self.conv4(x)))
        x = self.pool(x)

        x = x.view(-1, 64 * 7 * 7) # Flatten the tensor
        x = self.dropout1(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout2(x)
        x = self.fc2(x)
        return x
    
ao_model = AccuracyOptimCNN().to(device)
ao_criterion = nn.CrossEntropyLoss()
ao_optimizer = optim.Adam(ao_model.parameters(), lr=0.001)
ao_scheduler = optim.lr_scheduler.StepLR(ao_optimizer, step_size=5, gamma=0.5)

print_model_summary(ao_model)

'AccuracyOptimCNN' Model Summary:
Layer                          Output Shape                   Param #        
conv1                          (32, 'H_out', 'W_out')         320            
conv2                          (64, 'H_out', 'W_out')         18496          
conv3                          (32, 'H_out', 'W_out')         18464          
conv4                          (64, 'H_out', 'W_out')         18496          
bn1                            None                           64             
bn2                            None                           128            
bn3                            None                           64             
bn4                            None                           128            
pool                           (2, 'H_out', 'W_out')          0              
dropout1                       None                           0              
fc1                            64                             200768         
dropout2                      

In [11]:
train(ao_model, ao_criterion, ao_optimizer, epochs=20, scheduler=ao_scheduler)
test(ao_model)

Epoch 1/20, Step 100/600, Loss: 1.7012524604797363
Epoch 1/20, Step 200/600, Loss: 1.1781768798828125
Epoch 1/20, Step 300/600, Loss: 0.7821168303489685
Epoch 1/20, Step 400/600, Loss: 0.9377439618110657
Epoch 1/20, Step 500/600, Loss: 0.9829246997833252
Epoch 1/20, Step 600/600, Loss: 0.9114763736724854
Epoch 2/20, Step 100/600, Loss: 0.7381716966629028
Epoch 2/20, Step 200/600, Loss: 0.8888132572174072
Epoch 2/20, Step 300/600, Loss: 0.7536575794219971
Epoch 2/20, Step 400/600, Loss: 0.6089790463447571
Epoch 2/20, Step 500/600, Loss: 0.9550453424453735
Epoch 2/20, Step 600/600, Loss: 0.4485337734222412
Epoch 3/20, Step 100/600, Loss: 0.4952573776245117
Epoch 3/20, Step 200/600, Loss: 0.688212513923645
Epoch 3/20, Step 300/600, Loss: 0.6882531642913818
Epoch 3/20, Step 400/600, Loss: 0.7205285429954529
Epoch 3/20, Step 500/600, Loss: 0.5526673197746277
Epoch 3/20, Step 600/600, Loss: 0.665091872215271
Epoch 4/20, Step 100/600, Loss: 0.7612094879150391
Epoch 4/20, Step 200/600, Loss: 0

# 5. Parameters Optimized CNN

In [12]:
class LeastParamsCNN(nn.Module):
    def __init__(self):
        super(LeastParamsCNN, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=8, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=8, out_channels=8, kernel_size=3, stride=1, padding=1, groups=8)
        self.conv2_pointwise = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=1) 
        self.conv3 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=1, groups=16)
        self.conv3_pointwise = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=1)
        self.bn1 = nn.BatchNorm2d(8)
        self.bn2 = nn.BatchNorm2d(16)
        self.bn3 = nn.BatchNorm2d(32)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        
        self.fc1 = nn.Linear(32 * 3 * 3, 16)
        self.fc2 = nn.Linear(16, 10)

    def forward(self, x):
        x = torch.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = torch.relu(self.bn2(self.conv2_pointwise(self.conv2(x))))
        x = self.pool(x)
        x = torch.relu(self.bn3(self.conv3_pointwise(self.conv3(x))))
        x = self.pool(x)

        x = x.view(-1, 32 * 3 * 3) # Flatten the tensor
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
lp_model = LeastParamsCNN().to(device)
lp_criterion = nn.CrossEntropyLoss()
lp_optimizer = optim.Adam(lp_model.parameters(), lr=0.001)
lp_scheduler = optim.lr_scheduler.StepLR(lp_optimizer, step_size=5, gamma=0.5)

print_model_summary(lp_model)

'LeastParamsCNN' Model Summary:
Layer                          Output Shape                   Param #        
conv1                          (8, 'H_out', 'W_out')          80             
conv2                          (8, 'H_out', 'W_out')          80             
conv2_pointwise                (16, 'H_out', 'W_out')         144            
conv3                          (16, 'H_out', 'W_out')         160            
conv3_pointwise                (32, 'H_out', 'W_out')         544            
bn1                            None                           16             
bn2                            None                           32             
bn3                            None                           64             
pool                           (2, 'H_out', 'W_out')          0              
fc1                            16                             4624           
fc2                            10                             170            
Total params:          5914
Trai

In [13]:
train(lp_model, lp_criterion, lp_optimizer, epochs=20, scheduler=lp_scheduler)
test(lp_model)

Epoch 1/20, Step 100/600, Loss: 1.3906863927841187
Epoch 1/20, Step 200/600, Loss: 0.8284105658531189
Epoch 1/20, Step 300/600, Loss: 0.8900453448295593
Epoch 1/20, Step 400/600, Loss: 0.5031629800796509
Epoch 1/20, Step 500/600, Loss: 0.4270351529121399
Epoch 1/20, Step 600/600, Loss: 0.4711875021457672
Epoch 2/20, Step 100/600, Loss: 0.3393630087375641
Epoch 2/20, Step 200/600, Loss: 0.26549991965293884
Epoch 2/20, Step 300/600, Loss: 0.2750595808029175
Epoch 2/20, Step 400/600, Loss: 0.33409905433654785
Epoch 2/20, Step 500/600, Loss: 0.32548826932907104
Epoch 2/20, Step 600/600, Loss: 0.24065403640270233
Epoch 3/20, Step 100/600, Loss: 0.23973405361175537
Epoch 3/20, Step 200/600, Loss: 0.20622731745243073
Epoch 3/20, Step 300/600, Loss: 0.20666471123695374
Epoch 3/20, Step 400/600, Loss: 0.26085391640663147
Epoch 3/20, Step 500/600, Loss: 0.3836088180541992
Epoch 3/20, Step 600/600, Loss: 0.5467283129692078
Epoch 4/20, Step 100/600, Loss: 0.24843847751617432
Epoch 4/20, Step 200/6