**Case 3:**

Target:

 * Reduce overfitting - Augmentation

 * Learning rate optimization

 * Increase accuracy 


Results:

~7k Parameters (Changed the network)

Best Train Accuracy: 99.14

Best Test Accuracy: 99.5

Observations:

1. Introduced transformations like ShiftScaleRotate, RandomCrop, and RandomBrightness from albumentations library to reduce the overfitting further

2. Used LR scheduler to define a search space -> (0.01 - 0.1)


**Case 2:**

Target:

 * Lighter model

 * Reduce overfitting

 * Increase Model efficiency with Batch Normalization

 * Use GAP


Results:

~4k Parameters

Best Train Accuracy: 98.98

Best Test Accuracy: 98.9

Observations:

1. Model's parameters are brought down
2. Overfitting has reduced though not completely
3. Accuracy is still around 98



**Case 1:**

Target:

 * Basic Network. 

Results:

~16k Parameters

Best Train Accuracy: 99.26

Best Test Accuracy: 98.92

Observations:

1. Model has decent parameters
2. Overfitting can be seen happening after epoch 8

In [None]:

from __future__ import print_function
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

import itertools
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class Block(nn.Module):
    def __init__(self, input_size, output_size, padding=1, usepool=True):
        super(Block, self).__init__()
        self.usepool = usepool
        self.conv1 = nn.Conv2d(input_size, output_size, 3, padding=padding)
        self.bn1 = nn.BatchNorm2d(output_size)
        self.conv2 = nn.Conv2d(output_size, output_size, 3, padding=padding)
        self.bn2 = nn.BatchNorm2d(output_size)
        if usepool:
            self.pool = nn.MaxPool2d(2, 2)
        
    def __call__(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.bn1(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.bn2(x)
        if self.usepool:
            x = self.pool(x)
        return x

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        #self.base_channels = 4
        self.drop = 0.01

        # Conv 
        self.block1 = Block(1, 7)
        self.dropout1 = nn.Dropout(self.drop)
        self.block2 = Block(7, 14)
        self.dropout2 = nn.Dropout(self.drop)
        self.block3 = Block(14, 14, usepool=False)
        #self.dropout3 = nn.Dropout(self.drop)

        self.flat = nn.Conv2d(14, 10, 1)
        self.gap = nn.AdaptiveAvgPool2d(1)

    def forward(self, x, dropout=True):
        # Conv Layer
        x = self.block1(x)
        if dropout:
            x = self.dropout1(x)
        x = self.block2(x)
        if dropout:
            x = self.dropout2(x)
        x = self.block3(x)
        #if dropout:
        #    x = self.dropout3(x)

        # Output Layer
        x = self.flat(x)
        x = self.gap(x)
        x = x.view(-1, 10)

        # Output Layer
        return F.log_softmax(x, dim=1)

In [None]:
#!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = Net().to(device)
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 7, 28, 28]              70
       BatchNorm2d-2            [-1, 7, 28, 28]              14
            Conv2d-3            [-1, 7, 28, 28]             448
       BatchNorm2d-4            [-1, 7, 28, 28]              14
         MaxPool2d-5            [-1, 7, 14, 14]               0
           Dropout-6            [-1, 7, 14, 14]               0
            Conv2d-7           [-1, 14, 14, 14]             896
       BatchNorm2d-8           [-1, 14, 14, 14]              28
            Conv2d-9           [-1, 14, 14, 14]           1,778
      BatchNorm2d-10           [-1, 14, 14, 14]              28
        MaxPool2d-11             [-1, 14, 7, 7]               0
          Dropout-12             [-1, 14, 7, 7]               0
           Conv2d-13             [-1, 14, 7, 7]           1,778
      BatchNorm2d-14             [-1, 1

In [None]:
!pip install -U albumentations

Collecting albumentations
[?25l  Downloading https://files.pythonhosted.org/packages/b0/be/3db3cd8af771988748f69eace42047d5edebf01eaa7e1293f3b3f75f989e/albumentations-1.0.0-py3-none-any.whl (98kB)
[K     |███▍                            | 10kB 23.7MB/s eta 0:00:01[K     |██████▊                         | 20kB 29.3MB/s eta 0:00:01[K     |██████████                      | 30kB 19.9MB/s eta 0:00:01[K     |█████████████▍                  | 40kB 16.2MB/s eta 0:00:01[K     |████████████████▊               | 51kB 8.5MB/s eta 0:00:01[K     |████████████████████            | 61kB 8.9MB/s eta 0:00:01[K     |███████████████████████▍        | 71kB 8.9MB/s eta 0:00:01[K     |██████████████████████████▊     | 81kB 9.1MB/s eta 0:00:01[K     |██████████████████████████████  | 92kB 9.9MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 6.7MB/s 
Collecting opencv-python-headless>=4.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/c8/84/72ec52fbac4775c2a5bf

In [None]:
torch.manual_seed(1)
batch_size = 64

class MnistDataset(Dataset):
    def __init__(self, dataset, transforms=None):
        self.transforms = transforms
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Read Image and Label
        image, label = self.dataset[idx]
        
        image = np.array(image)
        
        # Apply Transforms
        if self.transforms is not None:
            image = self.transforms(image=image)["image"]

        return (image, label)
    
import albumentations as A
from albumentations.pytorch import ToTensorV2

train_transform = A.Compose([
#     A.SmallestMaxSize(max_size=160),
    A.ShiftScaleRotate(shift_limit=0.09, scale_limit=0.09, rotate_limit=10, p=0.5),
    A.RandomCrop(height=28, width=28),
#     A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.Normalize(mean=(0.1307,), std=(0.3081,)),
    ToTensorV2(),
])

test_transform = A.Compose([
    A.Normalize(mean=(0.1307,), std=(0.3081,)),
    ToTensorV2(),
])

kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}

train_loader = DataLoader(
    MnistDataset(datasets.MNIST('../data', train=True, download=True), transforms=train_transform),
    batch_size=batch_size, shuffle=True, **kwargs)

test_loader = DataLoader(
    MnistDataset(datasets.MNIST('../data', train=False, download=True), transforms=test_transform),
    batch_size=batch_size, shuffle=True, **kwargs)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=9912422.0), HTML(value='')))


Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=28881.0), HTML(value='')))


Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=1648877.0), HTML(value='')))


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=4542.0), HTML(value='')))


Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw

Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
  cpuset_checked))


In [None]:
def train(model, train_loader, optimizer, scheduler, epoch, dropout, device):
    model.train()
    epoch_loss = 0
    correct = 0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data, dropout)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        # scheduler.step()
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()
        epoch_loss += loss.item()
      
    return epoch_loss / len(train_loader), correct


def test(model, test_loader, device):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    
    return test_loss, correct

In [None]:
import time
import math

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def initialize_weights(m):
    if isinstance(m, nn.Conv2d):
        nn.init.xavier_uniform_(m.weight.data)
        if m.bias is not None:
            nn.init.constant_(m.bias.data, 0)
    elif isinstance(m, nn.BatchNorm2d):
        nn.init.constant_(m.weight.data, 1)
        nn.init.constant_(m.bias.data, 0)

model = Net().apply(initialize_weights).to(device)

# model = Net().to(device)

# Train Params
epochs = 15
lr = 0.01
max_lr = 0.1
steps_per_epoch = len(train_loader)
dropout = True

print("Using Device:", device)
print("Epochs:", epochs)
print("Lr:", lr)
print("Max Lr:", max_lr)
print("Batch Size:", batch_size)
print("\n")

optimizer = optim.SGD(model.parameters(), lr=lr,momentum = 0.9)

scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=max_lr, steps_per_epoch=steps_per_epoch, epochs=epochs, anneal_strategy='linear')

# One cycle schedule with custome function
scheduler = np.interp(np.arange(epochs+1), [0, 2, 5, epochs], [lr, max_lr, lr/5.0, 0])

best_perc = 99.4
best_path = ""

def lr_schedules(epoch):
    return scheduler[epoch]

for epoch in range(epochs):
    # Manually assign lr
    optimizer.param_groups[0]['lr'] = lr_schedules(epoch)

    print(f'Epoch: {epoch+1:02}')
    print(f'\t Learning Rate: {optimizer.param_groups[0]["lr"]:.6f}')

    start_time = time.time()

    train_loss, train_correct = train(model, train_loader, optimizer, scheduler, epoch, dropout, device)
    valid_loss, valid_correct = test(model, test_loader, device)
    
    valid_perc = (100. * valid_correct / len(test_loader.dataset))
    
    if valid_perc >= best_perc:
        best_perc = valid_perc
        best_path = f'model_weights_{valid_perc:.2f}.pth'
        torch.save(model.state_dict(), best_path)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'\t          Time: {epoch_mins}m {epoch_secs}s');
    print(f'\t    Train Loss: {train_loss:.6f}')
    print(f'\tTrain Accuracy: {train_correct:5d}/{len(train_loader.dataset):5d} | Percent: {(100. * train_correct / len(train_loader.dataset)):.2f}%')
    print(f'\t     Val. Loss: {valid_loss:.6f}')
    print(f'\t  Val Accuracy: {valid_correct:5d}/{len(test_loader.dataset):5d} | Percent: {(100. * valid_correct / len(test_loader.dataset)):.2f}%')

Using Device: cuda
Epochs: 15
Lr: 0.01
Max Lr: 0.1
Batch Size: 64


Epoch: 01
	 Learning Rate: 0.010000


  cpuset_checked))


	          Time: 0m 16s
	    Train Loss: 0.354249
	Train Accuracy: 53695/60000 | Percent: 89.49%
	     Val. Loss: 0.067935
	  Val Accuracy:  9796/10000 | Percent: 97.96%
Epoch: 02
	 Learning Rate: 0.055000
	          Time: 0m 16s
	    Train Loss: 0.114741
	Train Accuracy: 57902/60000 | Percent: 96.50%
	     Val. Loss: 0.063230
	  Val Accuracy:  9806/10000 | Percent: 98.06%
Epoch: 03
	 Learning Rate: 0.100000
	          Time: 0m 16s
	    Train Loss: 0.082678
	Train Accuracy: 58453/60000 | Percent: 97.42%
	     Val. Loss: 0.044848
	  Val Accuracy:  9852/10000 | Percent: 98.52%
Epoch: 04
	 Learning Rate: 0.067333
	          Time: 0m 16s
	    Train Loss: 0.051603
	Train Accuracy: 59054/60000 | Percent: 98.42%
	     Val. Loss: 0.046088
	  Val Accuracy:  9848/10000 | Percent: 98.48%
Epoch: 05
	 Learning Rate: 0.034667
	          Time: 0m 16s
	    Train Loss: 0.037320
	Train Accuracy: 59306/60000 | Percent: 98.84%
	     Val. Loss: 0.018956
	  Val Accuracy:  9939/10000 | Percent: 99.39%
Epoch: