In [1]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        #Input will be a 28*28*1 image and output of this Sequential block wil be 22*22*32, RF is 3*3
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 16, 3, bias=False), #After convolving, this will become 26*26
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout2d(0.1),

            #Since we have convoluted once using stride 1, RF has become 5*5
            nn.Conv2d(16, 16, 3, bias=False), 
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout2d(0.1),

            #RF here will be 7*7
            nn.Conv2d(16, 32, 3, bias=False), #After convolving, this will become 24*24
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Dropout2d(0.1),
        )

        #Input will be a 22*22*32 and output of this Sequential block wil be 11*11*16, RF is 7*7
        self.trans1 = nn.Sequential(
            nn.Conv2d(32, 16, 1, bias=False), 
            nn.ReLU(),

             #RF here will be 14*14
            nn.MaxPool2d(2, 2), #After Max Pooling this become 14*14
        )
        
        #Input will be a 11*11*16 and output of this Sequential block wil be 7*7*16, RF is 16*16
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 16, 3, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout2d(0.1),

            #RF here will be 18*18
            nn.Conv2d(16, 16, 3, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout2d(0.1),
        )

        #Input will be 7*7*16 and output of this Sequential block wil be 5*5*16, RF is 20*20
        self.conv3 = nn.Sequential(
            nn.Conv2d(16, 16, 3, padding=1, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout2d(0.1),

            #RF here will be 22*22
            nn.Conv2d(16, 16, 3, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout2d(0.1),
        )

        # GAP Layer
        # input here will be 5*5*16; output - 1*1*10
        self.avg_pool = nn.Sequential(
            #RF here is 22*22
            nn.Conv2d(16, 10, 1, bias=False),
            nn.AvgPool2d(5)
        )

   
    def forward(self, x): # Forward function defines the computation at every call( Takes x: input, returns log_softmax(x) as output)
        x = self.conv1(x)
        x = self.trans1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.avg_pool(x)

        x = x.view(-1, 10)
        return F.log_softmax(x)



* Batch normalization is used after every convolution.
* Bias is set as False to remove bias terms.
* Dropout of 0.1 is applied on every convolution. It is not applied before last layer since, last layer is our CEO and we cannot hide anything from it. 









In [3]:
!pip install torchsummary #This package provides information complementary to what is provided by print(your_model) in PyTorch
from torchsummary import summary

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:

use_cuda = torch.cuda.is_available() #checking if CUDA is available
device = torch.device("cuda" if use_cuda else "cpu")
model = Net().to(device) # This transfers the model to the device
summary(model, input_size=(1, 28, 28))# This prints the model summary

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 26, 26]             144
              ReLU-2           [-1, 16, 26, 26]               0
       BatchNorm2d-3           [-1, 16, 26, 26]              32
         Dropout2d-4           [-1, 16, 26, 26]               0
            Conv2d-5           [-1, 16, 24, 24]           2,304
              ReLU-6           [-1, 16, 24, 24]               0
       BatchNorm2d-7           [-1, 16, 24, 24]              32
         Dropout2d-8           [-1, 16, 24, 24]               0
            Conv2d-9           [-1, 32, 22, 22]           4,608
             ReLU-10           [-1, 32, 22, 22]               0
      BatchNorm2d-11           [-1, 32, 22, 22]              64
        Dropout2d-12           [-1, 32, 22, 22]               0
           Conv2d-13           [-1, 16, 22, 22]             512
             ReLU-14           [-1, 16,

  return F.log_softmax(x)


In [5]:

torch.manual_seed(1) #Fixes the seed to obtain consistent results on every iterations
batch_size = 128 #This sets the batch size, majorly 2^x values as GPU stores on 128 or 256 bits at a time

# Pin memory is used to reduce data transfer. Ff you load your samples in the Dataset on CPU and would like to push it during training to the GPU, you can speed up the host to device transfer by enabling pin_memory. 
# This lets your DataLoader allocate the samples in page-locked memory, which speeds-up the transfer.
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} 

# This loads the training data and performs normalization 
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)

# This loads the test data and performs normalization 
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)


Reference: [Pin_memory](https://discuss.pytorch.org/t/when-to-set-pin-memory-to-true/19723/2)

In [6]:
from tqdm import tqdm# #make your loops show a smart progress meter

def train(model, device, train_loader, optimizer, epoch):
  
  """ Training the model
  Args
  model: the model which will be trained
  device: the device on which model will be trained cpu/gpu
  train_loader: the train data loader from torch.utils.data.DataLoader
  epoch: the number of epochs for which model runs
  optimizer: the optimizer to be used for training
  """
  model.train() # This sets the model on training mode
  pbar = tqdm(train_loader)
  for batch_idx, (data, target) in enumerate(pbar):
    data, target = data.to(device), target.to(device) #This move the data to device
    optimizer.zero_grad() #This optimizer zeroes the gradients
    output = model(data) #To obtain the output for the data
    loss = F.nll_loss(output, target) #loss is negative log likelihood
    loss.backward() #This makes the gradients flow backward
    optimizer.step() #This performs a parameter update on the current gradient which is stored in .grad attribute of a parameter
    pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}') #this is just for beautification of printing


def test(model, device, test_loader):
  """ Testing the model
  Args
  model: the model to be tested
  device: the device to use for testing
  test_loader: the test data loader from torch.utils.data.DataLoader
  """
  model.eval() #This sets the model on eval mode
  test_loss = 0 #This sets the test loss to 0
  correct = 0 #This signifies number of correct classifications
  with torch.no_grad(): #This turns off gradients sicne we are in test mode
    
    for data, target in test_loader:

      data, target = data.to(device), target.to(device)#This moves the data to device
      output = model(data) #To obtain the model output
      test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
      pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
      correct += pred.eq(target.view_as(pred)).sum().item()

  test_loss /= len(test_loader.dataset)

  print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [7]:

model = Net().to(device) ## move the model to device
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) #Stochastic gradient descent optimizer with model params, learning rate and momentum

num_epoch=20 #defining the epochs
for epoch in range(1, num_epoch+1):
  print('\nEpoch {} : '.format(epoch))
  train(model, device, train_loader, optimizer, epoch) #Training the model
  test(model, device, test_loader) #Testing the model


Epoch 1 : 


  return F.log_softmax(x)
loss=0.24384133517742157 batch_id=468: 100%|██████████| 469/469 [00:20<00:00, 23.25it/s]



Test set: Average loss: 0.0787, Accuracy: 9760/10000 (98%)


Epoch 2 : 


loss=0.11260246485471725 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.42it/s]



Test set: Average loss: 0.0421, Accuracy: 9862/10000 (99%)


Epoch 3 : 


loss=0.06747380644083023 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 27.35it/s]



Test set: Average loss: 0.0326, Accuracy: 9889/10000 (99%)


Epoch 4 : 


loss=0.026839254423975945 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 27.85it/s]



Test set: Average loss: 0.0292, Accuracy: 9908/10000 (99%)


Epoch 5 : 


loss=0.04351365566253662 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 27.55it/s]



Test set: Average loss: 0.0259, Accuracy: 9919/10000 (99%)


Epoch 6 : 


loss=0.030737513676285744 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.16it/s]



Test set: Average loss: 0.0253, Accuracy: 9912/10000 (99%)


Epoch 7 : 


loss=0.01626867987215519 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 27.93it/s]



Test set: Average loss: 0.0249, Accuracy: 9909/10000 (99%)


Epoch 8 : 


loss=0.04324956238269806 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.28it/s]



Test set: Average loss: 0.0230, Accuracy: 9930/10000 (99%)


Epoch 9 : 


loss=0.04249363765120506 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.57it/s]



Test set: Average loss: 0.0218, Accuracy: 9924/10000 (99%)


Epoch 10 : 


loss=0.056269872933626175 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.18it/s]



Test set: Average loss: 0.0238, Accuracy: 9921/10000 (99%)


Epoch 11 : 


loss=0.11305997520685196 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.88it/s]



Test set: Average loss: 0.0226, Accuracy: 9931/10000 (99%)


Epoch 12 : 


loss=0.045498643070459366 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.66it/s]



Test set: Average loss: 0.0218, Accuracy: 9929/10000 (99%)


Epoch 13 : 


loss=0.08380056172609329 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.44it/s]



Test set: Average loss: 0.0214, Accuracy: 9924/10000 (99%)


Epoch 14 : 


loss=0.031593725085258484 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.89it/s]



Test set: Average loss: 0.0210, Accuracy: 9932/10000 (99%)


Epoch 15 : 


loss=0.028566718101501465 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.63it/s]



Test set: Average loss: 0.0212, Accuracy: 9929/10000 (99%)


Epoch 16 : 


loss=0.06716521829366684 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.25it/s]



Test set: Average loss: 0.0210, Accuracy: 9928/10000 (99%)


Epoch 17 : 


loss=0.022780269384384155 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.29it/s]



Test set: Average loss: 0.0213, Accuracy: 9932/10000 (99%)


Epoch 18 : 


loss=0.049581289291381836 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.52it/s]



Test set: Average loss: 0.0191, Accuracy: 9935/10000 (99%)


Epoch 19 : 


loss=0.05828255042433739 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.59it/s]



Test set: Average loss: 0.0181, Accuracy: 9939/10000 (99%)


Epoch 20 : 


loss=0.00918623898178339 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 27.07it/s]



Test set: Average loss: 0.0172, Accuracy: 9938/10000 (99%)



### The architecture defined above uses 17200 params and achieves validation accuracy of 99.39% in 19th epoch.

## **Experimentation**

The architecutre defined below is an experiment to use less than 5000 params and achieve val accuracy of 99%.



In [14]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 8, 3, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(8),
            nn.Dropout2d(0.1),

            nn.Conv2d(8, 8, 3, bias=False), 
            nn.ReLU(),
            nn.BatchNorm2d(8),
            nn.Dropout2d(0.1),
        )

        self.trans1 = nn.Sequential(
            nn.Conv2d(8, 8, 1, bias=False), 
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(8, 8, 3, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(8),
            nn.Dropout2d(0.1),

            nn.Conv2d(8, 8, 3, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(8),
            nn.Dropout2d(0.1),
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(8, 8, 3, padding=1, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(8),
            nn.Dropout2d(0.1),
        )

        self.avg_pool = nn.Sequential(
            nn.Conv2d(8, 10, 1, bias=False),
            nn.AvgPool2d(5)
        )

    def forward(self, x): 
        x = self.conv1(x)
        x = self.trans1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.avg_pool(x)

        x = x.view(-1, 10)
        return F.log_softmax(x)


In [15]:
use_cuda = torch.cuda.is_available() #checking if CUDA is available
device = torch.device("cuda" if use_cuda else "cpu")
model = Net().to(device) # This transfers the model to the device
summary(model, input_size=(1, 28, 28))# This prints the model summary

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 26, 26]              72
              ReLU-2            [-1, 8, 26, 26]               0
       BatchNorm2d-3            [-1, 8, 26, 26]              16
         Dropout2d-4            [-1, 8, 26, 26]               0
            Conv2d-5            [-1, 8, 24, 24]             576
              ReLU-6            [-1, 8, 24, 24]               0
       BatchNorm2d-7            [-1, 8, 24, 24]              16
         Dropout2d-8            [-1, 8, 24, 24]               0
            Conv2d-9            [-1, 8, 24, 24]              64
             ReLU-10            [-1, 8, 24, 24]               0
        MaxPool2d-11            [-1, 8, 12, 12]               0
           Conv2d-12            [-1, 8, 10, 10]             576
             ReLU-13            [-1, 8, 10, 10]               0
      BatchNorm2d-14            [-1, 8,

  return F.log_softmax(x)


In [16]:
torch.manual_seed(1) #Fixes the seed to obtain consistent results on every iterations
batch_size = 128 #This sets the batch size, majorly 2^x values as GPU stores on 128 or 256 bits at a time

# Pin memory is used to reduce data transfer. Ff you load your samples in the Dataset on CPU and would like to push it during training to the GPU, you can speed up the host to device transfer by enabling pin_memory. 
# This lets your DataLoader allocate the samples in page-locked memory, which speeds-up the transfer.
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} 

# This loads the training data and performs normalization 
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)

# This loads the test data and performs normalization 
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)


In [17]:
from tqdm import tqdm# #make your loops show a smart progress meter

def train(model, device, train_loader, optimizer, epoch):
  
  """ Training the model
  Args
  model: the model which will be trained
  device: the device on which model will be trained cpu/gpu
  train_loader: the train data loader from torch.utils.data.DataLoader
  epoch: the number of epochs for which model runs
  optimizer: the optimizer to be used for training
  """
  model.train() # This sets the model on training mode
  pbar = tqdm(train_loader)
  for batch_idx, (data, target) in enumerate(pbar):
    data, target = data.to(device), target.to(device) #This move the data to device
    optimizer.zero_grad() #This optimizer zeroes the gradients
    output = model(data) #To obtain the output for the data
    loss = F.nll_loss(output, target) #loss is negative log likelihood
    loss.backward() #This makes the gradients flow backward
    optimizer.step() #This performs a parameter update on the current gradient which is stored in .grad attribute of a parameter
    pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}') #this is just for beautification of printing


def test(model, device, test_loader):
  """ Testing the model
  Args
  model: the model to be tested
  device: the device to use for testing
  test_loader: the test data loader from torch.utils.data.DataLoader
  """
  model.eval() #This sets the model on eval mode
  test_loss = 0 #This sets the test loss to 0
  correct = 0 #This signifies number of correct classifications
  with torch.no_grad(): #This turns off gradients sicne we are in test mode
    
    for data, target in test_loader:

      data, target = data.to(device), target.to(device)#This moves the data to device
      output = model(data) #To obtain the model output
      test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
      pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
      correct += pred.eq(target.view_as(pred)).sum().item()

  test_loss /= len(test_loader.dataset)

  print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [20]:
model = Net().to(device) ## move the model to device
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) #Stochastic gradient descent optimizer with model params, learning rate and momentum

num_epoch=20 #defining the epochs
for epoch in range(1, num_epoch+1):
  print('\nEpoch {} : '.format(epoch))
  train(model, device, train_loader, optimizer, epoch) #Training the model
  test(model, device, test_loader) #Testing the model


Epoch 1 : 


  return F.log_softmax(x)
loss=0.47719481587409973 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.56it/s]



Test set: Average loss: 0.2480, Accuracy: 9318/10000 (93%)


Epoch 2 : 


loss=0.3515276610851288 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.96it/s]



Test set: Average loss: 0.1351, Accuracy: 9588/10000 (96%)


Epoch 3 : 


loss=0.22162796556949615 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.64it/s]



Test set: Average loss: 0.1236, Accuracy: 9646/10000 (96%)


Epoch 4 : 


loss=0.2028311938047409 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.12it/s]



Test set: Average loss: 0.0990, Accuracy: 9699/10000 (97%)


Epoch 5 : 


loss=0.21560150384902954 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.31it/s]



Test set: Average loss: 0.0890, Accuracy: 9712/10000 (97%)


Epoch 6 : 


loss=0.24940691888332367 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.31it/s]



Test set: Average loss: 0.0841, Accuracy: 9747/10000 (97%)


Epoch 7 : 


loss=0.26770034432411194 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.68it/s]



Test set: Average loss: 0.0769, Accuracy: 9760/10000 (98%)


Epoch 8 : 


loss=0.1762668639421463 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.44it/s]



Test set: Average loss: 0.0765, Accuracy: 9756/10000 (98%)


Epoch 9 : 


loss=0.34397467970848083 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.44it/s]



Test set: Average loss: 0.0757, Accuracy: 9778/10000 (98%)


Epoch 10 : 


loss=0.17334377765655518 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.57it/s]



Test set: Average loss: 0.0741, Accuracy: 9775/10000 (98%)


Epoch 11 : 


loss=0.11125566810369492 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.51it/s]



Test set: Average loss: 0.0688, Accuracy: 9788/10000 (98%)


Epoch 12 : 


loss=0.1741122156381607 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.13it/s]



Test set: Average loss: 0.0734, Accuracy: 9775/10000 (98%)


Epoch 13 : 


loss=0.11284058541059494 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.34it/s]



Test set: Average loss: 0.0657, Accuracy: 9801/10000 (98%)


Epoch 14 : 


loss=0.135837122797966 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.74it/s]



Test set: Average loss: 0.0678, Accuracy: 9793/10000 (98%)


Epoch 15 : 


loss=0.10651921480894089 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.82it/s]



Test set: Average loss: 0.0673, Accuracy: 9792/10000 (98%)


Epoch 16 : 


loss=0.2212749570608139 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.05it/s]



Test set: Average loss: 0.0688, Accuracy: 9775/10000 (98%)


Epoch 17 : 


loss=0.17073588073253632 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.00it/s]



Test set: Average loss: 0.0706, Accuracy: 9782/10000 (98%)


Epoch 18 : 


loss=0.24766004085540771 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.22it/s]



Test set: Average loss: 0.0655, Accuracy: 9798/10000 (98%)


Epoch 19 : 


loss=0.15622377395629883 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.97it/s]



Test set: Average loss: 0.0642, Accuracy: 9806/10000 (98%)


Epoch 20 : 


loss=0.18333594501018524 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.50it/s]



Test set: Average loss: 0.0625, Accuracy: 9802/10000 (98%)



### **The architecture defined for experimentation uses 2600 params and achieve an accuracy of 98.06% in 19th epoch.**