In [1]:
import torch ## importing torch
import torch.nn as nn
import torch.nn.functional as F ## importing torch neural network module
import torch.optim as optim
from torchvision import datasets, transforms
!pip install torchsummary
from torchsummary import summary ## To see the model summary

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
'''
this cell checks the availability of GPU
'''
use_cuda = torch.cuda.is_available() # is cuda available
device = torch.device("cuda" if use_cuda else "cpu")  # confirmation of cuda is available 
device

device(type='cuda')

In [3]:
'''
The cell looks into the data loading
'''
# Size of a batch must be more than or equal to one and less than or equal to the number of samples in the training dataset.
batch_size = 128 #We have to start with a random batch size and then update after one run, see how much RAM is used and iterate with new batch size
## Number of Physical Processor is in power of 2 and hence Virtual Processor should always be in power of 2. 
#train data
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([   
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,)) 
                    ])),
    batch_size=batch_size, shuffle=True)   
# Compose is similar to pipeline, the list of transforms
# Loading the data and convert to tensor , tensor helps to send data to GPU by converting to tensor float objects with data standardisation (Between 0 to 1.0)
# Normalisation- Bring the image similar. 0.1307 and 0.3081 is widely used for MNIST
# Data loader is like a for loop- using multiple images

#test data
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True)

## Same for the test data from the training 

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 220735282.77it/s]

Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 85366944.20it/s]


Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 115863484.61it/s]


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 7063599.84it/s]


Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw



# Some Notes on our naive model

We are going to write a network based on what we have learnt so far. 

The size of the input image is 28x28x1. We are going to add as many layers as required to reach RF = 32 "atleast". 

In [4]:
class FirstDNN(nn.Module):
  def __init__(self):
    super(FirstDNN, self).__init__()
    # For Edges and Gradient
    # Pixel Size: 28x28
    # r_in:1, n_in:28, j_in:1, s:1, r_out:3, n_out:28, j_out:1
    self.conv1 = nn.Conv2d(1, 32, 3, padding=1)  ## Convolution- 1 input channel ,32 output channel, 3x3 kernel
    
    # r_in:3 , n_in:28 , j_in:1 , s:1 , r_out:5 , n_out:28 , j_out:1
    self.conv2 = nn.Conv2d(32, 64, 3, padding=1) # Convolution- 32 input channel ,64 output channel, 3x3 kernel
    
    
    # r_in:5 , n_in:28 , j_in:1 , s:2 , r_out:6 , n_out:14 , j_out:2
    self.pool1 = nn.MaxPool2d(2, 2) # Max Pooling
    
    # Textures and patterns
    # r_in:6, n_in:14 , j_in:2 , s:1 , r_out:10 , n_out:14 , j_out:2
    self.conv3 = nn.Conv2d(64, 128, 3, padding=1) # Convolution- 64 input channel ,128 output channel, 3x3 kernel
    
    # r_in:10 , n_in:14 , j_in:2 , s:1 , r_out:14 , n_out:14 , j_out:2
    self.conv4 = nn.Conv2d(128, 256, 3, padding = 1) # Convolution- 128 input channel ,256 output channel, 3x3 kernel
    
    # r_in:14 , n_in:14 , j_in:2 , s:2 , r_out:16 , n_out:7 , j_out:4
    self.pool2 = nn.MaxPool2d(2, 2) # Max Pooling 
    
    # Objects
    # r_in:16 , n_in:7 , j_in:4 , s:1 , r_out:24 , n_out:5 , j_out:4
    self.conv5 = nn.Conv2d(256, 512, 3) # Convolution- 256 input channel ,512 output channel, 3x3 kernel
    
    # r_in:24 , n_in:5 , j_in:4 , s:1 , r_out:32 , n_out:3 , j_out:4
    self.conv6 = nn.Conv2d(512, 1024, 3) # Convolution- 512 input channel ,1024 output channel, 3x3 kernel
    
    # r_in:32 , n_in:3 , j_in:4 , s:1 , r_out:40 , n_out:1 , j_out:4
    self.conv7 = nn.Conv2d(1024, 10, 3) # Convolution- 1024 input channel ,10 output channel, 3x3 kernel


  def forward(self, x):
    x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x))))) ## Relu(Conv1) + Relu(Conv2) + MaxPool
    x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x))))) ## Relu(Conv3) + Relu(Conv4) + MaxPool
    x = F.relu(self.conv6(F.relu(self.conv5(x))))  ## Relu(Conv5) + Relu(Conv6)
#    x = F.relu(self.conv7(x)) ## Relu(Conv7)
    x =self.conv7(x) ## By removing the Relu func, you will get maximum accuracy

    x = x.view(-1, 10)   # View will reshape the tensor. -1 means that the first dimension will be automatically picked based on the second dimension
    return F.log_softmax(x) ## Log softmax normalize the output of a network to a probability distribution over predicted output classes,
##Softmax is generated for tensor to lie between 0-1 and their sum is 1. Log Softmax is mainly for NLLLoss

'''
Setting the feed for the different layers. Relu is the activation function. It stands for Rectified Linear Unit
'''

'\nSetting the feed for the different layers. Relu is the activation function. It stands for Rectified Linear Unit\n'

In [5]:
model = FirstDNN().to(device) # Sending the model architecture to the CUDA device

In [6]:
# Model Summary
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             320
            Conv2d-2           [-1, 64, 28, 28]          18,496
         MaxPool2d-3           [-1, 64, 14, 14]               0
            Conv2d-4          [-1, 128, 14, 14]          73,856
            Conv2d-5          [-1, 256, 14, 14]         295,168
         MaxPool2d-6            [-1, 256, 7, 7]               0
            Conv2d-7            [-1, 512, 5, 5]       1,180,160
            Conv2d-8           [-1, 1024, 3, 3]       4,719,616
            Conv2d-9             [-1, 10, 1, 1]          92,170
Total params: 6,379,786
Trainable params: 6,379,786
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 1.51
Params size (MB): 24.34
Estimated Total Size (MB): 25.85
-------------------------------------

  return F.log_softmax(x) ## Log softmax normalize the output of a network to a probability distribution over predicted output classes,


In [7]:
from tqdm import tqdm ## TQDM is a function which is used for time calculation
'''
Function for training the model
'''
def train(model, device, train_loader, optimizer, epoch):
    model.train() 
    pbar = tqdm(train_loader)         
    for batch_idx, (data, target) in enumerate(pbar): ##Batch_IDx - Get batch id, 
        data, target = data.to(device), target.to(device) 
        optimizer.zero_grad()              ###  Reset the gradients before proceeding to the next iteration or batch. 
        output = model(data)        # The negative log likelihood loss.
        loss = F.nll_loss(output, target) # Maximize the probability of choosing the correct category by minimizing the negative log likelihood
        loss.backward()  # Back propogation based on loss and gradients are calculated.
        optimizer.step() # Performs a single optimization step (parameter update) based on the loss function. The weights get updated based on the loss. 
        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}') ## TQDM bar for seeing the progress of the training

'''
Function for prediction of the model
'''
def test(model, device, test_loader):
    model.eval() # Model evaluation on the test data
    test_loss = 0
    correct = 0
    with torch.no_grad(): # For test data, there is no requirement for no gradient calculation
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data) # Output for the model
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()  

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset))) ## Calculating the accuracy of the model based on the loss

In [8]:
'''
Various optimisation algorithms are implemented by the package torch.optim.
Stochastic gradient descent is implemented (optionally with momentum).
A method for iteratively improving an objective function with sufficient smoothness qualities is called SGD.
SGD's momentum only serves to shorten the convergence time.
In order to minimise the loss function, learning rate defines the step size at each iteration.
'''
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 2): ## If model goes on all images in the dataset,it is one epoch.
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

  return F.log_softmax(x) ## Log softmax normalize the output of a network to a probability distribution over predicted output classes,
loss=0.10209507495164871 batch_id=468: 100%|██████████| 469/469 [00:32<00:00, 14.61it/s]



Test set: Average loss: 0.0552, Accuracy: 9812/10000 (98%)



In [None]:
# When batch decreased, accuracy increases/decreases but loss also increased
## Epoch increases, time to run increase
## When changed momentum from 0.9 to 0.3 , accuracy decreases and loss increases
## When changed lr from 0.01 to 0.1 , accuracy decreases badly and loss also increases high
## lr=0.05, momentum=0.9, accuracy= 39% 
## When batch increased, time for one epoch is less