<a href="https://colab.research.google.com/github/amanjain487/assembler/blob/master/Assignment_1/Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from __future__ import print_function 
# this is not a import statement actually. it tells python compiler to use functions from advanced versions.
# example - python 3.0+ uses print as function, so if we want to use print as function in lower releases, say python 2.6, then above statement does that.
 
import torch 
# a Tensor library like NumPy, with strong GPU support
 
import torch.nn as nn 
# a neural networks library deeply integrated with autograd designed for maximum flexibility
 
import torch.nn.functional as F 
# functional operations required for NN
# like, convolution operation, activation function, los function, pooling and so on...
 
import torch.optim as optim 
# package which implements various optimization algorithms - specifically for updating parameter values
 
from torchvision import datasets 
# contains various famous datasets, which can be loaded to train and test to model
 
from torchvision import transforms 
# common image transformation functions - for pre and post processing images
# like normalization, convert to grayscale, flip, crop and so on..

In [3]:
class Net(nn.Module): 
  # define a class named "Net" which takes nn.Module as parameter
  # nn.Module is a base module class upon which our model builds
  # nn.Module is parent class and Net is child class in this case.
 
    def __init__(self): 
      # this function is called as soon as an object is created of this class
      # consider it as constructor of this class
      # it takes itself as parameter
      # it is a constructor of Net class
 
        super(Net, self).__init__() 
        # lets you avoid referring to base class explicitly.
        # .init() is a constructor of base/parent class
        # in our case, this line is constructor of class "nn.Module"
 
                                                         #input - 28x28x1                        RF = 1x1
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)      #input - 28x28x1    Output - 28x28x32   RF - 3x3
        # create 2d convolutional layer number 1
        # 1st parameter = number of channels in input
        # 2nd parameter = number of channels in output
        # 3rd paramter = size of kernel (always square)
        # padding is specified
 
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)     #input - 28x28x1    Output - 28x28x64   RF - 5x5
        # create 2d convolutional layer number 2
 
        self.pool1 = nn.MaxPool2d(2, 2)                  #input - 28x28x64   Output - 1414x64    RF - 10x10
        # create pooling layer number 1
        # max pooling is done
        # parameters are kernel size and stride
        # 1st parameter = kernel size
        # 2nd parameter = stride
        # padding can also be specified
 
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)    #input - 14x14x64   Output - 14x14x128  RF - 12x12
        # create 2d convolutional layer number 3
 
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)   #input - 14x14x128  Output - 14x14x256  RF - 14x14
        # create 2d convolutional layer number 4
 
        self.pool2 = nn.MaxPool2d(2, 2)                  #input - 14x14x256  Output - 7x7x256    RF - 28x28
        # create pooling layer number 2
        # max pooling is done
 
        self.conv5 = nn.Conv2d(256, 512, 3)              #input - 7x7x256    Output - 5x5x512    RF - 30x30
        # create 2d convolutional layer number 5
 
        self.conv6 = nn.Conv2d(512, 1024, 3)             #input - 5x5x512    Output - 3x3x1024   RF - 32x32
        # create 2d convolutional layer number 63
 
        self.conv7 = nn.Conv2d(1024, 10, 3)              #input - 3x3x1024   Output - 1x1x10     RF - 34x34
        # create 2d convolutional layer number 7
 
 
    def forward(self, x): 
      # defines the network structure
      # define how model is going to run from input to output
      # input is passed as parameter
      # consider it as forward pass which predicts output at last layer
      # initially, x is a vector/matrix of input pixels
 
        x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x))))) 
        # apply kernel 1 on input
        # followed by relu activation function
        # followed by convolution with kernel 2 in layer 2
        # followed by relu activation function
        # finally, apply max pooling
 
        x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x))))) 
        # apply convolution with kernel 3 in layer 3
        # followed by relu activation function
        # followed by convolution with kernel 4 in layer 4
        # followed by relu activation function
        # finally, apply max pooling
 
        x = F.relu(self.conv6(F.relu(self.conv5(x)))) 
        # apply convolution with kernel 5 in layer 5
        # followed by relu activation function
        # followed by convolution with kernel 6 in layer 6
        # followed by relu activation function
 
        x = F.relu(self.conv7(x)) 
        # apply convolution with kernel 7 in layer 7
        # followed by relu activation function
 
        x = x.view(-1, 10) 
        # behaves like -1 in numpy.reshape
        # arrange x such that it can have any number of rows but 10 columns
 
        return F.log_softmax(x) 
        # apply softmax function
        # followed by log
        # finally return the output.

In [4]:
!pip install torchsummary 
# install torchsummary which provides functions like summary

from torchsummary import summary 
# summary provides information complementary to what is provided by print(your_model) 

use_cuda = torch.cuda.is_available() 
# check if CUDA is available or not
# CUDA is a parallel computing platform for general computing on GPUs (graphics processing units)
# CUDA enables developers to speed up compute-intensive applications by using the power of GPUs for the parallelizable part of the computation.

device = torch.device("cuda" if use_cuda else "cpu") 
# use gpu if cuda is available
# else use cpu

model = Net().to(device) 
# pass the entire model to gpu if available or cpu if gpu is not available

summary(model, input_size=(1, 28, 28)) 
#print all the details of model
# input shape must be passed, based on which
# it will show at each layer what wil be the output shape
# it also displays number of parameters at each layer, and how many of those are actually trainable

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             320
            Conv2d-2           [-1, 64, 28, 28]          18,496
         MaxPool2d-3           [-1, 64, 14, 14]               0
            Conv2d-4          [-1, 128, 14, 14]          73,856
            Conv2d-5          [-1, 256, 14, 14]         295,168
         MaxPool2d-6            [-1, 256, 7, 7]               0
            Conv2d-7            [-1, 512, 5, 5]       1,180,160
            Conv2d-8           [-1, 1024, 3, 3]       4,719,616
            Conv2d-9             [-1, 10, 1, 1]          92,170
Total params: 6,379,786
Trainable params: 6,379,786
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 1.51
Params size (MB): 24.34
Estimated Total Size (MB): 25.85
-------------------------------------



In [5]:


torch.manual_seed(1) 
# Sets the seed for generating random numbers

batch_size = 128 
# how many instances of training examples utilized in one iteration

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} 
# number_of_workers means how many process are there to load data into RAM
# to speed up transfer between cpu and gpu, we set pin memory as true

train_loader = torch.utils.data.DataLoader(# load training data
  datasets.MNIST('../data', # path to MNIST dataset in torchvision
                 train=True, # bool denoting whether it is training data or not
                 download=True, # to download data or not
                 transform=transforms.Compose([
                                               transforms.ToTensor(),
                                               # The output of torchvision datasets are PILImage images of range [0, 1].
                                               # We transform them to Tensors of normalized range [-1, 1]
                                               
                                               transforms.Normalize((0.1307,), (0.3081,)) 
                                               # 0.1307 is mean of MNIST dataset
                                               # 0.3081 is std deviation of MNIST Dataset
                                               ])
                 ),
                 batch_size=batch_size, # assign batch size 
                 shuffle=True, # should the data be shuffled at every iteration or not
                 **kwargs # parameters such as num_workers, pin_memory if we use cuda
                 )
 
test_loader = torch.utils.data.DataLoader( #load test data
    datasets.MNIST('../data', #path to MNIST dataset in torchvision
                   train=False, # bool denoting whether it is training data or not
                   transform=transforms.Compose([
                                                 transforms.ToTensor(), 
                                                 # The output of torchvision datasets are PILImage images of range [0, 1].
                                                 # We transform them to Tensors of normalized range [-1, 1]
                                                 
                                                 transforms.Normalize((0.1307,), (0.3081,)) 
                                                 # 0.1307 is mean of MNIST dataset
                                                 # 0.3081 is std deviation of MNIST Dataset
                                                 ])
                   ),
                   batch_size=batch_size, # assign batch size 
                   shuffle=True, # should the data be shuffled at every iteration or not
                   **kwargs # parameters such as num_workers, pin_memory if we use cuda
                   )


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=9912422.0), HTML(value='')))


Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=28881.0), HTML(value='')))


Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=1648877.0), HTML(value='')))


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=4542.0), HTML(value='')))


Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw

Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [6]:
from tqdm import tqdm 
# lets any process or loop show smart progress meter

def train(model, device, train_loader, optimizer, epoch): 
  # function to train the model

    model.train() 
    # tell the model that we are training the model

    pbar = tqdm(train_loader) 
    # create progress bar for training

    for batch_idx, (data, target) in enumerate(pbar): 
      # enumerate all batches
      # batch_idx = batch number
      # data = x_train
      # target = y_train

        data, target = data.to(device), target.to(device) 
        # send x_train and y_train to gpu if available else to cpu

        optimizer.zero_grad() 
        # set the gradients to zero before starting to do backpropragation
        # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls).

        output = model(data) 
        # pass data to model for forward pass

        loss = F.nll_loss(output, target) 
        # calaculate loss function
        # nll_loss = negative log likelihood loss
        # output = prediction by model
        # target = real value

        loss.backward() 
        # computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation.

        optimizer.step() 
        # causes the optimizer to take a step based on the gradients of the parameters.

        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}') 
        # format how progress bar should look and what should it display


def test(model, device, test_loader): 
  # function to test the model

    model.eval() 
    # tell the model that we are testing/evaluating the model

    test_loss = 0 
    # to find average loss

    correct = 0 
    # required for calculating accuracy
    # accuracy = correct predictions / total predictions

    with torch.no_grad(): 
      # temporarily disabling gradient calculation

        for data, target in test_loader: 
          # data = x_test, target = y_test

            data, target = data.to(device), target.to(device)
             # send x_text and y_test to gpu if available else to cpu

            output = model(data) 
            # pass data to model for forward pass

            test_loss += F.nll_loss(output, target, reduction='sum').item()
            # sum up batch loss
            # in train function, it computes average loss for each batch
            # but here, it computes and add loss of each entry
            # that is why we use reduction='sum' -> adds the loss for each test data

            pred = output.argmax(dim=1, keepdim=True)  
            # get the index of the max log-probability
            # get index of largest value which is probability that the test data is that number

            correct += pred.eq( # computes element wise equality 
                               target.view_as(pred) # view target as pred
                               ).sum().item() # compare predictions and target
                               # count number of correct predictions and fetch that number and add it to variable correct

    test_loss /= len(test_loader.dataset) 
    # calculate average loss

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset))) 
    #print loss and accuracy of predictions

In [7]:

model = Net().to(device) 
# initialize model and load it in gpu if available, else cpu

optimizer = optim.SGD(# Implements stochastic gradient descent
                      # SGD randomly picks one data point from the whole data set to compute derivatives at each iteration to reduce the computations
                      
                      model.parameters(), 
                      # fetches all the parameters from model

                      lr=0.01, 
                      # learning rate -> determines the step size at each iteration while moving toward a minimum of a loss function

                      momentum=0.9 # technique to improve training speed and accuracy
                      ) 

for epoch in range(1, 2): 
  #epoch means how many times entire dataset is passed to model for training

    train(model, device, train_loader, optimizer, epoch) 
    # train model defined by us in gpu if available, else cpu
    # train loader loads mnist dataset, optimizer used here is SGD, epoch is 1

    test(model, device, test_loader) # test model trained in above line
    # device is gpu if available, else cpu
    # test loader loads test data from mnist dataset

loss=1.9762245416641235 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 26.16it/s]



Test set: Average loss: 1.8785, Accuracy: 2929/10000 (29%)

