# Convolutional Neural Networks
---
The reason why CNNs work so well is because of 2 properties observed in data:
* __Stationarity of Statistics:__ This means given small local patches, there is a high probability of finding recurrent patterns in the locality. A simpler definition for this is that some motifs tend to reoccur within the (image) data. This allows for parameter sharing in case of CNNs, which inversely affects the number of parameters in the model, and hence allows relatively lesser training times as compared to, say, fully connected layers.

* __Locality of Pixel Dependencies:__ This principal states that pixels that are close to each other tend to be more correlated and dependent on each other as compared to those far away. In simpler words, pixels that are closer to each other tend to be of similar color. This also means that related data tends to be concentrated into small patches. Locality affects that sparsity of the connections.

* __Compositionality:__ Talking about in terms of image data, images are composed of smaller, simpler patterns. In fact, all data is composed of simpler data. Thus, instead of looking for a certain object within the image, the network can focus on discovering these patterns within the image. 

In this notebook, we will see how CNNs perform as compared to FC nerworks.

In [1]:
from res.plot_lib import *
set_default() # setting the default plot style

In [2]:
# importing dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import dataset, DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy

In [3]:
# function to calculate the number of parameters in the model
def count_parameters(model):
    params = 0
    for p in list(model.parameters()):
        params += p.nelement()
    return params

In [4]:
# selecting the default device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [5]:
# creating the dataloaders to feed data to the models
train_loader = DataLoader(
    datasets.MNIST("E-Learning/NYU-DL/data/MNIST", train=True, download=True, 
                        transform= transforms.Compose([
                            transforms.ToTensor(),
                            transforms.Normalize((0.1307,), (0.3081,))
    ])),
    batch_size=64,
    shuffle=True,
)

validation_loader = DataLoader(
    datasets.MNIST("E-Learning/NYU-DL/data/MNIST", train=False, download=True, 
                        transform= transforms.Compose([
                            transforms.ToTensor(),
                            transforms.Normalize((0.1307,), (0.3081,))
    ])),
    batch_size=256,
    shuffle=False,
)

## Modeiling
---
### FC Model:

In [6]:
input_size  = 28*28  
output_size = 10  

class FCModel(nn.Module):
    def __init__(self, input_size, n_hidden, output_size):
        super(FCModel, self).__init__()
        self.input_size = input_size
        self.n_hidden = n_hidden
        self.output_size = output_size 
        self.network = nn.Sequential(
            nn.Linear(self.input_size, self.n_hidden),
            nn.ReLU(),
            nn.Linear(self.n_hidden, self.n_hidden),
            nn.ReLU(),
            nn.Linear(self.n_hidden, self.output_size),
            nn.LogSoftmax(dim=1),
        )
        
    def forward(self, x):
        x = x.view(-1, self.input_size)
        return self.network(x)

### CNN Model:

In [7]:
class CNNModel(nn.Module):
    def __init__(self, n_channels, output_size):
        super(CNNModel, self).__init__()
        self.n_channels = n_channels
        self.output_size = output_size
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=n_channels//2, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=n_channels//2, out_channels=n_channels, kernel_size=5)
        self.fc1 = nn.Linear(n_channels*4*4, 50)
        self.fc2 = nn.Linear(50, 10)
        
    def forward(self, x):
        # input shape = m x 28 x 28 x 1
        x = self.conv1(x) # m x 24 x 24 x n_channels//2
        x = F.relu(x)
        x = nn.MaxPool2d(kernel_size=2)(x) # m x 12 x 12 x n_channels//2
        x = self.conv2(x) # m x 8 x 8 x n_channels
        x = F.relu(x)
        x = nn.MaxPool2d(kernel_size=2)(x) # 4 x 4 x n_channels
        x = nn.Flatten()(x) # m x 4 * 4 * n_channels
        x = self.fc1(x) # m x 50
        x = F.relu(x)
        x = self.fc2(x) # m x 10
        x = F.log_softmax(x, dim=1)
        return x

Now that we have defined the models, let us define the training and validation functions.

In [8]:
accuracy_list = []

def train(epochs, model, perm=torch.arange(0, 784).long()):
    model.train()
    for batch_idx, (data, label) in enumerate(train_loader):
        # moving the data and label to device
        data, label = data.to(device), label.to(device)
        # permute pixels
        data = data.view(-1, 28*28)
        data = data[:, perm]
        data = data.view(-1, 1, 28, 28)
        # resetting gradients
        optimizer.zero_grad()
        # training the model
        output = model(data)
        loss = F.nll_loss(output, label)
        loss.backward()
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            

def test(model, perm=torch.arange(0, 784).long()):
    model.eval()
    test_loss = 0
    correct = 0
    for data, label in validation_loader:
        # send to device
        data, label = data.to(device), label.to(device)
        
        # permute pixels
        data = data.view(-1, 28*28)
        data = data[:, perm]
        data = data.view(-1, 1, 28, 28)
        
        output = model(data)
        test_loss += F.nll_loss(output, label, reduction='sum').item() # sum up batch loss                                                               
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability                                                                 
        correct += pred.eq(label.data.view_as(pred)).cpu().sum().item()

    test_loss /= len(validation_loader.dataset)
    accuracy = 100. * correct / len(validation_loader.dataset)
    accuracy_list.append(accuracy)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(validation_loader.dataset),
        accuracy))

## Training the Models
---

### FC Model

In [9]:

n_hidden = 8 # number of hidden units

fc_model = FCModel(input_size, n_hidden, output_size)
fc_model.to(device)
optimizer = optim.SGD(fc_model.parameters(), lr=0.01, momentum=0.5)
print('Number of parameters: {}'.format(count_parameters(fc_model)))

for epoch in range(0, 1):
    train(epoch, fc_model)
    test(fc_model)

Number of parameters: 6442

Test set: Average loss: 0.4281, Accuracy: 8719/10000 (87%)



### ConvNet Model

In [11]:
# Training settings 
n_channels = 6 # number of feature maps

cnn_model = CNNModel(n_channels, output_size)
cnn_model.to(device)
optimizer = optim.SGD(cnn_model.parameters(), lr=0.01, momentum=0.5)
print('Number of parameters: {}'.format(count_parameters(cnn_model)))

for epoch in range(0, 1):
    train(epoch, cnn_model)
    test(cnn_model)

Number of parameters: 5894

Test set: Average loss: 0.2056, Accuracy: 9361/10000 (94%)

