#### Preamble

In [75]:
# TODO: Import necessary libraries
import torch
from torchvision import transforms
from torchvision import datasets
from torch import nn
import numpy as np
from sklearn.metrics import f1_score  

# 7.5 Build your own regularized NN

In this exercise you get to use your previously built networks, but this time you need to add regularization in the form of dropout and $L_2$-regularization.

Each layer has the option of using dropout. Your code needs to allow for this flexibility.

Additionally, adding $L_2$-regularization should also be optional upon creation.

**NOTE**: You are allowed to use built-in functions from pytorch to incorporate this functionality.

### 7.5.1 Implement a regularized model (1 point)

Implement your own model (using `torch`) using the skeleton code provided.

In [86]:
class Model(nn.Module):
    """
    Implement a model that incorporates dropout and L2 regularization
    depending on arguments passed.
    
    Args:
    input_dim: dimensionality of the inputs
    hidden_dim: how many units each hidden layer will have
    out_dim: how many output units
    num_layers: how many hidden layers to create/use
    dropout: a list of booleans specifying which hidden layers will have dropout
    dropout_p: the probability used for the `Dropout` layers
    l2_reg: a boolean value that indicates whether L2 regularization should be used
    """
    # TODO: Implement
    def __init__(self,
                 input_dim: int,
                 hidden_dim: int,
                 out_dim: int,
                 num_layers: int,
                 dropout: list,
                 dropout_p: float,
                 l2_reg: bool):

        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.out_dim = out_dim
        self.num_layers = num_layers
        self.dropout_list = dropout
        self.dropout_p = dropout_p
        self.l2_reg = l2_reg
        self.flatten = nn.Flatten()
        
        # Define the layers of the network
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.hidden_layers = nn.ModuleList()
        for i in range(num_layers):
            self.hidden_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.fc_out = nn.Linear(hidden_dim, out_dim)
        
        # Use the dropout layer if specified
        if self.dropout_p > 0.0:
            self.dropout = nn.Dropout(p=self.dropout_p)
            
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = torch.relu(x)
        
        i=0
        for layer in self.hidden_layers:
            x = layer(x)
            x = torch.relu(x)   
            if self.dropout_p > 0.0 and self.dropout_list[i]:
                x = self.dropout(x)
                
            i+=1
        
        x = self.fc_out(x)

        return x
    
def train(dataloader, model, loss_fn, optimizer, lambda_l2):

    # Total size of dataset for reference
    size = 0

    # places your model into training mode
    model.train()

    correct = 0
    _correct = 0
    
    y_true = torch.tensor(()).to(device)
    y_pred = torch.tensor(()).to(device)

    # Gives X , y for each batch
    for batch, (X, y) in enumerate(dataloader):

        # Converting device to cuda
        X, y = X.to(device), y.to(device)
        model.to(device)

        # Compute prediction error / loss
        # 1. Compute y_pred 
        # 2. Compute loss between y and y_pred using selectd loss function

        pred = model.forward(X)
        loss = loss_fn(pred, y)
        
        y_true = torch.cat((y_true, y), 0)
        y_pred = torch.cat((y_pred, pred.argmax(1)), 0)
     
    
        # L2 regularization
        if model.l2_reg:
            l2=0
            for p in model.parameters():
                l2 = l2 + (p**2).sum()
                loss = loss + lambda_l2 * l2

        # Backpropagation on optimizing for loss
        # 1. Sets gradients as 0 
        # 2. Compute the gradients using back_prop
        # 3. update the parameters using the gradients from step 2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        _correct = (pred.argmax(1) == y).type(torch.float).sum().item()
        _batch_size = len(X)

        correct += _correct


        size += _batch_size

        if batch % 400 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}]")

    correct/=size
    print(f"Train : \nAccuracy: {(100*correct):>0.1f}%")

    f1 = f1_score(y_true.cpu(), y_pred.cpu(), average='macro')
    print(f"F1 score: {(100*f1):>0.1f}%")

def validation(dataloader, model, loss_fn):

    # Total size of dataset for reference
    size = 0

    # Setting the model under evaluation mode.
    model.eval()

    test_loss = 0
    correct = 0
    _correct = 0
    _batch_size = 0
    
    y_true = torch.tensor(()).to(device)
    y_pred = torch.tensor(()).to(device)

    with torch.no_grad():

        # Gives X , y for each batch
        for batch , (X, y) in enumerate(dataloader):

            X, y = X.to(device), y.to(device)
            model.to(device)
            pred = model.forward(X)
            
            y_true = torch.cat((y_true, y), 0)
            y_pred = torch.cat((y_pred, pred.argmax(1)), 0)

            loss_fn(pred, y).item()
            _batch_size = len(X)

            _correct = (pred.argmax(1) == y).type(torch.float).sum().item()
            correct += _correct

            size+=_batch_size


    ## Calculating Accuracy based on how many y match with y_pred
    correct /= size
    
    ## calculating f1 score
    f1 = f1_score(y_true.cpu(), y_pred.cpu(), average='macro')

    print(f"Validation : \nAccuracy: {(100*correct):>0.1f}%")
    print(f"F1 score: {(100*f1):>0.1f}%")
    print()

### 7.5.2 Experiment with your model (1 point)

Use the MNIST dataset and evaluation code from the previous assignment to run some experiments. Run the following experiments:

1. Shallow network (not more than 1 hidden layer)
1. Shallow regularized network
1. Deep network (at least 3 hidden layers)
1. Deep regularized network

Report Accuracy and $F_1$ metrics for your experiments and discuss your results. What did you expect to see and what did you end up seeing.

**NOTE**: You can choose how you use regularization. Ideally you would experiment with various parameters for this regularization, the 4 listed variants are merely what you must cover as a minimum. Report results for all your experiments concisely in a table.

**NOTE 2**: Make sure to report your metrics on the training and evaluation/heldout sets.

In [87]:
# Load the data
# DO NOT CHANGE THE CODE IN THIS CELL EXCEPT FOR THE BATCH SIZE IF NECESSARY
transform_fn = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.7,), (0.7,)),])

mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform_fn)
train_dl = torch.utils.data.DataLoader(mnist_train, batch_size=16, shuffle=True)

mnist_test = datasets.MNIST(root='./data', train=False, download=True, transform=transform_fn)
test_dl = torch.utils.data.DataLoader(mnist_test, batch_size=16, shuffle=False)

# Use the above data for your experiments

In [88]:
# TODO: Run your experiments

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

learning_rate = 0.01
loss_function = nn.CrossEntropyLoss()
hidden_dim = 512
dropout_p = 0.5
inp = 28*28
out = 10
lambda_l2 = 0.001
epochs = 5

initial_arr = [True, False]
arr = np.random.choice(initial_arr, size=hidden_dim)
bool_list = list(map(bool, arr))

Using cuda device


In [89]:
shallow = Model(inp, hidden_dim, out, 1, bool_list, 0, l2_reg = False).to(device)
optimizer = torch.optim.SGD(shallow.parameters(), lr=learning_rate)


print("Shallow Model \n")
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, shallow, loss_function, optimizer, lambda_l2)
    validation(test_dl, shallow, loss_function)     
print("Done!")

Shallow Model 

Epoch 1
-------------------------------
loss: 2.366033  [    0]
loss: 1.187685  [ 6400]
loss: 0.541177  [12800]
loss: 0.563532  [19200]
loss: 0.276086  [25600]
loss: 0.175800  [32000]
loss: 0.110824  [38400]
loss: 0.453283  [44800]
loss: 0.363207  [51200]
loss: 0.244357  [57600]
Train : 
Accuracy: 83.4%
F1 score: 83.2%
Validation : 
Accuracy: 90.5%
F1 score: 90.4%

Epoch 2
-------------------------------
loss: 0.053905  [    0]
loss: 0.236218  [ 6400]
loss: 0.102433  [12800]
loss: 0.152733  [19200]
loss: 0.029574  [25600]
loss: 0.080245  [32000]
loss: 0.359718  [38400]
loss: 0.490714  [44800]
loss: 0.184518  [51200]
loss: 0.568807  [57600]
Train : 
Accuracy: 91.6%
F1 score: 91.5%
Validation : 
Accuracy: 92.9%
F1 score: 92.8%

Epoch 3
-------------------------------
loss: 0.407561  [    0]
loss: 0.127263  [ 6400]
loss: 0.614773  [12800]
loss: 0.143562  [19200]
loss: 0.223976  [25600]
loss: 0.109319  [32000]
loss: 0.035222  [38400]
loss: 0.117323  [44800]
loss: 0.176518  

In [90]:
shallow_reg = Model(inp, hidden_dim, out, 1, bool_list, dropout_p, l2_reg = True).to(device)
optimizer = torch.optim.SGD(shallow_reg.parameters(), lr=learning_rate)

print("Shallow Regularization Model \n")
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, shallow, loss_function, optimizer, lambda_l2)
    validation(test_dl, shallow, loss_function)     
print("Done!")

Shallow Regularization Model 

Epoch 1
-------------------------------
loss: 0.082001  [    0]
loss: 0.094270  [ 6400]
loss: 0.203187  [12800]
loss: 0.217602  [19200]
loss: 0.701317  [25600]
loss: 0.155076  [32000]
loss: 0.039931  [38400]
loss: 0.033724  [44800]
loss: 0.117633  [51200]
loss: 0.013330  [57600]
Train : 
Accuracy: 96.7%
F1 score: 96.7%
Validation : 
Accuracy: 96.3%
F1 score: 96.2%

Epoch 2
-------------------------------
loss: 0.043046  [    0]
loss: 0.006882  [ 6400]
loss: 0.156729  [12800]
loss: 0.052657  [19200]
loss: 0.145447  [25600]
loss: 0.009354  [32000]
loss: 0.474807  [38400]
loss: 0.086060  [44800]
loss: 0.048467  [51200]
loss: 0.464221  [57600]
Train : 
Accuracy: 96.7%
F1 score: 96.7%
Validation : 
Accuracy: 96.3%
F1 score: 96.2%

Epoch 3
-------------------------------
loss: 0.050266  [    0]
loss: 0.034112  [ 6400]
loss: 0.104641  [12800]
loss: 0.024506  [19200]
loss: 0.117050  [25600]
loss: 0.117034  [32000]
loss: 0.157268  [38400]
loss: 0.050026  [44800]
l

In [93]:
deep = Model(inp, hidden_dim, out, 3, bool_list, 0, l2_reg = False).to(device)
optimizer = torch.optim.SGD(deep.parameters(), lr=learning_rate)

print("Deep Model \n")
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, deep, loss_function, optimizer, lambda_l2)
    validation(test_dl, deep, loss_function)     
print("Done!")

Deep Model 

Epoch 1
-------------------------------
loss: 2.312886  [    0]
loss: 2.208130  [ 6400]
loss: 1.695760  [12800]
loss: 1.027298  [19200]
loss: 0.623616  [25600]
loss: 0.999520  [32000]
loss: 0.413017  [38400]
loss: 0.548535  [44800]
loss: 0.384042  [51200]
loss: 0.367201  [57600]
Train : 
Accuracy: 70.1%
F1 score: 70.4%
Validation : 
Accuracy: 89.0%
F1 score: 88.9%

Epoch 2
-------------------------------
loss: 0.333760  [    0]
loss: 0.401197  [ 6400]
loss: 0.429575  [12800]
loss: 0.076977  [19200]
loss: 0.352355  [25600]
loss: 0.257197  [32000]
loss: 0.177038  [38400]
loss: 0.389601  [44800]
loss: 0.151102  [51200]
loss: 0.236277  [57600]
Train : 
Accuracy: 90.7%
F1 score: 90.6%
Validation : 
Accuracy: 92.0%
F1 score: 91.9%

Epoch 3
-------------------------------
loss: 0.430705  [    0]
loss: 0.115080  [ 6400]
loss: 0.092994  [12800]
loss: 0.085429  [19200]
loss: 0.048860  [25600]
loss: 0.024009  [32000]
loss: 0.096219  [38400]
loss: 0.159472  [44800]
loss: 0.443125  [51

In [92]:
deep_reg = Model(inp, hidden_dim, out, 3, bool_list, dropout_p, l2_reg = True).to(device)
optimizer = torch.optim.SGD(deep_reg.parameters(), lr=learning_rate)

print("Deep Regularization Model \n")
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, deep_reg, loss_function, optimizer, lambda_l2)
    validation(test_dl, deep_reg, loss_function)     
print("Done!")

Deep Regularization Model 

Epoch 1
-------------------------------
loss: 7.100754  [    0]
loss: 6.524984  [ 6400]
loss: 6.031408  [12800]
loss: 4.896721  [19200]
loss: 4.210218  [25600]
loss: 3.694695  [32000]
loss: 3.606270  [38400]
loss: 2.964967  [44800]
loss: 2.655102  [51200]
loss: 2.294011  [57600]
Train : 
Accuracy: 53.2%
F1 score: 52.7%
Validation : 
Accuracy: 84.5%
F1 score: 84.2%

Epoch 2
-------------------------------
loss: 2.456746  [    0]
loss: 2.366848  [ 6400]
loss: 2.029624  [12800]
loss: 1.724416  [19200]
loss: 1.575170  [25600]
loss: 1.855827  [32000]
loss: 1.664716  [38400]
loss: 1.446649  [44800]
loss: 1.354885  [51200]
loss: 1.444786  [57600]
Train : 
Accuracy: 84.8%
F1 score: 84.5%
Validation : 
Accuracy: 87.4%
F1 score: 87.2%

Epoch 3
-------------------------------
loss: 1.179937  [    0]
loss: 1.120596  [ 6400]
loss: 1.423028  [12800]
loss: 1.171690  [19200]
loss: 0.939265  [25600]
loss: 0.892074  [32000]
loss: 0.926099  [38400]
loss: 0.963740  [44800]
loss

<table>
  <tr>
    <th>Model</th>
    <th>Train Accuracy</th>
    <th>Train F1 Score</th>  
    <th>Validation Accuracy</th>
    <th>Validation F1 Score</th>
  </tr>
  <tr>
    <td>Shallow network (not more than 1 hidden layer)</td>
    <td>95.7%</td>
    <td>95.7%</td>
    <td>96.3%</td>
    <td>96.2%</td>
  </tr>
  <tr>
    <td>Shallow regularized network</td>
    <td>96.7%</td>
    <td>96.7%</td>
    <td>96.3%</td>
    <td>96.2%</td>
  </tr>
    <tr>
    <td>Deep network (at least 3 hidden layers)</td>
    <td>95.9%</td>
    <td>95.9%</td>
    <td>96.2%</td>
    <td>96.1%</td>
  </tr>
    <tr>
    <td>Deep regularized network</td>
    <td>91.0%</td>
    <td>90.9%</td>
    <td>92.3%</td>
    <td>92.2%</td>
  </tr>
</table>


Shallow network and shallow regularized network has almost same validation accuracy. We thought that after applying regularization to the model, it will improve generalisation and hence improve validation accuracy but it remains same.<br> 
Deep network and shallow network has almost same validation accuracy and there is no much difference observed after adding extra layers.

### 7.5.3 Get the best model! (1 + 1 point (bonus))

* Present your model during a tutorial session. Justify your decisions when designing your model/solution.
* If you achieve one of the top N results, you get yet another extra point!

In [None]:
## Following is the best model(Shallow Regularized Network) from our experiments with validation accuracy of 96.3%

hidden_dim = 512
dropout_p = 0.5
inp = 28*28
out = 10
shallow_reg = Model(inp, hidden_dim, out, 1, bool_list, dropout_p, l2_reg = True).to(device)