# Practice training a deep neural network on the CIFAR10 image dataset:

In [1]:
import os
os.chdir("..")

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

from PIL import Image
import time

import numpy as np
from pathlib import Path

In [2]:
def train(model, train_loader, val_loader, criterion, optimizer, device='cpu', scheduler=None, epochs=50, patience=5):
    best_val_loss = float('inf')
    patience_counter = 0
    start = time.time()
    for epoch in range(epochs):
        model.train()
        per_epoch_train_loss = 0.0
        time_per_epoch = time.time()
        for data in train_loader:
            inputs, targets = data[0].to(device), data[1].to(device)
            
            optimizer.zero_grad()
                
            outputs = model(inputs)
            loss = criterion(outputs, targets)            
            loss.backward()
            optimizer.step()
            if scheduler:
                scheduler.step()
            per_epoch_train_loss += loss.item()

        avg_per_epoch_train_loss = per_epoch_train_loss / len(train_loader)
        writer.add_scalar("Loss/Train", avg_per_epoch_train_loss, epoch+1)

        per_epoch_val_loss = 0.0
        total = 0
        correct = 0
        for data in val_loader:
            inputs, targets = data[0].to(device), data[1].to(device)
            
            with torch.no_grad():
                outputs = model(inputs)
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
                loss = criterion(outputs, targets)                
                per_epoch_val_loss += loss.item()
                
        avg_per_epoch_val_loss = per_epoch_val_loss / len(val_loader)
        val_accuracy = 100 * correct / total
        writer.add_scalar("Loss/Val", avg_per_epoch_val_loss, epoch+1)
        writer.add_scalar("Accuracy/Val", val_accuracy, epoch+1)

        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_per_epoch_train_loss:.4f}, " 
              f"Val Loss: {avg_per_epoch_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%, Time Elapsed {time.time() - time_per_epoch:.3f}s")

        if  avg_per_epoch_val_loss < best_val_loss:
            patience_counter = 0
            best_val_loss = avg_per_epoch_val_loss
            best_model_state = model.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                model.load_state_dict(best_model_state)
                break

    end = time.time()
    print(f"\nTotal Time for Training {(end-start)/60:.3f}m")
    return model

In [3]:
def eval(model, test_loader, device='cpu'):
    model.eval()
    correct = 0
    total = 0
    for inputs, targets in test_loader:
        inputs  = inputs.to(device)
        targets = targets.to(device)

        with torch.no_grad():
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
            
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

**A. Build a DNN with 20 hidden layers of 100 neurons each (that’s too many, but it’s the point of this exercise). Use He initialization and the Swish activation function.()**

In [4]:
class CIFAR10V1(nn.Module):
    def __init__(self, input_features=3*32*32, output_neurons=100, num_classes=10, hidden_layers=20):
        super(CIFAR10V1, self).__init__()
        layers = []
        
        layer_first = nn.Linear(input_features, output_neurons)
        layers.append(layer_first)
        layers.append(Swish())
        
        for i in range(hidden_layers-1):
            layer = nn.Linear(output_neurons, output_neurons)
            layers.append(layer)
            layers.append(Swish())

        layer_last = nn.Linear(output_neurons, num_classes)
        layers.append(layer_last)
        
        self.net = nn.Sequential(*layers)
        self.net.apply(weights_init)

    def forward(self, X):
        flatten = nn.Flatten()
        X = flatten(X)
        return self.net(X)

### He Initialization (Kaiming Initialization) in PyTorch

He initialization, also called **Kaiming initialization**, is used to initialize weights. According to this, the weight parameters should be sampled from a distribution with:

$$\text{mean} = 0, \;\; \text{variance} = \frac{2}{\text{fan\_in}}\;\;\;i.e.,\;\;\;W \sim \mathcal{N} \left( 0, \frac{2}{\text{fan\_in}} \right)$$  



In PyTorch, there is no direct way to set the variance explicitly to $$(\frac{2}{\text{fan\_in}})$$ using `torch.nn.init.kaiming_normal_()`, since the `nonlinearity` argument only accepts `linear`, `relu`, and `leaky_relu` as arguments.

For **Leaky ReLU**, the gain and std is computed as:  

$$\text{gain} = \sqrt{\frac{2}{1 + \text{negative\_slope}^2}} \;\;\;\;
\text{std} = \sqrt{\frac{\text{gain}}{\sqrt{\text{fan\_mode}}}},\;\;\;
\text{variance} = \frac{\text{gain}}{\sqrt{\text{fan\_mode}}}$$  


To ensure the variance matches $$(\frac{2}{\text{fan\_in}}),\;\;\; we\;need\;to\;set\;\;negative\_slope=0\;\;and\;\;fan\_mode=fan\_in$$ 

Since `torch.nn.init.kaiming_normal_()` takes a parameter **`a`**, which denotes `negative_slope`, setting `a=0` makes sure that He initialization is applied correctly.  


**I have also implemented `nn.SiLU` (Swish activation function) manually. I do not know why I did this.**

In [5]:
def weights_init(layer, nonlinearity='leaky_relu'):
    if isinstance(layer, nn.Linear):
        if nonlinearity=='leaky_relu': # param a is only used with leaky_relu
            torch.nn.init.kaiming_normal_(layer.weight, mode='fan_in', a=0, nonlinearity=nonlinearity)
        else:
            torch.nn.init.kaiming_normal_(layer.weight, mode='fan_in', nonlinearity=nonlinearity)

        torch.nn.init.constant_(layer.bias, 0)

class Swish(nn.Module):
    def forward(self, x):
        return x * F.sigmoid(x)

**B. Using Nadam optimization and early stopping, train the network on the CIFAR10 dataset. The dataset is composed of 60,000 32 × 32–pixel color images (50,000 for training, 10,000 for testing) with 10 classes. Remember to search for the right learning rate each time you change the model’s architecture or hyperparameters.**

In [6]:
batch_size = 64

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]
)

# Load CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset  = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# Split training data into train and validation subsets (e.g., 80%/20%)
train_size = int(0.9 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

train_loader = torch.utils.data.DataLoader(train_subset, batch_size=batch_size, shuffle=True)
val_loader   = torch.utils.data.DataLoader(val_subset, batch_size=batch_size, shuffle=False)
test_loader  = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:01<00:00, 88724437.56it/s] 


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [8]:
# Training the model
epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
writer = SummaryWriter()

cifar10  = CIFAR10V1().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.NAdam(cifar10.parameters(), lr=0.001)
model_v1 = train(cifar10, train_loader, val_loader, criterion, optimizer, device=device, epochs=epochs)
writer.flush()
writer.close()

# Model Accuracy
eval(model_v1, test_loader, device=device)

Epoch [1/100], Train Loss: 1.8834, Val Loss: 2.0342, Val Acc: 28.66%, Time Elapsed 12.412s
Epoch [2/100], Train Loss: 1.6588, Val Loss: 1.7618, Val Acc: 37.16%, Time Elapsed 12.223s
Epoch [3/100], Train Loss: 1.5673, Val Loss: 1.7157, Val Acc: 39.76%, Time Elapsed 12.256s
Epoch [4/100], Train Loss: 1.4956, Val Loss: 1.6138, Val Acc: 42.76%, Time Elapsed 12.233s
Epoch [5/100], Train Loss: 1.4386, Val Loss: 1.5578, Val Acc: 45.02%, Time Elapsed 12.306s
Epoch [6/100], Train Loss: 1.3937, Val Loss: 1.5904, Val Acc: 42.98%, Time Elapsed 12.227s
Epoch [7/100], Train Loss: 1.3549, Val Loss: 1.5189, Val Acc: 46.06%, Time Elapsed 12.202s
Epoch [8/100], Train Loss: 1.3138, Val Loss: 1.5711, Val Acc: 44.90%, Time Elapsed 12.313s
Epoch [9/100], Train Loss: 1.2747, Val Loss: 1.5244, Val Acc: 46.40%, Time Elapsed 12.340s
Epoch [10/100], Train Loss: 1.2431, Val Loss: 1.5137, Val Acc: 47.56%, Time Elapsed 12.246s
Epoch [11/100], Train Loss: 1.2127, Val Loss: 1.5083, Val Acc: 48.00%, Time Elapsed 12.22

**C. Now try adding batch normalization and compare the learning curves: is it converging faster than before? Does it produce a better model? How does it affect training speed?**

In [11]:
class CIFAR10V2(nn.Module):
    def __init__(self, input_features=3*32*32, output_neurons=100, num_classes=10, hidden_layers=20):
        super(CIFAR10V2, self).__init__()
        layers = []
        
        layer_first = nn.Linear(input_features, output_neurons)
        layers.append(layer_first)
        layers.append(nn.BatchNorm1d(output_neurons))
        layers.append(Swish())
        
        for i in range(hidden_layers-1):
            layer = nn.Linear(output_neurons, output_neurons)
            layers.append(layer)
            layers.append(nn.BatchNorm1d(output_neurons))
            layers.append(Swish())

        layer_last = nn.Linear(output_neurons, num_classes)
        layers.append(layer_last)
        
        self.net = nn.Sequential(*layers)
        self.net.apply(weights_init)

    def forward(self, X):
        flatten = nn.Flatten()
        X = flatten(X)
        return self.net(X)

In [18]:
epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
writer = SummaryWriter()

cifar10  = CIFAR10V2().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.NAdam(cifar10.parameters(), lr=0.001)
model_v2 = train(cifar10, train_loader, val_loader, criterion, optimizer, device=device, epochs=epochs)
writer.flush()
writer.close()

Epoch [1/100], Train Loss: 1.9657, Val Loss: 1.8295, Val Acc: 34.22%, Time Elapsed 14.770s
Epoch [2/100], Train Loss: 1.6850, Val Loss: 1.7463, Val Acc: 37.30%, Time Elapsed 14.673s
Epoch [3/100], Train Loss: 1.5941, Val Loss: 1.6492, Val Acc: 41.38%, Time Elapsed 14.746s
Epoch [4/100], Train Loss: 1.5122, Val Loss: 1.7223, Val Acc: 39.36%, Time Elapsed 14.837s
Epoch [5/100], Train Loss: 1.4587, Val Loss: 1.6158, Val Acc: 43.58%, Time Elapsed 14.752s
Epoch [6/100], Train Loss: 1.4127, Val Loss: 1.5324, Val Acc: 45.90%, Time Elapsed 14.757s
Epoch [7/100], Train Loss: 1.3632, Val Loss: 1.5499, Val Acc: 45.28%, Time Elapsed 14.706s
Epoch [8/100], Train Loss: 1.3304, Val Loss: 1.4972, Val Acc: 47.10%, Time Elapsed 14.718s
Epoch [9/100], Train Loss: 1.2928, Val Loss: 1.6656, Val Acc: 42.26%, Time Elapsed 14.676s
Epoch [10/100], Train Loss: 1.2834, Val Loss: 1.5026, Val Acc: 47.88%, Time Elapsed 14.738s
Epoch [11/100], Train Loss: 1.2338, Val Loss: 1.4563, Val Acc: 49.58%, Time Elapsed 14.73

In [19]:
eval(model_v2, test_loader, device=device)

Test Accuracy: 51.26%


***Is the model converging faster than before? - Yes, to some extent (only by 2-3 epochs). Maybe experimenting with different learning rates can yield much faster convergence.***

***Does BN produce a better model?  - Yes, adding batch normalization boosted the accuracy by ~2%***

***How does BN affect training speed? - Yes and No, previously it took ~12s for each epoch, but this one's taking ~15s. But, can be said that total training time did reduce since the model also acheived a better accuracy.***

**D. Try replacing batch normalization with SELU, and make the necessary adjustments to ensure the network self-normalizes (i.e., standardize the input features, use LeCun normal initialization, make sure the DNN contains only a sequence of dense layers, etc.).**

*Note that input features were already normalized while downloading data*

In [22]:
class CIFAR10V3(nn.Module):
    def __init__(self, input_features=3*32*32, output_neurons=100, num_classes=10, hidden_layers=20):
        super(CIFAR10V3, self).__init__()
        layers = []
        
        layer_first = nn.Linear(input_features, output_neurons)
        layers.append(layer_first)
        layers.append(nn.SELU())
        
        for i in range(hidden_layers-1):
            layer = nn.Linear(output_neurons, output_neurons)
            layers.append(layer)
            layers.append(nn.SELU())

        layer_last = nn.Linear(output_neurons, num_classes)
        layers.append(layer_last)
        
        self.net = nn.Sequential(*layers)
        self.net.apply(weights_init_lecun)

    def forward(self, X):
        flatten = nn.Flatten()
        X = flatten(X)
        return self.net(X)

**LeCun Normal Initialization** is called **Xavier Normal Initialization** (`torch.nn.init.xavier_normal_`).  
But, LeCun Initialization is not directly supported in PyTorch, so here also we need to do some maths.  

The standard deviation for `torch.nn.init.xavier_normal_()` is given in the [documentation](https://shorturl.at/WxECS).  


LeCun Normal Initialization uses a standard deviation of:  

$$\text{std} = \sqrt{\frac{1}{\text{fan\_in}}}$$  

To match this, we need to set the **gain** as follows:  

$$\text{gain} = \sqrt{\frac{\text{fan\_in} + \text{fan\_out}}{\text{fan\_in}}}$$  

In [23]:
def weights_init_lecun(layer):
    if isinstance(layer, nn.Linear):
        gain = np.sqrt(sum(layer.weight.shape) / layer.weight.shape[1])
        torch.nn.init.xavier_normal_(layer.weight, gain=gain)
        torch.nn.init.constant_(layer.bias, 0)

In [25]:
epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
writer = SummaryWriter()

cifar10  = CIFAR10V3().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.NAdam(cifar10.parameters(), lr=0.001)
model_v3 = train(cifar10, train_loader, val_loader, criterion, optimizer, device=device, epochs=epochs)
writer.flush()
writer.close()
eval(model_v3, test_loader, device=device)

Epoch [1/100], Train Loss: 2.2196, Val Loss: 2.0358, Val Acc: 24.50%, Time Elapsed 11.658s
Epoch [2/100], Train Loss: 1.7954, Val Loss: 1.9671, Val Acc: 32.20%, Time Elapsed 11.678s
Epoch [3/100], Train Loss: 1.6761, Val Loss: 1.8195, Val Acc: 35.60%, Time Elapsed 11.648s
Epoch [4/100], Train Loss: 1.6029, Val Loss: 1.7836, Val Acc: 36.92%, Time Elapsed 11.693s
Epoch [5/100], Train Loss: 1.5488, Val Loss: 1.6671, Val Acc: 39.30%, Time Elapsed 11.745s
Epoch [6/100], Train Loss: 1.4995, Val Loss: 1.7507, Val Acc: 40.52%, Time Elapsed 11.770s
Epoch [7/100], Train Loss: 1.4641, Val Loss: 1.9306, Val Acc: 33.80%, Time Elapsed 11.700s
Epoch [8/100], Train Loss: 1.4162, Val Loss: 1.7136, Val Acc: 40.90%, Time Elapsed 11.664s
Epoch [9/100], Train Loss: 1.3872, Val Loss: 1.7040, Val Acc: 41.82%, Time Elapsed 11.731s
Epoch [10/100], Train Loss: 1.3541, Val Loss: 1.5268, Val Acc: 46.22%, Time Elapsed 11.685s
Epoch [11/100], Train Loss: 1.3193, Val Loss: 1.5018, Val Acc: 48.04%, Time Elapsed 11.64

***Though training time reduced, accuracy is worse then the first model. But lowest val loss was reached in just 11 epochs.***

**E. Try regularizing the model with alpha dropout. Then, without retraining your model, see if you can achieve better accuracy using MC dropout.**

In [26]:
class CIFAR10V4(nn.Module):
    def __init__(self, input_features=3*32*32, output_neurons=100, num_classes=10, hidden_layers=20):
        super(CIFAR10V4, self).__init__()
        layers = []
        
        layer_first = nn.Linear(input_features, output_neurons)
        layers.append(layer_first)
        layers.append(nn.SELU())
        
        for i in range(hidden_layers-1):
            layer = nn.Linear(output_neurons, output_neurons)
            layers.append(layer)
            layers.append(nn.SELU())

        layers.append(nn.AlphaDropout(p=0.1))
        layer_last = nn.Linear(output_neurons, num_classes)
        layers.append(layer_last)
        
        self.net = nn.Sequential(*layers)
        self.net.apply(weights_init_lecun)

    def forward(self, X):
        flatten = nn.Flatten()
        X = flatten(X)
        return self.net(X)

In [29]:
epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
writer = SummaryWriter()

cifar10  = CIFAR10V4().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.NAdam(cifar10.parameters(), lr=0.001)
model_v4 = train(cifar10, train_loader, val_loader, criterion, optimizer, device=device, epochs=epochs)
writer.flush()
writer.close()

eval(model_v4, test_loader, device=device)

Epoch [1/100], Train Loss: 2.1604, Val Loss: 1.9445, Val Acc: 28.74%, Time Elapsed 11.799s
Epoch [2/100], Train Loss: 1.7827, Val Loss: 1.9209, Val Acc: 31.60%, Time Elapsed 11.857s
Epoch [3/100], Train Loss: 1.6765, Val Loss: 2.0588, Val Acc: 28.84%, Time Elapsed 11.808s
Epoch [4/100], Train Loss: 1.6118, Val Loss: 1.6568, Val Acc: 39.78%, Time Elapsed 11.945s
Epoch [5/100], Train Loss: 1.5567, Val Loss: 1.7301, Val Acc: 41.60%, Time Elapsed 11.801s
Epoch [6/100], Train Loss: 1.5161, Val Loss: 1.6173, Val Acc: 42.48%, Time Elapsed 11.767s
Epoch [7/100], Train Loss: 1.4743, Val Loss: 1.6345, Val Acc: 43.50%, Time Elapsed 11.703s
Epoch [8/100], Train Loss: 1.4420, Val Loss: 1.5671, Val Acc: 45.00%, Time Elapsed 11.779s
Epoch [9/100], Train Loss: 1.4074, Val Loss: 1.6784, Val Acc: 44.80%, Time Elapsed 11.763s
Epoch [10/100], Train Loss: 1.3755, Val Loss: 1.6277, Val Acc: 43.80%, Time Elapsed 11.893s
Epoch [11/100], Train Loss: 1.3439, Val Loss: 1.5694, Val Acc: 46.58%, Time Elapsed 11.79

***Not as good as the previous one both in terms of accuracy and in terms of time taken***

In [30]:
class MCAlphaDropout(nn.AlphaDropout):
    def call(self, inputs):
        return super().call(inputs).train(True)

Note that I could have directly used MCAlphaDropout in the model creation itself, but, whatif I am using an already trained model and want to implement MCAlphaDropout. So, it makes sense to create an identical model only with MCAlphaDropout instead of AlphaDropout.

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

mc_model = nn.Sequential(*[
    MCAlphaDropout(layer.p)
    if isinstance(layer, nn.AlphaDropout)
    else layer
    for layer in model_v4.net
]).to(device)

In [42]:
flatten = nn.Flatten()

def mc_dropout_predictions(model, test_loader, device='cpu'):
    model.train(True) # important
    correct = 0
    total = 0
    y_probas = []
    for inputs, targets in test_loader:
        inputs  = inputs.to(device)
        inputs = flatten(inputs)
        targets = targets.to(device)

        with torch.no_grad():
            outputs = model(inputs)
            outputs = outputs.to('cpu').detach().numpy()
            y_probas.append(outputs)
            
    return np.vstack(y_probas)

In [43]:
# Training is set to true, each time model predicts a different output even for the 
# same inputs, since Dropout layer randomly drops out some neurons with probability p.
# Here we get predictions for the test set `mc_iterations` times and then stack it 
# -> [mc_iterations, 10000, 10] -> 10000 instances and 10 scores for each class.
# Then we average over dim=0 and get the prediction scores

# mc_model = mc_model.to(device)
mc_iterations = 100
y_probas = np.stack([mc_dropout_predictions(mc_model, test_loader, device=device) for i in range(mc_iterations)])
y_scores = y_probas.mean(axis=0)
y_scores = torch.tensor(y_scores)

In [46]:
# Make predictions

targets = np.hstack([targets for _, targets in test_loader])
targets = torch.tensor(targets)

_, predicted = torch.max(y_scores, 1)

In [47]:
# Accuracy without retraining

correct = 0
total = targets.size(0)
correct += (predicted == targets).sum().item()
        
print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 47.29%


***Exactly same? This should not happen. Something's off, have to investigate.***

**F. Retrain your model using 1cycle scheduling and see if it improves training speed and model accuracy.**

In [51]:
epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
writer = SummaryWriter()

criterion = nn.CrossEntropyLoss()
optimizer = optim.NAdam(cifar10.parameters(), lr=3e-2)

scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.05, steps_per_epoch=len(train_loader), epochs=epochs)
model = train(model_v4, train_loader, val_loader, criterion, optimizer, device=device, scheduler=scheduler, epochs=epochs)
writer.flush()
writer.close()

eval(model, test_loader, device=device)

Epoch [1/100], Train Loss: 1.5190, Val Loss: 1.7086, Val Acc: 41.98%, Time Elapsed 11.845s
Epoch [2/100], Train Loss: 1.4492, Val Loss: 1.6635, Val Acc: 41.02%, Time Elapsed 11.823s
Epoch [3/100], Train Loss: 1.4359, Val Loss: 2.1139, Val Acc: 27.62%, Time Elapsed 11.887s
Epoch [4/100], Train Loss: 77.3060, Val Loss: 2.4306, Val Acc: 9.96%, Time Elapsed 11.813s
Epoch [5/100], Train Loss: 2.3214, Val Loss: 2.3948, Val Acc: 9.96%, Time Elapsed 11.828s
Epoch [6/100], Train Loss: 2.3258, Val Loss: 2.3671, Val Acc: 11.42%, Time Elapsed 11.841s
Epoch [7/100], Train Loss: 2.3344, Val Loss: 2.4216, Val Acc: 9.96%, Time Elapsed 11.870s
Early stopping triggered.

Total Time for Training 1.382m
Test Accuracy: 10.00%


In [10]:
# %load_ext tensorboard
%tensorboard --logdir=runs

UsageError: Line magic function `%tensorboard` not found.
