# Deep Learning Practical Answers - Getting the Best Performance out of a Neural Network
---

## Author : Amir Atapour-Abarghouei, amir.atapour-abarghouei@durham.ac.uk

This notebook will provide you with the answer to the excercise for improved design and training of a nerual network. Try to solve the problems yourself before looking at the answers.

Copyright (c) 2024 Amir Atapour-Abarghouei, UK.

License : LGPL - http://www.gnu.org/licenses/lgpl.html

For this practical, we will be using [Caltech 101](https://data.caltech.edu/records/mzrjq-6wc02). The dataset consistes of objects belonging to 101 categories with about 40 to 800 images per category. This dataset is generally considered to be challenging.

First, let's import what we need and set Torch to use the GPU:

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torchvision
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from torch.utils.data import random_split, DataLoader
import torchvision.datasets as datasets

device = torch.device('cuda')
print('donso!')

Now we need to load the data. Write the code needed to download the dataset and setup the data loaders. The dataset is available in Torchvision:

https://pytorch.org/vision/main/generated/torchvision.datasets.Caltech101.html

Now we want to load the model. We will use off-the-shelf architectures in torchvision.

https://pytorch.org/vision/stable/models.html

Let's start with ResNet18, which is a good starting point, but you should load different ones and try to experiment to see which gives the best results. The answer for ResNet18 is provided. Once you know how to do this, it should be easy to apply this to lots of different models.

In [None]:
# define the model
class ResNet18Modified(nn.Module):
    def __init__(self, num_classes=102):
        super(ResNet18Modified, self).__init__()
        # Load the ResNet18 model without pre-trained weights
        self.model = torchvision.models.resnet18()
        # Modify the final fully connected layer to match the number of classes
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)

    def forward(self, x):
        return self.model(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# create the modified ResNet18 model
N = ResNet18Modified(num_classes=102).to(device)

# number of model parameters
print(f'Number of model parameters is: {len(torch.nn.utils.parameters_to_vector(N.parameters()))}')

# initialize the optimiser
optimiser = torch.optim.Adam(N.parameters(), lr=0.001)
epoch = 0

Now let's train the model:

In [None]:
# define transformations for augmentation
train_transform = transforms.Compose([
    # resize all images to 128x128
    transforms.Resize((128, 128)),
    # random flipping
    transforms.RandomHorizontalFlip(),
    # all images are RGB
    transforms.Lambda(lambda x: x.convert("RGB")),
    # randomly rotate images up to 45 degrees
    transforms.RandomRotation(45),
    # color jitter
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2),
    # affine transformations with shear
    transforms.RandomAffine(degrees=30, shear=15),
    # convert some images to grayscale
    transforms.RandomGrayscale(p=0.3),
    # Gaussian blur
    transforms.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 2.0)),
    # perspective transformation
    transforms.RandomPerspective(distortion_scale=0.5, p=0.5),
    # randomly invert colors
    transforms.RandomInvert(p=0.2),
    # adjust contrast
    transforms.RandomAutocontrast(p=0.3),
    # randomly adjust sharpness
    transforms.RandomAdjustSharpness(sharpness_factor=2, p=0.2),
    # convert to tensor
    transforms.ToTensor(),
    # randomly erase parts of the image
    transforms.RandomErasing(p=0.3, scale=(0.02, 0.2), ratio=(0.3, 3.3)),
])

# for the test set, only resize, convert to tensor
test_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.Lambda(lambda x: x.convert("RGB")),
    transforms.ToTensor()
])

# load the full dataset
full_dataset = datasets.Caltech101(root='data', download=True, transform=train_transform, target_type='category')

# define the train-test split ratio
train_size = int(0.8 * len(full_dataset))  # 80% for training
test_size = len(full_dataset) - train_size  # 20% for testing

# split the dataset into training and testing sets
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

# update the transformations for the test dataset
test_dataset.dataset.transform = test_transform

# create DataLoaders for the training and test sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=128, drop_last=True, num_workers=2, pin_memory=True)

# print dataset sizes
print(f"There are {len(train_dataset)} images in the training set!")
print(f"There are {len(test_dataset)} images in the test set!")

In [None]:
# define the total number of training steps
total_steps = 500
step = 0

# training mode
N.train()

# arrays for metrics
train_loss_arr = np.zeros(0)
train_acc_arr = np.zeros(0)
test_acc_arr = np.zeros(0)

# loop until the total number of steps is reached
while step < total_steps:
    for i, batch in enumerate(train_loader):
        if step >= total_steps:
            # stop if we've reached the total number of steps
            break

        # sample x from the dataset
        x, t = batch
        x, t = x.to(device), t.to(device)

        optimiser.zero_grad()
        p = N(x)
        pred = p.argmax(dim=1, keepdim=True)
        loss = torch.nn.functional.cross_entropy(p, t)
        loss.backward()
        optimiser.step()

        train_loss_arr = np.append(train_loss_arr, loss.cpu().data)
        train_acc_arr = np.append(train_acc_arr, pred.data.eq(t.view_as(pred)).float().mean().item())

        # increment the step counter
        step += 1

        # print progress every given number of steps
        if step % 50 == 0:
            print(f'Step {step}: train loss: {train_loss_arr.mean():.3f}, train acc: {train_acc_arr.mean():.3f}')
            train_loss_arr = np.zeros(0)
            train_acc_arr = np.zeros(0)

# evaluation phase
N.eval()

# disable gradient computation
with torch.no_grad():
    for x, t in test_loader:
        x, t = x.to(device), t.to(device)
        p = N(x)
        pred = p.argmax(dim=1, keepdim=True)
        test_acc_arr = np.append(test_acc_arr, pred.data.eq(t.view_as(pred)).float().mean().item())

print(f'* Test Accuracy : {test_acc_arr.mean():.3f}')

Now, we want to play around with finetuning - let's implement the ability to freeze a number of layers and train others.

In [None]:
# define the model
class ResNet18Modified(nn.Module):
    def __init__(self, num_classes=102):
        super(ResNet18Modified, self).__init__()
        # load the ResNet18 model with pre-trained weights
        self.model = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
        # modify the final fully connected layer to match the number of classes
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)

    def forward(self, x):
        return self.model(x)

def freeze_layers(model, num_layers_to_freeze):
    # get the layers in the model
    layers = list(model.model.children())
    # freeze the first num_layers_to_freeze layers
    for layer in layers[:num_layers_to_freeze]:
        for param in layer.parameters():
            param.requires_grad = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# create the modified ResNet18 model
N = ResNet18Modified(num_classes=102).to(device)

# number of layers to freeze (e.g., freeze the first 6 layers) - change this number to see what effect this might have:
num_layers_to_freeze = 6

freeze_layers(N, num_layers_to_freeze)

# Print the number of model parameters
print(f'Number of model parameters is: {len(torch.nn.utils.parameters_to_vector(N.parameters()))}')

# Initialize the optimizer (only train the layers that have requires_grad = True)
optimiser = torch.optim.Adam(filter(lambda p: p.requires_grad, N.parameters()), lr=0.001)
epoch = 0

and let's train again:

In [None]:
# define the total number of training steps
total_steps = 500
step = 0

# training mode
N.train()

# arrays for metrics
train_loss_arr = np.zeros(0)
train_acc_arr = np.zeros(0)
test_acc_arr = np.zeros(0)

# loop until the total number of steps is reached
while step < total_steps:
    for i, batch in enumerate(train_loader):
        if step >= total_steps:
            # stop if we've reached the total number of steps
            break

        # sample x from the dataset
        x, t = batch
        x, t = x.to(device), t.to(device)

        optimiser.zero_grad()
        p = N(x)
        pred = p.argmax(dim=1, keepdim=True)
        loss = torch.nn.functional.cross_entropy(p, t)
        loss.backward()
        optimiser.step()

        train_loss_arr = np.append(train_loss_arr, loss.cpu().data)
        train_acc_arr = np.append(train_acc_arr, pred.data.eq(t.view_as(pred)).float().mean().item())

        # increment the step counter
        step += 1

        # print progress every given number of steps
        if step % 50 == 0:
            print(f'Step {step}: train loss: {train_loss_arr.mean():.3f}, train acc: {train_acc_arr.mean():.3f}')
            train_loss_arr = np.zeros(0)
            train_acc_arr = np.zeros(0)

# evaluation phase
N.eval()

# disable gradient computation
with torch.no_grad():
    for x, t in test_loader:
        x, t = x.to(device), t.to(device)
        p = N(x)
        pred = p.argmax(dim=1, keepdim=True)
        test_acc_arr = np.append(test_acc_arr, pred.data.eq(t.view_as(pred)).float().mean().item())

print(f'* Test Accuracy : {test_acc_arr.mean():.3f}')

It is important to understand how albation studies work.

We have gotten very good performance because of the things we have done but because we have done lots of things, we need to understand what exactly helped us and by how much in a rigorous scientific manner.

In an ablation study, individual components of the model or the training setup (e.g., layers, types of regularization, data augmentation techniques) are removed or modified to observe their impact on the overall performance.

https://en.wikipedia.org/wiki/Ablation_(artificial_intelligence)

Consider this for your coursework.