# ResNet-50: Transfer Learning

with CIFAR-10 dataset


- [PyTorch reference](https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html)

- [TensorFlow reference](https://www.tensorflow.org/guide/keras/transfer_learning)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import datasets, models, transforms

import torch.nn.utils.prune as prune

import matplotlib.pyplot as plt
import numpy as np
import os
import time
import copy


In [2]:
print(f"PyTorch Version: {torch.__version__}")
print(f"Torchvision Version: {torchvision.__version__}")

PyTorch Version: 1.8.1+cu101
Torchvision Version: 0.9.1+cu101


In [3]:
# Device configuration-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"currently available device: {device}")

currently available device: cuda


In [4]:
# Define transformations for training and test sets-
transform_train = transforms.Compose(
    [
      transforms.RandomCrop(32, padding = 4),
      transforms.RandomHorizontalFlip(),
      transforms.ToTensor(),
      transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
     ]
     )

transform_test = transforms.Compose(
    [
      transforms.ToTensor(),
      transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
     ]
     )

In [None]:
# Change to directory containing CIFAR10 dataset-
os.chdir("/home/arjun/Documents/Programs/Python_Codes/PyTorch_Resources/Good_Codes/")

In [5]:
# Load dataset-
train_dataset = torchvision.datasets.CIFAR10(
        root = './data', train = True,
        download = True, transform = transform_train
        )

test_dataset = torchvision.datasets.CIFAR10(
        root = './data', train = False,
        download = True, transform = transform_test
        )

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=0.0, max=170498071.0), HTML(value='')))


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [6]:
print(f"len(train_dataset) = {len(train_dataset)} & len(test_dataset) = {len(test_dataset)}")

len(train_dataset) = 50000 & len(test_dataset) = 10000


In [7]:
batch_size = 128

In [8]:
# Create training and testing loaders-
train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size = batch_size,
        shuffle = True
        )

test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size = batch_size,
        shuffle = False
        )

In [9]:
print(f"len(train_loader) = {len(train_loader)} & len(test_loader) = {len(test_loader)}")

len(train_loader) = 391 & len(test_loader) = 79


In [10]:
# Sanity check-
len(train_dataset) / batch_size, len(test_dataset) / batch_size

(390.625, 78.125)

In [11]:
# Get some random training images-
# some_img = iter(train_loader)
# images, labels = some_img.next()
images, labels = next(iter(train_loader))

# You get 32 images due to our specified batch size-
print(f"images.shape: {images.shape} & labels.shape: {labels.shape}")

images.shape: torch.Size([128, 3, 32, 32]) & labels.shape: torch.Size([128])


In [None]:
def imshow(inp, title = None):
    """
    Imshow for Tensor
    """
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.4914, 0.4822, 0.4465])
    std = np.array([0.2023, 0.1994, 0.2010])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    
    plt.figure(figsize = (9, 7))
    plt.imshow(inp)
    
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated
    
    return None


In [None]:
# class_names = image_datasets['train'].classes
class_names = labels

In [None]:
# Get a batch of training data
# inputs, classes = next(iter(dataloaders['train']))

# Make a grid from batch
out = torchvision.utils.make_grid(images)

imshow(out, title = [class_names[x] for x in labels])

### Finetuning a pre-trained ResNet-50 convnet:

Load a pretrained model and reset the final fully connected/dense layer.



In [12]:
# Load pre-trained ResNet-50 model-
model_ft = models.resnet50(pretrained = True)

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))




In [13]:
# Compute number of features for defining last linear/dense layer-
num_ftrs = model_ft.fc.in_features
print(f"number of input features = {num_ftrs}")

number of input features = 2048


In [14]:
# Define last dense layer-
model_ft.fc = nn.Linear(in_features = num_ftrs, out_features = 10)

### Change the first _conv_ layer:

Use the hyper-parameters: ```kernel_size=3, stride=1 and padding=1``` instead of the original ResNet-50 hyper-parameters. This keeps the original input size for quite a while in forward propagation. In my experiments, doing this is leads to a higher validation accuracy and is probably more efficient for small input dataset(s) such as CIFAR-10.

[Refer to torch discussion](https://discuss.pytorch.org/t/resnet50-torchvision-implementation-gives-low-accuracy-on-cifar-10/82046/3)


In [17]:
# Change first conv layer of ResNet-50:
model_ft.conv1 = torch.nn.Conv2d(
    in_channels = 3, out_channels = 64,
    kernel_size = (3, 3), stride = (1, 1),
    padding = (1, 1), bias = False
)

In [18]:
# Sanity check-
model_ft.conv1

Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)

In [None]:
# Further sanity check-
print(model_ft)

In [20]:
# Place model on GPU-
model_ft = model_ft.to(device)

In [22]:
# Define cost function-
loss = nn.CrossEntropyLoss()

# Learning rate - Observe that all parameters are being optimized-
optimizer_ft = optim.SGD(model_ft.parameters(), lr = 0.01, momentum = 0.9)

In [23]:
# Learning rate scheduler - Decay LR by a factor of 0.1 every 10 epochs-
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size = 10, gamma = 0.1)

In [None]:
'''
for layer_name, param in model_ft.named_parameters():
    print(f"layer name: {layer_name} has {param.shape} parameters")
'''

In [None]:
# PATH = '/home/arjun/Deep_Learning_Resources/Computer_Vision_Resources/Transfer_Learning_resources/'

In [None]:
# Load trained weights-
# model_ft.load_state_dict(torch.load(PATH + 'ResNet50__finetuned_best_trained_loss.pth', map_location = device))

<All keys matched successfully>

### Fine-Tune ResNet-50 model:

In [None]:
# global step counter for the entire duration of training-
# step = 0

In [24]:
def train_model(model, train_loader):
    '''
    Function to perform one epoch of training by using 'train_loader'.
    Returns loss and number of correct predictions for this epoch.
    '''
    running_loss = 0.0
    running_corrects = 0.0

    for batch, (images, labels) in enumerate(train_loader):
        # Reshape image and place it on GPU-
        # images = images.reshape(-1, input_size).to(device)
        images = images.to(device)
        labels = labels.to(device) 
        outputs = model(images)   # forward pass
        J = loss(outputs, labels) # compute loss
        optimizer_ft.zero_grad()     # empty accumulated gradients
        J.backward()              # perform backpropagation
        optimizer_ft.step()          # update parameters

        '''
        global step
        optimizer.param_groups[0]['lr'] = custom_lr_scheduler.get_lr(step)
        # sgd.param_groups[0]['lr'] = sch.get_lr(step)

        step += 1
        '''
        
        # Compute model's performance statistics-
        running_loss += J.item() * images.size(0)
        _, predicted = torch.max(outputs, 1)
        running_corrects += torch.sum(predicted == labels.data)

        '''
        # Print information every 100 steps-
        if (batch + 1) % 100 == 0:
            print(f"epoch {epoch + 1}/{num_epochs}, step {batch + 1}/{num_steps}, loss = {J.item():.4f}")
        '''

    return running_loss, running_corrects


In [25]:
def test_model(model, test_loader):
    total = 0.0
    correct = 0.0
    running_loss_val = 0.0

    with torch.no_grad():
        for images, labels in test_loader:

            # Place features (images) and targets (labels) to GPU-
            # images = images.reshape(-1, input_size).to(device)
            images = images.to(device)
            labels = labels.to(device)
            # print(f"images.shape = {images.shape}, labels.shape = {labels.shape}")

            # Set model to evaluation mode-
            model.eval()
    
            # Make predictions using trained model-
            outputs = model(images)
            _, y_pred = torch.max(outputs, 1)

            # Compute validation loss-
            J_val = loss(outputs, labels)

            running_loss_val += J_val.item() * labels.size(0)
    
            # Total number of labels-
            total += labels.size(0)

            # Total number of correct predictions-
            correct += (y_pred == labels).sum()

    return (running_loss_val, correct, total)


In [26]:
# Python3 dict to contain model training metrics-
history_lr = {}

In [27]:
best_val_loss = 100

In [28]:
# Hyper-parameters-
num_epochs = 40
learning_rate = 0.01

In [29]:
print(f"batch size = {batch_size}, number of epochs = {num_epochs} & learning rate = {learning_rate:.4f}")

batch size = 128, number of epochs = 40 & learning rate = 0.0100


In [30]:
# Training loop-
for epoch in range(num_epochs):
    running_loss = 0.0
    running_corrects = 0.0
    
    '''
    if loc_patience >= patience:
        print("\n'EarlyStopping' called!\n")
        break
    '''
    
    running_loss, running_corrects = train_model(model_ft, train_loader)
  
    epoch_loss = running_loss / len(train_dataset)
    epoch_acc = running_corrects.double() / len(train_dataset)
    # epoch_acc = 100 * running_corrects / len(trainset)
    # print(f"\nepoch: {epoch + 1} training loss = {epoch_loss:.4f}, training accuracy = {epoch_acc * 100:.2f}%\n")

    running_loss_val, correct, total = test_model(model_ft, test_loader)

    epoch_val_loss = running_loss_val / len(test_dataset)
    val_acc = 100 * (correct / total)

    print(f"\nepoch: {epoch + 1} training loss = {epoch_loss:.4f}, training accuracy = {epoch_acc * 100:.2f}%,"
          f" val_loss = {epoch_val_loss:.4f} & val_accuracy = {val_acc:.2f}%, "
          f" LR = {optimizer_ft.param_groups[0]['lr']:.8f}\n")
    
    history_lr[epoch + 1] = {'loss': epoch_loss, 'acc': epoch_acc * 100, 'val_loss': epoch_val_loss, 'val_acc': val_acc,
                             'lr': optimizer_ft.param_groups[0]['lr']}
    
    exp_lr_scheduler.step()

    
    # Save best weights achieved until now-
    if (epoch_val_loss < best_val_loss):    
        # update 'best_val_loss' variable to lowest loss encountered so far-
        best_val_loss = epoch_val_loss
        
        print(f"Saving model with lowest val_loss = {epoch_val_loss:.4f}\n")
        
        # Save trained model with 'best' validation accuracy-
        torch.save(model_ft.state_dict(), "ResNet50__finetuned_best_trained_loss.pth")
    
    
    '''
    # Code for manual Early Stopping:
    # if np.abs(epoch_val_loss < best_val_loss) >= minimum_delta:
    if (epoch_val_loss < best_val_loss) and np.abs(epoch_val_loss - best_val_loss) >= minimum_delta:
        # print(f"epoch_val_loss = {epoch_val_loss:.4f}, best_val_loss = {best_val_loss:.4f}")
        
        # update 'best_val_loss' variable to lowest loss encountered so far-
        best_val_loss = epoch_val_loss
        
        # reset 'loc_patience' variable-
        loc_patience = 0
        
        print(f"\nSaving model with lowest val_loss = {epoch_val_loss:.4f}")
        
        # Save trained model with validation accuracy-
        # torch.save(model.state_dict, f"LeNet-300-100_Trained_{val_acc}.pth")
        torch.save(best_model.state_dict(), "LeNet-300-100_Trained.pth")
        
    else:  # there is no improvement in monitored metric 'val_loss'
        loc_patience += 1  # number of epochs without any improvement
    '''
    
    



epoch: 1 training loss = 0.9709, training accuracy = 66.39%, val_loss = 0.5264 & val_accuracy = 82.08%,  LR = 0.0100

Saving model with lowest val_loss = 0.5264


epoch: 2 training loss = 2.0591, training accuracy = 21.79%, val_loss = 1.5641 & val_accuracy = 41.78%,  LR = 0.0100


epoch: 3 training loss = 1.1587, training accuracy = 58.56%, val_loss = 0.6752 & val_accuracy = 76.59%,  LR = 0.0100


epoch: 4 training loss = 0.6257, training accuracy = 78.45%, val_loss = 0.5175 & val_accuracy = 82.27%,  LR = 0.0100

Saving model with lowest val_loss = 0.5175


epoch: 5 training loss = 0.4956, training accuracy = 83.07%, val_loss = 0.4673 & val_accuracy = 83.44%,  LR = 0.0100

Saving model with lowest val_loss = 0.4673


epoch: 6 training loss = 0.4214, training accuracy = 85.39%, val_loss = 0.3730 & val_accuracy = 87.48%,  LR = 0.0100

Saving model with lowest val_loss = 0.3730


epoch: 7 training loss = 0.3646, training accuracy = 87.44%, val_loss = 0.3576 & val_accuracy = 88.00%,  LR =

In [31]:
history_lr.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40])

In [32]:
# Convert from torch cuda to float values-
for epoch in history_lr.keys():
    history_lr[epoch]['acc'] = history_lr[epoch]['acc'].cpu().numpy().item()
    history_lr[epoch]['val_acc'] = history_lr[epoch]['val_acc'].cpu().numpy().item()    

In [33]:
# Sanity check-
history_lr[1]

{'acc': 66.39,
 'loss': 0.9709232287979126,
 'lr': 0.01,
 'val_acc': 82.08000183105469,
 'val_loss': 0.5264009431362152}

### Accessing current learning rate:

You could use the internal ```scheduler._last_lr``` attribute, the ```scheduler.state_dict()``` or alternatively you could check the learning rate in the optimizer via ```optimizer.param_groups[0]['lr']```.

Note that the first two approaches would only work after the first ```scheduler.step()``` call.

Refer: [here](https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/3)

In [34]:
import pickle

with open("ResNet50_finetuned_pretrained_exponential_scheduler_history.pkl", "wb") as file:
    pickle.dump(history_lr, file)

### Make predictions using 'best' trained weights:

In [36]:
# Initialize and load best weights-
best_model = models.resnet50(pretrained = False)

In [37]:
# Compute number of features for defining last linear/dense layer-
num_ftrs = best_model.fc.in_features

# Define last dense layer-
best_model.fc = nn.Linear(in_features = num_ftrs, out_features = 10)

# Change first conv layer of ResNet-50:
best_model.conv1 = torch.nn.Conv2d(
    in_channels = 3, out_channels = 64,
    kernel_size = (3, 3), stride = (1, 1),
    padding = (1, 1), bias = False
)

In [38]:
# Load trained weights from above-
best_model.load_state_dict(torch.load("ResNet50__finetuned_best_trained_loss.pth", map_location = device))

<All keys matched successfully>

In [42]:
# Make predictions-
running_loss_val, correct, total = test_model(best_model, test_loader)

val_loss = running_loss_val / len(test_dataset)
val_acc = 100 * (correct / total)

In [43]:
print("ResNet-50 trained model metrics on validation dataset:")
print(f"val_loss = {val_loss:.4f} and val_accuracy = {val_acc:.3f}%")

ResNet-50 trained model metrics on validation dataset:
val_loss = 0.2378 and val_accuracy = 92.580%
