# 6.3 Transfer Learning

In this code, we show how to perform transfer-learning using a pretrained [ResNet18](https://pytorch.org/vision/main/models/generated/torchvision.models.resnet18.html) and the FashionMnist dataset.



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, models, transforms
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import copy

# choose cpu or gpu
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

The expected input for the Resnet18 network is (3, 224, 224) - (Channel, Height, Width)

On the other hand, the images of the FashionMNIST dataset have (1, 28, 28)

So we need to preprocess the FashionMNIST dataset in order to resize and prepare the images.

In [None]:
# Pre-process
my_transform = transforms.Compose([
        transforms.Resize((224, 224)),  # resize height and width
        transforms.Grayscale(3),   # channel
        transforms.ToTensor(),
        transforms.Normalize((0.1307,0.1307,0.1307), (0.3081,0.3081,0.3081)),  # normalization
    ])

train_set = torchvision.datasets.FashionMNIST(root = './data/FashionMNIST', download = True,
                                              train = True, transform = my_transform)
test_set = torchvision.datasets.FashionMNIST(root = './data/FashionMNIST', download=True,
                                             train = False, transform = my_transform)

In [None]:
# training set shuffle the data
train_loader = DataLoader(dataset=train_set, batch_size=256, shuffle=True)
# testing set fix the data order
test_loader = DataLoader(dataset=test_set, batch_size=16, shuffle=False)

In [None]:
dataset_sizes = {'train': len(train_set), 'val': len(test_set)}
class_names = train_set.classes
print(dataset_sizes)
print(class_names)

Now that we have the datasets and dataloaders, let's just plot some examples.

In [None]:
# first batch images in test set
inputs, labels = next(iter(test_loader))
grid_images = torchvision.utils.make_grid(inputs)

def no_normalize(im):
    im = im.permute(1, 2, 0)
    im = im*torch.Tensor([0.1307, 0.1307, 0.1307]) + torch.Tensor([0.3081, 0.3081, 0.3081])
    return im

# plot figures
grid_images = no_normalize(grid_images)
plt.title([class_names[x] for x in labels])
plt.imshow(grid_images)
plt.show()

There are two common methods of transfer learning:

1. **feature extraction**: uses a pre-trained model (in our case ResNet18) as a fixed feature extractor by keeping its convolutional layers **frozen** and only training a new classifier on top;
2. **fine-tuning**: instead of keeping the pre-trained model fixed, we unfreeze some or all layers and train them further on the new dataset.

Below, we will use these two methods to train our model, and finally conduct a comparative analysis.
Let's first define below train and validation methods.

In [None]:
def validation(model, criterion):
    model.eval()   # eval

    running_loss = 0.0
    running_corrects = 0

    # data loop
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # forward
        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            preds = outputs.argmax(1)
            loss = criterion(outputs, labels)

        # count
        running_loss += loss.item() * inputs.size(0)
        running_corrects += (preds == labels.data).sum()

    epoch_loss = running_loss / dataset_sizes['val']
    epoch_acc = running_corrects.double() / dataset_sizes['val']

    # print training process
    print(f'VAL-LOSS: {epoch_loss:.4f}',  f'VAL-ACC: {epoch_acc:.4f} ', end='\n')
    return epoch_loss, epoch_acc

In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=10, cut_train=False):
    for epoch in range(num_epochs):
        lr = optimizer.param_groups[0]['lr']
        print(
            f'EPOCH: {epoch+1:0>{len(str(num_epochs))}}/{num_epochs}',
            f'LR: {lr:.4f}',
            end=' '
        )
        model.train()  # train

        running_loss = 0.0
        running_corrects = 0

        # data loop
        for i, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero gradient
            optimizer.zero_grad()

            # forward
            outputs = model(inputs)
            preds = outputs.argmax(1)
            loss = criterion(outputs, labels)

            # backward + parameters update
            loss.backward()
            optimizer.step()

            # count
            running_loss += loss.item() * inputs.size(0)
            running_corrects += (preds == labels.data).sum()

            if cut_train is True and i == 10:
              # if cut_train is True, break after a few interations to speed up
              # not the best approach, but necessary
              # ideally, you should train using gpu
              break

        # learning rate adjustment
        scheduler.step()

        epoch_loss = running_loss / dataset_sizes['train']
        epoch_acc = running_corrects.double() / dataset_sizes['train']

        # print training process
        print(
            f'LOSS: {epoch_loss:.4f}',
            f'ACC: {epoch_acc:.4f} ',
            end='\n'
        )
        val_loss, val_acc = validation(model, criterion)  # at the end of an epoch, validate
    return val_loss, val_acc

Resnet18 is relatively deep compared to ordinary one or two-layer convolutional networks, and the Fashion Mnsit dataset is quite large, with a total of 70,000 pictures. We recommend you to use GPU to train the model, but it will still cost for about 5 minutes for each epochs (you can reduce the training epoch to save time).


`Note: please run the code on GPU colab due to the time consuming of training process.`

`You can change the EPOCH for different values. `


# 1 - Feature Extraction

This method freezes the weights of all layers except the fully connected layer, and only trains the fully connected layer after modifying the fully connected layer.

In [None]:
# Load the pretrained model
model_conv = models.resnet18(pretrained=True)

# Freeze all layers except fully connected layers so that their gradients are not computed in backpropagation
for param in model_conv.parameters():
    param.requires_grad = False

# Get the number of input features of the fully connected layer of resnet18
num_ftrs = model_conv.fc.in_features

# Adjust the number of output features of the fully connected layer to 10
model_conv.fc = nn.Linear(num_ftrs, 10)

# GPU/CPU
model_conv = model_conv.to(device)

# loss function
criterion = nn.CrossEntropyLoss()

# optimizer which only pass the parameters of the fully connected layer
optimizer_conv = optim.SGD(model_conv.fc.parameters(), lr=1e-3, momentum=0.9)

# Define the optimizer adjustment strategy, and reduce the learning rate by 0.1 multiplication factor after every 5 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=5, gamma=0.1)

# train
fe_val_loss, fe_val_acc = train_model(
    model_conv,
    criterion,
    optimizer_conv,
    exp_lr_scheduler,
    num_epochs=1,     # 5, 10, ...
    cut_train=True  # comment this to use the entire dataset for training!!!
)

# 2 - Parameter fine-tuning
This method uses pre-trained parameters to initialize our network model, modifies the fully connected layers and then trains all layers.



In [None]:
# Load the pretrained model
model_ft = models.resnet18(pretrained=True)

# NOTE THE DIFFERENCE HERE: we do not freeze the layers!!!!!!

# Get the number of input features of the fully connected layer of resnet18
num_ftrs = model_ft.fc.in_features

# Adjust the number of output features of the fully connected layer to 10
model_ft.fc = nn.Linear(num_ftrs, len(class_names))

# GPU/CPU
model_ft = model_ft.to(device)

# loss function
criterion = nn.CrossEntropyLoss()

# optimizer
optimizer_ft = optim.SGD(model_ft.parameters(), lr=1e-3, momentum=0.9)

# Define the optimizer adjustment strategy, and reduce the learning rate by 0.1 multiplication factor after every 5 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=5, gamma=0.1)

# call the train_model
ft_val_loss, ft_val_acc = train_model(
    model_ft,
    criterion,
    optimizer_ft,
    exp_lr_scheduler,
    num_epochs=1,  # 5, 10, 15....
    cut_train=True  # comment this to use the entire dataset for training!!!
)

# 3- Trained from Scratch

In this part, we train the same model from scratch, that is, without using pre-trained weights.

In [None]:
model_ft = models.resnet18(pretrained=False)

num_ftrs = model_ft.fc.in_features

model_ft.fc = nn.Linear(num_ftrs, len(class_names))

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

optimizer_ft = optim.SGD(model_ft.parameters(), lr=1e-3, momentum=0.9)

exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=5, gamma=0.1)

scr_val_loss, scr_val_acc = train_model(
    model_ft,
    criterion,
    optimizer_ft,
    exp_lr_scheduler,
    num_epochs=1,  # 5, 10, 15, ...
    cut_train=True  # comment this to use the entire dataset for training!!!
)

In [None]:
import matplotlib.pyplot as plt

# plot
plt.clf()
plt.bar([0, 1, 2], [fe_val_acc, ft_val_acc, scr_val_acc])
plt.xticks([0, 1, 2], ['Feature Extraction', 'Fine-Tuning', 'Trained from Scratch'])
plt.ylabel('Acc')

plt.ylim((0.0, 0.4))
plt.show()
plt.close()