### __Transfer Learning Assignment | Mohammed Asif Sahadh - 24MSD7061__

#### Import libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, models, transforms
import torch.utils.data as data
from torch.utils.data import DataLoader
import time

#### Set device to GPU

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


#### Initialize parameters

In [3]:
num_classes = 2
batch_size = 10
num_epochs = 100
learning_rate = 1e-5

#### Data transformation to match pretrained model training data dimentions

In [4]:
data_transform = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

#### Get data & load it to PyTorch DataLoader class

In [5]:
train_data_path = r"train"
test_data_path = r"test"

In [6]:
train_data = torchvision.datasets.ImageFolder(root = train_data_path, transform = data_transform['train'])
train_data_loader = data.DataLoader(train_data, batch_size = batch_size, shuffle = True,  num_workers = 4)

test_data = torchvision.datasets.ImageFolder(root = test_data_path, transform = data_transform['test'])
test_data_loader = data.DataLoader(test_data, batch_size = batch_size, shuffle = True,  num_workers = 4)

#### Get model

In [7]:
model = models.vgg16(weights = 'DEFAULT')

#### Define loss function and optimizer

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr = learning_rate)

#### 1. __Fine Tune Fully Connected Layers Only__

In [9]:
for param in model.features.parameters():
    param.requires_grad = False 

In [10]:
model.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes) # modify output neurons
model.cuda() # send model to GPU

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [11]:
start = time.time()

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_data_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_data_loader.dataset)
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}')

end = time.time()

print()
et = (end - start) / 60
print(f"Elapsed time: {et:.2f} minutes")

Epoch 10/100, Loss: 0.3060
Epoch 20/100, Loss: 0.1177
Epoch 30/100, Loss: 0.0485
Epoch 40/100, Loss: 0.0441
Epoch 50/100, Loss: 0.0154
Epoch 60/100, Loss: 0.0082
Epoch 70/100, Loss: 0.0077
Epoch 80/100, Loss: 0.0045
Epoch 90/100, Loss: 0.0028
Epoch 100/100, Loss: 0.0053

Elapsed time: 8.46 minutes


In [12]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_data_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

val_accuracy = correct / total
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

Validation Accuracy: 71.79%


#### 2. __Fine Tune Entire Model__

In [9]:
for param in model.features.parameters():
    param.requires_grad = True

In [10]:
model.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes) # modify output neurons
model.cuda() # send model to GPU

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [11]:
start = time.time()

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_data_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_data_loader.dataset)
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}')

end = time.time()

print()
et = (end - start) / 60
print(f"Elapsed time: {et:.2f} minutes")

Epoch 10/100, Loss: 0.3224
Epoch 20/100, Loss: 0.1184
Epoch 30/100, Loss: 0.0490
Epoch 40/100, Loss: 0.0229
Epoch 50/100, Loss: 0.0147
Epoch 60/100, Loss: 0.0079
Epoch 70/100, Loss: 0.0033
Epoch 80/100, Loss: 0.0023
Epoch 90/100, Loss: 0.0023
Epoch 100/100, Loss: 0.0022

Elapsed time: 11.42 minutes


In [12]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_data_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

val_accuracy = correct / total
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

Validation Accuracy: 69.23%


#### 3. __Hybrid Fine Tuning__ - Freeze First 10 Layers & Unfreeze the Rest

In [9]:
for param in model.features.parameters():
    param.requires_grad = True

counter = 0
for child in model.children():
    counter += 1
    if counter == 1:
        continue 
    if counter < 11: 
        for param in child.parameters():
            param.requires_grad = False # freezing only the first 10 layers

In [10]:
model.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes) # modify output neurons
model.cuda() # send model to GPU

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [11]:
start = time.time()

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_data_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_data_loader.dataset)
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}')

end = time.time()

print()
et = (end - start) / 60
print(f"Elapsed time: {et:.2f} minutes")

Epoch 10/100, Loss: 0.7189
Epoch 20/100, Loss: 0.7376
Epoch 30/100, Loss: 0.7352
Epoch 40/100, Loss: 0.7296
Epoch 50/100, Loss: 0.7129
Epoch 60/100, Loss: 0.7503
Epoch 70/100, Loss: 0.7691
Epoch 80/100, Loss: 0.7189
Epoch 90/100, Loss: 0.7229
Epoch 100/100, Loss: 0.7200

Elapsed time: 11.51 minutes


In [12]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_data_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

val_accuracy = correct / total
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

Validation Accuracy: 48.72%


#### __Concluding points__
- Validation accuracy for fine tuning only the fully connected layers of VGG16 came out to be `71.79%`, trained in `8 minutes & 28 seconds`.<br><br>
- Validation accuracy for fine tuning all of VGG16 came out to be `69.23%`, trained in `11 minutes & 25 seconds`.<br><br>
- Validation accuracy for fine tuning all of VGG16 excet the first 10 layers came out to be `48.72%`, trained in `11 minutes & 30 seconds`.

The above results suggests that fine tuning the fully connected layers alone is sufficient to get a good accuracy. Moreover, the training time is also least in this case. The accuracy decline when training more layers could be attributed to overparameterization, where too many parameters are being fine-tuned relative to the limited amount of training data.