# Transfer learning - Fine-tuning VGG16



## Import Modules and Datasets

In [4]:
# Setting seeds to try and ensure we have the same results - this is not guaranteed across PyTorch releases.
import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [7]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
    ])

trainset = datasets.CIFAR10(root='~/.pytorch/CIFAR10',train=True, download=True,transform=transform)
testset = datasets.CIFAR10(root='~/.pytorch/CIFAR10',train=False, transform=transform)

trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /root/.pytorch/CIFAR10/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting /root/.pytorch/CIFAR10/cifar-10-python.tar.gz to /root/.pytorch/CIFAR10


In [8]:
for images, labels in trainloader:
  print(images.size(), labels.size())
  break

torch.Size([64, 3, 224, 224]) torch.Size([64])


In [35]:
model = models.vgg16(pretrained=True)

In [36]:
model.features

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_si

In [37]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=4096, out_features=1000, bias=True)
)

In [38]:
for param in model.parameters():
  param.requires_grad = False

#Training the last output layer

In [39]:
model.classifier[-1] = nn.Sequential(
                       nn.Linear(in_features=4096, out_features=10),
                       nn.LogSoftmax(dim=1)
                        )

In [40]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Sequential(
    (0): Linear(in_features=4096, out_features=10, bias=True)
    (1): LogSoftmax(dim=1)
  )
)

In [41]:
criterion = nn.NLLLoss()

In [42]:
from torch.optim import Adam

model = model.to(device)
optimizer = Adam(model.parameters())

In [43]:
# Train the model - Final Fully Connected LAyer
num_epochs = 1
batch_loss = 0
cum_epoch_loss = 0

for e in range(num_epochs):
  cum_epoch_loss = 0
  
  for batch, (images, labels) in enumerate(trainloader,1):
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logps = model(images)
    loss = criterion(logps, labels)
    loss.backward()
    optimizer.step()
    
    batch_loss += loss.item()
    print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(trainloader)})  Batch loss : {loss.item()}')
    
  print(f'Training loss : {batch_loss/len(trainloader)}')  
    

Epoch(0/1 : Batch number(1/782)  Batch loss : 2.446807384490967
Epoch(0/1 : Batch number(2/782)  Batch loss : 2.2174389362335205
Epoch(0/1 : Batch number(3/782)  Batch loss : 1.935154676437378
Epoch(0/1 : Batch number(4/782)  Batch loss : 1.8331561088562012
Epoch(0/1 : Batch number(5/782)  Batch loss : 1.6881494522094727
Epoch(0/1 : Batch number(6/782)  Batch loss : 1.6643388271331787
Epoch(0/1 : Batch number(7/782)  Batch loss : 1.637941837310791
Epoch(0/1 : Batch number(8/782)  Batch loss : 1.367242455482483
Epoch(0/1 : Batch number(9/782)  Batch loss : 1.472529649734497
Epoch(0/1 : Batch number(10/782)  Batch loss : 1.17693030834198
Epoch(0/1 : Batch number(11/782)  Batch loss : 1.2244377136230469
Epoch(0/1 : Batch number(12/782)  Batch loss : 1.1676913499832153
Epoch(0/1 : Batch number(13/782)  Batch loss : 1.366422414779663
Epoch(0/1 : Batch number(14/782)  Batch loss : 1.0710053443908691
Epoch(0/1 : Batch number(15/782)  Batch loss : 0.9390655159950256
Epoch(0/1 : Batch number(16

In [44]:
model.to('cpu')

model.eval()
with torch.no_grad():
    num_correct = 0
    total = 0

    #set_trace()
    for batch, (images, labels) in enumerate(testloader,1):
        
        logps = model(images)
        output = torch.exp(logps)
        
        pred = torch.argmax(output, 1)
        total += labels.size(0)
        num_correct += (pred == labels).sum().item()
        print(f'Batch ({batch}/{len(testloader)})')
        
        if batch == 5:
          break

    print(f'Accuracy of the model on {total} test images: {num_correct * 100 / total}% ')

Batch (1/157)
Batch (2/157)
Batch (3/157)
Batch (4/157)
Batch (5/157)
Accuracy of the model on 320 test images: 81.5625% 


# Transfer Learning - Fully Connected Layer

In [22]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Sequential(
    (0): Linear(in_features=4096, out_features=10, bias=True)
    (1): LogSoftmax(dim=1)
  )
)

In [23]:
for i in range(0,7):
  model.classifier[i].requires_grad = True

In [24]:
model.classifier[6] = nn.Sequential(
                      nn.Linear(4096,512),
                      nn.ReLU(),
                      nn.Dropout(0.5),
                      nn.Linear(512,10),
                      nn.LogSoftmax(dim=1)
                      )


In [25]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Sequential(
    (0): Linear(in_features=4096, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=512, out_features=10, bias=True)
    (4): LogSoftmax(dim=1)
  )
)

In [26]:
model.features

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_si

In [27]:
criterion = nn.NLLLoss()

In [28]:
from torch.optim import Adam

model = model.to(device)
optimizer = Adam(model.parameters())


## Training from the Fully Connected Network onwards

### Re-training the model

In [29]:
model = model.to(device)
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

num_epochs = 1
batch_loss = 0
cum_epoch_loss = 0

for e in range(num_epochs):
  cum_epoch_loss = 0
  
  for batch, (images, labels) in enumerate(trainloader,1):
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logps = model(images)
    loss = criterion(logps, labels)
    loss.backward()
    optimizer.step()
    
    batch_loss += loss.item()
    print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(trainloader)}) : Batch loss : {loss.item()}')
    
  print(f'Training loss : {batch_loss/len(trainloader)}')  
    

Epoch(0/1 : Batch number(1/782) : Batch loss : 2.3403685092926025
Epoch(0/1 : Batch number(2/782) : Batch loss : 2.0940003395080566
Epoch(0/1 : Batch number(3/782) : Batch loss : 1.8263192176818848
Epoch(0/1 : Batch number(4/782) : Batch loss : 1.6251918077468872
Epoch(0/1 : Batch number(5/782) : Batch loss : 1.3979531526565552
Epoch(0/1 : Batch number(6/782) : Batch loss : 1.3720227479934692
Epoch(0/1 : Batch number(7/782) : Batch loss : 1.1942205429077148
Epoch(0/1 : Batch number(8/782) : Batch loss : 1.0625096559524536
Epoch(0/1 : Batch number(9/782) : Batch loss : 1.107385277748108
Epoch(0/1 : Batch number(10/782) : Batch loss : 1.0846725702285767
Epoch(0/1 : Batch number(11/782) : Batch loss : 1.0032676458358765
Epoch(0/1 : Batch number(12/782) : Batch loss : 1.0276130437850952
Epoch(0/1 : Batch number(13/782) : Batch loss : 0.6948723196983337
Epoch(0/1 : Batch number(14/782) : Batch loss : 0.9485833644866943
Epoch(0/1 : Batch number(15/782) : Batch loss : 0.7482354640960693
Epoch

### The accuracy of the model

In [30]:
model.to('cpu')

model.eval()
with torch.no_grad():
    num_correct = 0
    total = 0

    #set_trace()
    for batch, (images, labels) in enumerate(testloader,1):
        
        logps = model(images)
        output = torch.exp(logps)
        
        pred = torch.argmax(output, 1)
        total += labels.size(0)
        num_correct += (pred == labels).sum().item()
        print(f'Batch ({batch}/{len(testloader)})')
        
        if batch == 5:
          break

    print(f'Accuracy of the model on {total} test images: {num_correct * 100 / total}% ')

Batch (1/157)
Batch (2/157)
Batch (3/157)
Batch (4/157)
Batch (5/157)
Accuracy of the model on 320 test images: 82.1875% 


## Un-freezing & training on the LAST CNN block onwards

### Re-training the model

In [None]:
for i in range(24,31):
  model.features[i].requires_grad = True


In [None]:
model = model.to(device)
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

num_epochs = 1
batch_loss = 0
cum_epoch_loss = 0

for e in range(num_epochs):
  cum_epoch_loss = 0
  
  for batch, (images, labels) in enumerate(trainloader,1):
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logps = model(images)
    loss = criterion(logps, labels)
    loss.backward()
    optimizer.step()
    
    batch_loss += loss.item()
    print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(trainloader)}) : Batch loss : {loss.item()}')
    
  print(f'Training loss : {batch_loss/len(trainloader)}')  
    

Epoch(0/1 : Batch number(1/782) : Batch loss : 0.37638935446739197
Epoch(0/1 : Batch number(2/782) : Batch loss : 0.4667692184448242
Epoch(0/1 : Batch number(3/782) : Batch loss : 0.4867028594017029
Epoch(0/1 : Batch number(4/782) : Batch loss : 0.7602155208587646
Epoch(0/1 : Batch number(5/782) : Batch loss : 0.46336594223976135
Epoch(0/1 : Batch number(6/782) : Batch loss : 0.48384594917297363
Epoch(0/1 : Batch number(7/782) : Batch loss : 0.6412556171417236
Epoch(0/1 : Batch number(8/782) : Batch loss : 0.3660600483417511
Epoch(0/1 : Batch number(9/782) : Batch loss : 0.6906344294548035
Epoch(0/1 : Batch number(10/782) : Batch loss : 0.6811375021934509
Epoch(0/1 : Batch number(11/782) : Batch loss : 0.30804815888404846
Epoch(0/1 : Batch number(12/782) : Batch loss : 0.35521185398101807
Epoch(0/1 : Batch number(13/782) : Batch loss : 0.6848178505897522
Epoch(0/1 : Batch number(14/782) : Batch loss : 0.5894997715950012
Epoch(0/1 : Batch number(15/782) : Batch loss : 0.3391475677490234

### The accuracy of the model

In [None]:
model.to('cpu')

model.eval()
with torch.no_grad():
    num_correct = 0
    total = 0

    #set_trace()
    for batch, (images, labels) in enumerate(testloader,1):
        
        logps = model(images)
        output = torch.exp(logps)
        
        pred = torch.argmax(output, 1)
        total += labels.size(0)
        num_correct += (pred == labels).sum().item()
        print(f'Batch ({batch}/{len(testloader)})')
        
        if batch == 5:
          break

    print(f'Accuracy of the model on {total} test images: {num_correct * 100 / total}% ')

Batch (1/157)
Batch (2/157)
Batch (3/157)
Batch (4/157)
Batch (5/157)
Accuracy of the model on 320 test images: 83.4375% 


## Un-freezing & training on the LAST TWO CNN block onwards

### Re-training the model

In [31]:
for i in range(17,24):
  model.features[i].requires_grad = True

In [32]:
model = model.to(device)
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

num_epochs = 1
batch_loss = 0
cum_epoch_loss = 0

for e in range(num_epochs):
  cum_epoch_loss = 0
  
  for batch, (images, labels) in enumerate(trainloader,1):
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logps = model(images)
    loss = criterion(logps, labels)
    loss.backward()
    optimizer.step()
    
    batch_loss += loss.item()
    print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(trainloader)}) : Batch loss : {loss.item()}')
    
  print(f'Training loss : {batch_loss/len(trainloader)}')  
    

Epoch(0/1 : Batch number(1/782) : Batch loss : 0.4241348206996918
Epoch(0/1 : Batch number(2/782) : Batch loss : 0.42242661118507385
Epoch(0/1 : Batch number(3/782) : Batch loss : 0.5244381427764893
Epoch(0/1 : Batch number(4/782) : Batch loss : 0.5073692202568054
Epoch(0/1 : Batch number(5/782) : Batch loss : 0.4361976981163025
Epoch(0/1 : Batch number(6/782) : Batch loss : 0.4047842025756836
Epoch(0/1 : Batch number(7/782) : Batch loss : 0.5826764106750488
Epoch(0/1 : Batch number(8/782) : Batch loss : 0.45725929737091064
Epoch(0/1 : Batch number(9/782) : Batch loss : 0.6708146929740906
Epoch(0/1 : Batch number(10/782) : Batch loss : 0.4495810866355896
Epoch(0/1 : Batch number(11/782) : Batch loss : 0.46492207050323486
Epoch(0/1 : Batch number(12/782) : Batch loss : 0.41973525285720825
Epoch(0/1 : Batch number(13/782) : Batch loss : 0.4632171392440796
Epoch(0/1 : Batch number(14/782) : Batch loss : 0.46138861775398254
Epoch(0/1 : Batch number(15/782) : Batch loss : 0.4958300292491913

### The accuracy of the model

In [33]:
model.to('cpu')

model.eval()
with torch.no_grad():
    num_correct = 0
    total = 0

    #set_trace()
    for batch, (images, labels) in enumerate(testloader,1):
        
        logps = model(images)
        output = torch.exp(logps)
        
        pred = torch.argmax(output, 1)
        total += labels.size(0)
        num_correct += (pred == labels).sum().item()
        print(f'Batch ({batch}/{len(testloader)})')
        
        if batch == 5:
          break

    print(f'Accuracy of the model on {total} test images: {num_correct * 100 / total}% ')

Batch (1/157)
Batch (2/157)
Batch (3/157)
Batch (4/157)
Batch (5/157)
Accuracy of the model on 320 test images: 80.0% 
