# Transfer learning - Differential Learning Rate


## Fine Tuning

In [0]:
# Setting seeds to try and ensure we have the same results - this is not guaranteed across PyTorch releases.
import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [0]:
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
    ])

trainset = datasets.CIFAR10(root='~/.pytorch/CIFAR10',train=True, download=True,transform=transform)
testset = datasets.CIFAR10(root='~/.pytorch/CIFAR10',train=False, transform=transform)

trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=False)

Files already downloaded and verified


In [5]:
for images, labels in trainloader:
  print(images.size(), labels.size())
  break

torch.Size([64, 3, 224, 224]) torch.Size([64])


In [0]:
def print_model_info(model):
  for name,param in model.named_parameters():
          if param.requires_grad == True:
              print("\t",name)

  total_params = sum(p.numel() for p in model.parameters())
  print(f'{total_params:,} total parameters.')
  total_trainable_params = sum(
      p.numel() for p in model.parameters() if p.requires_grad)
  print(f'{total_trainable_params:,} training parameters.')

In [7]:
model = models.vgg16(pretrained=True)
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=4096, out_features=1000, bias=True)
)

In [8]:
print_model_info(model)

	 features.0.weight
	 features.0.bias
	 features.2.weight
	 features.2.bias
	 features.5.weight
	 features.5.bias
	 features.7.weight
	 features.7.bias
	 features.10.weight
	 features.10.bias
	 features.12.weight
	 features.12.bias
	 features.14.weight
	 features.14.bias
	 features.17.weight
	 features.17.bias
	 features.19.weight
	 features.19.bias
	 features.21.weight
	 features.21.bias
	 features.24.weight
	 features.24.bias
	 features.26.weight
	 features.26.bias
	 features.28.weight
	 features.28.bias
	 classifier.0.weight
	 classifier.0.bias
	 classifier.3.weight
	 classifier.3.bias
	 classifier.6.weight
	 classifier.6.bias
138,357,544 total parameters.
138,357,544 training parameters.


In [0]:
for param in model.parameters():
  param.requires_grad = False

In [10]:
print_model_info(model)

138,357,544 total parameters.
0 training parameters.


In [0]:
for i in range(0,7):
  model.classifier[i].requires_grad = True

In [12]:
print_model_info(model)

138,357,544 total parameters.
0 training parameters.


In [0]:
model.classifier[6] = nn.Sequential(
                      nn.Linear(4096,512),
                      nn.ReLU(),
                      nn.Dropout(0.5),
                      nn.Linear(512,10),
                      nn.LogSoftmax(dim=1)
                      )


In [14]:
print_model_info(model)

	 classifier.6.0.weight
	 classifier.6.0.bias
	 classifier.6.3.weight
	 classifier.6.3.bias
136,363,338 total parameters.
2,102,794 training parameters.


In [0]:
model

In [0]:
criterion = nn.NLLLoss()

In [0]:
from torch.optim import Adam

model = model.to(device)



## Training from the Fully Connected Network onwards

### Re-training the model

In [0]:
from torch.optim import Adam

lr = 3e-4
optimizer = Adam([
    { 'params': model.classifier[0].parameters(), 'lr': lr},
    { 'params': model.classifier[3].parameters(), 'lr': lr},
    { 'params': model.classifier[6].parameters(), 'lr': lr}
    ], lr=lr)

In [16]:
print_model_info(model)

	 classifier.6.0.weight
	 classifier.6.0.bias
	 classifier.6.3.weight
	 classifier.6.3.bias
136,363,338 total parameters.
2,102,794 training parameters.


In [0]:
model = model.to(device)
#optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

num_epochs = 1
batch_loss = 0
cum_epoch_loss = 0

for e in range(num_epochs):
  cum_epoch_loss = 0
  
  for batch, (images, labels) in enumerate(trainloader,1):
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logps = model(images)
    loss = criterion(logps, labels)
    loss.backward()
    optimizer.step()
    
    batch_loss += loss.item()
    print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(trainloader)}) : Batch loss : {loss.item()}')
    
  print(f'Training loss : {batch_loss/len(trainloader)}')  
    

### The accuracy of the model

In [0]:
model.to('cpu')

model.eval()
with torch.no_grad():
    num_correct = 0
    total = 0

    #set_trace()
    for batch, (images, labels) in enumerate(testloader,1):
        
        logps = model(images)
        output = torch.exp(logps)
        
        pred = torch.argmax(output, 1)
        total += labels.size(0)
        num_correct += (pred == labels).sum().item()
        print(f'Batch ({batch}/{len(testloader)})')
        
        if batch == 5:
          break

    print(f'Accuracy of the model on {total} test images: {num_correct * 100 / total}% ')

## Un-freezing & training on the LAST CNN block onwards

### Re-training the model

In [0]:
for i in range(24,31):
  model.features[i].requires_grad = True


In [18]:
print_model_info(model)

	 classifier.6.0.weight
	 classifier.6.0.bias
	 classifier.6.3.weight
	 classifier.6.3.bias
136,363,338 total parameters.
2,102,794 training parameters.


In [0]:
from torch.optim import Adam

lr = 3e-4
optimizer = Adam([
    { 'params': model.features[24].parameters(), 'lr': lr/3},
    { 'params': model.features[26].parameters(), 'lr': lr/3},
    { 'params': model.features[28].parameters(), 'lr': lr/3},
    { 'params': model.classifier[0].parameters(), 'lr': lr},
    { 'params': model.classifier[3].parameters(), 'lr': lr},
    { 'params': model.classifier[6].parameters(), 'lr': lr}
    ], lr=lr)

In [20]:
print_model_info(model)

	 classifier.6.0.weight
	 classifier.6.0.bias
	 classifier.6.3.weight
	 classifier.6.3.bias
136,363,338 total parameters.
2,102,794 training parameters.


In [0]:
model = model.to(device)
#optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

num_epochs = 1
batch_loss = 0
cum_epoch_loss = 0

for e in range(num_epochs):
  cum_epoch_loss = 0
  
  for batch, (images, labels) in enumerate(trainloader,1):
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logps = model(images)
    loss = criterion(logps, labels)
    loss.backward()
    optimizer.step()
    
    batch_loss += loss.item()
    print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(trainloader)}) : Batch loss : {loss.item()}')
    
  print(f'Training loss : {batch_loss/len(trainloader)}')  
    

### The accuracy of the model

In [0]:
model.to('cpu')

model.eval()
with torch.no_grad():
    num_correct = 0
    total = 0

    #set_trace()
    for batch, (images, labels) in enumerate(testloader,1):
        
        logps = model(images)
        output = torch.exp(logps)
        
        pred = torch.argmax(output, 1)
        total += labels.size(0)
        num_correct += (pred == labels).sum().item()
        print(f'Batch ({batch}/{len(testloader)})')
        
        if batch == 5:
          break

    print(f'Accuracy of the model on {total} test images: {num_correct * 100 / total}% ')

## Un-freezing & training on the LAST TWO CNN block onwards

### Re-training the model

In [0]:
for i in range(17,24):
  model.features[i].requires_grad = True

In [0]:
from torch.optim import Adam

lr = 3e-4
optimizer = Adam([
    { 'params': model.features[17].parameters(), 'lr': lr/9},
    { 'params': model.features[19].parameters(), 'lr': lr/9},
    { 'params': model.features[21].parameters(), 'lr': lr/9},
    { 'params': model.features[24].parameters(), 'lr': lr/3},
    { 'params': model.features[26].parameters(), 'lr': lr/3},
    { 'params': model.features[28].parameters(), 'lr': lr/3},
    { 'params': model.classifier[0].parameters(), 'lr': lr},
    { 'params': model.classifier[3].parameters(), 'lr': lr},
    { 'params': model.classifier[6].parameters(), 'lr': lr}
    ], lr=lr)

In [23]:
print_model_info(model)

	 classifier.6.0.weight
	 classifier.6.0.bias
	 classifier.6.3.weight
	 classifier.6.3.bias
136,363,338 total parameters.
2,102,794 training parameters.


In [0]:
model = model.to(device)
#optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

num_epochs = 1
batch_loss = 0
cum_epoch_loss = 0

for e in range(num_epochs):
  cum_epoch_loss = 0
  
  for batch, (images, labels) in enumerate(trainloader,1):
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logps = model(images)
    loss = criterion(logps, labels)
    loss.backward()
    optimizer.step()
    
    batch_loss += loss.item()
    print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(trainloader)}) : Batch loss : {loss.item()}')
    
  print(f'Training loss : {batch_loss/len(trainloader)}')  
    

### The accuracy of the model

In [0]:
model.to('cpu')

model.eval()
with torch.no_grad():
    num_correct = 0
    total = 0

    #set_trace()
    for batch, (images, labels) in enumerate(testloader,1):
        
        logps = model(images)
        output = torch.exp(logps)
        
        pred = torch.argmax(output, 1)
        total += labels.size(0)
        num_correct += (pred == labels).sum().item()
        print(f'Batch ({batch}/{len(testloader)})')
        
        if batch == 5:
          break

    print(f'Accuracy of the model on {total} test images: {num_correct * 100 / total}% ')