# Transfer Learning

with **torchvision.models** we can download pretrained networks and use them in our applications.

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models

Most of the pretrained models require the input to be 224x224 images. Also, we'need to match the normalization used when the models were trained. Each color channel was normalized separately.

For imagenet, the means are [0.485, 0.456, 0.406], and the standard deviations are [0.229, 0.224, 0.225].

In [3]:
data_dir = 'Cat_Dog_data/'

# Define transforms for the trainin data 
# and testing data
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomRotation(30),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.ToTensor()
])

# Pass transforms in here, then run the next cell to
# see how the transforms look
train_data = datasets.ImageFolder(
    data_dir + '/train',
    transform=train_transforms)

test_data = datasets.ImageFolder(
    data_dir + '/test',
    transform=test_transforms)

trainloader = torch.utils.data.DataLoader(
    train_data,
    batch_size=64,
    shuffle=True)

testloader = torch.utils.data.DataLoader(
    test_data,
    batch_size=64)

In [4]:
model = models.densenet121(pretrained=True)
model

Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /home/burak/.cache/torch/checkpoints/densenet121-a639ec97.pth
100.0%


DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

This model is built out of two main parts, the features and the classifier. The features part is a stack of conv layers and overall works as a feature detector that can be fed into a classifier. The classifier part is a single full-connected layer.

We need to replace the classifier, but the features will work perfectly on their own.

In [5]:
#  Freeze parameters, we don't backprop through them

for param in model.parameters():
    param.requires_grad = False

from collections import OrderedDict
classifier = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(1024, 500)),
    ('relu', nn.ReLU()),
    ('fc2', nn.Linear(500, 2)),
    ('output', nn.LogSoftmax(dim=1))
]))

model.classifier = classifier

In PyTorch, we move our model parameters and other tensors to the GPU memory using *model.to('cuda')*. We can move them back from GPU with *model.to('cpu')* which we'll commonly do when we need to operate on the network ouput outside of PyTorch.

Let's see the difference.

In [6]:
import time

In [7]:
for device in ['cpu', 'cuda']:
    
    criterion = nn.NLLLoss()
    # Only train the classifier parameters,
    # feature parameters are frozen
    optimizer = optim.Adam(model.classifier.parameters(), 
                           lr=0.001)
    
    model.to(device)
    
    for ii, (inputs, labels) in enumerate(trainloader):
        # Move input and label tensors to the GPU
        inputs, labels = inputs.to(device), labels.to(device)
        
        start = time.time()
        
        outputs = model.forward(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        if ii==3:
            break
            
    print(f'Device = {device}; Time per batch: {(time.time() - start)/3:.3f} seconds')

Device = cpu; Time per batch: 2.167 seconds


AssertionError: 
Found no NVIDIA driver on your system. Please check that you
have an NVIDIA GPU and installed a driver from
http://www.nvidia.com/Download/index.aspx

We can write device agnostic code which will automatically use CUDA if it's enabled like so:

```
# at beginning of the script
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# then whenever you get a new Tensor or Module
# this won't copy if they are already on the desired device
input = data.to(device)
model = MyModule(...).to(device)
```

**Exercise**

We should get better than 95% accuracy easily.

Train a pretrained models to classify the cat and dog images. Make sure you are only training the classifier and the parameters for the features part are frozen.

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
model = models.resnet50(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /home/burak/.cache/torch/checkpoints/resnet50-19c8e357.pth
100.0%


In [14]:
# Turn off gradients for our model
for param in model.parameters():
    param.requires_grad = False
    
# Define our new classifier
classifier = nn.Sequential(nn.Linear(2048, 512), 
                      nn.ReLU(),
                      nn.Dropout(p=0.2),
                      nn.Linear(512, 2),
                      nn.LogSoftmax(dim=1))
model.fc = classifier

criterion = nn.NLLLoss()

optimizer = optim.Adam(model.fc.parameters(), lr=0.003)

model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
model

In [19]:
epochs = 1
steps = 0
running_loss = 0
print_every = 5

for epoch in range(epochs):
    for images, labels in trainloader:
        steps +=1
        
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        logps = model(images)
        loss = criterion(logps, labels)
        loss.backward()
        
        optimizer.step()
        
        running_loss += loss.item()
        
        if step % print_every == 0:
            model.eval()
            test_loss = 0
            accuracy = 0
            
            for images, labels in testloader:
                
                images, labels = images.to(device), labels.to(device)
                
                logps = model(images)
                loss = criterion(logps, labels)
                test_loss += loss.item()
                
                # calculate our accuracy
                ps = torch.exp(logps)
                top_ps, top_class = ps.topk(1, dim=1)
                equality = top_class == labels.view(*top_class.shape)
                accuracy += torch.mean(equality.type(torch.FloatTensor))
                
            print(f"Epoch {epoch+1}/{epochs}.."
                  f"Train loss: {running_loss/print_every:.3f}.."
                  f"Test loss: {test_loss/len(testloader):.3f}.."
                  f"Test accuracy: {accuracy/len(testloader):.3f}")
            
            running_loss = 0
            model.train()

Epoch 1/1..Train loss: 0.196..Test loss: 0.365..Test accuracy: 0.824
Epoch 1/1..Train loss: 0.169..Test loss: 1.136..Test accuracy: 0.505
Epoch 1/1..Train loss: 0.263..Test loss: 0.726..Test accuracy: 0.615
Epoch 1/1..Train loss: 0.158..Test loss: 0.247..Test accuracy: 0.902
Epoch 1/1..Train loss: 0.082..Test loss: 0.392..Test accuracy: 0.795
Epoch 1/1..Train loss: 0.136..Test loss: 0.331..Test accuracy: 0.823
Epoch 1/1..Train loss: 0.059..Test loss: 0.240..Test accuracy: 0.897
Epoch 1/1..Train loss: 0.036..Test loss: 0.276..Test accuracy: 0.870
Epoch 1/1..Train loss: 0.043..Test loss: 0.365..Test accuracy: 0.801
Epoch 1/1..Train loss: 0.075..Test loss: 0.372..Test accuracy: 0.800


KeyboardInterrupt: 