# CNN - image classifier

We will do the following steps in order:

- Load and normalizing the CIFAR10 training and test datasets using torchvision
- Define a Convolution Neural Network
- Define a loss function
- Train the network on the training data
- Test the network on the test data

http://cs231n.github.io/convolutional-networks/

### Setup notebook

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

import torch.utils.data
from torch.utils.data import Dataset, DataLoader

import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models, transforms, utils

from mymods.lauthom import *

print(torch.__version__)

## Data

### Transforms

The output of torchvision datasets are PILImage images of range `[0, 1]`. 
We transform them to Tensors of normalized range `[-1, 1]`.

In [None]:
# Image standardisation constants:
MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]
IMG_SIZE = 224

# Other constants
BATCH_SIZE = 4

In [None]:
transform = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=MEAN, std=STD),
])

### Dataloaders

In [None]:
# download datasets and define minibatch loaders
kwargs = {'batch_size': BATCH_SIZE, 'num_workers': 4}

trainset = torchvision.datasets.CIFAR10(root = './data', 
                                        train=True,
                                        download=True, 
                                        transform=transform)

trainloader = torch.utils.data.DataLoader(trainset, 
                                          shuffle=True, 
                                          **kwargs) # stochastic shuffle

testset = torchvision.datasets.CIFAR10(root = './data', 
                                       train=False,
                                       download=True, 
                                       transform=transform)

testloader = torch.utils.data.DataLoader(testset, 
                                         shuffle=False, 
                                         **kwargs)

print('ready downloading')

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
print(trainset)
trainloader.dataset

#### Show image

In [None]:
def imshow(img):
    plt.figure(figsize=(20,5))
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    
start = -BATCH_SIZE

Iterate and show some random training images with respective labels.

In [None]:
images, labels = next(iter(trainloader))

imshow(torchvision.utils.make_grid(images))
print(' '.join('{}'.format(classes[labels[j]]) for j in np.arange(BATCH_SIZE)))

### Build CNN

Convolutional layer

We can compute the spatial **size of the output** volume as a function of:
 - the input volume size (W)
 - the receptive field size of the Conv Layer neurons (F)
 - the stride with which they are applied (S)
 - the amount of zero padding used (P) on the border
 
The formula for calculating how many neurons 'fit' is given by:
(W−F+2P)/S+1

*E.g. for a 7x7 input and a 3x3 filter with stride 1 and pad 0 => (7 - 3 + 2*0)/1 +1 => 4/1 + 1 = 5 => output = 5x5
With stride 2 we would get a 3x3 output => (7 - 3 + 2*0)/2 +1 => 4/2 + 1 = 3 => output = 3x3*

Pooling layer


Conv1 5x5:
(224 - 5)/1 + 1 = 220x220

Pooling 2x2
220/2 = 110x110

Conv2 5x5:
(110 - 5)/1 + 1 = 106x106

Pooling 2x2
106/2 = 53x53 (=3364)

179776 = (4, 16, 53, 53)

In [None]:
# define NN for 3 channel images
# 2 5x5 filters
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)         # in: (4, 3, 224, 224), out: (4, 6, 220, 220)
        self.pool = nn.MaxPool2d(2, 2)          # in: (4, 3, 220, 220), out: (4, 3, 110, 110)
        self.conv2 = nn.Conv2d(6, 16, 5)        # in: (4, 6, 110, 110), out: (4, 16, 106, 106)
        # pool(2,2)                             # in: (4, 6, 106, 106), out: (4, 16, 53, 53)
        self.fc1 = nn.Linear(16 * 53 * 53, 120) # in: (4, 400), out: (4, 120)
        self.fc2 = nn.Linear(120, 84)           # in: (4, 120), out: (4, 84)
        self.fc3 = nn.Linear(84, 10)            # in: (4, 84), out: (4, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))    # 
        x = self.pool(F.relu(self.conv2(x)))    # out: 179776
        x = x.view(-1, 16 * 53 * 53)            # resize: (-1, 400)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
net = Net()
print(net)

Intuitively, stacking CONV layers with tiny filters as opposed to having one CONV layer with big filters allows us to express more powerful features of the input, and with fewer parameters. As a practical disadvantage, we might need more memory to hold all the intermediate CONV layer results if we plan to do backpropagation.

Conv1 3x3:
(224 - 3)/1 + 1 = 222x222

Conv2 3x3:
(222 - 3)/1 + 1 = 220x220

Conv2 1x1:
(220 - 1)/1 + 1 = 220x220

Pooling 2x2
220/2 = 110x110 (=3364)

Conv3 3x3:
(110 - 3)/1 + 1 = 108x108

Pooling 2x2
108/2 = 54x54 (=2916)

186624 = (4, 16, 54, 54) =  (4, 46656)

In [None]:
# define NN for 3 channel images
# 3 3x3 filters
class Net2(nn.Module):
    def __init__(self):
        super(Net2, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 3)         # in: (4, 3, 224, 224), out: (4, 6, 222, 222)
        self.conv2 = nn.Conv2d(6, 16, 3)        # in: (4, 6, 222, 222), out: (4, 16, 220, 220)
        self.conv3 = nn.Conv2d(16, 8, 1)        # in: (4, 16, 220, 220), out: (4, 8, 220, 220)
        self.pool = nn.MaxPool2d(2, 2)          # in: (4, 8, 110, 110), out: (4, 8, 110, 110)
        self.conv4 = nn.Conv2d(8, 4, 3)         # in: (4, 8, 110, 110), out: (4, 4, 108, 108)
        # pool(2,2)                             # in: (4, 4, 108, 108), out: (4, 4, 54, 54)
        self.fc1 = nn.Linear(4 * 54 * 54, 120)  # in: (4, 400), out: (4, 120)
        self.fc2 = nn.Linear(120, 60)           # in: (4, 120), out: (4, 84)
        self.fc3 = nn.Linear(60, 10)            # in: (4, 84), out: (4, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        x = x.view(-1, 4 * 54 * 54)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
net = Net2()
print(net)

In [None]:
# define loss
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
trainset_size = len(trainloader.dataset)

In [None]:
class Timer():
    import time, datetime
    
    def __init__(self):
        self.start = time.time()

    def __call__(self):
        return '{}'.format(str(datetime.timedelta(seconds=int(time.time()-self.start))))

In [None]:
# train 
N_EPOCHS = 5
PRINT_FREQ = 200

for epoch in np.arange(N_EPOCHS)+1:  # loop over the dataset multiple times

    running_loss = 0.0
    stopwatch = Timer()
    for i, data in enumerate(trainloader, 1):
        # get the inputs
        inputs, labels = data

        # wrap them in Variable
        inputs, labels = Variable(inputs), Variable(labels)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.data.item()
        if i % PRINT_FREQ == 0:    # print every N mini-batches
            print('Epoch: {}/{}, Batch: {}/{}, loss: {:.3f}, duration: {}'.format(epoch, N_EPOCHS, 
                                                                                  i, trainset_size//BATCH_SIZE, 
                                                                                  running_loss/PRINT_FREQ, stopwatch()))
            running_loss = 0.0

print('Finished Training')

In [None]:
# test
dataiter = iter(testloader)
images, labels = dataiter.next()

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))

In [None]:
# outputs are the energies for the 10 classes, highest energy wins
outputs = net(Variable(images))
_, predicted = torch.max(outputs.data, 1)

print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
                              for j in range(4)))

In [None]:
# performance full testset
correct = 0
total = 0

# 2500 batches of 4 images, labels
for data in testloader:
    images, labels = data
    outputs = net(Variable(images))
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0) # tensor size (batch_size of dataloader)
    correct += (predicted == labels).sum()

print('Accuracy of the network on the 10000 test images: {:.1f}%'.format(100*correct/total))

In [None]:
# performance per class
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))

for data in testloader:
    images, labels = data
    outputs = net(Variable(images))
    _, predicted = torch.max(outputs.data, 1)
    c = (predicted == labels).squeeze()
    for i in range(4):
        label = labels[i]
        class_correct[label] += c[i]
        class_total[label] += 1

for i in range(10):
    print('Accuracy of {} : {:.1f}%'.format(
        classes[i], 100*class_correct[i]/class_total[i]))