After the first two notebooks, we have built classes with which we can create a very compact training loop. We kept using fully connected layers for simplicity. We will now add convolutions

In [1]:
%load_ext pycodestyle_magic
%flake8_on

In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torchvision.datasets import MNIST
import torch.nn.functional as F

# Get Data

In [3]:
# Load, reshape, put to float32
trainset = MNIST('../', download=True, train=True)
testset = MNIST('../', download=True, train=False)

# Get labels
y_trainset = trainset.targets
y_testset = testset.targets

# reshape and pass to float
trainset = trainset.data.reshape(60000, -1).to(torch.float32)
testset = testset.data.reshape(10000, -1).to(torch.float32)

# Normalize
m, std = trainset.mean(), trainset.std()
trainset = (trainset - m) / std
testset = (testset - m) / std

# Define useful classes

In [4]:
class Dataset():
    def __init__(self, x, y): self.x, self.y = x, y
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return self.x[i], self.y[i] 

In [5]:
class DataLoader():
    def __init__(self, data, bs):
        self.data, self.bs = data, bs

    def __iter__(self):
        for i in range(0, len(self.data), self.bs):
            yield self.data[i:i+self.bs]

In [6]:
class Optimizer():
    def __init__(self, parameters, lr=0.4):
        self.parameters, self.lr = list(parameters), lr

    def step(self):
        with torch.no_grad():
            for p in self.parameters:
                p -= p.grad * self.lr

    def zero_grad(self):
        for p in self.parameters:
            p.grad.zero_()

In [7]:
def accuracy(output, target):
    return (torch.argmax(output, dim=1) == target).float().mean()

In [8]:
EPOCHS = 5
lr = 0.3
bs = 64
loss_func = F.cross_entropy

# here we will later download and assign pre-trained models
learner = nn.Sequential(
    nn.Linear(784, 250),
    nn.ReLU(),
    nn.Linear(250, 100),
    nn.ReLU(),
    nn.Linear(100, 10)
)

opt = Optimizer(learner.parameters(), lr=lr)

my_data = Dataset(trainset, y_trainset)
train_dataloader = DataLoader(my_data, bs)

In [9]:
for i in range(EPOCHS):
    for xb, yb in train_dataloader:
        out = learner(xb)
        loss = loss_func(out, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    print(loss)

tensor(0.0109, grad_fn=<NllLossBackward>)
tensor(0.0071, grad_fn=<NllLossBackward>)
tensor(0.0025, grad_fn=<NllLossBackward>)
tensor(0.0004, grad_fn=<NllLossBackward>)
tensor(0.0005, grad_fn=<NllLossBackward>)


# Adding Validation

In [17]:
train, valid = trainset[0:50000, :], trainset[50000:, :]
y_train, y_valid = y_trainset[0:50000], y_trainset[50000:]

We'll now start using Pytorch's DataLoader because it also has a random sampler

In [20]:
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler

In [21]:
train_dl = DataLoader(Dataset(train, y_train), batch_size=64, shuffle=True)
valid_dl = DataLoader(Dataset(valid, y_valid), 64)

In [24]:
for epoch in range(EPOCHS):
    learner.train()
    for xb, yb in train_dl:
        loss = loss_func(learner(xb), yb)
        loss.backward()
        opt.step()
        opt.zero_grad()

    learner.eval()
    with torch.no_grad():
        total_loss, total_acc = 0., 0.
        for xb, yb in valid_dl:
            pred = learner(xb)
            total_loss += loss_func(pred, yb)
            total_acc += accuracy(pred, yb)
        n_entries = len(valid_dl)
        print('epoch', epoch,
              'loss:', (total_loss/n_entries).item(),
              'accuracy:', (total_acc/n_entries).item()
              )

epoch 0 loss: 0.04268951714038849 accuracy: 0.9902468323707581
epoch 1 loss: 0.03754099830985069 accuracy: 0.9912420511245728
epoch 2 loss: 0.038301967084407806 accuracy: 0.991042971611023
epoch 3 loss: 0.033308595418930054 accuracy: 0.9918391704559326
epoch 4 loss: 0.03391985222697258 accuracy: 0.9917396306991577


# Adding convolutions

Now adding convolutions is just a matter of changing the sequence of layers in nn.Sequential

In [25]:
# Use Jupyter Notebook shortcts to access the doc and see how to use Conv2d
# ??nn.Conv2d

In [27]:
# We can basically replace the nn.Linear with this:
my_layer = nn.Conv2d(in_channels=1, out_channels=9, kernel_size=3, padding=1)

# Except nn.Conv2d takes input of shape N * Channels * Height * Width 
# (as seen in the doc if you uncomment above)
# and we had no channels so far (MNIST isn't RVB), so we add an extra channel
my_layer(xb.reshape(16,1,28,28)).shape

torch.Size([16, 9, 28, 28])

In [32]:
# We also need to flatten out the output of the successive convolutions
# before we pass it to a nn.Linear()
# we add a Lambda layer (pretty much like a lambda function) to do that


class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func

    def forward(self, x):
        return self.func(x)


def flatten(x):
    return x.view(x.shape[0], -1)

# we can also use that Lambda class to resize the data as in the above cell
def mnist_resize(x): return x.view(-1, 1, 28, 28) 

In [34]:
EPOCHS = 6
lr = 0.4
bs = 64
loss_func = F.cross_entropy

learner = nn.Sequential(
    Lambda(mnist_resize),  # we can do the reshape here
    nn.Conv2d(in_channels=1, out_channels=8,
              kernel_size=3, stride=1, padding=1),  # bs*8*28*28
    nn.ReLU(),
    nn.Conv2d(8, 16, 3, 2, 1),  # bs*16*14*14
    nn.ReLU(),
    nn.Conv2d(16, 32, 3, 2, 1),  # bs * 32 * 7 * 7
    nn.ReLU(),
    nn.Conv2d(32, 64, 3, 2, 1),  # bs * 64 * 4 * 4
    nn.ReLU(),
    nn.Conv2d(64, 64, 3, 2, 1),  # bs * 64 * 2 * 2
    nn.AdaptiveAvgPool2d(1),
    Lambda(flatten),
    nn.Linear(64, 10)
)

opt = Optimizer(learner.parameters(), lr=lr)

In [36]:
# A cool thing is that the train loop doesn't need to change
for epoch in range(EPOCHS):
    learner.train()
    for xb, yb in train_dl:
        loss = loss_func(learner(xb), yb)
        loss.backward()
        opt.step()
        opt.zero_grad()

    learner.eval()
    with torch.no_grad():
        total_loss, total_acc = 0., 0.
        for xb, yb in valid_dl:
            pred = learner(xb)
            total_loss += loss_func(pred, yb)
            total_acc += accuracy(pred, yb)
        n_entries = len(valid_dl)
        print('epoch', epoch,
              'loss:', (total_loss/n_entries).item(),
              'accuracy:', (total_acc/n_entries).item()
              )

epoch 0 loss: 0.06012481078505516 accuracy: 0.9852706789970398
epoch 1 loss: 0.09420818835496902 accuracy: 0.9772093892097473
epoch 2 loss: 0.33874279260635376 accuracy: 0.918789803981781
epoch 3 loss: 0.08275756239891052 accuracy: 0.9787022471427917
epoch 4 loss: 0.04851749911904335 accuracy: 0.987957775592804
epoch 5 loss: 0.06320108473300934 accuracy: 0.984375


So, we added convolutions, but two problems arose:

* the training isn't very smooth. We'll need to add some regularization
* the training is way slower than before. We'll need to start using the GPU

We'll start doing that in the next Notebook