In [1]:
%load_ext pycodestyle_magic
%flake8_on

In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torchvision.datasets import MNIST
import math
from functools import partial

### Get data
Same as previous notebook, condensed with less prints

In [3]:
# Load, reshape, put to float32
trainset = MNIST('../', download=True, train=True)
testset = MNIST('../', download=True, train=False)
train = trainset.data.reshape(60000, -1).to(torch.float32)
test = testset.data.reshape(10000, -1).to(torch.float32)

# Normalize
m, std = train.mean(), train.std()
train = (train - m) / std
test = (test - m) / std

# Get labels
y_train = trainset.targets
y_test = testset.targets

### Define useful classes

In [6]:
class Model(nn.Module):
    def __init__(self, n_in, n_out, depth):
        super().__init__()
        # quick 'n dirty way to create wider models
        IN = np.linspace(n_in, n_out, depth).astype(int)
        self.layers = []
        for layer in range(depth-2):
            self.layers += [nn.Linear(IN[layer], IN[layer+1])]
            self.layers += [nn.ReLU()]
        # last layer doesn't have relu
        self.layers += [nn.Linear(IN[-2], n_out)]

        # cross-entropy is nll(log_softmax), as we defined in previous nb
        self.loss = nn.functional.cross_entropy

    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [10]:
n_in = train.shape[1]  # 28*28:784
n_layers = 6  # now we can just change the number of layers here
n_out = 10

learner = Model(n_in, n_out, n_layers)
np.linspace(n_in, n_out, n_layers).astype(int)

array([784, 629, 474, 319, 164,  10])

In [11]:
learner.loss(learner(train), y_train)

tensor(2.3047, grad_fn=<NllLossBackward>)

### Introducing batches
so far we've been training the model with the whole data at every epoch. Most of the time this won't be possible (all datasets aren't nearly as small as MNIST). Thus we split the data in small batches (usually 32, or 64)

In [13]:
%flake8_off
bs = 64
EPOCHS = 5
lr = 0.3

for epoch in range(EPOCHS):
    for i in range((60000-1)//bs + 1):
        # get the batch
        start_i = i * bs
        end_i = start_i + bs
        xbatch = train[start_i:end_i]
        ybatch = y_train[start_i:end_i]
        
        # predict and get the loss
        output = learner(xbatch)
        loss = learner.loss(output, ybatch)

        loss.backward()
        with torch.no_grad():
            for l in learner.layers:
                if hasattr(l, 'weight'):
                    # step
                    l.weight -= l.weight.grad * lr
                    l.bias   -= l.bias.grad   * lr
                    l.weight.grad.zero_()
                    l.bias  .grad.zero_()
    print(loss)

tensor(0.0014, grad_fn=<NllLossBackward>)
tensor(0.0006, grad_fn=<NllLossBackward>)
tensor(0.0001, grad_fn=<NllLossBackward>)
tensor(0.0001, grad_fn=<NllLossBackward>)
tensor(0.0002, grad_fn=<NllLossBackward>)


In [17]:
%flake8_on
def accuracy(pred, targs):
    return float((pred.argmax(dim=-1) == targs).sum()) / float(len(pred))

In [18]:
output = learner(test)

In [19]:
accuracy(output, y_test)

0.9807

#### So we have a training loop that works
Moreover, last notebook we needed 35 passes through the whole data to get 90% accuracy. With minibatches, the learning is smoother, the model learns more and does so more quickly.

Now we'll refactor the code to make it more maintainable (Note that I could've done that from the start, I just wanted to show the potential recruiter that I know what happens behind the scenes)

# Refactoring the training loop

## updating the parameters

First, we can define our layers outside the model and pass them as parameters

In [21]:
depth = 5
IN = np.linspace(n_in, n_out, depth).astype(int)
layers = []
for layer in range(depth-2):
    layers += [nn.Linear(IN[layer], IN[layer+1])]
    layers += [nn.ReLU()]
# last layer doesn't have relu
layers += [nn.Linear(IN[-2], n_out)] 

In [22]:
IN

array([784, 590, 397, 203,  10])

In [25]:
# Now we can refactor the Model() class and add each layer as a module
class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers
        for i, l in enumerate(layers):
            self.add_module(f'layer_{i}', l)

    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [26]:
learner = Model(layers)

the add_module of the nn.Module class provides us automated access to the model parameters:

In [28]:
learner.parameters(), learner.parameters

(<generator object Module.parameters at 0x7ff0b0754a50>,
 <bound method Module.parameters of Model(
   (layer_0): Linear(in_features=784, out_features=590, bias=True)
   (layer_1): ReLU()
   (layer_2): Linear(in_features=590, out_features=397, bias=True)
   (layer_3): ReLU()
   (layer_4): Linear(in_features=397, out_features=203, bias=True)
   (layer_5): ReLU()
   (layer_6): Linear(in_features=203, out_features=10, bias=True)
 )>)

In [31]:
# So we can refactor our training loop like so :

bs = 64
EPOCHS = 4
lr = 0.3

for epoch in range(EPOCHS):
    for i in range((60000-1)//bs + 1):
        # Get the batch
        start_i = i*bs
        end_i = start_i+bs
        xbatch = train[start_i:end_i]
        ybatch = y_train[start_i:end_i]

        # Model
        output = learner(xbatch)
        loss = nn.functional.cross_entropy(output, ybatch)

        # Gradient descent
        loss.backward()
        with torch.no_grad():
            for p in learner.parameters():
                p -= p.grad * lr
                p.grad.zero_()
    print(loss)

tensor(0.0001, grad_fn=<NllLossBackward>)
tensor(9.4050e-05, grad_fn=<NllLossBackward>)
tensor(6.2602e-05, grad_fn=<NllLossBackward>)
tensor(8.5853e-05, grad_fn=<NllLossBackward>)


In [33]:
# So with self.add_module() in the Model Class,
# we managed to refactor this part of our training loop:

# for l in learner.layers:
#     if hasattr(l, 'weight'):
#         l.weight -= l.weight.grad * lr
#         l.bias   -= l.bias.grad   * lr
#         l.weight.grad.zero_()
#         l.bias  .grad.zero_()

# into:

# for p in learner.parameters():
#     p -= p.grad * lr
#     p.grad.zero_()



We added code into Model() to use add_module, but in fact, all it does can be done with nn.Sequential
(We'll define a func to avoid duplicate code every time we refactor the code):

In [37]:
def get_linear():
    learner = nn.Sequential(
        nn.Linear(784, 590),
        nn.ReLU(),
        nn.Linear(590, 397),
        nn.ReLU(),
        nn.Linear(397, 203),
        nn.ReLU(),
        nn.Linear(203, 10)
    )
    return learner

In [39]:
learner = get_linear()

bs = 64
EPOCHS = 4
lr = 0.3

for epoch in range(EPOCHS):
    for i in range((60000-1)//bs + 1):
        # get batch
        start_i = i*bs
        end_i = start_i+bs
        xbatch = train[start_i:end_i]
        ybatch = y_train[start_i:end_i]

        # model
        output = learner(xbatch)
        loss = nn.functional.cross_entropy(output, ybatch)

        # step
        loss.backward()
        with torch.no_grad():
            for p in learner.parameters():
                p -= p.grad * lr
                p.grad.zero_()
    print(loss)

tensor(0.0235, grad_fn=<NllLossBackward>)
tensor(0.0051, grad_fn=<NllLossBackward>)
tensor(0.0012, grad_fn=<NllLossBackward>)
tensor(0.0009, grad_fn=<NllLossBackward>)


## iterating through the data

This part is also a bit messy

In [40]:
# for i in range((60000-1)//bs + 1):
#     start_i = i*bs
#     end_i = start_i+bs
#     xbatch = train[start_i:end_i]
#     ybatch = y_train[start_i:end_i]

### Dataset and Dataloader classes

In [47]:
class Dataset():
    def __init__(self, x, y): self.x, self.y = x, y
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return self.x[i], self.y[i]

In [42]:
my_data = Dataset(train, y_train)

In [43]:
# with this, we can the the xbatch and ybatch in one line:
xbatch, ybatch = my_data[0:64]

# But we would still have to calculate the indices each time:
# for i in range((n-1)//bs + 1):
#    start_i = i*bs
#    end_i = start_i+bs

# ideally we'd want a one liner like this one:

# for xb, yb in data:

In [46]:
class DataLoader():
    def __init__(self, data, bs): self.data, self.bs = data, bs

    def __iter__(self):
        for i in range(0, len(self.data), self.bs):
            yield self.data[i:i+bs]

In [48]:
train_dl = DataLoader(my_data, 64)

In [51]:
for xb, yb in train_dl:
    print(xb.shape, yb.shape)
    break
# if you remove the break and scroll down,
# you'll see the last bs isn't 64 but 32.
# we'll take a closer look at this later

torch.Size([64, 784]) torch.Size([64])


In [53]:
# Simplified training loop:
learner = get_linear()
my_data = Dataset(train, y_train)
train_dl = DataLoader(data=my_data, bs=64)

for epoch in range(EPOCHS):
    for xb, yb in train_dl:
        # Model
        output = learner(xb)
        loss = nn.functional.cross_entropy(output, yb)

        # Gradient descent
        loss.backward()
        with torch.no_grad():
            for p in learner.parameters():
                p -= p.grad * lr
                p.grad.zero_()
    print(loss)

tensor(0.0484, grad_fn=<NllLossBackward>)
tensor(0.0073, grad_fn=<NllLossBackward>)
tensor(0.0018, grad_fn=<NllLossBackward>)
tensor(0.0010, grad_fn=<NllLossBackward>)


## Update the parameters using a class

Now we want to simplify the training loop further by getting rid of this loop:

In [55]:
# with torch.no_grad():
#     for p in learner.parameters():
#         p -= p.grad * lr
#         p.grad.zero_()

In [62]:
class Optimizer():
    def __init__(self, parameters, lr):
        self.parameters, self.lr = list(parameters), lr

    def step(self):
        with torch.no_grad():
            for p in self.parameters:
                p -= p.grad * self.lr

    def zero_grad(self):
        for p in self.parameters:
            p.grad.zero_()

In [None]:
EPOCHS = 4
learner = get_linear()
opt = Optimizer(learner.parameters(), 0.4)
my_data = Dataset(train, y_train)
train_dl = DataLoader(data=my_data, bs=64)

for epoch in range(EPOCHS):
    for xb, yb in train_dl:
        output = learner(xb)
        loss = nn.functional.cross_entropy(output, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    print(loss)

That's it! We have a nice and compact training loop. I said we'd add convolutions but this notebook is getting long so we'll do that in the next notebook, along with "in-training" validation to monitor overfitting