In [1]:
%load_ext pycodestyle_magic
%flake8_on

In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torchvision.datasets import MNIST
import torch.nn.functional as F

In [3]:
trainset = MNIST('../', download=True, train=True)
testset = MNIST('../', download=True, train=False)

y_trainset = trainset.targets
y_testset = testset.targets

# this time, we resize the data to have directly one channel for convolutions
trainset = trainset.data.reshape(60000, 1, 28, 28).to(torch.float32)
testset = testset.data.reshape(10000, 1, 28, 28).to(torch.float32)

# normalize
m, s = trainset.mean(), trainset.std()
trainset = (trainset - m) / s
testset = (testset - m) / s

# Import (not define) useful classes
so far we've been defining Dataset, DataLoader and Optimizer. That was mainly an exercise to recreate them from scratch and get a deep understanding of what they do exactly. But we can now import them from Pytorch

In [4]:
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split

from torch.optim import SGD

In [5]:
def accuracy(output, target):
    return (torch.argmax(output, dim=1) == target).float().mean()

In [6]:
# Still, we've got to create our own Dataset Class inheriting from Dataset
class MNIST_Dataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

    def __len__(self):
        return len(self.x)

In [7]:
x_train, x_valid = trainset[0:50000, :], trainset[50000:, :]
y_train, y_valid = y_trainset[0:50000], y_trainset[50000:]

train = MNIST_Dataset(x_train, y_train)
valid = MNIST_Dataset(x_valid, y_valid)

In [8]:
EPOCHS = 5
bs = 64
lr = 0.05
loss_func = F.cross_entropy


train_dl = DataLoader(train, bs, shuffle=True)
valid_dl = DataLoader(valid, bs, shuffle=False)

In [9]:
# Define model


class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func

    def forward(self, x):
        return self.func(x)


def flatten(x):
    return x.view(x.shape[0], -1)

# Note: this architecture is absolutely not optimal
model = nn.Sequential(
    nn.Conv2d(in_channels=1, out_channels=8,
              kernel_size=3, stride=2, padding=1),  # bs*8*14*14
    nn.ReLU(),
    nn.Conv2d(8, 16, 3, 2, 1),  # bs*16*7*7
    nn.ReLU(),
    nn.Conv2d(16, 32, 3, 2, 1),  # bs * 32 * 4 * 4
    nn.ReLU(),
    nn.Conv2d(32, 64, 3, 2, 1),  # bs * 64 * 2 * 2
    nn.ReLU(),
    nn.Conv2d(64, 64, 3, 2, 1),  # bs * 64 * 1 * 1
    nn.AdaptiveAvgPool2d(1),
    Lambda(flatten),
    nn.Linear(64, 10)
)

opt = SGD(model.parameters(), lr)

In [10]:
%%time
for i in range(EPOCHS):
    for xb, yb in train_dl:
        out = model(xb)
        loss = loss_func(out, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    print(loss)

tensor(0.0443, grad_fn=<NllLossBackward>)
tensor(0.0432, grad_fn=<NllLossBackward>)
tensor(0.2128, grad_fn=<NllLossBackward>)
tensor(0.1608, grad_fn=<NllLossBackward>)
tensor(0.0003, grad_fn=<NllLossBackward>)
CPU times: user 4min 9s, sys: 2min 57s, total: 7min 7s
Wall time: 39 s


# Using the GPU

Using the GPU to speed up computations requires... well, a gpu. Let's make sure we have one at our disposal:

In [11]:
torch.cuda.is_available()

True

In [12]:
torch.cuda.current_device()

0

We've got one. Now, we need to put both the model AND the data on the gpu. Putting a tensor on the gpu in pytorch is as simple as:

In [13]:
model.cuda()

Sequential(
  (0): Conv2d(1, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (1): ReLU()
  (2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (3): ReLU()
  (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (5): ReLU()
  (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (7): ReLU()
  (8): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (9): AdaptiveAvgPool2d(output_size=1)
  (10): Lambda()
  (11): Linear(in_features=64, out_features=10, bias=True)
)

Now, we've put the model on the gpu, but not the data. Let's see what happens if we try to model:

In [14]:
print(xb.shape)
# model(xb)  # Uncomment this and you'll see the following error:

torch.Size([16, 1, 28, 28])


```python 
RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same
```

We have an input type which is a classic tensor, and a weight type that is a torch.cuda.FloatTensor

In [15]:
model(xb.cuda())  # This will work

tensor([[  7.0674,  -6.0264,  -2.2024, -12.0934,   1.5402,   1.7699,  21.5525,
         -10.4638,   2.1530,  -6.8926],
        [ -5.4994,   1.5802,  -2.6634,  12.5096,  -0.1541,   2.5668,  -8.0999,
          -2.3327,   0.0724,   1.2808],
        [ -0.3200,  -0.1592,  -2.8402,  -3.1717,  -1.4508,   5.8560,  14.1711,
          -9.7040,   2.7078,  -7.6999],
        [ -7.4451,  -9.6124,  -6.1428,  10.0985,   6.5370,   0.6794, -14.1489,
           3.3726,   2.3676,  19.9739],
        [ -4.0804,  -1.2109,   3.3527,  -6.3227,  19.8378,  -6.0988,   0.3382,
           2.5081,  -7.7397,  -0.7231],
        [ -6.9855,  12.6681,   1.7988,   0.5260,   2.5971, -10.6724,  -4.8678,
           5.8919,   0.5415,  -1.1746],
        [ -5.8864,   1.2687,   0.3832,  10.3150,  -0.0888,   1.3408,  -2.2453,
           0.1404,  -3.1439,  -5.0681],
        [ -6.5852,   0.6337,  -4.2944,  -4.1017,  13.5828,  -3.3877,  -0.6945,
           2.7353,   0.1108,   1.9531],
        [ -2.6542,   6.7018,  13.7193,   6.2078,

In [66]:
%%time
model.cuda()  # We changed this
for i in range(EPOCHS):
    for xb, yb in train_dl:
        out = model(xb.cuda())  # and this
        loss = loss_func(out, yb.cuda())  # and this
        loss.backward()
        opt.step()
        opt.zero_grad()
    print(loss.item())

0.0025713443756103516
0.00010836124420166016
0.015048742294311523
0.0007213354110717773
0.003766000270843506
CPU times: user 9.97 s, sys: 2.3 s, total: 12.3 s
Wall time: 7.35 s


Down to 7s, vs 22s earlier. That's 3 times faster! 
Of course, this improvement will depend on a number of factors: the complexity of the model, your gpu, etc. But a three-times improvement with such a simple architecture shows that using a GPU when you have access to one is simply a no-brainer

However, let's take a look at the losses:
* 0.0025
* 0.0001
* 0.0150  (ouch, previous times 150!)
* 0.0007
* 0.0037

The loss landscape is very bumpy!
We could smoothen that out with batch-normalization. We'll do that next notebook