In [1]:
%load_ext pycodestyle_magic
%flake8_on

In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torchvision.datasets import MNIST
import torch.nn.functional as F

In [3]:
trainset = MNIST('../', download=True, train=True)
testset = MNIST('../', download=True, train=False)

y_trainset = trainset.targets
y_testset = testset.targets

# this time, we resize the data to have directly one channel for convolutions
trainset = trainset.data.reshape(60000, 1, 28, 28).to(torch.float32)
testset = testset.data.reshape(10000, 1, 28, 28).to(torch.float32)

# normalize
m, s = trainset.mean(), trainset.std()
trainset = (trainset - m) / s
testset = (testset - m) / s

# Import (not define) useful classes
so far we've been defining Dataset, DataLoader and Optimizer. That was mainly an exercise to recreate them from scratch and get a deep understanding of what they do exactly. But we can now import them from Pytorch

In [4]:
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split

from torch.optim import SGD

In [5]:
def accuracy(output, target):
    return (torch.argmax(output, dim=1) == target).float().mean()

In [6]:
# Still, we've got to create our own Dataset Class inheriting from Dataset
class MNIST_Dataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

    def __len__(self):
        return len(self.x)

In [7]:
x_train, x_valid = trainset[0:50000, :], trainset[50000:, :]
y_train, y_valid = y_trainset[0:50000], y_trainset[50000:]

train = MNIST_Dataset(x_train, y_train)
valid = MNIST_Dataset(x_valid, y_valid)

In [8]:
EPOCHS = 5
bs = 64
lr = 0.05
loss_func = F.cross_entropy


train_dl = DataLoader(train, bs, shuffle=True)
valid_dl = DataLoader(valid, bs, shuffle=False)

In [42]:
# Define model


class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func

    def forward(self, x):
        return self.func(x)


def flatten(x):
    return x.view(x.shape[0], -1)


model = nn.Sequential(
    nn.Conv2d(in_channels=1, out_channels=8,
              kernel_size=3, stride=2, padding=1),  # bs*8*14*14
    nn.ReLU(),
    nn.Conv2d(8, 16, 3, 2, 1),  # bs*16*7*7
    nn.ReLU(),
    nn.Conv2d(16, 32, 3, 2, 1),  # bs * 32 * 4 * 4
    nn.ReLU(),
    nn.Conv2d(32, 64, 3, 2, 1),  # bs * 64 * 2 * 2
    nn.ReLU(),
    nn.Conv2d(64, 64, 3, 2, 1),  # bs * 64 * 1 * 1
    nn.AdaptiveAvgPool2d(1),
    Lambda(flatten),
    nn.Linear(64, 10)
)

opt = SGD(model.parameters(), lr)

In [20]:
%%time
for i in range(EPOCHS):
    for xb, yb in train_dl:
        out = model(xb)
        loss = loss_func(out, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    print(loss)

tensor(0.5283, grad_fn=<NllLossBackward>)
tensor(0.3140, grad_fn=<NllLossBackward>)
tensor(0.0176, grad_fn=<NllLossBackward>)
tensor(0.0214, grad_fn=<NllLossBackward>)
tensor(0.0208, grad_fn=<NllLossBackward>)
CPU times: user 2min 21s, sys: 1min 44s, total: 4min 5s
Wall time: 22.4 s


# Using the GPU

Using the GPU to speed up computations requires... well, a gpu. Let's make sure we have one at our disposal:

In [22]:
torch.cuda.is_available()

True

In [23]:
torch.cuda.current_device()

0

We've got one. Now, we need to put both the model AND the data on the gpu. Putting a tensor on the gpu in pytorch is as simple as:

In [43]:
model.cuda()

Sequential(
  (0): Conv2d(1, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (1): ReLU()
  (2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (3): ReLU()
  (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (5): ReLU()
  (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (7): ReLU()
  (8): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (9): AdaptiveAvgPool2d(output_size=1)
  (10): Lambda()
  (11): Linear(in_features=64, out_features=10, bias=True)
)

Now, we've put the model on the gpu, but not the data. Let's see what happens if we try to model:

In [48]:
print(xb.shape)
# model(xb)  # Uncomment this and you'll see the following error:

torch.Size([16, 1, 28, 28])


```python 
RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same
```

We have an input type which is a classic tensor, and a weight type that is a torch.cuda.FloatTensor

In [62]:
model(xb.cuda())  # This will work

tensor([[ 0.0309, -0.0093, -0.0053,  0.0657, -0.0169, -0.0834, -0.0505,  0.1114,
          0.0616, -0.0684],
        [ 0.0314, -0.0130, -0.0088,  0.0577, -0.0210, -0.0833, -0.0528,  0.1109,
          0.0612, -0.0696],
        [ 0.0267, -0.0205, -0.0078,  0.0621, -0.0265, -0.0828, -0.0480,  0.1115,
          0.0635, -0.0695],
        [ 0.0279, -0.0074, -0.0069,  0.0646, -0.0171, -0.0816, -0.0511,  0.1132,
          0.0606, -0.0682],
        [ 0.0358, -0.0189, -0.0051,  0.0584, -0.0230, -0.0842, -0.0529,  0.1153,
          0.0612, -0.0711],
        [ 0.0293, -0.0105, -0.0056,  0.0629, -0.0160, -0.0823, -0.0498,  0.1131,
          0.0609, -0.0691],
        [ 0.0329, -0.0117, -0.0089,  0.0590, -0.0173, -0.0825, -0.0504,  0.1100,
          0.0618, -0.0706],
        [ 0.0264, -0.0122, -0.0070,  0.0658, -0.0160, -0.0809, -0.0512,  0.1173,
          0.0580, -0.0669],
        [ 0.0300, -0.0105, -0.0053,  0.0616, -0.0240, -0.0829, -0.0535,  0.1086,
          0.0652, -0.0690],
        [ 0.0336, -

In [66]:
%%time
model.cuda()  # We changed this
for i in range(EPOCHS):
    for xb, yb in train_dl:
        out = model(xb.cuda())  # and this
        loss = loss_func(out, yb.cuda())  # and this
        loss.backward()
        opt.step()
        opt.zero_grad()
    print(loss.item())

0.0025713443756103516
0.00010836124420166016
0.015048742294311523
0.0007213354110717773
0.003766000270843506
CPU times: user 9.97 s, sys: 2.3 s, total: 12.3 s
Wall time: 7.35 s


Down to 7s, vs 22s earlier. That's 3 times faster! 
Of course, this improvement will depend on a number of factors: the complexity of the model, your gpu, etc. But a three-times improvement with such a simple architecture shows that using a GPU when you have access to one is simply a no-brainer