# PyTorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

from sklearn import datasets
import timeit
import matplotlib.pyplot as plt

from ipywidgets import interact, fixed
import ipywidgets 

from load_cifar import load_cifar

PyTorch is a spritual successor of Torch and is being implemented by Facebook. 

It operates on various levels of abstraction:

* Tensor - something similar to `np.array` but can be stored on the GPU
* Variable - a part of a computational graph. Holds tensors as the value of the variable, as well as variable's gradients.
* Module - a neural network layer

We'll start by creating a simple, shallow model which we'll use to classify the Iris dataset ( https://archive.ics.uci.edu/ml/datasets/iris )

## First, let's load data!

In [None]:
iris = datasets.load_iris()
iris['data']
iris['target']

In [None]:
X = Variable(torch.FloatTensor(iris['data']), requires_grad=False)
y = Variable(torch.LongTensor(iris['target']), requires_grad=False)
# We'll train on the whole dataset - don't ever do that - but for ilustrating behaviour it's good enough!

This is an example of an autograd function - you can use them to define your own operations!

In [None]:
# a helper function to measure accuracy
def accuracy(logits, y):
    return (logits == y).sum() / y.shape

In [None]:
class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

relu = MyReLU.apply


Whether variables are stored on CPU or GPU is decided by their type

In [None]:
print(torch.cuda.is_available())
dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
gpu_dtype = dtype = torch.cuda.FloatTensor
cpu_dtype = torch.FloatTensor

PyTorch works on dynamic computational graphs. It means that with every operation, the graph is constructed from scratch. It's slower, but allows for nice things such as loops.

The downside is that models don't infer dimensionality that easily. It can be a pain, especially when building more complex models.

In [None]:
D_in, H, D_out = 4, 10, 3

X_t = X.type(dtype)
y_t = y.type(dtype)
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

loss_fn = nn.CrossEntropyLoss()

learning_rate = 1e-2

for t in range(500):
    
    # Forward pass
    y_pred = relu(X_t @ w1) @ w2
    
    loss = loss_fn(y_pred, y_t.long())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Backward pass
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()
    
    #
    logits = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()  
    acc = accuracy(logits, y_t.data.cpu().numpy())
    if t % 50 == 0: print(t, loss.data[0], acc)

## Let's train a network on a more serious dataset - CIFAR-10 !

But first, let's load (already normalized!) data. PyTorch has utilities for that as well. How cool is that?

In [None]:
X_train, y_train, X_test, y_test = load_cifar()

X_train = X_train.reshape(-1, 32, 32, 3)
X_test = X_test.reshape(-1, 32, 32, 3)

# in PyTorch, the 'channels' dimension is of higher order than height and width
# as a consequence, images have now shapes [?, 3, 32, 32]
X_train = X_train.transpose(0, 3, 1, 2)
X_test = X_test.transpose(0, 3, 1, 2)

# print(X_train[0])
X_train = Variable(torch.FloatTensor(X_train), requires_grad=False)
y_train = Variable(torch.LongTensor(y_train), requires_grad=False)
X_test = Variable(torch.FloatTensor(X_test), requires_grad=False)
y_test = Variable(torch.LongTensor(y_test), requires_grad=False)


In [None]:
def random_batch(n=128, X=X_train, y=y_train, dtype=dtype):
    ind = np.random.randint(low=0, high=X.size()[0], size=n)
    X_np = X.data.numpy()[ind]
    y_np = y.data.numpy()[ind]
    X_var = Variable(torch.FloatTensor(X_np)).type(dtype)
    y_var = Variable(torch.LongTensor(y_np)).type(dtype).long()
    return X_var, y_var
    


What's inside the dataset?

In [None]:
def show_cifar(i):
    print(i)
    x = X_train.data[i].cpu().numpy().transpose(1, 2, 0)
    plt.imshow(x)
    plt.show()

interact(show_cifar, i=ipywidgets.IntSlider(min=0, max=63))

In [None]:
def reset(m):
    if hasattr(m, 'reset_parameters'):
        m.reset_parameters()

In PyTorch you can not only use pre-implemented modules - you can also implement your own. The only thing to do is implement the forward pass. 

In [None]:
class Flatten(nn.Module):
    def forward(self, x):
        N, C, H, W = x.size() # read in N, C, H, W
        return x.view(N, -1)  # "flatten" the C * H * W values into a single vector per image

And this is an easy way to define a model:

Note that you could also create non-sequential connections (like Inception layers) for example by implementing your own modules.

In [None]:
model_base = nn.Sequential( 
    nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(32),
    nn.Dropout(p=0.3),
    nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(32),
    nn.Dropout(p=0.3),
    nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(32),
    nn.Dropout(p=0.3),
    
    nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(64),
    nn.Dropout(p=0.3),
    nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(64),
    nn.Dropout(p=0.3),
    nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(64),
    nn.Dropout(p=0.3),
    
    nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(128),
    nn.Dropout(p=0.3),
    nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(128),
    nn.Dropout(p=0.3),
    nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(128),
    nn.Dropout(p=0.3),
    Flatten(),
    nn.Linear(2048, 10),  
            )


Before we train the model, let's see how fast it is!

In [None]:
import copy 
model_cpu = model_base.type(cpu_dtype)
model_gpu = copy.deepcopy(model_base).type(gpu_dtype)
x = torch.randn(64, 3, 32, 32).type(cpu_dtype)
x_var = Variable(x.type(cpu_dtype)) 
x_gpu = torch.randn(64, 3, 32, 32).type(gpu_dtype)
x_var_gpu = Variable(x.type(gpu_dtype))

In [None]:
%%timeit 
ans = model_cpu(x_var)

In [None]:
%%timeit 
torch.cuda.synchronize() # Make sure there are no pending GPU computations
ans = model_gpu(x_var_gpu)        # Feed it through the model! 
torch.cuda.synchronize() # Make sure there are no pending GPU computations

Now let's have fun with the model and train it!

In [None]:
if dtype == gpu_dtype: torch.cuda.random.manual_seed(2137)
else: torch.random.manual_seed(2137)

model = model_base.type(dtype)
model.apply(reset)

loss_fn = nn.CrossEntropyLoss().type(dtype)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
for i in range(1):
    print('EPOCH', i)
    for e in range(1000):
        model.train()
        x_var, y_var = random_batch()
        y_pred = model(x_var)
        logits = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()    
        loss = loss_fn(y_pred, y_var.long())
        if (e % 100) == 0: print(e, loss.data[0], accuracy(logits, y_var.data.cpu().numpy()))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

How accurate is our model?

In [None]:
x_var, y_var = random_batch(X=X_test, y=y_test, dtype=gpu_dtype)
y_pred = model(x_var)
logits = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()
acc = accuracy(logits, y_var.data.cpu().numpy())
print('accuracy:', acc)