# PyTorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

from sklearn import datasets
import timeit
import matplotlib.pyplot as plt

from ipywidgets import interact, fixed
import ipywidgets 

PyTorch operates on various levels of abstraction:

* Tensor - something similar to `np.array` but can be stored on the GPU
* Variable - a part of a computational graph. Holds tensors as the value of the variable, as well as variable's gradients.
* Module - a neural network layer

### First, let's load data!

In [None]:
iris = datasets.load_iris()
iris['data']
iris['target']

In [None]:
X = Variable(torch.FloatTensor(iris['data']), requires_grad=False)
y = Variable(torch.LongTensor(iris['target']), requires_grad=False)
# We'll train on the whole dataset - don't ever do that - but for ilustrating behaviour it's good enough!

This is an example of an autograd function - you can use them to define your own operations!

In [None]:
# a helper function to measure accuracy
def accuracy(logits, y):
    return (logits == y).sum() / y.shape

In [None]:
class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

relu = MyReLU.apply


PyTorch works on dynamic computational graphs. It means that with every operation, the graph is constructed from scratch. It's slower, but allows for nice things such as loops.

In [None]:
dtype = torch.FloatTensor

D_in, H, D_out = 4, 10, 3

X_t = X.type(dtype)
y_t = y.type(dtype)
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

loss_fn = nn.CrossEntropyLoss()

learning_rate = 1e-2

for t in range(500):
    
    # Forward pass
    y_pred = relu(X_t @ w1) @ w2

    # Compute and print loss
    
    loss = loss_fn(y_pred, y_t.long())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Backward pass
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()
    
    #
    logits = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()  
    acc = accuracy(logits, y_t.data.cpu().numpy())
    if t % 50 == 0: print(t, loss.data[0], acc)


## Let's train a network on a more serious dataset - CIFAR-10 !

But first, let's load (already normalized!) data. PyTorch has utilities for that as well. How cool is that?

In [None]:
class ChunkSampler(sampler.Sampler):
    def __init__(self, num_samples, start = 0):
        self.num_samples = num_samples
        self.start = start

    def __iter__(self):
        return iter(range(self.start, self.start + self.num_samples))

    def __len__(self):
        return self.num_samples

NUM_TRAIN = 49000
NUM_VAL = 1000

cifar10_train = dset.CIFAR10('.', train=True, download=True,
                           transform=T.ToTensor())
loader_train = DataLoader(cifar10_train, batch_size=64, sampler=ChunkSampler(NUM_TRAIN, 0))

cifar10_val = dset.CIFAR10('.', train=True, download=True,
                           transform=T.ToTensor())
loader_val = DataLoader(cifar10_val, batch_size=64, sampler=ChunkSampler(NUM_VAL, NUM_TRAIN))

cifar10_test = dset.CIFAR10('.', train=False, download=True,
                          transform=T.ToTensor())
loader_test = DataLoader(cifar10_test, batch_size=64)

What's inside the dataset?

In [None]:
cifar = list(loader_train)[0][0].numpy()
cifar.shape

In [None]:
def show_cifar(i):
    print(i)
    x = cifar[i].transpose(1, 2, 0)
    plt.imshow(x)
    plt.show()

interact(show_cifar, i=ipywidgets.IntSlider(min=0, max=63))

In [None]:
def reset(m):
    if hasattr(m, 'reset_parameters'):
        m.reset_parameters()

In PyTorch you can not only use pre-implemented modules - you can also implement your own. The only thing to do is implement the forward pass. 

In [None]:
class Flatten(nn.Module):
    def forward(self, x):
        N, C, H, W = x.size() # read in N, C, H, W
        return x.view(N, -1)  # "flatten" the C * H * W values into a single vector per image

And this is an easy way to define a model:

Note that you could also create non-sequential connections (like Inception layers) for example by implementing your own modules.

In [None]:
model_base = nn.Sequential( 
    nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(32),
    nn.Dropout(p=0.3),
    nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(32),
    nn.Dropout(p=0.3),
    nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(32),
    nn.Dropout(p=0.3),
    
    nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(64),
    nn.Dropout(p=0.3),
    nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(64),
    nn.Dropout(p=0.3),
    nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(64),
    nn.Dropout(p=0.3),
    
    nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(128),
    nn.Dropout(p=0.3),
    nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(128),
    nn.Dropout(p=0.3),
    nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=1),
    nn.PReLU(),
    nn.BatchNorm2d(128),
    nn.Dropout(p=0.3),
    Flatten(),
    nn.Linear(2048, 10),  
            )


Before we train the model, let's see how fast it is!

In [None]:
print(torch.cuda.is_available())
gpu_dtype = dtype = torch.cuda.FloatTensor
cpu_dtype = torch.FloatTensor

In [None]:
import copy 
model_cpu = model_base.type(cpu_dtype)
model_gpu = copy.deepcopy(model_base).type(gpu_dtype)
x = torch.randn(64, 3, 32, 32).type(cpu_dtype)
x_var = Variable(x.type(cpu_dtype)) 
x_gpu = torch.randn(64, 3, 32, 32).type(gpu_dtype)
x_var_gpu = Variable(x.type(gpu_dtype))

In [None]:
%%timeit 
ans = model_cpu(x_var)

In [None]:
%%timeit 
torch.cuda.synchronize() # Make sure there are no pending GPU computations
ans = model_gpu(x_var_gpu)        # Feed it through the model! 
torch.cuda.synchronize() # Make sure there are no pending GPU computations

Now let's have fun with the model and train it!

In [None]:
torch.cuda.random.manual_seed(2137)

dtype = gpu_dtype

model = model_base.type(dtype)
model.apply(reset)

loss_fn = nn.CrossEntropyLoss().type(dtype)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
for i in range(15):
    print('EPOCH', i)
    for e, (x, y) in enumerate(loader_train):
        model.train()
        x_var = Variable(x.type(gpu_dtype))
        y_var = Variable(y.type(gpu_dtype).long())
        y_pred = model(x_var)
        logits = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()    
        loss = loss_fn(y_pred, y_var.long())
        if (e % 100) == 0: print(e, loss.data[0], accuracy(logits, y_var.data.cpu().numpy()))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

How accurate is our model?

In [None]:
for loader in [loader_train, loader_val, loader_test]:
    acc = 0
    div = 0
    for (x,y) in loader:
        model.eval()
        x_var = Variable(x.type(gpu_dtype), volatile=True)
        y_var = Variable(y.type(gpu_dtype).long())
        y_pred = model(x_var)
        logits = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()
        acc += accuracy(logits, y_var.data.cpu().numpy())
        div += 1

    print('accuracy:', acc / div)