# Pytorch fundamentals


https://pytorch.org/docs/stable/tensors.html

### Setup notebook

In [None]:
from __future__ import print_function
import torch
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Tensors and operations

#### Tensors

In [None]:
torch.Tensor(5, 3) # create tensor with memory pointer, can cause overflow
torch.Tensor([5, 3])
torch.tensor(np.array([[1, 2, 3], [4, 5, 6]]))

In [None]:
x = torch.rand(5, 3)
z = torch.zeros([2, 4], dtype=torch.int32)
x
z

In [None]:
x.size()

#### Operations

In [None]:
y = torch.rand(5, 3)
x + y

In [None]:
torch.add(x, y) == (x + y)

In [None]:
result = None               # initiate variable
torch.add(x, y, out=result) # assign to existing variable
result

#### In-place operations
Any operation that mutates a tensor in-place is post-fixed with an _

In [None]:
y.add_(x)
x.t_()
x.t_()
x.copy_(y)
x
y

#### Indexing - numpy syntax

In [None]:
x[:, 1]

#### Numpy-tensor convertion and mutability

In [None]:
# Converting a torch Tensor to a numpy array and vice versa
a = torch.ones(5)
b = a.numpy()
c = np.copy(a)
a.add_(1)
a
b
c

#### Mutable

In [None]:
a = np.ones(5)
b = torch.from_numpy(a) # mutable
np.add(a, 1, out=a)
a
b.numpy()

#### Immutable

In [None]:
a = np.ones(5)
b = torch.tensor(a) # copy
np.add(a, 1, out=a)
a
b.numpy()

##### `torch.tensor()` always copies data. If you have a Tensor data and just want to change its `requires_grad` flag, use `requires_grad_()` or `detach()` to avoid a copy. If you have a numpy array and want to avoid a copy, use `torch.from_numpy()`.

### Variable
** `autograd.Variable`  is the central class of the package**. It wraps a Tensor, and supports nearly all of operations defined on it. Once you finish your computation you can call `.backward()` and have all the gradients computed automatically.

You can access the raw tensor through the **`.data`** attribute, while the gradient w.r.t. this variable is accumulated into **`.grad`**.

There’s one more class which is very important for autograd implementation - a `Function`.

**`Variable` and `Function`** are interconnected and build up an acyclic graph, that encodes a complete history of computation.  
Each variable has a **`.grad_fn`** attribute that references a `Function` that has created the `Variable` (except for Variables created by the user - their `grad_fn is None`).

If you want to compute the **derivatives**, you can call **`.backward()`** on a `Variable`.   
 - if `Variable` is a scalar, you don’t need to specify any arguments to `backward()`
 - if it has more elements, you need to specify a **`grad_output`** argument that is a tensor of matching shape.

In [None]:
x = Variable(torch.ones(2, 2), requires_grad=True)
x
print(x)

In [None]:
# y was created as a result of an operation, so it has a grad_fn.
y = x + 2
print('y:', y)
print('y.data:', y.data)
print('y.grad: ', y.grad)
print('y.grad_fn:', y.grad_fn)

print('x.data:', x.data)
print('x.grad: ', x.grad)
print('x.grad_fn', x.grad_fn)  # we've created x ourselves

In [None]:
z = 3 * y**2
out = z.mean()
z
out

In [None]:
# gradient of x is None without backprop
print(x.grad)

#### Backpropagate

**`backward()`** propagates back the *loss* and works like generator next().
Differentiate the whole graph w.r.t. out and print gradient with respect to z(δout/δx), y(δout/δx) and x(δout/δx)

out = mean(z) = 1/n * sum(z)  
∂out/∂z = 1  

z = 3 * y^2  
∂z/∂y = 6 * y  

y = x + 2  
∂y/∂x = 1  

∂out/∂x = ∂out/∂z * ∂z/∂y * ∂y/∂x = 6  
27/6 = 4.5

In [None]:
out.backward()
print(z.grad)
print(y.grad)
print(x.grad) # requires_grad = True

##### ATTENTION: By default, gradient computation flushes all the internal buffers contained in the graph, so when you want to do mulitiple backwardprops and retain the gradients, you need to pass in **`retain_graph=True`** during the first pass.

In [None]:
x = Variable(torch.ones(2, 2), requires_grad=True)
y = x + 2

# the retain_graph flag will prevent the internal buffers from being freed
loss = torch.ones(2, 2)
y.backward(loss, retain_graph=True)
y.backward(loss)
y.backward(loss)

print(x.grad)

#### Multiple forward props and one backprop

In [None]:
x = torch.randn(3)
x = Variable(x, requires_grad=True)
print(x)

# Run multiple forward passes
y = x * 2
i = 1
while y.data.norm() < 2000:
    i *= 2
    y = y * 2
print(i, y, y.grad_fn)

# backprop given gradients
gradients = torch.FloatTensor([0.1, 1.0, 0.0001])

# differentiate the whole graph w.r.t. y given gradients
y.backward(gradients, retain_graph=True)

print(i, x.grad)

#### Profile

In [None]:
# use profile to see computational metrics
x = Variable(torch.randn(1, 1), requires_grad=True)

with torch.autograd.profiler.profile() as prof:
    y = x**2
    y.backward(retain_graph=True)
    
# NOTE: some columns were removed for brevity
print(prof)

## CNN

In [None]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 kernel
        # feature map = channel
        # (nSamples x nChannels x Height x Width)
        self.conv1 = nn.Conv2d(1, 6, 5) # == (1, 6, 5, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)),(2, 2))
        
        # if the size is square specify (2, 2) as 2
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        
        # reshape with 'free' dimension - flatten
        x = x.view(-1, self.num_flat_features(x)) 
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
        
    def num_flat_features(self, x):
        """multiply dimensions"""
        size = x.size()[1:] # all dimension except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [None]:
net = Net()
print(net)

In [None]:
params = list(net.parameters())
for param in params:
    print(param.size()) # conv1's .weight

In [None]:
# learnable params
params[:1]

#### Input

The input to the forward is an `autograd.Variable` and so is the output. 

Note: Expected input size to this net(LeNet) is 32x32.  
To use this net on MNIST dataset, please resize the images from the dataset to 32x32.

In [None]:
inputs = Variable(torch.rand(1, 1, 32, 32))
print(inputs)

out = net(inputs)
print(out)

Zero the gradient buffers of all parameters and backprop with random gradients:

In [None]:
net.zero_grad()
out.backward(torch.randn(1, 10))

### Compute loss (distance between input and target)

If you have a single sample or target, just use `.unsqueeze(0)` to add a batch dimension at index 0.

In [None]:
output = net(inputs)
target = Variable(torch.arange(1, 11)) # dummy target
criterion = nn.MSELoss()

output.size(), target.size(), target.unsqueeze(0).size()

# Ensure the dimensions are the same
loss = criterion(output, target.unsqueeze(0))
print(loss)

When you follow loss in the backward direction, using it’s `.grad_fn` attribute, you will see a graph of computations that looks like this:  

input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d  
      -> view -> linear -> relu -> linear -> relu -> linear  
      -> MSELoss  
      -> loss  
      
When we call `loss.backward()`, the whole graph is differentiated w.r.t. the loss, and all Variables in the graph will have their `.grad` Variable accumulated with the gradient.  

A few steps backwards:  

In [None]:
print(loss.grad_fn)                                            # MSELoss
print(loss.grad_fn.next_functions[0][0])                       # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[1][0])  # ReLU

To backpropagate the error all we have to do is to loss.backward(). 

You need to clear the existing gradients though, else gradients will be accumulated to existing gradients.

Now we shall call loss.backward(), and have a look at conv1’s bias gradients before and after the backward.

In [None]:
net.zero_grad()
print('conv1.bias.grad before backward:\n{}'.format(net.conv1.bias.grad))

loss.backward()
print('conv1.bias.grad after backward:\n{}'.format(net.conv1.bias.grad))

The simplest update rule used in practice is the Stochastic Gradient Descent (SGD):  
weight = weight - learning_rate * gradient

In [None]:
learning_rate = 0.01
for f in net.parameters():
    _ = f.data.sub_(f.grad.data * learning_rate)

However, as you use neural networks, you want to use various different update rules such as:
 - SGD
 - Nesterov-SGD
 - Adam
 - RMSProp
 - etc. 

To enable this, we built a small package: torch.optim that implements all these methods. Using it is very simple:

In [None]:
import torch.optim as optim

#### Create optimizer

In [None]:
optimizer = optim.SGD(net.parameters(), lr=0.01)

Use optimizer in training loop like so;

In [None]:
optimizer.zero_grad()
output = net(inputs)
loss = criterion(output, target.unsqueeze(0))
loss.backward()
optimizer.step() # does the update

### Format to run (TODO)

In [None]:
class Timer():
    import time, datetime
    
    def __init__(self):
        self.start = time.time()

    def __call__(self):
        return '{}'.format(str(datetime.timedelta(seconds=int(time.time()-self.start))))

In [None]:
# Define trainloader
# TODO

trainset_size = len(trainloader.dataset)

In [None]:
# train 
N_EPOCHS = 5
PRINT_FREQ = 200

for epoch in np.arange(N_EPOCHS)+1:  # loop over the dataset multiple times

    running_loss = 0.0
    stopwatch = Timer()
    for i, data in enumerate(trainloader, 1):
        # get the inputs
        inputs, labels = data

        # wrap them in Variable
        inputs, labels = Variable(inputs), Variable(labels)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.data.item()
        if i % PRINT_FREQ == 0:    # print every N mini-batches
            print('Epoch: {}/{}, Batch: {}/{}, loss: {:.3f}, duration: {}'.format(epoch, N_EPOCHS, 
                                                                                  i, trainset_size//BATCH_SIZE, 
                                                                                  running_loss/PRINT_FREQ, stopwatch()))
            running_loss = 0.0

print('Finished Training')

In [None]:
# train 
N_EPOCHS = 5
PRINT_FREQ = 200

for epoch in np.arange(N_EPOCHS)+1:  # loop over the dataset multiple times

    running_loss = 0.0
    stopwatch = Timer()
    for i, data in enumerate(trainloader, 1):
        # get the inputs
        inputs, labels = data

        # wrap them in Variable
        inputs, labels = Variable(inputs), Variable(labels)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.data.item()
        if i % PRINT_FREQ == 0:    # print every N mini-batches
            print('Epoch: {}/{}, Batch: {}/{}, loss: {:.3f}, duration: {}'.format(epoch, N_EPOCHS, 
                                                                                  i, trainset_size//BATCH_SIZE, 
                                                                                  running_loss/PRINT_FREQ, stopwatch()))
            running_loss = 0.0

print('Finished Training')