# PyTorch
* Like numpy but allows us to use GPU for calculations

In [None]:
import torch
import torchvision
import torchvision.datasets as dset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from torch.autograd import Variable

## Data Loading

* Torchvision is an accompanying library to pyTorch. It contains some data set loaders, and a Dataset class you may extend if you wish.
* torch.utils.data works with the DataSet class to provide data loaders. It's a complex class so if you use it, just use it as is.
* Alternatively you can write your own loader code.

## Train/Test Data
* You will want to train on some of the data, and test with the rest. We will provide you with the split.

## Tensors
* The default loader class will create instances of PyTorch Tensors. They're like Numpy arrays, but can also reside on your GPU.

In [None]:
# Automatic transforms. Normalize(0.5, 0.5) will set that mean and std-dev
# Based on statistics of the data
trans = transforms.Compose([transforms.ToTensor()])
train_set = dset.MNIST(root='./mnist_data', train=True, transform=trans, download=True)
test_set = dset.MNIST(root='./mnist_data', train=False, transform=trans)
N = 8
train_loader = DataLoader(dataset=train_set, batch_size=N, shuffle=True, num_workers=2)
test_loader = DataLoader(dataset=test_set, batch_size=N, shuffle=False, num_workers=2)

## Looking at the data
* Notice the transpose! Though these are black/white images, you'll need to swap channels around to get normal colors.
* If you normalize your data, you may have to recenter it before displaying.

In [None]:
def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1,2,0)))
    plt.show()
dataiter = iter(train_loader)
images, labels = dataiter.next()
grid = torchvision.utils.make_grid(images)
imshow(grid)
print " ".join(["%s" % i for i in labels])

In [None]:
print (images[0].shape)
D_in = 28 * 28 # size of input
H = 400     # size of hidden layer
D_out = 10    # number of classes

In [None]:
# Randomly initialize weights
w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)

learning_rate = 1e-6 # learning rate
iters = 3

for epoch in range(epochs):
    for imgs, labels in train_loader:
        ## Forward pass ##     
        # flatten image
        imgs.resize_((N, D_in)) 
        # multiply by hidden layer weights
        h = imgs.mm(w1)
        # non-linearity (RELU)
        h_relu = h.clamp(min=0)
        # multiply hidden layer by second set of weights
        y_pred = h_relu.mm(w2)
        
        ## Calculate Loss ##
        # Get categorical representation of label
        y = torch.eye(10)[labels]

        # SSD
        #loss = (y_pred - y).pow(2).sum()
        y_diff = y_pred - y
        y_pow = y_diff.pow(2)
        loss = y_pow.sum()
        print "Epoch = %d, Loss = %f" % (epoch, loss)
            
        ## Back propagation ##
        # Back propagate loss
        d_y_pow = loss
        d_y_diff = 2.0 * 1.0 * y_diff
        d_y_pred = d_y_diff
        # Multiplication: del x (x*y) = y, del y (x * y) = x
        d_w2 = h_relu.t().mm(d_y_pred)
        d_h_relu = d_y_pred.mm(w2.t())
        d_h = d_h_relu.clone()
        d_h[h < 0] = 0
        d_w1 = imgs.t().mm(d_h)

        # Update weights using gradient descent
        w1 -= learning_rate * d_w1
        w2 -= learning_rate * d_w2

In [None]:
# Test
dataiter = iter(test_loader)
images, labels = dataiter.next()
grid = torchvision.utils.make_grid(images)
imshow(grid)
print "Ground Truth:", " ".join(["%s" % i for i in labels])

## Predict
y_pred = imgs.resize_((N, D_in)).mm(w1).clamp(min=0).mm(w2) 
y_pred = np.argmax(y_pred.numpy(), axis=1)
print y_pred

## Autograd
* Doing differentiation by ourselves for back-propagation is a lot of work. PyTorch can do it for us.
* We'll use the Variable class to wrap Tensors and maintain a computation graph.

In [None]:


D_in = 28 * 28 # size of input
H1 = 400     # size of 1st hidden layer
H2 = 100
D_out = 10    # number of classes

# Randomly initialize weights
w1 = Variable(torch.randn(D_in, H1), requires_grad=True)
w2 = Variable(torch.randn(H1, H2), requires_grad=True)
w3 = Variable(torch.randn(H2, D_out), requires_grad=True)

learning_rate = 1e-6 # learning rate

epochs = 3
for epoch in range(epochs):
    for imgs, labels in train_loader:
        ## Forward pass ##     
        # flatten image
        imgs = Variable(imgs.resize_((N,28*28)), requires_grad=False)
        
        # do all calculations (no need to stage, Autograd will take care of it)
        y_pred = imgs.mm(w1).clamp(min=0).mm(w2).clamp(min=0).mm(w3)

        ## Calculate Loss ##
        # Get categorical representation of label
        y = Variable(torch.eye(10)[labels], requires_grad=True)
        # MSE
        loss = (y_pred - y).pow(2).mean()
        #print "y_pred = ", y_pred.data
        #print "y = ", y.data
        print "Epoch = %d, Loss = %f" % (epoch, loss.data[0])

        ## Back propagation ##        
        loss.backward()
        
        # Update weights directly
        w1.data -= learning_rate * w1.grad.data
        w2.data -= learning_rate * w2.grad.data
        
        # Manually zero the gradients after updating weights
        w1.grad.data.zero_()
        w2.grad.data.zero_()

In [None]:
# Test
dataiter = iter(test_loader)
images, labels = dataiter.next()
grid = torchvision.utils.make_grid(images)
imshow(grid)
print "Ground Truth:", " ".join(["%s" % i for i in labels])

## Predict
y_pred = imgs.mm(w1).clamp(min=0).mm(w2).clamp(min=0).mm(w3)
print y_pred
y_pred = np.argmax(y_pred.data.numpy(), axis=1)
print y_pred

## torch.nn Module
* We'd like to automate the low-level design
* Put together Layers that perform certain functionality, like Legos
* High level interface

In [None]:
import torch.nn as nn

D_in = 28 * 28 # size of input
H1 = 400     # size of 1st hidden layer
H2 = 100
D_out = 10    # number of classes

model = nn.Sequential(
    nn.Linear(D_in, H1),
    nn.ReLU(),
    nn.Linear(H1, H2),
    nn.ReLU(),
    nn.Linear(H2, D_out)
)

# We have pre-built loss functions available to us
loss_fn = nn.CrossEntropyLoss()
#loss_fn = nn.MSELoss(size_average=False)

learning_rate = 1e-6
for imgs, labels in train_loader:
    x = Variable(imgs.resize_((N,28*28)), requires_grad=False)
    #y = Variable(torch.eye(10)[labels], requires_grad=False)

    y = Variable(labels)
    # forward pass
    y_pred = model(x)

    loss = loss_fn(y_pred, y)
    print loss.data[0]
    
    # zero the gradients
    model.zero_grad()   

    # backward pass
    loss.backward(retain_graph=True)
    
    # update parameters using gradient
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

In [None]:
# Test
dataiter = iter(test_loader)
images, labels = dataiter.next()
grid = torchvision.utils.make_grid(images)
imshow(grid)
print "Ground Truth:", " ".join(["%s" % i for i in labels])

## Predict
x = Variable(imgs.resize_((N,28*28)), requires_grad=False)
y_pred = model(x)
print y_pred
y_pred = np.argmax(y_pred.data.numpy(), axis=1)
print y_pred

## Optimizer
* We've been updating parameters directly with gradient descent, but there are better algorithms.
* e.g. Adam
* These algorithms simulate momentum and other effects to do more than just step in the direction of least gradient.

In [None]:
import torch.nn as nn

D_in = 28 * 28 # size of input
H1 = 500     # size of 1st hidden layer
H2 = 200
D_out = 10    # number of classes

model = nn.Sequential(
    nn.Linear(D_in, H1),
    nn.ReLU(),
    nn.Linear(H1, H2),
    nn.ReLU(),
    nn.Linear(H2, D_out)
)

# We have pre-built loss functions available to us
loss_fn = nn.CrossEntropyLoss()

learning_rate = 1e-6
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

epochs = 2

In [None]:
for epoch in range(epochs):
    for batch_idx, (imgs, labels) in enumerate(train_loader):
        x = Variable(imgs).view(-1, 28*28) # Notice how we flatten
        y = Variable(labels)
    
        # zero the gradients
        optimizer.zero_grad()
        y_pred = model(x)

        loss = loss_fn(y_pred, y)
              
        # backward pass
        loss.backward(retain_graph=True)
    
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print "Epoch %d, Batch %d Loss %f" % (epoch, batch_idx, loss.data[0])

In [None]:
# Test
dataiter = iter(test_loader)
images, _ = dataiter.next()
grid = torchvision.utils.make_grid(images)
imshow(grid)
print "Ground Truth:", " ".join(["%s" % i for i in labels])

# Predict
x = Variable(images).view(-1, 28*28)
y_pred = model(x)
print y_pred
_, classes = torch.max(y_pred, 1)
print classes

# Accuracy
count = 0
correct = 0
for images, labels in test_loader:
    count += N
    x = Variable(images).view(-1, 28*28)
    y_pred = model(x)
    _, classes = torch.max(y_pred, 1)
    correct += np.count_nonzero(labels.numpy() == classes.data.numpy())

print "Accuracy is ",(correct / float(count))

## Custom modules
* We can define our own modules with our own behavior.
* Inherit from nn.Module and override forward()
* Can contain other modules and autograd Variables.
* Can be custom loss modules.

## GPU
* We wanted to use the GPU, right?
* .cuda() method will move a Tensor/Variable to the GPU
* If we want to examine a Variable, we now need to transfer to CPU using .cpu()

## Saving/Loading your Model
* Very important! Cloud instances can be lost easily.
* Use `torch.save(the_model.state_dict(), PATH)` to save, and `the_model.load_state_dict(torch.load(PATH))` to load.
* When submitting, we'll want your state and program.