# Forward and backward passess
using gradients in backward propagationn to update weights on a tensor

## Setup and getting the data

In [46]:
# Boilerplate code to load all the relevant libraries and put the data into tensors

import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

In [47]:
# Downloading the MNIST dataset

MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

from urllib.request import urlretrieve
if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

## The basic architecture

Using SGD, not cross-entropy and then evaluating the results using MSE. It is a hacky way as proof of concept

In [48]:
n,m = x_train.shape
c = y_train.max()+1
n, m, c

(50000, 784, tensor(10))

In [49]:
# number of hidden layers (arbitrarily chosen)

nh = 50

In [50]:
# Initialise all the weights and biases on our new basic neural network

w1 = torch.randn(m, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [51]:
def lin(x, w, b): return x@w + b

In [52]:
t = lin(x_valid, w1, b1)
t.shape

torch.Size([10000, 50])

In [53]:
# Relu function clamping, so changing all values less than zero to zero

def relu(x): return x.clamp_min(0.)

In [54]:
t = relu(t)
t

tensor([[ 0.00, 11.87,  0.00,  ...,  5.48,  2.14, 15.30],
        [ 5.38, 10.21,  0.00,  ...,  0.88,  0.08, 20.23],
        [ 3.31,  0.12,  3.10,  ..., 16.89,  0.00, 24.74],
        ...,
        [ 4.01, 10.35,  0.00,  ...,  0.23,  0.00, 18.28],
        [10.62,  0.00, 10.72,  ...,  0.00,  0.00, 18.23],
        [ 2.84,  0.00,  1.43,  ...,  0.00,  5.75,  2.12]])

In [55]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)


In [56]:
res = model(x_valid)
res.shape

torch.Size([10000, 1])

In [57]:
res

tensor([[  25.75],
        [ -13.06],
        [-114.79],
        ...,
        [ -67.44],
        [ -74.48],
        [ -60.19]])

# MSE, defining a loss function

In [58]:
y_valid.shape, res.shape

(torch.Size([10000]), torch.Size([10000, 1]))

In [59]:
# To subtract these from each other we need to add the index collumn, or use the squeeze method to remove the trailing unit vector
res.squeeze().shape

torch.Size([10000])

In [60]:
# Turn targets of training and validation sets into floats as we are using MSE
y_train,y_valid = y_train.float(),y_valid.float()

# Make our first set of predictions using our model
preds = model(x_train)
preds.shape

torch.Size([50000, 1])

In [61]:
def mse(output, targ): return (output[:,0]-targ).pow(2).mean()

In [62]:
mse(preds, y_train)

tensor(4308.76)

# Gradients and backward pass

To do here:
- figure out what all the individual lines of code do and why
- Experiment with the python debugger 
import pdb; pdb.set_trace() is the code that places a breakpoint. With this the code stops running and you can use various commands to assess the states of the various variables at that point

In [63]:
def lin_grad(inp, out, w, b):
    inp.g = out.g @ w.t()
#     Note that the t() method transposes the tensor
    # import pdb; pdb.set_trace()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)
    
    

# def lin_grad(inp, out, w, b):
#     # grad of matmul with respect to input
#     inp.g = out.g @ w.t()
#     w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
#     b.g = out.g.sum(0)

In [64]:
def forward_and_backward(inp, target):
    #Forward pass
    l1 = lin(inp, w1, b1)
    l2 = relu(l1)
    out = lin(l2, w2, b2)
    diff = out[:,0]-target
    loss = diff.pow(2).mean() 
    
    #Backward pass
    out.g = 2.*diff[:,None] / inp.shape[0]
    lin_grad(l2, out, w2, b2)
    l1.g = (l1>0).float() * l2.g
    lin_grad(inp, l1, w1, b1)
    
    


In [65]:
forward_and_backward(x_train, y_train)

In [66]:
# lin_grad can also be done using numpy transposing (.T()) and miltiplying

def lin_grad(inp, out, w, b):
    inp.g = out.g @ w.t()
#     Note that the t() method transposes the tensor
    # import pdb; pdb.set_trace()
    w.g = inp.T@out
    b.g = out.g.sum(0)

In [67]:
# Save for testing against later
def get_grad(x): return x.g.clone()
chks = w1,w2,b1,b2,x_train
grads = w1g,w2g,b1g,b2g,ig = tuple(map(get_grad, chks))         

In [68]:
# Using PyTorch to get gradients and compare

def mkgrad(x): return x.clone().requires_grad_(True)
ptgrads = w12,w22,b12,b22,xt2 = tuple(map(mkgrad, chks))   

In [69]:
def forward(inp, targ):
    l1 = lin(inp, w12, b12)
    l2 = relu(l1)
    out = lin(l2, w22, b22)
    return mse(out, targ)  



In [70]:
loss = forward(xt2, y_train)
loss.backward()

In [71]:
for a,b in zip(grads, ptgrads): test_close(a, b.grad, eps=0.01)

# Refactor model

## Layers as classess

Here we create classes to perform all the above and refactor them for usability

In [72]:


class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)
        return self.out
    
    def backward(self): self.inp.g = (self.inp>0).float() * self.out.g
     


In [73]:


class Lin():
    def __init__(self, w, b): self.w,self.b = w,b

    def __call__(self, inp):
        self.inp = inp
        self.out = lin(inp, self.w, self.b)
        return self.out

    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = self.inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)
     


In [74]:


class Mse():
    def __call__(self, inp, targ):
        self.inp,self.targ = inp,targ
        self.out = mse(inp, targ)
        return self.out
    
    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]
     


In [75]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [76]:
model = Model(w1, b1, w2, b2)

In [77]:
loss = model(x_train, y_train)

model = Model(w1, b1, w2, b2)
loss = model(x_train, y_train)


In [78]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)   


## Module.forward()

Here we define a class called "Module" that stores all the input saving. 
This is so that we can use inheritance of this classs to simplify subsequent classess, that they only contain the actual mathematics that is needed

In [79]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out

    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)
    def bwd(self): raise Exception('not implemented')

In [80]:
# Note how much cleaner this function now is

class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g    

In [81]:
class Lin(Module):
    def __init__(self, w, b): self.w,self.b = w,b
    def forward(self, inp): return inp@self.w + self.b
    def bwd(self, out, inp):
        inp.g = self.out.g @ self.w.t()
        self.w.g = inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [82]:


class Mse(Module):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]
     


In [83]:
model = Model(w1, b1, w2, b2)

In [84]:
loss = model(x_train, y_train)

In [85]:
model.backward()

In [86]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)    


## Autograd

The Module function is already defined in Pytorch, so now that we've implemented it, we can use PyTorch's version - that is why we don't need to define backward. This version already knows what the derivatives are. F is used as the name of the Module, per convention

In [88]:
from torch import nn
import torch.nn.functional as F    


In [89]:
class Linear(nn.Module):
    def __init__(self, n_in, n_out):
        super().__init__()
        self.w = torch.randn(n_in,n_out).requires_grad_()
        self.b = torch.zeros(n_out).requires_grad_()
    def forward(self, inp): return inp@self.w + self.b    


In [90]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [Linear(n_in,nh), nn.ReLU(), Linear(nh,n_out)]
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return F.mse_loss(x, targ[:,None])    


In [91]:
model = Model(m, nh, 1)
loss = model(x_train, y_train)
loss.backward()    


In [92]:
l0 = model.layers[0]
l0.b.grad

tensor([-19.60,  -2.40,  -0.12,   1.99,  12.78, -15.32, -18.45,   0.35,   3.75,  14.67,  10.81,  12.20,  -2.95, -28.33,
          0.76,  69.15, -21.86,  49.78,  -7.08,   1.45,  25.20,  11.27, -18.15, -13.13, -17.69, -10.42,  -0.13, -18.89,
        -34.81,  -0.84,  40.89,   4.45,  62.35,  31.70,  55.15,  45.13,   3.25,  12.75,  12.45,  -1.41,   4.55,  -6.02,
        -62.51,  -1.89,  -1.41,   7.00,   0.49,  18.72,  -4.84,  -6.52])