# The Forward and the Backward passes

In [188]:
#exports
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# export
import sys
sys.path.insert(0,"/content/drive/My Drive/Colab Notebooks/exp")
# sys.path.append("/content/drive/My Drive/Colab Notebooks/exp")
from nb_01 import *

In [0]:
# export
def get_data():
  path = datasets.download_data(MNIST_URL, ext= '.gz')
  with gzip.open(path,'rb') as f:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f,encoding='latin-1')
  return map(tensor, (x_train, y_train, x_valid, y_valid))

def normalise(x, m, s): return (x-m)/s

In [0]:
x_train, y_train, x_valid, y_valid = get_data()

In [192]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(tensor(0.1304), tensor(0.3073))

In [0]:
x_train = normalise(x_train, train_mean, train_std) #Normalise train data
x_valid = normalise(x_valid, train_mean, train_std) #Normalise valid data using train mean and std

In [194]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(tensor(0.0001), tensor(1.))

Note that mean has approached 0 and std 1

In [0]:
# export
def test_near_zero(a,tol=1e-3):
  assert a.abs()<tol, f"Near zero: {a}"

In [0]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

In [197]:
n, m = x_train.shape
c = y_train.max() + 1
n, m, c

(50000, 784, tensor(10))

# Foundation Version

## Basic Architecture

In [0]:

#num of hidden layer
nh = 50

Using 2 weight matricies and 2 bias

In [0]:
# Simplified kaiming init/ he init
# random numbers are generated which are divided by sqrt(m) to make the mean and
# std 0 and 1 respectively.

w1 = torch.randn(m, nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)/math.sqrt(nh)
b2 = torch.zeros(1)

# w1, b1, w2, b2

In [0]:
test_near_zero(w1.mean())
test_near_zero(w1.std()-1/math.sqrt(m))

In [201]:
#Should be (0,1)
x_valid.mean(), x_valid.std()

(tensor(-0.0057), tensor(0.9924))

In [0]:
# Linear layer
def lin(x, w, b):
  return x@w + b

In [0]:
t = lin(x_valid, w1, b1)

In [204]:
t.mean(), t.std()

(tensor(0.0961), tensor(0.9714))

In [0]:
def relu(x):
  return x.clamp_min(0)

In [0]:
t = relu(lin(x_valid, w1, b1))

In [207]:
t.mean(), t.std()

(tensor(0.4324), tensor(0.6109))


From pytorch docs: a: the negative slope of the rectifier used after this layer (0 for ReLU by default)

$$\text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}$$
This was introduced in the paper that described the Imagenet-winning approach from He et al: Delving Deep into Rectifiers, which was also the first paper that claimed "super-human performance" on Imagenet (and, most importantly, it introduced resnets!)

In [0]:
#Kaiming/He init for Relu
w1 = torch.randn(m, nh)*math.sqrt(2/m)

In [209]:
w1.mean(), w1.std()

(tensor(-0.0003), tensor(0.0504))

In [210]:
t=relu(lin(x_valid, w1, b1))
t.mean(), t.std()

(tensor(0.5350), tensor(0.8048))

In [0]:
# export
from torch.nn import init

In [212]:
w1 = torch.zeros(m,nh)
init.kaiming_normal_(w1, mode='fan_out') # Read fan_in and fan_out doc4
t=relu(lin(x_valid, w1, b1))
t.mean(), t.std()

(tensor(0.5281), tensor(0.8204))

In [213]:
w1.mean(), w2.std()

(tensor(0.0001), tensor(0.1518))

In [0]:
import torch.nn

In [215]:
torch.nn.Linear(m, nh).weight.shape

torch.Size([50, 784])

In [0]:
#Let's try -0.5
def relu(x): return x.clamp(0)-0.5


In [217]:
w1 = torch.randn(m, nh)*math.sqrt(2/m)
t1 = relu(lin(x_valid, w1, b1))
t1.mean() , t1.std()

(tensor(-0.0153), tensor(0.7654))

In [0]:
# Forward pass model, contains linear, relu and another linear layer.
def model(xb):
  l1=lin(xb, w1, b1)
  l2=relu(l1)
  l3=lin(l2, w2, b2)
  return l3

In [219]:
%timeit -n 10 _=model(x_valid)

10 loops, best of 3: 20.9 ms per loop


In [0]:
assert model(x_valid).shape==torch.Size([x_valid.shape[0],1])

## Loss Function : MSE

In [221]:
model(x_valid).shape

torch.Size([10000, 1])

We need squeeze() to get rid of that trailing (,1), in order to use mse. (Of course, mse is not a suitable loss function for multi-class classification; we'll use a better loss function soon. We'll use mse for now to keep things simple.)

In [0]:
# export
# unsqueeze() adds a column while squeeze removes a column
def mse(output, target):
  return (output.squeeze(-1) - target).pow(2).mean()

In [0]:
y_train, y_valid = y_train.float(), y_valid.float()

In [0]:
pred = model(x_train)

In [225]:
pred.shape

torch.Size([50000, 1])

In [226]:
mse(pred, y_train)

tensor(32.9285)

## Gradient and Backward Pass

In [0]:
def mse_grad(inp, targ):
  #grad of loss with respect to output of previous layer.
  inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1)/inp.shape[0]
  

In [0]:
def relu_grad(inp, out):
  # grad of relu with respect to input activation
  inp.g = (inp>0).float()*out.g
  

In [0]:
def lin_grad(inp, out, w, b):
  # grad of matmul with respect to input
  inp.g = out.g@w.t()
  w.g = (inp.unsqueeze(-1)*out.g.unsqueeze(1)).sum(0)
  b.g = out.g.sum(0)

In [0]:
def forward_and_backward(inp, targ):
  #Forward pass:
  l1 = inp @ w1 + b1  #Linear layer 1
  l2 = relu(l1)       #Relu layer
  out = l2 @ w2 + b2  #Linear layer 2
  #Error calculation (we don't actually use it anywhere)
  loss = mse(out,targ)
  
  #Backward pass
  mse_grad(out,targ)
  lin_grad(l2, out, w2, b2)
  relu_grad(l1, l2)
  lin_grad(inp, l1, w1, b1)

In [0]:
forward_and_backward(x_train, y_train)

## Test and compare with pytorch version

In [0]:
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
ig  = x_train.g.clone()

In [0]:
# We cheat a little bit by using pytorch autograd
xt2 = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)

In [0]:
def forward(inp, targ):
  l1 = inp @ w12 + b12
  l2 = relu(l1)
  out = l2 @ w22 + b22
  return mse(out, targ)

In [0]:
loss = forward(xt2, y_train)

In [0]:
loss.backward()

In [0]:
# export
def test(a, b, cmp, cname=None):
  if cname is None: cname = cmp.__name__
  assert cmp(a, b), f'{cname}:\n{a}\n{b}'
  
def near(a,b):
  return torch.allclose(a, b, rtol=1e-3, atol=1e-5)

def test_near(a,b):
    test(a,b,near)
  

In [0]:
test_near(w22.grad, w2g)
test_near(b22.grad, b2g)
test_near(w12.grad, w1g)
test_near(b12.grad, b1g)
test_near(xt2.grad, ig)

Hense, it works just like the pytorch autograd

## Refractor model

### Layers as classes

In [0]:
class Relu():
  def __call__(self, inp):
    self.inp = inp
    self.out = inp.clamp_min(0.) - 0.5
    return self.out
  
  def backward(self):
    self.inp.g = (self.inp > 0).float() * self.out.g

In [0]:
class Lin():
  def __init__(self, w, b):
    self.w, self.b = w, b
    
  def __call__(self, inp, ):
    self.inp = inp
    self. out = self.inp @ self.w + self.b
    return self.out
  
  def backward(self):
    self.inp.g = self.out.g @ self.w.t()
#     creating a giant outer product just to sum it...it's inefficient
    self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
    self.b.g = (self.out.g.sum(0))

In [0]:
class Mse():
  def __call__(self, inp, targ):
    self.inp = inp
    self.targ = targ
    self.out = (self.inp.squeeze(1) - self.targ).unsqueeze(-1)/self.targ.shape[0]
    return self.out
  
  def backward(self):
    self.inp.g = 2. * (self.inp.squeeze(1) - self.targ).unsqueeze(-1)/self.targ.shape[0]

In [0]:
class Model():
  def __init__(self, w1, b1, w2, b2):
    self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
    self.loss = Mse()
    
  def __call__(self, x, targ):
    for l in self.layers:
      x = l(x)
    return self.loss(x, targ)
  
  def backward(self):
    self.loss.backward()
    
    #Iterate through layers in backward
    for l in reversed(self.layers):
       l.backward()

In [0]:
# Initialize weight and biases
w1.g, b1.g, w2.g, b2.g = [None]*4

In [0]:
model = Model(w1, b1, w2, b2)

In [245]:
%time loss = model(x_train, y_train)

CPU times: user 118 ms, sys: 683 µs, total: 119 ms
Wall time: 119 ms


In [246]:
%time model.backward()

CPU times: user 8.33 s, sys: 60.7 ms, total: 8.39 s
Wall time: 8.47 s


In [0]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

### Remove  duplication by extending Module class

In [0]:
class Module():
  def __call__(self, *args):
    self.args = args
    self.out = self.forward(*args)
    return self.out
  
  def forward(self):
#     Will be implemented when extended
    raise Exception('Not Implemented')
  
  def backward(self):
    self.bwd(self.out, *self.args)
    

In [0]:
class Lin(Module):
  def __init__(self, w, b):
    self.w, self.b = w, b
    
  def forward(self, inp):
    return inp@self.w + self.b
  
  def bwd(self, out, inp):
    inp.g  =  out.g @ self.w.t()
    self.w.g = torch.einsum('bi,bj -> ij', inp, out.g)
    self.b.g = out.g.sum(0)

In [0]:
class Relu(Module):
  def forward(self, inp):
    return inp.clamp_min(0.) - 0.5
  
  def bwd(self, out, inp):
    inp.g = (inp > 0).float() * out.g

In [0]:
class Mse(Module):
  def forward(self, inp, targ):
    return (inp.squeeze(1) - targ).pow(2).mean()
  
  def bwd(self, out, inp, targ):
    inp.g = 2 * (inp.squeeze(1) - targ).unsqueeze(-1)/targ.shape[0]

In [0]:
class Model():
  def __init__(self):
    self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
    self.loss = Mse()
    
  def __call__(self, x, targ):
    for l in self.layers:
      x = l(x)      
    return self.loss(x, targ)
  
  def backward(self):
    self.loss.backward()
    for l in reversed(self.layers):
      l.backward()

In [0]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [254]:
%time loss = model(x_train, y_train)

CPU times: user 118 ms, sys: 344 µs, total: 118 ms
Wall time: 121 ms


In [255]:
%time model.backward()

CPU times: user 246 ms, sys: 1.84 ms, total: 248 ms
Wall time: 248 ms


In [0]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

### Using Linear layer without Einsum

In [0]:
class Lin(Module):
  def __init__(self, w, b):
    self.w, self.b = w, b
    
  def forward(self, inp):
    return inp@self.w + self.b
  
  def bwd(self, out, inp):
    inp.g = out.g @ self.w.t()
    self.w.g = inp.t() @ out.g
    self.b.g = out.g.sum(0)

In [0]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [259]:
%time loss = model(x_train, y_train)

CPU times: user 127 ms, sys: 1.17 ms, total: 129 ms
Wall time: 131 ms


In [260]:
%time model.backward()

CPU times: user 224 ms, sys: 4.44 ms, total: 229 ms
Wall time: 232 ms


In [0]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

### Using nn.Linear and nn.Module

In [0]:
#export
from torch import nn

In [0]:
class Model(nn.Module):
  def __init__(self, n_in, nh, n_out):
    super().__init__()
    self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
    self.loss = mse
    
  def __call__(self, x, targ):
    for l in self.layers:
      x = l(x)
    return self.loss(x.squeeze(), targ)

In [0]:
model = Model(m, nh, 1)

In [265]:
%time loss = model(x_train, y_train)

CPU times: user 102 ms, sys: 3.07 ms, total: 105 ms
Wall time: 108 ms


In [266]:
%time loss.backward()

CPU times: user 82.5 ms, sys: 2.14 ms, total: 84.7 ms
Wall time: 85.1 ms


# Export

In [267]:
!pip install fire



In [268]:
!python "/content/drive/My Drive/Colab Notebooks/notebook2script.py" "/content/drive/My Drive/Colab Notebooks/02_Fully_connected_NN.ipynb"

Converted /content/drive/My Drive/Colab Notebooks/02_Fully_connected_NN.ipynb to /content/drive/My Drive/Colab Notebooks/exp/nb_02.py
