# The forward and backward passes

In [None]:
import pickle, gzip, math, os, time, shutil, torch, matplotlib as mpl, numpy as np
import numpy as np
from torch import tensor
from pathlib import Path
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

path_data = Path('data')

train_images_file = 'kmnist-train-imgs.npz'
train_labels_file = 'kmnist-train-labels.npz'
test_images_file = 'kmnist-test-imgs.npz'
test_labels_file = 'kmnist-test-labels.npz'

x_train = np.load(path_data/train_images_file)['arr_0'].reshape(-1, 784)/255.0
x_train = np.float32(x_train)
y_train = np.load(path_data/train_labels_file)['arr_0']
x_valid = np.load(path_data/test_images_file)['arr_0'].reshape(-1, 784)/255.0
x_valid = np.float32(x_valid)
y_valid = np.load(path_data/test_labels_file)['arr_0']

x_train, y_train, x_valid, y_valid = map(tensor, (x_train, y_train, x_valid, y_valid))

  from .autonotebook import tqdm as notebook_tqdm


## Foundations Version

### Basic Architecture

In [None]:
n,m = x_train.shape
c = y_train.max() + 1
n, m, c

(60000, 784, tensor(10, dtype=torch.uint8))

In [None]:
# num hidden
nh = 50

In [None]:
w1 = torch.randn(m, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [None]:
def lin(x, w, b): return x@w + b

In [None]:
t = lin(x_valid, w1, b1)
t.shape

torch.Size([10000, 50])

In [None]:
def relu(x): return x.clamp_min(0.)

In [None]:
t = relu(t)
t

tensor([[13.38,  8.13,  0.00,  ...,  0.00,  0.00, 31.00],
        [ 3.94, 19.55,  3.68,  ...,  0.00,  0.00, 13.87],
        [ 3.86,  0.00,  0.00,  ...,  1.26,  0.00, 14.98],
        ...,
        [ 0.00,  3.12, 10.38,  ...,  7.62,  2.62, 13.96],
        [ 8.35,  0.00,  0.00,  ...,  7.02,  0.00, 13.15],
        [15.43,  0.00,  0.00,  ...,  0.00,  2.43, 19.81]])

In [None]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [None]:
res = model(x_valid)
res.shape

torch.Size([10000, 1])

### Loss function: MSE

(Of course, `mse` is not a suitable loss function for multi-class classification; we'll use a better loss function soon. We'll use `mse` for now to keep things simple)

In [None]:
res.shape, y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [None]:
(res-y_valid).shape

torch.Size([10000, 10000])

We need to get rid of the trailing (,1) in order to use `mse`

In [None]:
res[:, 0].shape

torch.Size([10000])

In [None]:
res[None, :, None].squeeze().shape

torch.Size([10000])

In [None]:
(res[:, 0] - y_valid).shape

torch.Size([10000])

In [None]:
y_train, y_valid = y_train.float(), y_valid.float()

preds = model(x_train)
preds.shape

torch.Size([60000, 1])

In [None]:
def mse(output, targ): return(output[:, 0] - targ).pow(2).mean()

In [None]:
mse(preds, y_train)

tensor(4160.43)

### Gradients and backward pass

In [None]:
from sympy import symbols, diff
x, y = symbols('x y')
diff(x**2, x)

2*x

In [None]:
diff(3*x**2 + 9, x)

6*x

In [None]:
def lin_grad(inp, out, w, b):
    # grad of matmul with respect to input
    inp.g = out.g @ w.t()
    w.g = inp.T@out.g
    b.g = out.g.sum(0)

In [None]:
def forward_and_backward(inp, targ):
    # forward pass
    l1 = lin(inp, w1, b1)
    l2 = relu(l1)
    out = lin(l2, w2, b2)
    diff = out[:, 0] - targ
    loss = diff.pow(2).mean()

    # backward pass
    out.g = 2.*diff[:, None] / inp.shape[0]
    lin_grad(l2, out, w2, b2)
    l1.g = (l1 > 0).float() * l2.g
    lin_grad(inp, l1, w1, b1)

In [None]:
forward_and_backward(x_train, y_train)

In [None]:
w1.g

tensor([[     0.07,     -0.03,     -0.06,  ...,      0.00,     -0.02,      0.15],
        [     0.18,     -0.08,     -0.17,  ...,      0.00,     -0.10,      0.40],
        [     0.32,     -0.15,     -0.28,  ...,      0.00,     -0.20,      0.71],
        ...,
        [     0.67,     -0.36,     -0.71,  ...,      0.00,     -0.28,      1.73],
        [     0.39,     -0.20,     -0.33,  ...,      0.00,     -0.19,      0.99],
        [     0.11,     -0.04,     -0.08,  ...,      0.00,     -0.05,      0.25]])

In [None]:
# Save for testing against later
def get_grad(x): return x.g.clone()
chks = w1, w2, b1, b2, x_train
grads = w1g, w2g, b1g, b2g, ig = tuple(map(get_grad, chks))

We cheat a little but and use PyTorch autograd to check our results.

In [None]:
def mkgrad(x): return x.clone().requires_grad_(True)
ptgrads = w12, w22, b12, b22, xt2 = tuple(map(mkgrad, chks))

In [None]:
def forward(inp, targ):
    l1 = lin(inp, w12, b12)
    l2 = relu(l1)
    out = lin(l2, w22, b22)
    return mse(out, targ)

In [None]:
loss = forward(xt2, y_train)
loss.backward()

In [None]:
for a, b in zip(grads, ptgrads): test_close(a, b.grad, eps=0.01)

### Refactor Model

#### Layers as classes

In [None]:
class A:
    def __call__(self): print('hi')

In [None]:
A()()

hi


In [None]:
a = A(); a()

hi


In [None]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)
        return self.out
    
    def backward(self): self.inp.g = (self.inp > 0).float() * self.out.g

In [None]:
class Lin():
    def __init__(self, w, b): self.w, self.b = w, b
    
    def __call__(self, inp):
        self.inp = inp
        self.out = lin(inp, self.w, self.b)
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = self.inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [None]:
class Mse():
    def __call__(self, inp, targ):
        self.inp, self.targ = inp, targ
        self.out = mse(inp, targ)
        return self.out
    
    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [None]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
    
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return dict(loss=self.loss(x, targ), preds=x)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [None]:
model = Model(w1, b1, w2, b2)

In [None]:
loss = model(x_train, y_train)

In [None]:
model.backward()

In [None]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

### Module.forward()

In [None]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exception('not implemeted')
    def backward(self): self.bwd(self.out, *self.args)
    def bwd(self): raise Exception('not implemented')

In [None]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)
    def bwd(self, out, inp): inp.g = (inp > 0).float() * out.g

In [None]:
class Lin(Module):
    def __init__(self, w, b): self.w, self.b = w, b
    def forward(self, inp): return inp@self.w + self.b
    def bwd(self, out, inp):
        inp.g = self.out.g @ self.w.t()
        self.w.g = inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [None]:
class Mse(Module):
    def forward(self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2 * (inp.squeeze() - targ).unsqueeze(-1) / targ.shape[0]

In [None]:
model = Model(w1, b1, w2, b2)

In [None]:
loss = model(x_train, y_train)

In [None]:
loss

{'loss': tensor(4160.43),
 'preds': tensor([[-133.06],
         [-192.51],
         [ -39.15],
         ...,
         [-108.78],
         [ -38.29],
         [  11.42]])}

In [None]:
model.backward()

In [None]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

### Autograd

In [None]:
from torch import nn
import torch.nn.functional as F

In [None]:
class Linear(nn.Module):
    def __init__(self, n_in, n_out):
        super().__init__()
        self.w = torch.randn(n_in, n_out).requires_grad_()
        self.b = torch.zeros(n_out).requires_grad_()
        
    def forward(self, inp): return inp @ self.w + self.b

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [Linear(n_in, nh), nn.ReLU(), Linear(nh, n_out)]
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return F.mse_loss(x, targ[:, None])

In [None]:
model = Model(m, nh, 1)
loss = model(x_train, y_train)
loss.backward()

In [None]:
l0 = model.layers[0]
l0.b.grad

tensor([ 72.82, -17.26, -10.03,   3.08,   0.24,  -5.18,  28.86,  34.96,  20.00,  -9.66,  30.65,   8.78,  -0.62,  -3.78,
         22.53,  16.52,   3.82,  -2.21,  33.31,   0.20,  -1.88,  -6.72, -20.30, -18.02,  -5.70,  39.50,  -5.39,  -2.07,
        -26.03, -20.24,  13.30,  -2.89, -15.64, -20.19,   1.42,  14.58, -11.81,   2.37,   4.86,  -2.59, -10.50,  60.86,
         -7.49,   9.51,   7.99, -26.85, -12.79,  -2.68,  -0.21,  53.31])