# The forward and backward passes

In [None]:
# !pip install deeplake

In [None]:
import pickle, gzip, math, os, time, shutil, torch, matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close
import deeplake
from sklearn.model_selection import train_test_split
from collections import Counter
import os
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

os.environ['DEEPLAKE_DOWNLOAD_PATH'] = '../../data/'
ds = deeplake.load('hub://activeloop/not-mnist-small', access_method='local')

images = ds.tensors['images'].numpy().reshape(-1, 784).astype('float32')
labels = ds.tensors['labels'].numpy().squeeze(-1).astype('int')

x_train, x_valid, y_train, y_valid = train_test_split(images, labels, test_size=0.2, random_state=1)
x_train, y_train, x_valid, y_valid = map(tensor, (x_train, y_train, x_valid, y_valid))
x_train, x_valid = x_train/255., x_valid/255.

-

Opening dataset in read-only mode as you don't have write permissions.


 


** Loaded local copy of dataset from ../../data/hub_activeloop_not-mnist-small. Downloaded on: Wed Dec 13 20:52:19 2023


In [None]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

(torch.Size([14979, 784]),
 torch.Size([14979]),
 torch.Size([3745, 784]),
 torch.Size([3745]))

In [None]:
Counter(list(labels))

Counter({0: 1872,
         1: 1873,
         2: 1873,
         3: 1873,
         4: 1873,
         5: 1872,
         6: 1872,
         7: 1872,
         8: 1872,
         9: 1872})

## Foundations version

### Basic architecture

In [None]:
n, m = x_train.shape
c = y_train.max() + 1
n, m, c

(14979, 784, tensor(10))

In [None]:
# num hidden
nh = 50

In [None]:
w1 = torch.randn(m, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [None]:
def lin(x, w, b): return x@w + b

In [None]:
x_valid.shape

torch.Size([3745, 784])

In [None]:
t = lin(x_valid, w1, b1)
t.shape

torch.Size([3745, 50])

In [None]:
def relu(x): return x.clamp_min(0.)

In [None]:
t = relu(t)
t

tensor([[ 0.00,  8.33,  0.00,  ...,  0.70,  0.00, 36.05],
        [ 6.36,  8.32,  2.91,  ...,  0.00,  0.00, 48.31],
        [ 0.00,  0.00, 16.72,  ...,  0.00,  4.32, 31.27],
        ...,
        [13.73,  4.81, 12.67,  ...,  0.00,  0.00, 16.93],
        [ 0.00,  3.24, 13.57,  ..., 12.05,  0.00, 16.29],
        [ 0.00, 25.35, 18.67,  ..., 20.08,  0.00, 22.78]])

In [None]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [None]:
res = model(x_valid)
res.shape

torch.Size([3745, 1])

## Loss Function: MSE

(ofcourse, `mse` is not suitable loss function for multi-class classification; we'll use a better loss function soon. We'll use `mse` for now to keep things simple. )

In [None]:
res.shape, y_valid.shape

(torch.Size([3745, 1]), torch.Size([3745]))

In [None]:
(res-y_valid).shape

torch.Size([3745, 3745])

We need to get rid of that trailing (,1), in order to use mse.

In [None]:
res[:,0].shape

torch.Size([3745])

In [None]:
res.squeeze().shape

torch.Size([3745])

In [None]:
res[None].shape, res[None].squeeze().shape # squeezes every unit dimensions

(torch.Size([1, 3745, 1]), torch.Size([3745]))

In [None]:
(res[:, 0]- y_valid).shape

torch.Size([3745])

In [None]:
y_train, y_valid = y_train.float(), y_valid.float()

preds = model(x_train)
preds.shape

torch.Size([14979, 1])

In [None]:
def mse(output, targ): return (output[:, 0] - targ).pow(2).mean()

In [None]:
mse(preds, y_train)

tensor(9223.19)

## Gradients and backward pass

In [None]:
from sympy import symbols, diff
x, y = symbols('x y')
diff(x**2, x)

2*x

In [None]:
diff(3*x**2 + 9, x)

6*x

In [None]:
def lin_grad(inp, out, w, b):
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0) # inp.T @ out.g
    b.g = out.g.sum(0)

In [None]:
def forward_and_backward(inp, targ):
    l1 = lin(inp, w1, b1)
    l2 = relu(l1)
    out = lin(l2, w2, b2)
    diff = out[:,0] - targ
    loss = diff.pow(2).mean()
    
    # backward pass
    out.g = 2 * diff[:,None] / inp.shape[0]
    lin_grad(l2, out, w2, b2)
    l1.g = (l1 > 0).float() * l2.g
    lin_grad(inp, l1, w1, b1)

In [None]:
forward_and_backward(x_train, y_train)

In [None]:
# Save for testing against later
def get_grad(x): return x.g.clone()
chks = w1, w2, b1, b2, x_train
grads = w1g, w2g, b1g, b2g, ig = list(map(get_grad, chks))

We cheat a little bit and use PyTorch autograd to check our results

In [None]:
def mkgrad(x): return x.clone().requires_grad_(True)
ptgrads = w12, w22, b12, b22, xt2 = list(map(mkgrad, chks))

In [None]:
def forward(inp, targ):
    l1 = lin(inp, w12, b12)
    l2 = relu(l1)
    out = lin(l2, w22, b22)
    return mse(out, targ)

In [None]:
loss = forward(xt2, y_train)
loss.backward()

In [None]:
for a, b in zip(grads, ptgrads): 
    test_close(a, b.grad, eps=1e-1)

## Refactor Model

### Layers as classes

In [None]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)
        return self.out
    
    def backward(self): self.inp.g = (self.inp > 0).float() * self.out.g

In [None]:
class Lin():
    def __init__(self, w, b): self.w, self.b = w, b
    
    def __call__(self, inp):
        self.inp = inp
        self.out = lin(inp, self.w, self.b)
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = self.inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [None]:
class Mse():
    def __call__(self, inp, targ):
        self.inp, self.targ = inp, targ
        self.out = mse(inp, targ)
        return self.out
    
    def backward(self):
        self.inp.g = 2 * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [None]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
    
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [None]:
model = Model(w1, b1, w2, b2)

In [None]:
loss = model(x_train, y_train)

In [None]:
model.backward()

In [None]:
test_close(w2g, w2.g, eps=0.1)
test_close(b2g, b2.g, eps=0.1)
test_close(w1g, w1.g, eps=0.1)
test_close(b1g, b1.g, eps=0.1)
test_close(ig, x_train.g, eps=0.1)

### Module.forward()

In [None]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exeption('not implemented')
    def backward(self): self.bwd(self.out, *self.args)
    def bwd(self): raise Exception('not implemented')

In [None]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)
    def bwd(self, out, inp): inp.g = (inp > 0).float() * out.g

In [None]:
class Lin(Module):
    def __init__(self, w, b): self.w, self.b = w, b
    def forward(self, inp): return inp@self.w + self.b
    def bwd(self, out, inp):
        inp.g = self.out.g @ self.w.t()
        self.w.g = inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [None]:
class Mse(Module):
    def forward(self, inp, targ): 
        self.diff = (inp.squeeze() - targ)
        return self.diff.pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2 * self.diff.unsqueeze(-1) / targ.shape[0]

In [None]:
model = Model(w1, b1, w2, b2)

In [None]:
loss = model(x_train, y_train)

In [None]:
model.backward()

In [None]:
test_close(w2g, w2.g, eps=0.1)
test_close(b2g, b2.g, eps=0.1)
test_close(w1g, w1.g, eps=0.1)
test_close(b1g, b1.g, eps=0.1)
test_close(ig, x_train.g, eps=0.1)

### Autograd

In [None]:
from torch import nn
import torch.nn.functional as F

In [None]:
class Linear(nn.Module):
    def __init__(self, n_in, n_out):
        super().__init__()
        self.w = torch.randn(n_in, n_out).requires_grad_()
        self.b = torch.zeros(n_out).requires_grad_()
    def forward(self, inp): return inp@self.w + self.b

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [Linear(n_in, nh), nn.ReLU(), Linear(nh, n_out)]
    def forward(self, x, targ):
        for l in self.layers: x = l(x)
        return F.mse_loss(x, targ[:, None])

In [None]:
model = Model(m, nh, 1)
loss = model(x_train, y_train)
loss.backward()

In [None]:
l0 = model.layers[0]
l0.b.grad

tensor([    -2.27,     -4.59,     -0.10,      4.55,      1.04,    -10.40,    -41.68,      1.54,      5.36,     20.64,
            10.28,     36.80,    -11.65,    -24.00,      3.27,    101.15,    -18.80,     26.60,    -12.04,     14.69,
            40.24,     38.33,    -21.00,    -15.40,    -18.52,     -7.08,     -0.07,    -20.08,    -22.04,     -6.98,
             5.13,      4.92,     12.92,     57.12,     37.86,     59.58,      2.06,      8.23,     59.37,     -4.24,
             9.57,     -0.42,    -83.99,     -9.44,     -0.90,      0.53,     16.30,     21.97,     -2.01,    -20.81])