## The forward and backward passes

In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
# from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## Foundations version

### Basic architecture

In [2]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

In [3]:
# num hidden
nh = 50

In [4]:
w1 = torch.randn((m,nh))
b1 = torch.zeros(nh)
w2 = torch.randn((nh,1)) # 1 instead of 10 for ease of demo, hence MSE for loss and not multi-class classification
b2 = torch.zeros(1)

In [5]:
def lin(x, w, b): return x@w + b

In [6]:
x_valid.shape

torch.Size([10000, 784])

In [7]:
t = lin(x_valid, w1, b1)
t.shape

torch.Size([10000, 50])

In [8]:
def relu(x): return x.clamp_min(0.)

In [9]:
t = relu(t)
t

tensor([[ 0.00, 11.87,  0.00,  ...,  5.48,  2.14, 15.30],
        [ 5.38, 10.21,  0.00,  ...,  0.88,  0.08, 20.23],
        [ 3.31,  0.12,  3.10,  ..., 16.89,  0.00, 24.74],
        ...,
        [ 4.01, 10.35,  0.00,  ...,  0.23,  0.00, 18.28],
        [10.62,  0.00, 10.72,  ...,  0.00,  0.00, 18.23],
        [ 2.84,  0.00,  1.43,  ...,  0.00,  5.75,  2.12]])

In [10]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [11]:
res = model(x_valid)
res.shape

torch.Size([10000, 1])

### Loss function: MSE

In [12]:
res.shape, y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [13]:
(res.reshape(-1) - y_valid).shape
# alternative methods - squeeze, [:,0]

torch.Size([10000])

In [14]:
y_train,y_valid = y_train.float(), y_valid.float()

preds = model(x_train)
preds.shape

torch.Size([50000, 1])

In [15]:
def mse(output, targ): return (output[:,0]-targ).pow(2).mean()

In [16]:
mse(preds, y_train)

tensor(4308.76)

### Gradients and backward pass