In [2]:
import gzip, pickle, torch, matplotlib as mpl
from pathlib import Path
import numpy as np

torch.manual_seed(42)
torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
np.set_printoptions(precision=2, linewidth=140)

In [3]:
file = Path('data/mnist.pkl.gz')

In [4]:
file.exists()

True

In [5]:
data = None
with gzip.open(file, 'rb') as f:
    data = pickle.load(f, encoding='latin')

In [6]:
((x_train, y_train), (x_val, y_val), _) = data
x_train.shape, y_train.shape, x_val.shape, y_val.shape

((50000, 784), (50000,), (10000, 784), (10000,))

In [7]:
(x_train, y_train, x_val, y_val) = map(torch.tensor, (x_train, y_train, x_val, y_val))
x_train.shape, y_train.shape, x_val.shape, y_val.shape

(torch.Size([50000, 784]),
 torch.Size([50000]),
 torch.Size([10000, 784]),
 torch.Size([10000]))

#### 1 hidden layer neural network with relu activation (ffn)

In [8]:
m, n = x_train.shape
c = y_train.max() + 1
m, n, c

(50000, 784, tensor(10))

In [9]:
nh = 50

In [10]:
w1 = torch.randn(n, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [11]:
def lin(x, w, b):
    return x @ w + b

In [12]:
t = lin(x_val, w1, b1)

In [13]:
t, t.shape

(tensor([[ -0.09,  11.87, -11.39,  ...,   5.48,   2.14,  15.30],
         [  5.38,  10.21, -14.49,  ...,   0.88,   0.08,  20.23],
         [  3.31,   0.12,   3.10,  ...,  16.89,  -6.05,  24.74],
         ...,
         [  4.01,  10.35, -11.25,  ...,   0.23,  -5.30,  18.28],
         [ 10.62,  -4.27,  10.72,  ...,  -2.87,  -2.87,  18.23],
         [  2.84,  -0.22,   1.43,  ...,  -3.91,   5.75,   2.12]]),
 torch.Size([10000, 50]))

In [14]:
def relu(x):
    return x.clamp_min(0)

In [15]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [16]:
res = model(x_val)

In [17]:
res.shape

torch.Size([10000, 1])

#### loss function (MSE)

In [18]:
res.shape, y_val.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [19]:
(res - y_val)

tensor([[  22.75,   17.75,   19.75,  ...,   20.75,   19.75,   17.75],
        [ -16.06,  -21.06,  -19.06,  ...,  -18.06,  -19.06,  -21.06],
        [-117.79, -122.79, -120.79,  ..., -119.79, -120.79, -122.79],
        ...,
        [ -70.44,  -75.44,  -73.44,  ...,  -72.44,  -73.44,  -75.44],
        [ -77.48,  -82.48,  -80.48,  ...,  -79.48,  -80.48,  -82.48],
        [ -63.19,  -68.19,  -66.19,  ...,  -65.19,  -66.19,  -68.19]])

In [20]:
y_val.view(-1, 1).shape

torch.Size([10000, 1])

In [21]:
(res - y_val.view(-1, 1)).pow(2).mean()

tensor(4154.01)

In [22]:
(res - y_val.view(-1, 1)).pow(2).sum()/(res - y_val.view(-1, 1)).shape[0]

tensor(4154.01)

In [23]:
(res - y_val.view(-1, 1)).shape[0]

10000

In [24]:
torch.diag((res - y_val)).allclose((res - y_val.view(-1, 1)).view(-1))

True

In [25]:
y_train, y_val = y_train.float(), y_val.float()

In [26]:
def mse(out, targ):
    return (out - targ.view(-1, 1)).pow(2).mean()

In [27]:
pred = model(x_train)

In [28]:
mse(pred, y_train)

tensor(4308.76)

#### gradients and backward pass

In [29]:
from sympy import symbols, diff

In [30]:
x, y = symbols('x y')

In [31]:
diff(x**2, x)

2*x

In [32]:
def lin_grad(out, x, w, b):
    b.g = out.g.sum(0)
    w.g = out.g * x.t().sum(1)
    x.g = out.g @ w.t()

In [34]:
def forward_backward(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    out = lin(l2, w2, b2)
    