### Our goal: create a neural network from scratch

In [1]:
import pandas as pd
import numpy as np
import torch
from torchvision.datasets import MNIST
import math
from functools import partial

### Get the data

In [2]:
trainset = MNIST('../', download=True, train=True)
testset = MNIST('../', download=True, train=False)

In [3]:
# 60 000 / 10 000: classic MNIST
trainset.data.shape, testset.data.shape

(torch.Size([60000, 28, 28]), torch.Size([10000, 28, 28]))

In [4]:
train = trainset.data.reshape(60000, -1)
test = testset.data.reshape(10000, -1)

In [5]:
y_train = trainset.targets
y_test = testset.targets

In [6]:
# FLoating point numbers work better with pytorch...
train.dtype

torch.uint8

In [7]:
train = train.to(torch.float32)
test = test.to(torch.float32)

In [8]:
# The data isn't normalized
m, std = train.mean(), train.std()
m, std

(tensor(33.3184), tensor(78.5675))

In [9]:
train = (train - m) / std
test = (test - m) / std
print(train.mean(), train.std()) # 0 and 1, good

tensor(1.3679e-06) tensor(1.)


### define a fully connected neural net

In [10]:
def linear(x, neurons=50):
    # x is a n by m matrix; to perform matrix operations, we want a 'm by neurons' matrix
    m = x.shape[1]
    w = torch.randn(m, neurons) * math.sqrt(2/m) # Kaiming He init, works better than Xavier
    b = torch.randn(neurons)
    return x @ w + b

In [11]:
def relu(x):
    return x.clamp_min(0.)

In [12]:
def model(x):
    x = linear(x, 50)
    x = relu(x)
    x = linear(x, 10)
    return x

In [13]:
output = model(train)

In [14]:
#ten predictions for every row, sounds good
output.shape

torch.Size([60000, 10])

### define a loss function

In [15]:
def log_softmax(output):
    return (output.exp() / output.exp().sum(-1, keepdim=True)).log()

In [None]:
# log-sum-exp-trick:          https://www.xarg.org/2016/06/the-log-sum-exp-trick-in-machine-learning/
def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:,None]).exp().sum(-1).log()

def log_softmax(x): 
    return x - x.logsumexp(-1,keepdim=True)

In [17]:
# negative log likelihood
def nll(output, y_train):
    return -output[range(y_train.shape[0]), y_train].mean()

In [18]:
nll(log_softmax(output), y_train)

tensor(3.9627)

### Forward pass

In [19]:
output = model(train)
print('output shape:', output.shape)
loss = nll(log_softmax(output), y_train)
print('loss', loss)

output shape: torch.Size([60000, 10])
loss tensor(3.8240)


### Backward pass

In [20]:
# parameters we can change:
EPOCHS = 35
INPUT = train
TARGETS = y_train
N_LAYERS = 2
LR = 0.3

# Automated fully connected neural net creation based on above parameters
activations = np.linspace(50, 10, N_LAYERS).astype(int)
shapes = [INPUT.shape[1]]
shapes.extend(activations)
w = []
b = []
for layer in range(N_LAYERS):
    w.append(torch.randn(shapes[layer], activations[layer]) * math.sqrt(2/shapes[layer]))
    b.append(torch.zeros(activations[layer]))
    w[-1].requires_grad_()
    b[-1].requires_grad_()
    
# Training loop
for i in range(EPOCHS):
    output = INPUT
    for layer in range(N_LAYERS-1):
        output = output @ w[layer] + b[layer]
        output = relu(output)
    
    output = output @ w[N_LAYERS-1] + b[N_LAYERS-1]
    
    loss = nll(log_softmax(output), TARGETS)
    print(loss)
    loss.backward()
    with torch.no_grad():
        for layer in range(N_LAYERS):
            w[layer] -= w[layer].grad * LR
            b[layer] -= b[layer].grad * LR
            w[layer].grad.zero_()
            b[layer].grad.zero_()

tensor(3.6006, grad_fn=<NegBackward>)
tensor(2.5323, grad_fn=<NegBackward>)
tensor(2.1176, grad_fn=<NegBackward>)
tensor(2.2939, grad_fn=<NegBackward>)
tensor(1.8105, grad_fn=<NegBackward>)
tensor(1.6865, grad_fn=<NegBackward>)
tensor(1.5013, grad_fn=<NegBackward>)
tensor(1.3421, grad_fn=<NegBackward>)
tensor(1.1259, grad_fn=<NegBackward>)
tensor(0.9166, grad_fn=<NegBackward>)
tensor(0.8209, grad_fn=<NegBackward>)
tensor(0.8318, grad_fn=<NegBackward>)
tensor(0.9769, grad_fn=<NegBackward>)
tensor(0.9921, grad_fn=<NegBackward>)
tensor(0.9432, grad_fn=<NegBackward>)
tensor(0.6969, grad_fn=<NegBackward>)
tensor(0.5547, grad_fn=<NegBackward>)
tensor(0.4987, grad_fn=<NegBackward>)
tensor(0.4755, grad_fn=<NegBackward>)
tensor(0.4633, grad_fn=<NegBackward>)
tensor(0.4624, grad_fn=<NegBackward>)
tensor(0.4721, grad_fn=<NegBackward>)
tensor(0.5215, grad_fn=<NegBackward>)
tensor(0.5402, grad_fn=<NegBackward>)
tensor(0.6184, grad_fn=<NegBackward>)
tensor(0.5125, grad_fn=<NegBackward>)
tensor(0.457

In [21]:
# even with this simple model, we see the loss decrease

### Inference

In [22]:
def accuracy(pred, targs):
    return float((pred.argmax(dim=-1) == targs).sum()) / float(len(pred))

In [23]:
output = test
for layer in range(N_LAYERS-1):
    output = output @ w[layer] + b[layer]
    output = relu(output)
    
output = output @ w[N_LAYERS-1] + b[N_LAYERS-1]
output = log_softmax(output)

In [24]:
accuracy(output, y_test)

0.9059

90% accuracy (random would be 10%,as there are 10 classes), our model is definitely learning something.
However, this code is messy and hard to maintain. In the next notebook we'll refactor it using python classes. Moreover, fully connected is a good start, but the tool of trade for computer vision is convolutions. We'll add that as well