Big Picture
* Get the dataset (pairs of input and label)
* Forward Pass: input -> function (model) -> output (prediction) 
* Compute the loss 
* Backward Pass (i.e backprop)
* Update the parameters (i.e. weights and biases)


In [None]:
from pathlib import Path
import requests

# get the dataset
DATA_PATH = Path('data')
PATH = DATA_PATH / 'mnist'

PATH.mkdir(parents=True, exist_ok=True)
URL = 'https://github.com/pytorch/tutorials/raw/main/_static/'
FILENAME = 'mnist.pkl.gz'

if not (PATH / FILENAME).exists():
    content = requests.get(URL + FILENAME).content
    (PATH / FILENAME).open('wb').write(content)

In [None]:
import pickle
import gzip

with gzip.open((PATH / FILENAME).as_posix(), 'rb') as f:
    # (training data), (validation data)
    # (x is input, y is label)
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding="latin-1")

print("x training data", x_train.shape)
print("y training data", y_train.shape)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# examine the dataset
plt.imshow(x_train[0].reshape(28, 28), cmap='gray')
print(y_train[0])


In [None]:
import torch

# turn numpy arrays to tensors
x_train, y_train, x_valid, y_valid = map(torch.tensor, (x_train, y_train, x_valid, y_valid))


* numpy: scientific and numerical computing in Python
* numpy array - multidimensional table of data - 2d, 3d, 4d...
* when all the elements in an array is of simple type - like integer or float; numpy will store it as a compact C data structure in memory. It can run computations on the data at the same speed as optimize C code.
* pytorch tensors are almost the same as numpy arrays - but there are some restrictions, that makes it more performant. 
* restrictions: A tensor can't be of any type - it has to be a single basic numeric type for all elements. - it also has to be rectangular in shape
* Tensors can utilize GPUs and optimized for computation in GPUs.
* Pytorch implements Autograd - automatically can compute gradients(derivatives) of an operation we do on tensors.

In [None]:
import math
# Initialize weights and biases (i.e parameters)
# NN Architecture: inputs -> hidden -> hidden -> output <--> label
# Our Architecture: inputs -> output <--> label (similar to logistic regression)
# Number of neurons in the input layer: 784(28 * 28); output layers: 10
# (10, 784) * (784 rows, 50000 columns) -> w * x + b
# (50000, 784) * (784, 10) -> x * w + b
torch.manual_seed(0) # create some random numbers
weights = torch.randn(784, 10) / math.sqrt(784) # Xavier initialization
weights.requires_grad_() # in place apply
bias = torch.zeros(10, requires_grad=True)


In [None]:
# Define the model: input -> model -> output
# (50000, 784) -> (... * 64, 784)
# xb -> batch of dataset
# @ sign is an operator for matrix multiplication
def model(xb):
    return xb @ weights + bias

In [None]:
# Loss function -> 10 <--> 10 label("5" -> [0,0,0,0,1,0,0,0,0,0])
def softmax(x): # e^x_i / sum(e^x_j)
    return x.exp() / x.exp().sum(-1).unsqueeze(-1)

# print(softmax(model(x_train)).shape)

def cross_entropy_loss(pred, targets):
    bs, out_features = pred.shape
    one_hot_encoded_targets = torch.eye(out_features)[targets]
    # cross entropy loss formula: - 1/n sum(p * log(q)) = bs
    return -(one_hot_encoded_targets * softmax(pred).log()).sum() / bs 

loss_func = cross_entropy_loss

In [None]:
import torch.nn.functional as F

# test if our implementation of cross entropy loss is correct
bs = 64 # bs -> batch size
xb = x_train[0:bs]
pred = model(xb)
yb = y_train[0:bs]

print(F.cross_entropy(pred, yb))
print(f"loss={cross_entropy_loss(pred, yb)}") 

In [None]:
def accuracy_func(pred, yb):
    pred_class = torch.argmax(pred, dim=1) # argmax
    # pred dim -> (64, 10)
    # [0.1, 0, 0, 0.2, 0.7, 0, 0, ...] -> 4 -> max(pred, dim=1)
    return (pred_class == yb).float().mean()

In [None]:
# Train the model (backward pass and parameters update)
# Epochs -> 2
# batches -> 50000 / 64

lr = 0.5
epochs = 2
n = x_train.shape[0]
num_batches = n // bs + 1

for epoch in range(epochs):
    for i in range(num_batches):
        start_i = i
        end_i = start_i + bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_func(pred, yb)
        accuracy = accuracy_func(pred, yb)
        loss.backward() # Autograd

        with torch.no_grad(): # temporarily sets all of the requires_grad flag to false. to save memory consumption
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            weights.grad.zero_() # otherwise, gradients will accumulate
            bias.grad.zero_()

        # Logging
        if i % 100 == 0:
            train_loss, train_accuracy = loss.item(), accuracy.item() * 100
            print(f"Loss: {train_loss:6f} Accuracy: {train_accuracy:0.1f}%")