This is a demonstration of the "raw" PyTorch API.  
We get to see manual parameter creation, manual graph building, and manual gradient updates.  
After seeing how PyTorch works, we can start throwing on helpers, wrappers, and managers for convenience and performance (like running operations on GPU).

In [None]:
!pip3 install torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
print(torch.__version__)

# Load the data

In [None]:
# Download MNIST dataset and parse into pytorch Dataset objects.

trainset = torchvision.datasets.MNIST(root='./data', download=True, train=True)
testset = torchvision.datasets.MNIST(root='./data', download=True, train=False)

In [None]:
# How big is our dataset? What kind of data do we have? 

print(trainset.data.shape, ', ', trainset.data.dtype)
print(trainset.targets.shape, ', ', trainset.targets.dtype)
print()
print(testset.data.shape, ', ', testset.data.dtype)
print(testset.targets.shape, ', ', testset.targets.dtype)

In [None]:
# Look at an example
# Images are monochrome with integer pixel values between 0 and 255 (inclusive)

torch.set_printoptions(linewidth=1000)
print(trainset.data[0])

In [None]:
# Visualize some images and check their labels

import matplotlib.pyplot as plt
import numpy as np

for i in range(6):
  print(trainset.targets[i].numpy())
  plt.imshow(trainset.data[i].numpy())
  plt.show()
  print('')

In [None]:
# Transform the dataset:
#   cast type to float
#   rescale pixels to [0, 1]
#   flatten each image, i.e. reshape from (28,28) to (,784)

trainset.data = trainset.data.flatten(1) / 255
testset.data = testset.data.flatten(1) / 255

print(trainset.data.shape)
print(trainset.data.dtype)
print(trainset.data.max())
print()
print(testset.data.shape)
print(testset.data.dtype)
print(testset.data.max())

# Define the model

In [None]:
INPUT_DIM = trainset.data.shape[-1]
HIDDEN_DIM = 256
OUTPUT_DIM = 10

# Define and initialize the model parameters by sampling each element i.i.d. from a normal distribution
SCALE = 1e-2
W1 = torch.nn.Parameter(SCALE * torch.normal(mean=0, std=1, size=(INPUT_DIM, HIDDEN_DIM)))
W2 = torch.nn.Parameter(SCALE * torch.normal(mean=0, std=1, size=(HIDDEN_DIM, OUTPUT_DIM)))
B1 = torch.nn.Parameter(SCALE * torch.normal(mean=0, std=1, size=(1, HIDDEN_DIM)))
B2 = torch.nn.Parameter(SCALE * torch.normal(mean=0, std=1, size=(1, OUTPUT_DIM)))

parameters = [W1, W2, B1, B2]

def model_fn(x):
  # x.shape=(batch_size, INPUT_DIM)
  h = F.relu(torch.matmul(x, W1) + B1)
  return torch.matmul(h, W2) + B2

In [None]:
# Test out our model

out = model_fn(trainset.data[:23])
print(out.shape)
print(out)

In [None]:
# View our parameters

print(W1)
print(B1)

In [None]:
# Define the loss function

def select_indices(x, indices):
  # From row i, select element indices[i], i.e.
  # return [x[i, indices[i]] for i in range(x.shape[0])]
  return x.gather(1, indices[:, None])

# Maximum likelihood loss (negative log probability of the data)
def mle_loss(x, labels):
  # equivalent to cross entropy loss where target probs are 1 on the correct labels
  # https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy
  logits = F.log_softmax(x, dim=1)  # rescale outputs in log space
  return -select_indices(logits, labels).mean()

***Cross entropy loss***

See https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy.

Let $\vec{q} = (q_1, \dots, q_n)$ be a vector of predicted probabilities,  
and let $\vec{p} = (p_1, \dots, p_n)$ be a vector of target probabilities.

The [cross entropy](https://en.wikipedia.org/wiki/Cross_entropy) of the two distributions is

$$
H(\vec{p}, \vec{q}) = -\sum_{i=1}^n p_i \log q_i
$$

Holding $\vec{p}$ fixed, $H(\vec{p}, \vec{q})$ is maximized when $\vec{q} = \vec{p}$.

In our case, the labels provide a one-hot target distribution.  
Let $t$ be the target label for some input image.
A one-hot distribution puts all probability on $t$, i.e.

$$\vec{\mathbb{1}}[t] = (0,\dots,0,1,0,\dots,0)$$

where $\mathbb{1}[t]_t = 1$.
Let $\vec{Y}$ be the raw model outputs and $\vec{q}$ be the model probabilities, i.e. 

$$
\vec{q} = \text{softmax}(\vec{Y}) = \frac{1}{\sum_{i=1}^n \exp(Y_i)}\Big(\exp(Y_1),\dots,\exp(Y_n)\Big)
$$

and let $\vec{p} = \vec{\mathbb{1}}[t]$.


Then the cross entropy loss (negated so that minimizing maximizes cross entropy) is


$$\begin{aligned}
L &= -H(\vec{p}, \vec{q}) \\
&= -H\left(\vec{\mathbb{1}}[t],\ \text{softmax}(\vec{Y})\right) \\
&= \sum_{i=1}^n \mathbb{1}[t]_i \log\left( \text{softmax}(\vec{Y}) \right)\\
&= \log\left( \text{softmax}(\vec{Y})_t \right) \\
&= Y_t - \log\left(\sum_{i=1}^n \exp(Y_i)\right)
\end{aligned}$$



In [None]:
def accuracy(logits, target):
  argmaxs = logits.max(1).indices
  corrects = torch.eq(argmaxs, target)
  return corrects.float().mean()

# Training loop

In [None]:
batch_size = 100
learning_rate = 1e-2
num_epochs = 1000
regularizer_weight = 1e-2

for epoch in range(batch_size):
  idx = torch.randperm(trainset.data.shape[0])  # random ordering of the training set

  ## training step
  for i in range(0, trainset.data.shape[0], batch_size):
    x = trainset.data[idx[i:i+batch_size]]
    y = trainset.targets[idx[i:i+batch_size]]

    ## forward + backprop + loss
    logits = model_fn(x)
    loss = mle_loss(logits, y)
    loss += regularizer_weight * sum(torch.linalg.norm(p, 1) for p in parameters)  # regularization
    loss.backward()  # calculate gradients

    # apply gradient updates to parameters
    for p in parameters:
      p.data.sub_(p.grad.data * learning_rate)
      p.grad.zero_()  # zero out gradient, otherwise they accumulate across multiple `backward` calls.

  train_logits = model_fn(trainset.data)  # Training accurate
  test_logits = model_fn(testset.data)  # Test accuracy
  loss = mle_loss(train_logits, trainset.targets)
  print('Epoch: %d | Train Loss: %.4f | Train Accuracy: %.2f | Test Accuracy: %.2f' % (epoch, loss.detach().item(), accuracy(train_logits, trainset.targets), accuracy(test_logits, testset.targets)))
