In [0]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
#export
import sys
sys.path.insert(0,"/content/drive/My Drive/Colab Notebooks/exp")
from nb_02 import *
import torch.nn.functional as F

# Initial Setup

## Data

In [0]:
mpl.rcParams['image.cmap'] = 'gray'

In [0]:
x_train, y_train, x_valid, y_valid = get_data()

In [0]:
n, m = x_train.shape
c= y_train.max() + 1
nh = 50

In [0]:
class Model(nn.Module):
  def __init__(self, n_in, nh, n_out):
    super().__init__()
    self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh, n_out)]
    
  def __call__(self, x):
    for l in self.layers: x = l(x)
    return x

In [0]:
model = Model(m, nh, 10)

In [0]:
pred = model(x_train)

## Cross Entropy loss

First, we will need to compute the softmax of our activations. This is defined by:

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{e^{x_{0}} + e^{x_{1}} + \cdots + e^{x_{n-1}}}$$
or more concisely:

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum_{0 \leq j \leq n-1} e^{x_{j}}}$$
In practice, we will need the log of the softmax when we calculate the loss.

In [0]:
def log_softmax(x):
  return (x.exp()/(x.exp().sum(-1, keepdim=True))).log()

In [0]:
sm_pred = log_softmax(pred)

In [12]:
pred, sm_pred

(tensor([[-0.0174, -0.2572, -0.1699,  ..., -0.0638, -0.1437,  0.2164],
         [-0.0703, -0.1512, -0.0750,  ..., -0.0376, -0.1473,  0.1875],
         [-0.0455, -0.1313, -0.0450,  ..., -0.0487, -0.0664,  0.0941],
         ...,
         [ 0.0219, -0.0733, -0.0975,  ..., -0.0654, -0.2372,  0.1928],
         [ 0.0380, -0.0535, -0.0325,  ...,  0.0273, -0.1015,  0.1711],
         [-0.0094, -0.0714, -0.1000,  ...,  0.0516, -0.1547,  0.1978]],
        grad_fn=<AddmmBackward>),
 tensor([[-2.2760, -2.5157, -2.4285,  ..., -2.3223, -2.4022, -2.0421],
         [-2.3486, -2.4295, -2.3533,  ..., -2.3159, -2.4256, -2.0907],
         [-2.3189, -2.4047, -2.3184,  ..., -2.3221, -2.3397, -2.1793],
         ...,
         [-2.2584, -2.3536, -2.3778,  ..., -2.3457, -2.5176, -2.0875],
         [-2.2854, -2.3769, -2.3558,  ..., -2.2961, -2.4248, -2.1523],
         [-2.3127, -2.3747, -2.4033,  ..., -2.2517, -2.4580, -2.1054]],
        grad_fn=<LogBackward>))

The cross entropy loss for some target $x$ and some prediction $p(x)$ is given by:

$$ -\sum x\, \log p(x) $$
But since our $x$s are 1-hot encoded, this can be rewritten as $-\log(p_{i})$ where i is the index of the desired target.

This can be done using numpy-style integer array indexing. Note that PyTorch supports all the tricks in the advanced indexing methods discussed in that link.

In [13]:
y_train[:3]

tensor([5, 0, 4])

In [14]:
sm_pred[[0,1,2],[5,0,4]], pred[[0,1,2],[5,0,4]]

(tensor([-2.4006, -2.3486, -2.2264], grad_fn=<IndexBackward>),
 tensor([-0.1420, -0.0703,  0.0469], grad_fn=<IndexBackward>))

In [15]:
y_train.shape

torch.Size([50000])

In [0]:
def nll(inp, target):
  return -inp[range(target.shape[0]), target].mean()

In [0]:
loss = nll(sm_pred, y_train)

In [18]:
loss

tensor(2.3137, grad_fn=<NegBackward>)

Note that the formula

$$\log \left ( \frac{a}{b} \right ) = \log(a) - \log(b)$$
gives a simplification when we compute the log softmax, which was previously defined as (x.exp()/(x.exp().sum(-1,keepdim=True))).log()

In [0]:
def log_softmax(x):
  return x - x.exp().sum(-1, keepdim=True).log()

In [0]:
test_near(nll(log_softmax(pred), y_train), loss)

Then, there is a way to compute the log of the sum of exponentials in a more stable way, called the LogSumExp trick. The idea is to use the following formula:

$$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$
where a is the maximum of the $x_{j}$.

In [0]:
def logsumexp(x):
  m = x.max(-1)[0]
  return m + (x-m[:,None]).exp().sum(-1).log()

This way, we will avoid an overflow when taking the exponential of a big activation. In PyTorch, this is already implemented for us.

In [0]:
test_near(logsumexp(pred),pred.logsumexp(-1))

So we can use it for our log_softmax function.

In [0]:
def log_softmax(x):
  return x - x.logsumexp(-1, keepdim=True)

In [0]:
test_near(nll(log_softmax(pred), y_train), loss)

Then use PyTorch's implementation.

In [0]:
test_near(F.nll_loss(F.log_softmax(pred,-1), y_train), loss)

In PyTorch, F.log_softmax and F.nll_loss are combined in one optimized function, F.cross_entropy.

In [0]:
test_near(F.cross_entropy(pred, y_train), loss)

## Basic training loop
Basically the training loop repeats over the following steps:



1.   get the output of the model on a batch of inputs
2.   compare the output to the labels we have and compute a loss
3.   calculate the gradients of the loss with respect to every parameter of the model
4.   update said parameters with those gradients to make them a little bit better

In [0]:
loss_func = F.cross_entropy

In [0]:
# export
def accuracy(out, yb):
  return (torch.argmax(out, dim=1) == yb).float().mean()

In [47]:
bs = 64
xb = x_train[0:bs]
pred = model(xb)
pred[0], pred.shape

(tensor([-0.0174, -0.2572, -0.1699,  0.0133,  0.0634, -0.1420, -0.0233, -0.0638,
         -0.1437,  0.2164], grad_fn=<SelectBackward>), torch.Size([64, 10]))

In [49]:
yb = y_train[0:bs]
loss_func(pred, yb)

tensor(2.3037, grad_fn=<NllLossBackward>)

In [50]:
accuracy(pred, yb)

tensor(0.1250)

In [0]:
lr = 0.5
epoch = 1

In [0]:
for epoch in range(epoch):
  for i in range((n-1)//bs+1):
    start_idx = i*bs
    end_idx = start_idx+bs
    xb = x_train[start_idx:end_idx]
    yb = y_train[start_idx:end_idx]
    loss = loss_func(model(xb), yb)
    
    loss.backward()
    with torch.no_grad():
      for l in model.layers:
        if hasattr(l, 'weight'):
          l.weight -= l.weight.grad * lr
          l.bias -= l.bias.grad * lr
          l.weight.grad.zero_()
          l.bias.grad.zero_()

In [66]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.1751, grad_fn=<NllLossBackward>), tensor(0.9375))

## Using Parameters and Optim
### Parameters
Use `nn.Module.__setattr__` and move relu to functional: