In [3]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
#export
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#export
import sys
sys.path.insert(0,"/content/drive/My Drive/Colab Notebooks/exp")
from nb_02 import *
import torch.nn.functional as F

# Initial Setup

## Data

In [0]:
mpl.rcParams['image.cmap'] = 'gray'

In [0]:
x_train, y_train, x_valid, y_valid = get_data()

In [0]:
n, m = x_train.shape
c= y_train.max() + 1
nh = 50

In [0]:
class Model(nn.Module):
  def __init__(self, n_in, nh, n_out):
    super().__init__()
    self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh, n_out)]
    
  def __call__(self, x):
    for l in self.layers: x = l(x)
    return x

In [0]:
model = Model(m, nh, 10)

In [0]:
pred = model(x_train)

## Cross Entropy loss

First, we will need to compute the softmax of our activations. This is defined by:

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{e^{x_{0}} + e^{x_{1}} + \cdots + e^{x_{n-1}}}$$
or more concisely:

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum_{0 \leq j \leq n-1} e^{x_{j}}}$$
In practice, we will need the log of the softmax when we calculate the loss.

In [0]:
def log_softmax(x):
  return (x.exp()/(x.exp().sum(-1, keepdim=True))).log()

In [0]:
sm_pred = log_softmax(pred)

In [14]:
pred, sm_pred

(tensor([[-1.1053e-01, -7.2135e-02, -2.6264e-01,  ..., -5.9562e-02,
          -9.4913e-02, -1.2401e-01],
         [-1.6240e-01, -3.0950e-02, -2.4026e-01,  ...,  1.4663e-02,
          -1.0649e-01, -2.0211e-01],
         [-8.8474e-02, -1.2782e-01, -1.9942e-01,  ...,  1.0551e-01,
           1.1453e-02, -6.6103e-02],
         ...,
         [-1.4424e-01, -1.9656e-01, -2.2762e-01,  ...,  2.3967e-02,
          -5.8659e-02, -2.0482e-01],
         [-9.6590e-02, -1.7434e-01, -1.9059e-01,  ...,  6.9588e-05,
          -5.9569e-02, -1.4848e-01],
         [-1.2940e-01, -1.2655e-01, -2.4920e-01,  ...,  1.5363e-01,
           1.0952e-02, -2.2787e-01]], grad_fn=<AddmmBackward>),
 tensor([[-2.3351, -2.2967, -2.4872,  ..., -2.2841, -2.3195, -2.3486],
         [-2.3717, -2.2402, -2.4495,  ..., -2.1946, -2.3158, -2.4114],
         [-2.3378, -2.3772, -2.4488,  ..., -2.1438, -2.2379, -2.3155],
         ...,
         [-2.3566, -2.4089, -2.4400,  ..., -2.1884, -2.2710, -2.4172],
         [-2.3177, -2.3955, -2.

The cross entropy loss for some target $x$ and some prediction $p(x)$ is given by:

$$ -\sum x\, \log p(x) $$
But since our $x$s are 1-hot encoded, this can be rewritten as $-\log(p_{i})$ where i is the index of the desired target.

This can be done using numpy-style integer array indexing. Note that PyTorch supports all the tricks in the advanced indexing methods discussed in that link.

In [15]:
y_train[:3]

tensor([5, 0, 4])

In [16]:
sm_pred[[0,1,2],[5,0,4]], pred[[0,1,2],[5,0,4]]

(tensor([-2.3572, -2.3717, -2.3492], grad_fn=<IndexBackward>),
 tensor([-0.1327, -0.1624, -0.0999], grad_fn=<IndexBackward>))

In [17]:
y_train.shape

torch.Size([50000])

In [0]:
def nll(inp, target):
  return -inp[range(target.shape[0]), target].mean()

In [0]:
loss = nll(sm_pred, y_train)

In [20]:
loss

tensor(2.3035, grad_fn=<NegBackward>)

Note that the formula

$$\log \left ( \frac{a}{b} \right ) = \log(a) - \log(b)$$
gives a simplification when we compute the log softmax, which was previously defined as (x.exp()/(x.exp().sum(-1,keepdim=True))).log()

In [0]:
def log_softmax(x):
  return x - x.exp().sum(-1, keepdim=True).log()

In [0]:
test_near(nll(log_softmax(pred), y_train), loss)

Then, there is a way to compute the log of the sum of exponentials in a more stable way, called the LogSumExp trick. The idea is to use the following formula:

$$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$
where a is the maximum of the $x_{j}$.

In [0]:
def logsumexp(x):
  m = x.max(-1)[0]
  return m + (x-m[:,None]).exp().sum(-1).log()

This way, we will avoid an overflow when taking the exponential of a big activation. In PyTorch, this is already implemented for us.

In [0]:
test_near(logsumexp(pred),pred.logsumexp(-1))

So we can use it for our log_softmax function.

In [0]:
def log_softmax(x):
  return x - x.logsumexp(-1, keepdim=True)

In [0]:
test_near(nll(log_softmax(pred), y_train), loss)

Then use PyTorch's implementation.

In [0]:
test_near(F.nll_loss(F.log_softmax(pred,-1), y_train), loss)

In PyTorch, F.log_softmax and F.nll_loss are combined in one optimized function, F.cross_entropy.

In [0]:
test_near(F.cross_entropy(pred, y_train), loss)

## Basic training loop
Basically the training loop repeats over the following steps:



1.   get the output of the model on a batch of inputs
2.   compare the output to the labels we have and compute a loss
3.   calculate the gradients of the loss with respect to every parameter of the model
4.   update said parameters with those gradients to make them a little bit better

In [0]:
loss_func = F.cross_entropy

In [0]:
# export
def accuracy(out, yb):
  return (torch.argmax(out, dim=1) == yb).float().mean()

In [31]:
bs = 64
xb = x_train[0:bs]
pred = model(xb)
pred[0], pred.shape

(tensor([-0.1105, -0.0721, -0.2626, -0.1013,  0.0403, -0.1327,  0.0946, -0.0596,
         -0.0949, -0.1240], grad_fn=<SelectBackward>), torch.Size([64, 10]))

In [32]:
yb = y_train[0:bs]
loss_func(pred, yb)

tensor(2.3028, grad_fn=<NllLossBackward>)

In [33]:
accuracy(pred, yb)

tensor(0.1094)

In [0]:
lr = 0.5
epochs = 1

In [0]:
for epoch in range(epoch):
  for i in range((n-1)//bs+1):
    start_idx = i*bs
    end_idx = start_idx+bs
    xb = x_train[start_idx:end_idx]
    yb = y_train[start_idx:end_idx]
    loss = loss_func(model(xb), yb)
    
    loss.backward()
    with torch.no_grad():
      for l in model.layers:
        if hasattr(l, 'weight'):
          l.weight -= l.weight.grad * lr
          l.bias -= l.bias.grad * lr
          l.weight.grad.zero_()
          l.bias.grad.zero_()

In [36]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.2475, grad_fn=<NllLossBackward>), tensor(0.9375))

## Using Parameters and Optim
### Parameters
Use `nn.Module.__setattr__` and move relu to functional:

In [0]:
class Model(nn.Module):
  def __init__(self, n_in, nh, n_out):
    super().__init__()
    self.l1 = nn.Linear(n_in, nh)
    self.l2 = nn.Linear(nh, n_out)
    
  def __call__(self, x):
    return self.l2(F.relu(self.l1(x)))

In [0]:
model = Model(m, nh, 10)

In [42]:
for name, l in model.named_children():
  print(f'{name} : {l}')

l1 : Linear(in_features=784, out_features=50, bias=True)
l2 : Linear(in_features=50, out_features=10, bias=True)


In [43]:
model

Model(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
)

In [44]:
model.l1

Linear(in_features=784, out_features=50, bias=True)

In [0]:
def fit():
  for epoch in range(epochs):
    for i in range((n-1)//bs+1):
      start_idx = i*bs
      end_idx = start_idx + bs
      xb = x_train[start_idx:end_idx]
      yb = y_train[start_idx:end_idx]
      loss = loss_func(model(xb), yb)
      
      loss.backward()
      with torch.no_grad():
        for p in model.parameters():
          p -= p.grad * lr
          model.zero_grad()

In [56]:
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.4532, grad_fn=<NllLossBackward>), tensor(0.8750))

Behind the scenes, PyTorch overrides the `__setattr__` function in  nn.Module so that the submodules you define are properly registered as parameters of the model.


In [59]:
for p in model.parameters():
  print(f'{p.shape}')

torch.Size([50, 784])
torch.Size([50])
torch.Size([10, 50])
torch.Size([10])


In [0]:
class DummyModule():
  def __init__(self, n_in, nh, n_out):
    self._modules = {}
    self.l1 = nn.Linear(n_in, nh)
    self.l2 = nn.Linear(nh, n_out)
    
  def __setattr__(self, k, v):
    if not k.startswith('_'):
      self._modules[k] = v
    super().__setattr__(k, v)
      
  def __repr__(self):
    return f'{self._modules}'
  
  def parameters(self):
    for l in self._modules.values():
      for p in l.parameters():
        yield p

In [63]:
mdl = DummyModule(m, nh, 10)
mdl

{'l1': Linear(in_features=784, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

In [64]:
[o.shape for o in mdl.parameters()]

[torch.Size([50, 784]),
 torch.Size([50]),
 torch.Size([10, 50]),
 torch.Size([10])]

## Registering modules
We can use the original layers approach, but we have to register the modules.

In [66]:
layers = [nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10)]
layers

[Linear(in_features=784, out_features=50, bias=True),
 ReLU(),
 Linear(in_features=50, out_features=10, bias=True)]

In [0]:
class Model(nn.Module):
  def __init__(self, layers):
    super().__init__()
    self.layers = layers
    for i, l in enumerate(self.layers):
      self.add_module(f'My_model_layer_{i}', l)
      
  def __call__(self, x):
    for l in self.layers:
      x = l(x)
      return x

In [73]:
model = Model(layers)
model

Model(
  (My_model_layer_0): Linear(in_features=784, out_features=50, bias=True)
  (My_model_layer_1): ReLU()
  (My_model_layer_2): Linear(in_features=50, out_features=10, bias=True)
)

### `nn.ModuleList`


`nn.ModuleList` does this for us



In [0]:
class SequentialModel(nn.Module):
  def __init__(self, layers):
    super().__init__()
    self.layers = nn.ModuleList(layers)
    
  def __call__(self, x):
    for l in self.layers:
      x = l(x)
    return x
    

In [78]:
model = SequentialModel(layers)
model

SequentialModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [79]:
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.3748, grad_fn=<NllLossBackward>), tensor(0.8750))