In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from exports.e_01_testing import test_near_torch
from exports.e_02_MNISTLoader import loadMNIST
from exports.e_04_DataAPI import Dataset

from torch import nn

In [3]:
class Model(nn.Module):
    def __init__(self, n_inp, n_hid, n_out):
        super().__init__()
        
        self.inpL = nn.Linear(n_inp, n_hid)
        self.hidL = nn.ReLU(n_hid)
        self.outL = nn.Linear(n_hid, n_out)
        
        self.layers = [self.inpL, self.hidL, self.outL]
        
        
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [4]:
x_train, y_train, x_valid, y_valid = loadMNIST()
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)

n_sampl, n_inp = x_train.shape
n_out = 10
n_hid = 50

model = Model(n_inp, n_hid, n_out)
pred = model(x_train)

## Softmax, Negative Log Likelihood and Cross Entropy Loss
Since the labels will be one-hot encoded, we get the NLL by clever use of indexing.

### LogSumExp Trick

For numerical stability

$ \log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right ) $

In [5]:
def logSumExp(x): m = x.max(-1)[0]; return m + (x - m[:, None]).exp().sum(-1).log()

def softmax(x): return x.exp() / x.exp().sum(-1, keepdim=True)
def log_softmax(x): return x - logSumExp(x)[:, None]

sm_pred = log_softmax(pred)
sm_pred

tensor([[-2.2961, -2.3437, -2.4982,  ..., -2.3744, -2.2866, -2.3065],
        [-2.3001, -2.3351, -2.4629,  ..., -2.3046, -2.2995, -2.3802],
        [-2.2967, -2.3184, -2.4669,  ..., -2.3209, -2.2645, -2.5148],
        ...,
        [-2.2160, -2.2770, -2.4233,  ..., -2.3545, -2.3835, -2.4213],
        [-2.1351, -2.3462, -2.3826,  ..., -2.4427, -2.3572, -2.4290],
        [-2.2223, -2.3551, -2.4704,  ..., -2.2734, -2.4102, -2.4339]],
       grad_fn=<SubBackward0>)

In [6]:
def NLL(x, lab): return -x[:, lab].mean()

loss = NLL(sm_pred, y_train)

### Optimized Pytorch Implementations

In [7]:
import torch.nn.functional as F

In [8]:
pred.logsumexp(-1, keepdim=True)

tensor([[2.3391],
        [2.3298],
        [2.3209],
        ...,
        [2.2965],
        [2.3271],
        [2.2889]], grad_fn=<LogsumexpBackward>)

In [9]:
F.nll_loss(F.log_softmax(pred, -1), y_train), F.cross_entropy(pred, y_train)

(tensor(2.3124, grad_fn=<NllLossBackward>),
 tensor(2.3124, grad_fn=<NllLossBackward>))

In [10]:
test_near_torch(F.nll_loss(F.log_softmax(pred, -1), y_train), F.cross_entropy(pred, y_train))

Arguments ARE near.


## Optimizers and Training

Basic structure:

1. Feed Forward: Compute outputs on a set of inputs.
2. Compute loss from outputs and labels.
3. Backpropagate: compute the gradients of the loss with respect to each model parameter.
4. Update: update parameters using the gradients.

In [11]:
import torch
from torch.utils.data import DataLoader

In [12]:
def acc_func(pred, lab): return (torch.argmax(pred, dim=1) == lab).float().mean()

In [14]:
loss_func = F.cross_entropy

bs = 64            # batch size
inp = x_train[:bs]
lab = y_train[:bs]
pred = model(inp)
print(f'Loss: {loss_func(pred, lab)}\nAccuracy: {acc_func(pred, lab)}')

Loss: 2.294121265411377
Accuracy: 0.15625


### Optimizer
We'll use gradient descent as our optimization step.

In [15]:
class Optimizer():
    def __init__(self, learning_rate, params): self.lr, self.params = learning_rate, list(params)
        
    def step(self):
        # Gradient Descent
        with torch.no_grad():
            for p in self.params: p -= p.grad * self.lr
    
    def zero_grad(self):
        for p in self.params: p.grad.data.zero_()

### Training

In [16]:
def fit(model, epochs, loss_func, acc_func, optimizer, train_dl, valid_dl):
    tot_loss, tot_acc = 0., 0.
    n_valid = len(valid_dl)
    for epoch in range(epochs):
        
        # training
        model.train()
        for xb, yb in train_dl:
            loss = loss_func(model(xb), yb)
            loss.backward()
            opt.step()         # SGD
            opt.zero_grad()
            
        # validation
        model.eval()
        with torch.no_grad():
            tot_loss, tot_acc = 0., 0.
            for xb, yb in valid_dl:
                pred = model(xb)
                tot_loss += loss_func(pred, yb)
                tot_acc  += acc_func(pred, yb)
            print(f'Epoch: {epoch+1}, Loss:{tot_loss/n_valid}\t Acc:{tot_acc/n_valid}')
            
    return tot_loss/n_valid, tot_acc/n_valid

In [17]:
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=bs)
opt = Optimizer(0.3, model.parameters())

loss, acc = fit(model, 5, loss_func, acc_func, opt, train_dl, valid_dl)

Epoch: 1, Loss:0.2933460772037506	 Acc:0.9052547812461853
Epoch: 2, Loss:0.1334770768880844	 Acc:0.9629777073860168
Epoch: 3, Loss:0.13553813099861145	 Acc:0.9596934914588928
Epoch: 4, Loss:0.11354310065507889	 Acc:0.9686504602432251
Epoch: 5, Loss:0.4481799304485321	 Acc:0.8923168778419495


## Pytorch Implementations and Cleanup

`nn.Sequential` allows us to define a model consisting of a sequence of layers as we did above.

In [18]:
model = nn.Sequential(nn.Linear(n_inp, n_hid), nn.ReLU(n_hid), nn.Linear(n_hid, n_out))
model

Sequential(
  (0): Linear(in_features=784, out_features=50, bias=True)
  (1): ReLU(inplace=True)
  (2): Linear(in_features=50, out_features=10, bias=True)
)

`nn.optim` gives a SGD optimizer similar to the one above

In [19]:
from torch import optim

In [20]:
opt = optim.SGD(model.parameters(), lr=0.3)
opt

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.3
    momentum: 0
    nesterov: False
    weight_decay: 0
)

### Clear and concise:

In [21]:
#--export--#
from torch.utils.data import DataLoader

def make_dls(train_ds, valid_ds, batch_size, **kwargs):
    return (DataLoader(train_ds, batch_size, shuffle=True, **kwargs),
            DataLoader(valid_ds, batch_size*2, **kwargs))

In [22]:
# data
train_dl, valid_dl = make_dls(train_ds, valid_ds, 64)

# model
model = nn.Sequential(nn.Linear(n_inp, n_hid), nn.ReLU(n_hid), nn.Linear(n_hid, n_out))
opt = opt = optim.SGD(model.parameters(), lr=0.3)

# training and eval
loss, acc = fit(model, 5, loss_func, acc_func, opt, train_dl, valid_dl)

Epoch: 1, Loss:0.18260787427425385	 Acc:0.9467958807945251
Epoch: 2, Loss:0.16307620704174042	 Acc:0.9539161324501038
Epoch: 3, Loss:0.11055906862020493	 Acc:0.968156635761261
Epoch: 4, Loss:0.10845605283975601	 Acc:0.968156635761261
Epoch: 5, Loss:0.10642056167125702	 Acc:0.9708267450332642


In [1]:
!python utils/export_notebook.py 05_Losses_Optimizers_TrainEval.ipynb

Notebook 05_Losses_Optimizers_TrainEval.ipynb has been converted to module ./exports/e_05_Losses_Optimizers_TrainEval.py!
