# Minibatch Training KMNIST Practice

In [None]:
import numpy
import torch
from torch import nn
from torch.nn import functional as F
import torchvision
from fastcore.all import test_close
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
torch.manual_seed(42)

<torch._C.Generator>

## Load Datasets

In [None]:
data = torchvision.datasets.KMNIST('data', train=True)
x_train, y_train = data.train_data.flatten(1).float()/255., data.train_labels



In [None]:
x_train.shape, y_train.shape

(torch.Size([60000, 784]), torch.Size([60000]))

## Create Model

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
    def forward(self, x):
        for layer in self.layers: x = layer(x)
        return x

In [None]:
m = Model(784, 50, 10)

In [None]:
y_pred = m(x_train)

In [None]:
y_pred.shape

torch.Size([60000, 10])

## Implement softmax

In [None]:
y_pred.exp().shape

torch.Size([60000, 10])

In [None]:
y_pred.exp().sum(1, keepdim=True).shape

torch.Size([60000, 1])

In [None]:
y_softmax = y_pred.exp() / y_pred.exp().sum(1, keepdim=True)
y_softmax.shape

torch.Size([60000, 10])

In [None]:
y_softmax

tensor([[0.0951, 0.0932, 0.1246,  ..., 0.0844, 0.0867, 0.1105],
        [0.0922, 0.0960, 0.1259,  ..., 0.0819, 0.0778, 0.1145],
        [0.0943, 0.0839, 0.1426,  ..., 0.0848, 0.0856, 0.1192],
        ...,
        [0.0915, 0.0916, 0.1299,  ..., 0.0888, 0.0856, 0.0903],
        [0.0860, 0.1066, 0.1084,  ..., 0.0892, 0.0855, 0.1209],
        [0.1006, 0.1091, 0.1240,  ..., 0.0880, 0.0868, 0.1014]],
       grad_fn=<DivBackward0>)

## Implement Log Softmax

In [None]:
def log_softmax(y_pred):
    return y_pred - y_pred.exp().sum(-1, keepdim=True).log()

In [None]:
log_softmax(y_pred)

tensor([[-2.3524, -2.3727, -2.0824,  ..., -2.4723, -2.4457, -2.2029],
        [-2.3834, -2.3435, -2.0719,  ..., -2.5020, -2.5533, -2.1676],
        [-2.3608, -2.4776, -1.9477,  ..., -2.4673, -2.4581, -2.1268],
        ...,
        [-2.3919, -2.3899, -2.0412,  ..., -2.4209, -2.4583, -2.4046],
        [-2.4536, -2.2385, -2.2215,  ..., -2.4164, -2.4594, -2.1125],
        [-2.2971, -2.2155, -2.0872,  ..., -2.4302, -2.4436, -2.2886]],
       grad_fn=<SubBackward0>)

There are infinite values 

## Implement Log Softmax with `LogExp` trick

In [None]:
y_pred.max(-1, keepdim=True).values.shape

torch.Size([60000, 1])

In [None]:
def logexp(y_pred):
    max_val = y_pred.max(-1, keepdim=True).values
    return max_val + (y_pred - max_val).exp().sum(-1, keepdim=True).log()

In [None]:
logexp(y_pred).shape

torch.Size([60000, 1])

In [None]:
y_pred.logsumexp(-1).shape

torch.Size([60000])

In [None]:
test_close(logexp(y_pred), y_pred.logsumexp(-1, keepdim=True))

In [None]:
def log_softmax(y_pred):
    return y_pred - logexp(y_pred)

In [None]:
log_softmax(y_pred)

tensor([[-2.3524, -2.3727, -2.0824,  ..., -2.4723, -2.4457, -2.2029],
        [-2.3834, -2.3435, -2.0719,  ..., -2.5020, -2.5533, -2.1676],
        [-2.3608, -2.4776, -1.9477,  ..., -2.4673, -2.4581, -2.1268],
        ...,
        [-2.3919, -2.3899, -2.0412,  ..., -2.4209, -2.4583, -2.4046],
        [-2.4536, -2.2385, -2.2215,  ..., -2.4164, -2.4594, -2.1125],
        [-2.2971, -2.2155, -2.0872,  ..., -2.4302, -2.4436, -2.2886]],
       grad_fn=<SubBackward0>)

No infinite values, because of the `LogExp` trick

In [None]:
test_close(log_softmax(y_pred), y_pred.log_softmax(-1), eps=0.0001)

## Implement Cross Entropy

In [None]:
y_pred.log_softmax(-1).shape, y_train.shape

(torch.Size([60000, 10]), torch.Size([60000]))

In [None]:
def cross_entropy(y_pred, y_train):
    return - y_pred.log_softmax(-1)[range(y_train.shape[0]), y_train].mean()

In [None]:
cross_entropy(y_pred, y_train)

tensor(2.3130, grad_fn=<NegBackward0>)

In [None]:
test_close(cross_entropy(y_pred, y_train), F.cross_entropy(y_pred, y_train))

## Implement Accuracy Metric

In [None]:
y_pred.argmax(-1).shape

torch.Size([60000])

In [None]:
def accuracy(y_pred, y_train):
    return (y_pred.argmax(-1) == y_train).sum() / y_pred.shape[0]

In [None]:
accuracy(y_pred, y_train)

tensor(0.0905)

## Implement Minibatch Training

In [None]:
bs = 32
epochs = 10
lr = 0.01

In [None]:
m = Model(784, 50, 10)

In [None]:
for i in range(epochs):
    for j in range(0, len(x_train), bs):
        batch = slice(j, min(len(x_train), j + bs))
        x_bs = x_train[batch]
        y_bs = y_train[batch]
        y_hat = m(x_bs)
        loss = F.cross_entropy(y_hat, y_bs)
        
        loss.backward()

        with torch.no_grad():
            for l in m.layers:
                if hasattr(l, 'weight'):
                    l.weight -= lr * l.weight.grad
                    l.bias -= lr * l.bias.grad
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()
    print (f'Epoch {i} Loss: {loss}')

Epoch 0 Loss: 0.6007910370826721
Epoch 1 Loss: 0.42758849263191223
Epoch 2 Loss: 0.3689204752445221
Epoch 3 Loss: 0.3278256952762604
Epoch 4 Loss: 0.29490503668785095
Epoch 5 Loss: 0.27044224739074707
Epoch 6 Loss: 0.2532268166542053
Epoch 7 Loss: 0.23949064314365387
Epoch 8 Loss: 0.2294975370168686
Epoch 9 Loss: 0.2232760339975357


In [None]:
new_y_pred = m(x_train)

In [None]:
accuracy(new_y_pred, y_train)

tensor(0.9105)