In [1]:
#|default_exp training

In [2]:
#|export
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F

### Data

In [5]:
data = None
with gzip.open('../data/mnist.pkl.gz', 'rb') as f:
    data = pickle.load(f, encoding='latin')
((x_train,y_train),(x_val,y_val), _) = data
(x_train,y_train,x_val,y_val) = map(tensor, (x_train,y_train,x_val,y_val))
x_train.shape,y_train.shape,x_val.shape,y_val.shape

(torch.Size([50000, 784]),
 torch.Size([50000]),
 torch.Size([10000, 784]),
 torch.Size([10000]))

### Prediction flow

In [213]:
m,n = x_train.shape
c = max(y_train) + 1
nh = 50
n_out = 10

In [231]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
    
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [232]:
model = Model(n,nh,n_out)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

In [223]:
pred.sum().backward()

In [224]:
model.layers[0].weight.grad.max()

tensor(9107.3145)

In [225]:
[ele for ele in model.parameters()]

[]

###  Cross entropy loss

In [36]:
def log_softmax(x):
    return (x.exp()/x.exp().sum(-1, keepdim=True)).log()

In [38]:
t1 = log_softmax(pred)
t1

tensor([[-2.3048, -2.2415, -2.2480,  ..., -2.1866, -2.2036, -2.3776],
        [-2.4131, -2.2731, -2.1568,  ..., -2.2583, -2.1803, -2.2698],
        [-2.4295, -2.1087, -2.2318,  ..., -2.1927, -2.2782, -2.4518],
        ...,
        [-2.3280, -2.2321, -2.2526,  ..., -2.2195, -2.3059, -2.3306],
        [-2.4395, -2.0856, -2.1918,  ..., -2.1651, -2.2479, -2.3557],
        [-2.3856, -2.0988, -2.1821,  ..., -2.2111, -2.3032, -2.3954]],
       grad_fn=<LogBackward0>)

In [39]:
def log_softmax(x):
    return x.exp().log() - x.exp().sum(-1, keepdim=True).log()

In [40]:
t2 = log_softmax(pred)
t2

tensor([[-2.3048, -2.2415, -2.2480,  ..., -2.1866, -2.2036, -2.3776],
        [-2.4131, -2.2731, -2.1568,  ..., -2.2583, -2.1803, -2.2698],
        [-2.4295, -2.1087, -2.2318,  ..., -2.1927, -2.2782, -2.4518],
        ...,
        [-2.3280, -2.2321, -2.2526,  ..., -2.2195, -2.3059, -2.3306],
        [-2.4395, -2.0856, -2.1918,  ..., -2.1651, -2.2479, -2.3557],
        [-2.3856, -2.0988, -2.1821,  ..., -2.2111, -2.3032, -2.3954]],
       grad_fn=<SubBackward0>)

In [41]:
torch.allclose(t1, t2)

True

In [96]:
def logsumexp(x):
    m = x.max(-1)[0]
    return m + ((x-m[:,None]).exp()).sum(-1).log()
     

In [87]:
def logsum(x):
    m = x.max(-1)[0]
    return m + ((x-m[:,None]).exp()).sum(-1).log()

In [97]:
logsum(pred)

tensor([2.3366, 2.3449, 2.3489,  ..., 2.3423, 2.3752, 2.3320],
       grad_fn=<AddBackward0>)

In [98]:
def log_softmax(x):
    return x.exp().log() - logsum(x)[:,None]

In [102]:
t3 = log_softmax(pred)

In [103]:
t3.shape

torch.Size([50000, 10])

In [104]:
torch.allclose(t2, t3)

True

In [105]:
def log_softmax(x):
    return x - x.logsumexp(-1, keepdim=True)

In [106]:
t4 = log_softmax(pred)
t4.shape

torch.Size([50000, 10])

In [107]:
torch.allclose(t2, t4)

True

In [112]:
sm_pred = log_softmax(pred)
torch.allclose(sm_pred, t4)

True

In [121]:
def nll(input, target):
    return -input[range(target.shape[0]), target].mean()

In [122]:
loss = nll(sm_pred, y_train)
loss

tensor(2.3164, grad_fn=<NegBackward0>)

In [125]:
loss2 = F.nll_loss(sm_pred, y_train)
loss2

tensor(2.3164, grad_fn=<NllLossBackward0>)

In [126]:
loss3 = F.cross_entropy(pred, y_train)
loss3

tensor(2.3164, grad_fn=<NllLossBackward0>)

### Training loop

In [159]:
loss_func = F.cross_entropy

In [160]:
bs = 50
xb,yb = x_train[0:bs], y_train[0:bs]
preds = model(xb)

In [161]:
loss = loss_func(preds, yb)
loss

tensor(2.2975, grad_fn=<NllLossBackward0>)

In [162]:
(preds.argmax(-1) == yb).float().mean()

tensor(0.1400)

In [163]:
#|export
def accuracy(out, yb):
    return (out.argmax(-1)==yb).float().mean()

In [164]:
accuracy(preds, yb)

tensor(0.1400)

In [195]:
lr = 0.5
epochs = 3

In [196]:
#|export
def report(loss, preds, yb):
    print(f'loss:{loss:.2f}, accuracy:{accuracy(preds,yb):.2f}')

In [197]:
report(loss, preds, yb)

loss:2.31, accuracy:0.06


In [234]:
for epoch in range(epochs):
    for i in range(0, m, bs):
        s = slice(i, min(m,i+bs))
        xb,yb = x_train[s],y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= lr * l.weight.grad
                    l.bias -= lr * l.bias.grad
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()
    report(loss, preds, yb)

loss:0.04, accuracy:1.00
loss:0.02, accuracy:1.00
loss:0.02, accuracy:1.00


### Parameters and optim

In [237]:
m1 = nn.Module()
m1.foo = 'hello'
m1.foo = nn.Linear(5, 10)
m1

Module(
  (foo): Linear(in_features=5, out_features=10, bias=True)
)

In [239]:
list(m1.named_children())

[('foo', Linear(in_features=5, out_features=10, bias=True))]

In [240]:
list(m1.parameters())

[Parameter containing:
 tensor([[-0.0135,  0.2427,  0.4071, -0.2641, -0.1063],
         [-0.0249, -0.0718,  0.2160,  0.1209, -0.4219],
         [ 0.4248,  0.2381, -0.1875, -0.0182,  0.2244],
         [ 0.2804, -0.3338, -0.1321,  0.1581,  0.4113],
         [-0.3045, -0.2625,  0.3716,  0.1020,  0.3805],
         [ 0.3382, -0.4217,  0.3375,  0.3837,  0.1381],
         [-0.0558,  0.0939,  0.4124,  0.3357,  0.2708],
         [ 0.4250, -0.2025,  0.1014, -0.1028,  0.1781],
         [ 0.1769,  0.2889, -0.0114, -0.0194,  0.3398],
         [-0.0588, -0.3934, -0.2778,  0.4254, -0.3880]], requires_grad=True),
 Parameter containing:
 tensor([-0.0305,  0.0801, -0.3037, -0.3867,  0.2716,  0.3476, -0.3826,  0.2228,
          0.1090, -0.2184], requires_grad=True)]

In [250]:
class MLP(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in, nh)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(nh, n_out)
    
    def forward(self, x):
        return self.l2(self.relu(self.l1(x)))

In [251]:
model = MLP(n, nh, n_out)

In [252]:
model

MLP(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (relu): ReLU()
  (l2): Linear(in_features=50, out_features=10, bias=True)
)

In [253]:
model(xb).shape

torch.Size([50, 10])

In [254]:
def fit():
    for epoch in range(epochs):
        for i in range(0, m, bs):
            s = slice(i, min(m,i+bs))
            xb,yb = x_train[s],y_train[s]
            preds = model(xb)
            loss = loss_func(preds, yb)
            loss.backward()
            with torch.no_grad():
                for p in model.parameters():
                    p -= lr*p.grad
                model.zero_grad()
        report(loss, preds, yb)

In [255]:
fit()

loss:0.15, accuracy:0.96
loss:0.14, accuracy:0.96
loss:0.13, accuracy:0.94


In [259]:
class MyModule:
    def __init__(self,n_in,nh,n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in, nh)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(nh, n_out)
    
    def __setattr__(self,k,v):
        if not k.startswith('_'):
            self._modules[k] = v
        super().__setattr__(k, v)
    
    def __repr__(self):
        return f'{self._modules}'
    
    def parameters(self):
        for l in self._modules.values():
            yield from l.parameters()

In [260]:
md1 = MyModule(n,nh,n_out)

In [261]:
md1

{'l1': Linear(in_features=784, out_features=50, bias=True), 'relu': ReLU(), 'l2': Linear(in_features=50, out_features=10, bias=True)}

In [262]:
for p in md1.parameters():
    print(p.shape)

torch.Size([50, 784])
torch.Size([50])
torch.Size([10, 50])
torch.Size([10])


In [263]:
layers = [nn.Linear(n,nh), nn.ReLU(), nn.Linear(nh,n_out)]

In [268]:
class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers
        for i,l in enumerate(self.layers):
            self.add_module(f'layer_{i}', l)
    
    def forward(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [269]:
model = Model(layers)

In [270]:
model

Model(
  (layer_0): Linear(in_features=784, out_features=50, bias=True)
  (layer_1): ReLU()
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
)

In [271]:
model(xb).shape

torch.Size([50, 10])

In [272]:
class SequentialsModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
    
    def forward(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [274]:
model = SequentialsModel(layers)
model

SequentialsModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [275]:
model(xb).shape

torch.Size([50, 10])

In [276]:
model = nn.Sequential(*layers)
model

Sequential(
  (0): Linear(in_features=784, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=10, bias=True)
)

In [277]:
model(xb).shape

torch.Size([50, 10])

### Optim

In [347]:
class Optimizer:
    def __init__(self, params, lr=0.5):
        self.lr = lr
        self.params = list(params)
    
    def step(self):
        with torch.no_grad():
            for p in self.params:
                p -= self.lr*p.grad
    
    def zero_grad(self):
        for p in self.params:
            p.grad.data.zero_()      

In [348]:
model = nn.Sequential(*[nn.Linear(n,nh), nn.ReLU(), nn.Linear(nh,n_out)])

In [349]:
opt = Optimizer(model.parameters())

In [351]:
for epoch in range(epochs):
    for i in range(0, m, bs):
        s = slice(i, min(m, i+bs))
        xb,yb = x_train[s],y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss:0.09, accuracy:0.96
loss:0.03, accuracy:1.00
loss:0.02, accuracy:1.00


In [363]:
from torch import optim

In [364]:
def get_model():
    model = nn.Sequential(*[nn.Linear(n,nh), nn.ReLU(), nn.Linear(nh,n_out)])
    opt = optim.SGD(model.parameters(), lr=lr)
    return model,opt

In [365]:
model,opt = get_model()

In [366]:
for epoch in range(epochs):
    for i in range(0, m, bs):
        s = slice(i, min(m, i+bs))
        xb,yb = x_train[s],y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss:0.16, accuracy:0.98
loss:0.08, accuracy:0.98
loss:0.03, accuracy:1.00


### Dataset and DataLoader

In [369]:
#|export

class Dataset:
    def __init__(self,x,y):
        self.x,self.y = x,y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self,i):
        return self.x[i],self.y[i]

In [370]:
train_ds,val_ds = Dataset(x_train, y_train),Dataset(x_val, y_val)

In [379]:
xb,yb = train_ds[0:5]

In [380]:
xb.shape,yb.shape

(torch.Size([5, 784]), torch.Size([5]))

In [383]:
model,opt = get_model()

In [385]:
for epoch in range(epochs):
    for i in range(0,m,bs):
        xb,yb = train_ds[i:min(m,i+bs)]
        preds = model(xb)
        loss = loss_func(preds,yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss,preds,yb)

loss:0.21, accuracy:0.94
loss:0.18, accuracy:0.92
loss:0.14, accuracy:0.92


### DataLoader

In [386]:
class DataLoader:
    def __init__(self,ds,bs):
        self.ds,self.bs = ds,bs
    
    def __iter__(self):
        for i in range(0,len(self.ds),self.bs):
            yield self.ds[i:i+self.bs]

In [387]:
train_dl = DataLoader(train_ds, bs)
val_dl = DataLoader(val_ds, bs)

In [389]:
model,opt = get_model()

In [392]:
def fit():
    for epoch in range(epochs):
        for xb,yb in train_dl:
            preds = model(xb)
            loss = loss_func(preds, yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
        report(loss, preds, yb)

In [393]:
fit()

loss:0.10, accuracy:0.96
loss:0.05, accuracy:1.00
loss:0.03, accuracy:1.00


### Random sampling

In [394]:
import random

In [397]:
class Sampler:
    def __init__(self, ds, shuffle=False):
        self.shuffle,self.n = shuffle,len(ds)
    
    def __iter__(self):
        res = list(range(self.n))
        if self.shuffle:
            random.shuffle(res)
        return iter(res)

In [402]:
ss = Sampler(train_ds, shuffle=True)

In [404]:
import fastcore.all as fc

In [412]:
class BatchSampler:
    def __init__(self, sampler, bs, drop_last=False):
        fc.store_attr()
    
    def __iter__(self):
        yield from fc.chunked(iter(self.sampler), self.bs, drop_last=self.drop_last)

In [416]:
batchs = BatchSampler(ss, 4)

In [505]:
from itertools import islice

In [601]:
list(islice(batchs, 4))

[[26747, 32392, 21528, 44881],
 [24809, 44369, 43235, 34463],
 [18199, 38684, 26613, 41748],
 [23086, 1949, 15452, 22802]]

In [636]:
def collate(b):
    xs,ys = zip(*b)
    return torch.stack(xs),torch.stack(ys)

In [637]:
class DataLoader:
    def __init__(self, ds, batchs, collate_fn=collate):
        fc.store_attr()
    
    def __iter__(self):
        yield from (self.collate_fn(self.ds[i] for i in b) for b in self.batchs)

In [638]:
train_samp = BatchSampler(Sampler(train_ds, shuffle=True ), bs)

In [639]:
train_dl = DataLoader(train_ds, batchs=train_samp)

In [640]:
xb,yb = next(iter(train_dl))

In [641]:
xb.shape,yb.shape

(torch.Size([50, 784]), torch.Size([50]))

In [665]:
t = iter(((1, 2), (3, 4)))
t = map(tensor, t)
t = ((1, 2), (3, 4))

In [667]:
for a,b in zip(*t):
    print(a,b)

1 3
2 4


### Validation 

In [682]:
#|export
from torch.utils.data import default_collate,DataLoader

In [683]:
#|export
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb,yb in train_dl:
            loss = loss_func(model(xb), yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
        
        model.eval()
        with torch.no_grad():
            tot_loss,tot_acc,count = 0.,0.,0.
            for xb,yb in valid_dl:
                pred = model(xb)
                n = len(xb)
                count += n
                tot_loss += loss_func(pred,yb).item()*n
                tot_acc += accuracy(pred,yb).item()*n
        print(epoch, tot_loss/count, tot_acc/count)
    return tot_loss/count, tot_acc/count

In [684]:
#|export
def get_dls(train_ds, val_ds, bs, **kwargs):
    return (
        DataLoader(train_ds, batch_size=bs, shuffle=True, **kwargs),
        DataLoader(val_ds, batch_size=bs*2, shuffle=False, **kwargs),
    )

In [685]:
train_dl,valid_dl = get_dls(train_ds, val_ds, bs)
model,opt = get_model()

In [686]:
%time loss,acc = fit(epochs, model, loss_func, opt, train_dl, val_dl)

0 0.15154032296035438 0.9564000001549721
1 0.14613142590736972 0.9564999982714653
2 0.1094715037359856 0.9671000015735626
CPU times: user 2.31 s, sys: 119 ms, total: 2.43 s
Wall time: 2.66 s


### Export -

In [687]:
import nbdev
nbdev.nbdev_export()