In [2]:
import torch, pickle, gzip, matplotlib as mp
import torch.nn as nn
import torch.nn.functional as F

mp.rcParams['image.cmap'] = 'gray'

In [3]:
data = None
with gzip.open('data/mnist.pkl.gz', 'rb') as f:
    data = pickle.load(f, encoding='latin')
    
((x_train, y_train), (x_val, y_val), _) = data
(x_train, y_train, x_val, y_val) = map(torch.tensor, (x_train, y_train, x_val, y_val))
x_train.shape, y_train.shape, x_val.shape, y_val.shape

(torch.Size([50000, 784]),
 torch.Size([50000]),
 torch.Size([10000, 784]),
 torch.Size([10000]))

In [4]:
m,n = x_train.shape
c = max(y_train) + 1
nh = 50
nout = 10

In [82]:
class Model(nn.Module):
    def __init__(self, nin, nh, nout):
        super().__init__()
        self.layers = [nn.Linear(nin, nh), nn.ReLU(nh), nn.Linear(nh, nout)]
    
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x

In [83]:
model = Model(n, nh, nout)

In [84]:
model

Model()

In [85]:
model.layers

[Linear(in_features=784, out_features=50, bias=True),
 ReLU(inplace=True),
 Linear(in_features=50, out_features=10, bias=True)]

In [12]:
pred = model(x_train)

In [13]:
pred.shape

torch.Size([50000, 10])

In [27]:
def logsoftmax(x):
    return x.exp()/(x.exp().sum(1, keepdim=True))

In [38]:
def logsoftmax(x):
    return x - (x.exp().sum(1, keepdim=True)).log()

In [39]:
logsoftmax(pred)[0].sum()

tensor(-23.0668, grad_fn=<SumBackward0>)

In [40]:
def logsumexp(x):
    m = x.max(1, keepdim=True)[0]
    return m + ((x-m).exp().sum(1, keepdim=True)).log()

In [41]:
def logsoftmax(x):
    return x - logsumexp(x)

In [44]:
logsoftmax(pred)[0].sum()

tensor(-23.0668, grad_fn=<SumBackward0>)

In [45]:
torch.allclose(logsumexp(pred), pred.logsumexp(1, keepdim=True))

True

In [48]:
def nll(pred, target):
    return -pred[range(pred.shape[0]), target].mean()

In [49]:
nll(pred, y_train)

tensor(0.0509, grad_fn=<NegBackward0>)

In [50]:
sm_pred = logsoftmax(pred)

In [52]:
nll(sm_pred, y_train)

tensor(2.2991, grad_fn=<NegBackward0>)

In [53]:
loss = nll(sm_pred, y_train)
loss

tensor(2.2991, grad_fn=<NegBackward0>)

In [54]:
torch.allclose(loss, F.nll_loss(sm_pred, y_train))

True

In [55]:
torch.allclose(loss, F.cross_entropy(sm_pred, y_train))

True

In [56]:
loss_func = F.cross_entropy

In [57]:
bs = 50
xb, yb = x_train[:bs], y_train[:bs]
xb.shape, yb.shape

(torch.Size([50, 784]), torch.Size([50]))

In [58]:
preds = model(xb)

In [59]:
preds.shape

torch.Size([50, 10])

In [65]:
loss = loss_func(preds, yb)
loss

tensor(2.3032, grad_fn=<NllLossBackward0>)

In [61]:
def accuracy(preds, target):
    return (preds.argmax(dim=1) == target).float().mean()

In [62]:
accuracy(preds, yb)

tensor(0.0800)

In [66]:
def report(loss, preds, target):
    print(f'loss:{loss:.2f}, accuracy:{accuracy(preds, target):.2f}')

In [67]:
report(loss, preds, yb)

loss:2.30, accuracy:0.08


In [68]:
lr = 0.5
epochs = 3

In [86]:
for epoch in range(epochs):
    for i in range(0, m, bs):
        s = slice(i, min(m, i+bs))
        xb,yb = x_train[s],y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= lr * l.weight.grad
                    l.bias -= lr * l.bias.grad
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()
    report(loss, preds, yb)

loss:0.13, accuracy:0.96
loss:0.08, accuracy:0.98
loss:0.07, accuracy:0.98


In [91]:
class MLP(nn.Module):
    def __init__(self, nin, nh, nout):
        super().__init__()
        self.l1 = nn.Linear(nin, nh)
        self.relu = nn.ReLU(nh)
        self.l2 = nn.Linear(nh, nout)
    
    def forward(self, x):
        return self.l2(self.relu(self.l1(x)))

In [92]:
model = MLP(n, nh, nout)

In [93]:
model

MLP(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (relu): ReLU(inplace=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
)

In [97]:
model

MLP(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (relu): ReLU(inplace=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
)

In [100]:
def fit():
    for epoch in range(epochs):
        for i in range(0, m, bs):
            s = slice(i, min(m, i+bs))
            xb,yb = x_train[s],y_train[s]
            preds = model(xb)
            loss = loss_func(preds, yb)
            loss.backward()
            with torch.no_grad():
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad()
        report(loss, preds, yb)

In [101]:
fit()

loss:0.08, accuracy:0.96
loss:0.07, accuracy:0.96
loss:0.05, accuracy:0.98


In [153]:
class MyModule:
    def __init__(self):
        self._modules = {}
        
    def __repr__(self):
        return f'{self._modules}'
    
    def __setattr__(self, k, v):
        if not k.startswith('_'):
            self._modules[k] = v
        super().__setattr__(k, v)
        
    def parameters(self):
        for l in self._modules.values():
            yield from l.parameters()
            
    def zero_grad(self):
        for p in self.parameters():
            p.grad.zero_()
            
    def __call__(self, x):
        return self.forward(x)
            

In [154]:
md1 = MyModule()

In [155]:
md1.foo = nn.Linear(3, 4)

In [156]:
md1

{'foo': Linear(in_features=3, out_features=4, bias=True)}

In [157]:
for p in md1.parameters():
    print(p.shape)

torch.Size([4, 3])
torch.Size([4])


In [162]:
class MLP(MyModule):
    def __init__(self, nin, nh, nout):
        super().__init__()
        self.l1 = nn.Linear(nin, nh)
        self.relu = nn.ReLU(nh)
        self.l2 = nn.Linear(nh, nout)
    
    def forward(self, x):
        return self.l2(self.relu(self.l1(x)))

In [163]:
model = MLP(n, nh, nout)

In [164]:
model

{'l1': Linear(in_features=784, out_features=50, bias=True), 'relu': ReLU(inplace=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

In [165]:
fit()

loss:0.17, accuracy:0.92
loss:0.10, accuracy:0.98
loss:0.10, accuracy:0.96
