## walk through of the pytorch api
[source](https://pytorch.org/tutorials/beginner/nn_tutorial.html)

In [1]:
import torch
import numpy as np
from pathlib import Path

In [2]:
import requests

In [3]:
DATA_PATH = Path("data")
PATH = DATA_PATH / "mnist"

In [4]:
PATH

PosixPath('data/mnist')

In [5]:
PATH.mkdir(parents=True, exist_ok=True)

In [6]:
URL="http://deeplearning.net/data/mnist/"
FILENAME="mnist.pkl.gz"

In [7]:
if not (PATH / FILENAME).exists():
    content = requests.get(URL + FILENAME).content
    (PATH / FILENAME).open("wb").write(content)

In [8]:
import pickle
import gzip
with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
    ((x_train, y_train), (x_valid, y_valid), other) = pickle.load(f, encoding="latin-1")

In [9]:
import torch

In [10]:
x_train, y_train, x_valid, y_valid = map( torch.tensor, 
        (x_train, y_train, x_valid, y_valid))

In [11]:
print(x_train.max(), x_train.min())

tensor(0.9961) tensor(0.)


In [12]:
import math

In [13]:
weights = torch.randn(784, 10)/math.sqrt(10)

In [14]:
weights.requires_grad_()

tensor([[-0.0359,  0.0949,  0.2106,  ...,  0.0066, -0.1073,  0.2142],
        [ 0.0126,  0.3749,  0.1723,  ...,  0.0339,  0.1582, -0.2986],
        [-0.4197, -0.1001, -0.1211,  ...,  0.3161, -0.2685,  0.1548],
        ...,
        [ 0.1440, -0.4225, -0.2772,  ...,  0.3982,  0.1130,  0.2492],
        [ 0.3383, -0.2124, -0.1433,  ..., -0.3756, -0.0267,  0.1842],
        [ 0.1300,  0.4001, -0.2007,  ...,  0.2468, -0.4542, -0.2929]],
       requires_grad=True)

In [15]:
bias = torch.zeros(10, requires_grad=True)

In [16]:
def log_softmax(x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)
def model(xb):
    return log_softmax(xb @ weights + bias)

In [17]:
batch_size = 64
xb=x_train[0:batch_size]
preds= model(xb)

In [18]:
print(preds[0])

tensor([-5.2468, -2.2046, -1.9279, -1.0049, -2.8145, -2.4985, -6.2645, -3.0805,
        -2.0400, -2.9395], grad_fn=<SelectBackward>)


Indexing 2d tensor with two 1d tensors of the same shape in each axis --> corresponding 1d tensor

In [19]:
def nll(input, target):
    return -input[range(target.shape[0]), target].mean()
loss_func = nll

In [20]:
yb = y_train[0:batch_size]

In [21]:
l = loss_func(preds, yb)

In [22]:
def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds==yb).float().mean()

In [23]:
print(accuracy(preds, yb))

tensor(0.0312)


In [24]:
n = x_train.shape[0]

In [25]:
from IPython.core.debugger import set_trace

In [26]:
lr = .5
epochs=2

In [27]:
for epoch in range(epochs):
#     set_trace()
    for i in range((n-1)//batch_size + 1):
        start_i = i * batch_size
        end_i = (i+1) * batch_size
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_func(pred, yb)
        
        #compute grads WRT loss and store in graph leaves
        loss.backward()
        #this part has nothing to do with gradient comp
        with torch.no_grad():
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            weights.grad.zero_()
            bias.grad.zero_()

In [28]:
print(loss_func(model(xb), yb), accuracy(model(xb), yb))

tensor(0.0708, grad_fn=<NegBackward>) tensor(1.)


Refactor with `torch.nn.Functional`

In [30]:
import torch.nn.functional as F

In [31]:
loss_func = F.cross_entropy

In [32]:
def model(xb):
    return xb @ weights + bias

In [33]:
print(loss_func(model(xb), yb), accuracy(model(xb), yb))

tensor(0.0708, grad_fn=<NllLossBackward>) tensor(1.)


Refactor with `nn.Module`

In [34]:
from torch import nn

In [35]:
class MNIST_logistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(784, 10)/ math.sqrt(10))
        self.bias = nn.Parameter(torch.zeros(10))
    def forward(self, xb):
        return xb @ self.weights + self.bias

In [42]:
model = MNIST_logistic()

In [43]:
def fit():
    for epoch in range(epochs):

        for i in range((n-1)//batch_size + 1):
            start_i = i * batch_size
            end_i = (i+1) * batch_size
            xb = x_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            pred = model(xb)
            loss = loss_func(pred, yb)


            loss.backward()

            with torch.no_grad():
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad()

In [44]:
fit()

In [46]:
print(loss_func(model(xb), yb))

tensor(0.0744, grad_fn=<NllLossBackward>)


Refactor with `nn.Linear`

In [47]:
class MNIST_logistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(784, 10)
    def forward(self, xb):
        return self.linear(xb)

In [48]:
model = MNIST_logistic()

In [54]:
for p in model.children():
    print(p)

Linear(in_features=784, out_features=10, bias=True)


In [55]:
print(loss_func(model(xb), yb))

tensor(2.3122, grad_fn=<NllLossBackward>)


In [56]:
fit()

In [57]:
print(loss_func(model(xb), yb))

tensor(0.0824, grad_fn=<NllLossBackward>)


Adding optimizers

In [58]:
from torch import optim

In [59]:
def get_model():
    model = MNIST_logistic()
    opt = optim.SGD(model.parameters(), lr=lr)
    return model, opt

In [60]:
model, opt = get_model()

In [61]:
print(loss_func(model(xb), yb))

tensor(2.2906, grad_fn=<NllLossBackward>)


In [62]:
for epoch in range(epochs):

    for i in range((n-1)//batch_size + 1):
        start_i = i * batch_size
        end_i = (i+1) * batch_size
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [63]:
print(loss_func(model(xb), yb))

tensor(0.0810, grad_fn=<NllLossBackward>)


Refactor with `Dataset`

In [65]:
from torch.utils.data import TensorDataset

In [66]:
train_ds = TensorDataset(x_train, y_train)

In [69]:
for epoch in range(epochs):

    for i in range((n-1)//batch_size + 1):
        start_i = i * batch_size
        end_i = (i+1) * batch_size
        xb, yb = train_ds[start_i:end_i]
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [71]:
from torch.utils.data import DataLoader

In [72]:
train_dl = DataLoader(train_ds, batch_size=batch_size)

In [74]:
def fit(model, opt):
    for xb, yb in train_dl:
        pred = model(xb)
        loss = loss_func(pred, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()

In [75]:
model, opt = get_model()

In [76]:
print(loss_func(model(xb), yb))

tensor(2.3112, grad_fn=<NllLossBackward>)


In [77]:
fit(model, opt)

In [78]:
print(loss_func(model(xb), yb))

tensor(0.1108, grad_fn=<NllLossBackward>)


Adding val step

In [81]:
train_ds= TensorDataset(x_train, y_train)
val_ds = TensorDataset(x_valid, y_valid)
train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size)
val_dl = DataLoader(val_ds, shuffle=False, batch_size = 2 * batch_size)

In [99]:
def fit(model, opt):
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            pred = model(xb)
            loss = loss_func(pred, yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
        model.eval()
        with torch.no_grad():
            val_loss = sum(loss_func(model(xb), yb) for xb, yb in val_dl) / len(val_dl)
            print(val_loss)

In [100]:
model, opt = get_model()

In [101]:
print(loss_func(model(xb), yb))

tensor(2.3414, grad_fn=<NllLossBackward>)


In [102]:
fit(model, opt)

tensor(0.3183)
tensor(0.2858)


In [103]:
print(loss_func(model(xb), yb))

tensor(0.4091, grad_fn=<NllLossBackward>)


In [104]:
a = torch.ones(3)

In [105]:
a

tensor([1., 1., 1.])

In [108]:
a.shape

torch.Size([3])

Adding conv layers and Sequential

In [111]:
class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func
    def forward(self, x):
        return self.func(x)

In [112]:
def preprocess(x):
    return x.view(-1, 1, 28,28)

In [113]:
model = nn.Sequential(
        Lambda(preprocess),
        nn.Conv2d(1,16,kernel_size=3,stride=2, padding=1),
        nn.ReLU(),
        nn.Conv2d(16,16,kernel_size=3,stride=2, padding=1),
        nn.ReLU(),
        nn.Conv2d(16,10,kernel_size=3,stride=2, padding=1),
        nn.ReLU(),
        nn.AvgPool2d(4), 
        Lambda(lambda x: x.view(x.size(0), -1)))

In [116]:
dir(model)

['__call__',
 '__class__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backend',
 '_backward_hooks',
 '_buffers',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_item_by_idx',
 '_get_name',
 '_load_from_state_dict',
 '_load_state_dict_pre_hooks',
 '_modules',
 '_named_members',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_slow_forward',
 '_state_dict_hooks',
 '_tracing_name',
 '_version',
 'add_module',
 'apply',
 'buffers',
 'children',
 'cpu',
 'cuda',
 'double',
 'dump_patches',
 'eval',
 'extra_repr',
 'float',
 'forward',
 'ha