## walk through of the pytorch api
[source](https://pytorch.org/tutorials/beginner/nn_tutorial.html)

In [1]:
import torch
import numpy as np
from pathlib import Path

In [2]:
import requests

In [3]:
DATA_PATH = Path("data")
PATH = DATA_PATH / "mnist"

In [4]:
PATH

PosixPath('data/mnist')

In [5]:
PATH.mkdir(parents=True, exist_ok=True)

In [6]:
URL="http://deeplearning.net/data/mnist/"
FILENAME="mnist.pkl.gz"

In [7]:
if not (PATH / FILENAME).exists():
    content = requests.get(URL + FILENAME).content
    (PATH / FILENAME).open("wb").write(content)

In [8]:
import pickle
import gzip
with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
    ((x_train, y_train), (x_valid, y_valid), other) = pickle.load(f, encoding="latin-1")

In [9]:
import torch

In [10]:
x_train, y_train, x_valid, y_valid = map( torch.tensor, 
        (x_train, y_train, x_valid, y_valid))

In [11]:
print(x_train.max(), x_train.min())

tensor(0.9961) tensor(0.)


In [12]:
import math

In [13]:
weights = torch.randn(784, 10)/math.sqrt(10)

In [14]:
weights.requires_grad_()

tensor([[-0.1187, -0.0815,  0.2100,  ...,  0.1378,  0.1532, -0.2719],
        [-0.3424,  0.1109, -0.2282,  ...,  0.0563,  0.1488, -0.0516],
        [ 0.4393,  0.1185, -0.0096,  ...,  0.3677, -0.4033,  0.3449],
        ...,
        [ 0.1083,  0.1018, -0.6197,  ..., -0.3246,  0.1452, -0.1608],
        [-0.0230, -0.1586, -0.0061,  ..., -0.0670, -0.4275,  0.2260],
        [-0.2039,  0.4072,  0.6629,  ..., -0.3535,  0.2506, -0.1778]],
       requires_grad=True)

In [15]:
bias = torch.zeros(10, requires_grad=True)

In [16]:
def log_softmax(x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)
def model(xb):
    return log_softmax(xb @ weights + bias)

In [17]:
batch_size = 64
xb=x_train[0:batch_size]
preds= model(xb)

In [18]:
print(preds[0])

tensor([ -1.5225,  -0.2951,  -7.5953,  -5.4842, -10.8539,  -6.7765,  -8.7971,
        -15.0511,  -3.4804,  -7.3984], grad_fn=<SelectBackward>)


Indexing 2d tensor with two 1d tensors of the same shape in each axis --> corresponding 1d tensor

In [19]:
def nll(input, target):
    return -input[range(target.shape[0]), target].mean()
loss_func = nll

In [20]:
yb = y_train[0:batch_size]

In [21]:
l = loss_func(preds, yb)

In [22]:
def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds==yb).float().mean()

In [23]:
print(accuracy(preds, yb))

tensor(0.0938)


In [24]:
n = x_train.shape[0]

In [25]:
from IPython.core.debugger import set_trace

In [26]:
lr = .5
epochs=2

In [27]:
for epoch in range(epochs):
#     set_trace()
    for i in range((n-1)//batch_size + 1):
        start_i = i * batch_size
        end_i = (i+1) * batch_size
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_func(pred, yb)
        
        #compute grads WRT loss and store in graph leaves
        loss.backward()
        #this part has nothing to do with gradient comp
        with torch.no_grad():
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            weights.grad.zero_()
            bias.grad.zero_()

In [38]:
print(loss_func(model(xb), yb), accuracy(model(xb), yb))

tensor(0.0822, grad_fn=<NegBackward>) tensor(1.)
