In [2]:
#export
import torch
from torch import tensor
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

In [4]:
#export
import pickle,gzip,math,torch,matplotlib as mpl
import matplotlib.pyplot as plt

In [5]:
from fastai import datasets

In [6]:
from pathlib import Path
from torch.nn import init

In [8]:
MNIST_URL = "http://deeplearning.net/data/mnist/mnist.pkl"
path = datasets.download_data(MNIST_URL, ext='.gz')
path

PosixPath('/Users/abhinavverma/.fastai/data/mnist.pkl.gz')

In [9]:
#export
def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))
def normalize(x,m,s):
    return (x-m)/s

In [10]:
x_train,y_train,x_valid,y_valid = get_data()

In [12]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std

(tensor(0.1304), tensor(0.3073))

In [13]:
x_train = normalize(x_train, train_mean, train_std)
# NB: Use training, not validation mean for validation set
x_valid = normalize(x_valid, train_mean, train_std)

In [14]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std

(tensor(0.0001), tensor(1.))

In [15]:
#export
def test_near_zero(a,tol=1e-3): assert a.abs()<tol, f"Near zero: {a}"

In [16]:
test_near_zero(train_mean)

In [17]:
test_near_zero(1-x_train.std())

In [18]:
??nn.Linear

We will use kaiming_uniform to initialize the weights as the mnist is a small example. so this is going with the defaults

In [21]:
#Global variables initialized
n,m = x_train.shape
c = y_train.max()+1
nh = 50
n,m,c

(50000, 784, tensor(10))

In [20]:
#model class is a callable class so it can be called like a function
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [26]:
#Model class
model = Model(m,nh,10)

In [24]:
model(x_train)

torch.Size([50000, 10])

In [27]:
#Global variables
pred = model(x_train)

##### We have already seen the importance of the softmax function. The softmax function is used to calculate the probabailities of a category in a multi-class class classification. 

In [25]:
def log_softmax(x): return (x.exp()/(x.exp().sum(-1,keepdim=True))).log()

In [28]:
softmax_pred = log_softmax(pred)

In [29]:
softmax_pred

tensor([[-2.8977, -2.2487, -2.8949,  ..., -2.2749, -1.8786, -2.6079],
        [-2.7164, -2.1471, -2.6864,  ..., -2.2885, -1.9710, -2.7302],
        [-2.5435, -2.4374, -2.8221,  ..., -2.5087, -2.1419, -2.0730],
        ...,
        [-2.3768, -2.5188, -2.7223,  ..., -2.3439, -2.0737, -2.3840],
        [-2.6748, -2.6027, -2.4636,  ..., -2.2222, -2.0227, -2.3758],
        [-2.5432, -2.6758, -2.2384,  ..., -2.2847, -2.0998, -2.5639]],
       grad_fn=<LogBackward>)

Now for cross entropy

In [30]:
def nll(pred,label):
    return -pred[range(label.shape[0]),label].mean()

In [32]:
nll(softmax_pred,y_train)

tensor(2.3521, grad_fn=<NegBackward>)

In [39]:
??torch.max

In [40]:
#this is calculated based on the log sum exp trick, a is the max value
def logsumexp(x):
    a = x.max(-1)[0]
    return a + (x-a[:,None]).exp().sum(-1).log()

In [41]:
#our log softmax
def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)

In [42]:
softmax_pred_exp = log_softmax(pred)

In [44]:
#test_near(nll(log_softmax(pred), y_train), loss)

In PyTorch, F.log_softmax and F.nll_loss are combined in one optimized function, F.cross_entropy.

In [45]:
test_near_zero(F.cross_entropy(pred, y_train)-nll(log_softmax(pred), y_train))

So the implementation is pretty accurate

In [46]:
#export
def accuracy(out, yb): return (torch.argmax(out, dim=1)==yb).float().mean()

We will use nn.Sequential to add layers to a Pytorch module.

In [47]:
#new Model
model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))

In [48]:
#export
from torch import optim

In [55]:
#Access lr or other parameters from optimizer.useful for modifying during learning
optim.SGD(model.parameters(), lr=0.5).param_groups[0]['lr']

0.5

In [60]:
#returns model and optimizer
def get_model():
    model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))
    return model, optim.SGD(model.parameters(), lr=0.5)

Since it is clunky to iterate through mini batches and specifying batch size all the time we will specify our dataset class and learn more about Pytorch Dataset,Dataloaders and RandomSamplers

In [58]:
#export
class Dataset():
    def __init__(self, x, y): self.x,self.y = x,y
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return self.x[i],self.y[i]

DataLoaders are added to make the training loop much cleaner

In [57]:
class DataLoader():
    def __init__(self, ds, bs): self.ds,self.bs = ds,bs
    def __iter__(self):
        for i in range(0, len(self.ds), self.bs): yield self.ds[i:i+self.bs]

In [97]:
model, _ = get_model()

In [87]:
next(model.named_children())

('0', Linear(in_features=784, out_features=50, bias=True))

In [96]:
#How to access the weights of every layer. Obviously there has to be an optimized way
#next(model[0].parameters())[0].data

In [100]:
#Change weights using apply method of Pytorch
def init_weights(m):
    print(m)
    if type(m) == nn.Linear:
        
        m.weight.data.fill_(1.0)
        print(m.weight)
    
        
        
#model.apply(init_weights)

#### Random sampling
We want our training set to be in a random order, and that order should differ each iteration. But the validation set shouldn't be randomized.

In [101]:
class Sampler():
    def __init__(self, ds, bs, shuffle=False):
        self.n,self.bs,self.shuffle = len(ds),bs,shuffle
        
    def __iter__(self):
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        for i in range(0, self.n, self.bs): yield self.idxs[i:i+self.bs]

In [102]:
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)
assert len(train_ds)==len(x_train)
assert len(valid_ds)==len(x_valid)

In [103]:
small_ds = Dataset(*train_ds[:10])

In [104]:
small_ds

<__main__.Dataset at 0x1a23e8a198>

In [105]:
s = Sampler(small_ds,3,False)
[o for o in s]

[tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7, 8]), tensor([9])]

In [106]:
s = Sampler(small_ds,3,True)
[o for o in s]

[tensor([5, 6, 1]), tensor([3, 4, 7]), tensor([9, 2, 0]), tensor([8])]

In [107]:
#Dataloader with a custom Random sampleer
def collate(b):
    xs,ys = zip(*b)
    return torch.stack(xs),torch.stack(ys)

class DataLoader():
    def __init__(self, ds, sampler, collate_fn=collate):
        self.ds,self.sampler,self.collate_fn = ds,sampler,collate_fn
        
    def __iter__(self):
        for s in self.sampler: yield self.collate_fn([self.ds[i] for i in s])

But we can also use Pytorch's built in method of data sampling. It's the same as normal dataloader and sampler we defined above but it has a num_workers which allows for parallel processing which will be necessary in the future hence we use them instead

In [108]:
#export
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler

In [110]:
#Global variable
bs=64

In [111]:
train_dl = DataLoader(train_ds, bs, sampler=RandomSampler(train_ds), collate_fn=collate)
valid_dl = DataLoader(valid_ds, bs, sampler=SequentialSampler(valid_ds), collate_fn=collate)