<a href="https://colab.research.google.com/github/alexrofail/Loss-Optimizers-Training-Loops/blob/main/training_mini_batches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch.nn.functional as F
import gzip
import pickle
from torch import tensor
from torch import nn

In [None]:
def get_data():
    with gzip.open('mnist.pkl.gz') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))


In [None]:
x_train,y_train,x_valid,y_valid = get_data()

In [None]:
import pickle, gzip, math, torch, matplotlib as mpl

In [None]:
mpl.rcParams['image.cmap'] = 'gray'

In [None]:
n, m = x_train.shape
c = y_train.max()+1
nh = 50

In [None]:
class Model(nn.Module):
  def __init__(self, nin, nh, nout):
    super().__init__()
    self.layers = [nn.Linear(nin,nh), nn.ReLU(), nn.Linear(nh,nout)]

  def __call__(self, x):
    for l in self.layers: x = l(x)
    return x

In [None]:
model = Model(m, nh, 10)

In [None]:
pred = model(x_train)

In [None]:
#Now we need a loss function

In [None]:
#For loss funcs we need to first compute the softmax of our activations
def log_softmax(x):
  return (x.exp()/(x.exp().sum(-1, keepdim=True))).log() #NLL requires log softmax in Pytorch

In [None]:
sm_pred = log_softmax(pred)

In [None]:
#Cross Entropy Loss
#Sum of the actual * log prob(actual)
#But in this case, our actuals are 1-hot-encoded, it can be done as -log(p_sub_i) where sub_i is he index of desired target

In [None]:
#First three elements of dependent var
y_train[:3]

tensor([5, 0, 4])

In [None]:
sm_pred[[0,1,2], [5,0,4]]

tensor([-2.2869, -2.4173, -2.4049], grad_fn=<IndexBackward>)

In [None]:
y_train.shape

torch.Size([50000])

In [None]:
def nll(input, target): return -input[range(target.shape[0]), target].mean()

In [None]:
loss = nll(sm_pred, y_train)
loss

tensor(2.3136, grad_fn=<NegBackward>)

In [None]:
#We can refactor log_softmax(x)
def log_softmax(x):
  return x - x.exp().sum(-1, keepdim=True).log()

In [None]:
#TODO: import the test near function to test these refactorings

In [None]:
def logsumexp(x):
  max = x.max(-1)[0]
  return m + (x - m[:,None]).exp().sum(-1).log()

In [None]:
#Now refactor log_softmax to use logsumexp
def log_softmax(x):
  return x - x.logsumexp(-1, keepdim=True)


In [None]:
#In pytorch log_softmax and nll_loss are combined in F.cross_entropy

In [None]:
loss_func = F.cross_entropy

In [None]:
#Define a metric: accuracy 
def accuracy(out, yb):
  return (torch.argmax(out, dim=1)==yb).float().mean()

In [None]:
bs = 64

xb = x_train[0:bs]

preds = model(xb)
preds[0], preds.shape

(tensor([-0.0779,  0.0680,  0.0250, -0.0056, -0.0180,  0.0321,  0.1891, -0.0610,
         -0.1683,  0.1322], grad_fn=<SelectBackward>), torch.Size([64, 10]))

In [None]:
yb = y_train[0:bs]
loss_func(preds,yb)

tensor(2.3131, grad_fn=<NllLossBackward>)

In [None]:
accuracy(preds, yb)

tensor(0.1094)

In [None]:
lr = 0.5

epochs =1

In [None]:
#Training loop
#Part 1, Lesson 2 reference
for epoch in range(epochs):
  for i in range((n-1)// bs+1):
    start_i = i*bs
    end_i = start_i +bs
    xb = x_train[start_i:end_i]
    yb = y_train[start_i:end_i]
    loss = loss_func(model(xb), yb)

    loss.backward()
    with torch.no_grad():
      for l in model.layers:
        if hasattr(l, 'weight'):
          l.weight -= l.weight.grad * lr
          l.bias -= l.bias.grad * lr
          l.weight.grad.zero_()
          l.bias.grad.zero_()

In [None]:
loss_func(model(xb),yb), accuracy(model(xb), yb)

(tensor(0.0664, grad_fn=<NllLossBackward>), tensor(1.))

In [None]:
class Model(nn.Module):
  def __init__(self, nin, nh, nout):
    super().__init__()
    self.l1 = nn.Linear(nin, nh)
    self.l2 = nn.Linear(nh, nout)

  def __call__(self, x):
    return self.l2(F.relu(self.l1(x)))

In [None]:
model = Model(m, nh, 10)

In [None]:
for name,l in model.named_children(): print(f"{name}: {l}")

l1: Linear(in_features=784, out_features=50, bias=True)
l2: Linear(in_features=50, out_features=10, bias=True)


In [None]:
model.l1

Linear(in_features=784, out_features=50, bias=True)

In [None]:
#refactored training loop, instead of looping through each layer in the backward we can go straight through params
def fit():
  for epoch in range(epochs):
    for i in range((n-1)// bs+1):
      start_i = i*bs
      end_i = start_i +bs
      xb = x_train[start_i:end_i]
      yb = y_train[start_i:end_i]
      loss = loss_func(model(xb), yb)

      loss.backward()
      with torch.no_grad():
        for p in model.parameters(): p -= p.grad*lr
        model.zero_grad()

In [None]:
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.0427, grad_fn=<NllLossBackward>), tensor(1.))

In [None]:
#This is PyTOrch's nn.sequential
class SequentialModel(nn.Module):
  def __init__(self, layers):
    super().__init__()
    self.layers = nn.ModuleList(layers)

  def __call__(self, x):
    for l in self.layers: x = l(x)
    return x

In [None]:
layers = [nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh, 10)]

In [None]:
model = SequentialModel(layers)

In [None]:
model

SequentialModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [51]:
#Optimizers
#to replace our with torch.no grad stuff in fit()

#In Pytorch this is the optim.SGD stuff
class Optimizer():
  def __init__(self, params, lr = 0.5):
    self.params, self.lr = list(params), lr

  def step(self):
    with torch.no_grad():
        for p in model.parameters(): p -= p.grad*lr

  def zero_grad(self):
    for p in self.params: p.grad.data.zero_()

In [55]:
model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh,10))
opt = Optimizer(model.parameters())

In [56]:
#Now we can refactor fit() to use our optimizer
def fit():
  for epoch in range(epochs):
    for i in range((n-1)// bs+1):
      start_i = i*bs
      end_i = start_i +bs
      xb = x_train[start_i:end_i]
      yb = y_train[start_i:end_i]
      loss = loss_func(model(xb), yb)

      loss.backward()
      opt.step()
      opt.zero_grad()

In [57]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(2.3425, grad_fn=<NllLossBackward>), tensor(0.0625))

In [58]:
from torch import optim

In [59]:
optim.SGD.step??

In [60]:
def get_model():
  model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh,10))
  return model, optim.SGD(model.parameters(), lr=lr)

In [62]:
model, opt = get_model()
loss_func(model(xb), yb)

tensor(2.2755, grad_fn=<NllLossBackward>)

In [65]:
class Dataset():
  def __init__(self,x,y):
    self.x ,self.y = x,y
  
  def __len__(self):
    return len(self.x)

  def __getitem__(self,i):
    return self.x[i], self.y[i]

In [66]:
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)
assert len(train_ds) == len(x_train)
assert len(valid_ds) == len(x_valid)

In [70]:
xb, yb = train_ds[0:5]
assert xb.shape == (5, 28*28)
assert yb.shape == (5,)
xb,yb

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([5, 0, 4, 1, 9]))

In [71]:
xb

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [72]:
yb

tensor([5, 0, 4, 1, 9])

In [67]:
model, opt = get_model()

In [73]:
for epoch in range(epochs):
    for i in range((n-1) // bs+1):
      start_i = i*bs
      end_i = start_i +bs
      xb = x_train[start_i:end_i]
      yb = y_train[start_i:end_i]
      loss = loss_func(model(xb), yb)

      loss.backward()
      opt.step()
      opt.zero_grad()

In [74]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.2987, grad_fn=<NllLossBackward>), tensor(0.9375))