# Mastering BackProp

Read this: https://karpathy.medium.com/yes-you-should-understand-backprop-e2f06eab496b

Usual data loading

In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
vocab_size = len(itos)

block_size = 3

def build_dataset(words):
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])


# Getting into Backprop

Utility function to compare manual gradients to Torch gradients

In [None]:
# def cmp(s, dt, t):
#     ex = torch.all(dt == t.grad).item()
#     app = torch.allclose(dt, t.grad)
#     maxdiff = (dt - t.grad).abs().max().item()
#     print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')
# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt, t):
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad)
  maxdiff = (dt - t.grad).abs().max().item()
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

Re-using our network design. The initialisations are changed to small random numbers. Just using random numbers might result in zeros being init, which will be some sort of mask. So multiply the init vars by a small number to prevent this

In [8]:
n_embd = 10 # dimensionality of the char embedding vectors
n_hidden = 200 # number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, n_embd),           generator=g)
W1 = torch.randn((n_embd*block_size, n_hidden), generator=g) * (5/3) / ((n_embd*block_size)**0.5)  # Kaiming init
b1 = torch.randn(n_hidden,                      generator=g) * 0.1 # this is generally not needed, since have batchnorm but it will be used for sanity check
W2 = torch.randn((n_hidden, vocab_size),        generator=g) * 0.1
b2 = torch.randn(vocab_size,                    generator=g) * 0.1

# Batch Norm params
bngain = torch.ones((1, n_hidden)) * 0.1 + 1.0
bnbias = torch.zeros((1, n_hidden)) * 0.1
bnmean_running = torch.zeros((1, n_hidden)) # init mean should be roughly 0
bnstd_running = torch.ones((1, n_hidden))  # and std should be roughly 1

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
for p in parameters:
    p.requires_grad = True

In [10]:
batch_size = 32
n = batch_size

ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix]

We bring back an explicit implementation of the loss function. We also break down into smaller chunks with intermediate tensors so we can do backward more appropriately

In [25]:
# forward pass
emb = C[Xb]
embcat = emb.view(emb.shape[0], -1)

# Linear layer
hprebn = embcat @ W1 + b1

# Batchnorm layer
bnmeani = 1 / n * hprebn.sum(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1 / (n-1) * (bndiff2).sum(0, keepdim=True)
bnvar_inv = (bnvar + 1e-5) ** -0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias

# Non linearity
h = torch.tanh(hpreact)

# Linear layer 2
logits = h @ W2 + b2

# Cross entropy loss 
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdim=True)
counts_sum_inv = counts_sum**-1
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean() # iterate down the rows and get the index specified by the element in the tensor Yb, get the mean and negate

In the backward pass, we want to retain the grads

In [16]:
for p in parameters:
    p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, norm_logits, logit_maxes,
          logits, h, hpreact, bnraw, bnvar_inv, bnvar, bndiff, bndiff2, hprebn, bnmeani, 
          embcat, emb]:
    t.retain_grad()
loss.backward()
loss

tensor(3.8279, grad_fn=<NegBackward0>)

Manually backprop through all the variables

Let's say our loss is loss = -1/3a + -1/3b + -1/3c, the derivatives dloss/da = -1/3, dloss/db=-1/3, dloss/dc=-1/3. Now, if our loss has n elements, we see that the derivative essentially becomes dloss/d(n) = -1/n

i.e. for every element i in loss, dloss/di = -1/n

But logprobs is an array of shape [32, 27], but only 32 of them participate in loss calculation. The gradient of all these others is 0 because they do not participate in the loss. 

In [23]:
dlogprobs =   torch.zeros_like(logprobs) # holds the derivative of the loss wrt all elements of logprobs

# we need to set the derivatives of each element in exactly its same location
dlogprobs[range(n), Yb] = -1.0/n
# print(dlogprobs)

# we check it using our function cmp
cmp('logprobs', dlogprobs, logprobs)

  ex = torch.all(dt == t.grad).item()


TypeError: all() received an invalid combination of arguments - got (bool), but expected one of:
 * (Tensor input, *, Tensor out = None)
 * (Tensor input, tuple of ints dim = None, bool keepdim = False, *, Tensor out = None)
 * (Tensor input, int dim, bool keepdim = False, *, Tensor out = None)
 * (Tensor input, name dim, bool keepdim = False, *, Tensor out = None)
