# Becoming backprop ninja

This lecture is more of a workbook style lecture it seems. I am going to first do the exercises myself and then see how Andrej implements it. The first few cells are the same as before where you do your standard imports and then prepare your data

In [1]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
%matplotlib inline

In [2]:
with open('names.txt') as f:
    names = f.readlines()
    names = [name.strip('\n') for name in names]

In [3]:
# alternate implementation, used by Andrej
names = open('names.txt', 'r').read().splitlines()

In [4]:
# prepare the vocab
tokens = ['.'] + sorted(set(''.join(names)))
stoi = {}
itos = {}
for i, token in enumerate(tokens):
    stoi[token] = i
    itos[i] = token

In [5]:
def buildDataset(names):
    X, Y = [], []
    for name in names:
        chars = [0] * 3
        for x in name + '.':
            xi = stoi[x]
            X.append(chars)
            Y.append(xi)
            chars = chars[1:] + [xi]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return (X, Y)

In [6]:
import random
random.shuffle(names)

samples = len(names)

train = 0.8
val = 0.9
test = 1.0

Xtr, Ytr = buildDataset(names[:int(train*samples)])
Xval, Yval = buildDataset(names[int(train*samples):int(val*samples)])
Xtest, Ytest = buildDataset(names[int(val*samples):])

In [7]:
class Linear:
    def __init__(self, fan_in, fan_out, bias = True):
        self.weights = torch.randn((fan_in, fan_out)) / (fan_in ** 0.5)
        self.bias = torch.randn((fan_out,)) * 0.1 if bias else None
    def __call__(self, x):
        self.out = x @ self.weights
        if self.bias is not None:
            self.out += self.bias
        return self.out
    def parameters(self):
        return [self.weights] + ([] if self.bias is None else [self.bias])
    
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out

In [10]:
max_iterations = 20000
vocab_size = len(stoi)
context_length = 3
embedding_dim = 10
hidden_dim = 200
minibatch_size = 32
lr = 0.1

C = torch.randn((vocab_size, embedding_dim))
layers = [
    Linear(embedding_dim*context_length, hidden_dim), Tanh(),
    Linear(                  hidden_dim, vocab_size)
]

parameters = [C] + [p for layer in layers if isinstance(layer, Linear) for p in layer.parameters()]
for p in parameters:
    p.requires_grad_()

print(sum([p.nelement() for p in parameters]))

11897


In [17]:
for step in range(max_iterations + 1):
    # minibatch
    minibatch = torch.randint(Xtr.shape[0], (minibatch_size,))

    # forward pass
    emb = C[Xtr[minibatch]]
    embcat = emb.view(-1, embedding_dim * context_length)
    x = embcat
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Ytr[minibatch])
    
    # calculate gradient
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update values
    for p in parameters:
        p.data -= lr * p.grad
    
    # print progress
    if step % 2000 == 0:
        print(f'{step}/{max_iterations} loss = {loss.item()}')

0/20000 loss = 2.6045684814453125
2000/20000 loss = 2.095168352127075
4000/20000 loss = 1.7923805713653564
6000/20000 loss = 2.142599582672119
8000/20000 loss = 1.9210169315338135
10000/20000 loss = 2.3190653324127197
12000/20000 loss = 1.9645098447799683
14000/20000 loss = 2.592478036880493
16000/20000 loss = 2.202781915664673
18000/20000 loss = 2.5616817474365234
20000/20000 loss = 2.426074266433716


So far we were just recreating what we did in the previous lectures. Now we "chunkate" the code and write it in a way where we don't use a lot of internal functions of PyTorch in order to implement backprop on our own

In [51]:
max_iterations = 20000
vocab_size = len(stoi)
context_length = 3
embedding_dim = 10
hidden_dim = 200
minibatch_size = 32
lr = 0.1
epsilon = 1e-5

C = torch.randn((vocab_size, embedding_dim))
# Linear layer 1
W1 = torch.randn((context_length * embedding_dim, hidden_dim)) * 0.1
b1 = torch.randn((hidden_dim,)) * 0.01
# Batch norm tensors
gamma = torch.ones((hidden_dim,))
beta = torch.zeros((hidden_dim,))
# Linear layer 2
W2 = torch.randn((hidden_dim, vocab_size)) * 0.1
b2 = torch.randn((vocab_size,)) * 0.01

parameters = [C, W1, b1, gamma, beta, W2, b2]
for p in parameters:
    p.requires_grad_()

print(sum([p.nelement() for p in parameters]))

12297


In [52]:
for step in range(max_iterations + 1):
    # minibatch
    minibatch = torch.randint(Xtr.shape[0], (minibatch_size,))

    # ----- forward pass -----
    # embedding and concatenating
    emb = C[Xtr[minibatch]]
    embcat = emb.view(-1, embedding_dim * context_length)
    # Linear layer 1
    h1preact = embcat @ W1 + b1
    # Batch Norm
    h1mean = h1preact.mean(0, keepdims=True)
    h1var = h1preact.var(0, keepdims=True)
    h1shifted = h1preact - h1mean
    h1scalefactor = torch.sqrt(h1var + epsilon)
    xhat = h1shifted / h1scalefactor
    yhat = gamma * xhat + beta
    # Activation
    h1 = torch.tanh(yhat)
    # Linear layer 2 
    h2 = h1 @ W2 + b2
    # Calculate the loss function
    counts = h2.exp()
    probs = counts / counts.sum(1, keepdims=True)
    logprobs = -probs.log()
    predprob = logprobs[torch.arange(minibatch_size), Ytr[minibatch]]
    loss = predprob.mean()
    
    # calculate gradient
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update values
    for p in parameters:
        p.data -= lr * p.grad
    
    # print progress
    if step % 2000 == 0:
        print(f'{step}/{max_iterations} loss = {loss.item()}')

0/20000 loss = 3.775161027908325
2000/20000 loss = 2.462073802947998
4000/20000 loss = 2.5500526428222656
6000/20000 loss = 2.0544486045837402
8000/20000 loss = 2.1829376220703125
10000/20000 loss = 2.486475706100464
12000/20000 loss = 2.3895978927612305
14000/20000 loss = 2.130673408508301
16000/20000 loss = 2.0279457569122314
18000/20000 loss = 2.4456534385681152
20000/20000 loss = 2.4373044967651367


We now have something that works as intended. This is good. Now we will just import the code that Andrej wrote so that we are in unision with the lecture