In [1]:
# no change in first several cells from the previous lecture

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# read in all the words
words = open('makemore/names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [4]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [5]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
    X, Y = [], []
    for w in words:
        # print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [6]:
# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt, t):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [8]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 64 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((vocab_size, n_embd), generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden, generator=g) * 0.1 # using b1 just for fun, it's useless because of batch norm bias
# Layer 2
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.1
b2 = torch.randn(vocab_size, generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden)) * 0.1 + 1.0
bnbias = torch.randn((1, n_hidden)) * 0.1

# Note: I am initializing many of these parameters in non-standard ways
# because sometimes initializing with e.g. all zeros could mask an incorrect
# implementation of the backward pass

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
    p.requires_grad = True

4137


In [9]:
batch_size = 32
n = batch_size # a shorter variable also, for convenience
# construct a minibatch
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X, Y

In [156]:
# forward pass, "chunkated" into smaller steps that are possible to backward one at a time

emb = C[Xb] # embed the characters into vectors
embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
# Linear layer 1
hprebn = embcat @ W1 + b1 # hidden layer pre-activation
# BatchNorm layer
bnmeani = 1 / n * hprebn.sum(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1/(n-1) * (bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n)
bnvar_inv = (bnvar + 1e-5)**-0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias
# Non-linearity
h = torch.tanh(hpreact) # hidden layer
# Linear layer 2
logits = h @ W2 + b2 # output layer
# cross entropy loss (same as F.cross_entropy(logits, Yb))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True)
counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be a bit exact...
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean()

# PyTorch backward pass
for p in parameters:
    p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, 
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
          bnvar_inv, bnvar, bndiff, bndiff2, hprebn, bnmeani,
          embcat, emb]:
    t.retain_grad()
loss.backward()
loss

tensor(3.3491, grad_fn=<NegBackward0>)

In [13]:
logprobs

tensor([[-2.5453, -2.3887, -4.0004, -3.0342, -3.9698, -2.5351, -3.7752, -3.2896,
         -3.9947, -3.5177, -3.2802, -3.2441, -3.3296, -3.5535, -3.2969, -4.3075,
         -4.5727, -3.9810, -4.2267, -2.9566, -3.1243, -3.7375, -3.5576, -2.6945,
         -2.8365, -3.6556, -3.8391],
        [-2.9464, -2.8454, -2.3858, -2.9313, -3.3509, -3.4276, -4.0100, -3.0833,
         -3.9303, -3.7939, -3.0016, -3.1390, -3.0809, -3.5213, -3.0001, -3.3072,
         -3.5359, -4.0858, -3.9174, -3.2185, -3.9796, -3.7524, -4.1313, -2.7333,
         -3.7562, -3.3160, -3.6210],
        [-3.8381, -3.7525, -4.2759, -4.3642, -3.9075, -3.2033, -2.8808, -2.6832,
         -2.7831, -3.5710, -3.9224, -3.3924, -3.1071, -2.9944, -3.7976, -3.6384,
         -4.1997, -3.4217, -3.6338, -2.1961, -2.7819, -3.2312, -3.0991, -3.2780,
         -3.2534, -3.9371, -3.5529],
        [-3.3685, -3.7027, -3.1507, -2.9466, -2.9068, -3.6339, -3.0142, -3.0966,
         -3.0812, -4.0738, -3.3096, -3.5594, -3.4525, -3.1563, -2.7786, -2.7008

In [14]:
logprobs.shape

torch.Size([32, 27])

In [16]:
Yb

tensor([ 8, 14, 15, 22,  0, 19,  9, 14,  5,  1, 20,  3,  8, 14, 12,  0, 11,  0,
        26,  9, 25,  0,  1,  1,  7, 18,  9,  3,  5,  9,  0, 18])

In [17]:
logprobs[range(n), Yb] # for each row i in logprobs, get the corresponding element i in Yb

tensor([-3.9947, -3.0001, -3.6384, -3.1822, -4.0384, -3.5717, -3.3094, -3.9432,
        -3.2381, -4.2850, -3.1351, -1.6623, -2.8689, -2.8949, -3.0631, -3.1330,
        -3.8170, -2.9379, -3.5501, -3.4175, -2.8673, -3.0299, -4.2935, -4.0829,
        -3.4574, -2.9817, -3.1422, -3.9086, -2.7540, -3.5519, -3.2008, -3.2198],
       grad_fn=<IndexBackward0>)

In [18]:
# if loss = -(a + b + c) / 3
# loss = -1/3a + -1/3b + -1/3c
# dloss/da = -1/3, and dloss = [-1/3 -1/3 -1/3]

In [27]:
counts.shape, counts_sum_inv.shape

(torch.Size([32, 27]), torch.Size([32, 1]))

In [None]:
# because the shapes are not the same, pytorch broadcasts counts_sum_inv when it's multiplied by counts
# then it looks sort of like this:
# c = a * b, but with tensors:
# a[3x3] * b[3x1] --->
# a11*b1 a12*b1 a13*b1
# a21*b2 a22*b2 a23*b2
# a31*b3 a32*b3 a33*b3
# then we need to take local derivative i.e. dc/db = a, but then since b is replicated across the columns,
# we need to sum across dim 1 to retain the same shape in the gradient as in the original tensor

In [41]:
counts.shape, counts_sum.shape

(torch.Size([32, 27]), torch.Size([32, 1]))

In [None]:
# for dcounts_sum/dcounts, we need to take in counts_sum and return something with the shape of counts

# a11 a12 a13 ---> b1 (= a11 + a12 + a13)
# a21 a22 a23 ---> b2 (= a21 + a22 + a23)
# a31 a32 a33 ---> b3 (= a31 + a32 + a33)

# db1 / da11 = 1, but db1 / da21 = 0 since b1 is only dependent on a11, a12, a13 and a21 does not contribute to the input
# because derivative of each b_i element is only dependent on the row being summed, the gradient gets "routed" to all elements
# participating in the addition. in other words, dloss/da11 = dloss/db1 * db1/da11 = b1.grad * loss
# then dcounts (the part which affects counts_sum) should just be a tensor of ones with the shape of counts,
# multiplied by dcounts_sum to distribute the gradient of dcounts_sum to each element of dcounts.

In [188]:
# Exercise 1: backprop through the whole thing manually,
# / through exactly all of the variables
# as they are defined in the forward pass above, one by one

dlogprobs = torch.zeros_like(logprobs)
dlogprobs[range(n), Yb] = -1.0/n
cmp('logprobs', dlogprobs, logprobs)

dprobs = 1.0 / probs * dlogprobs
cmp('probs', dprobs, probs)

dcounts_sum_inv = (counts * dprobs).sum(1, keepdim=True)
cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)

dcounts_sum = -(1.0 / counts_sum**2) * dcounts_sum_inv
cmp('counts_sum', dcounts_sum, counts_sum)

# # dloss/dcounts = dloss/dprobs * dprobs/dcounts + dloss/dcounts_sum * dcounts_sum/dcounts; dcounts_sum/dcounts = 1
dcounts = counts_sum_inv * dprobs + dcounts_sum * torch.ones_like(counts) # can also just do dcounts, no need to add the multiply
cmp('counts', dcounts, counts)

# dloss/dnorm_logits = dloss/dcounts * dcounts/dnorm_logits
dnorm_logits = dcounts * counts # since counts = e^norm_logits, dcounts/dnorm_logits = e^norm_logits = counts
cmp('norm_logits', dnorm_logits, norm_logits)

# have to be careful with this one beacuse the shapes do not match
# norm_logits.shape, logits.shape, logit_maxes.shape # [32, 27], [32, 27], [32, 1]
# a11 a12 a13 - b1 -> a11-b1 a12-b1 a13-b1
# a21 a22 a23 - b2 -> a21-b2 a22-b2 a23-b2
# a31 a32 a33 - b3 -> a31-b3 a32-b3 a33-b3
#
# so dloss/db1 = -1 -1 -1
#                -1 -1 -1
#                -1 -1 -1
#
# but then to retain the shape of b (3x1), we need to sum along dim 1
#
# norm_logits = logits - logit_maxes; dloss/dlogit_maxes = dloss/dnorm_logits * dnorm_logits/dlogit_maxes
# = dnorm_logits * -1
dlogit_maxes = (-1.0 * dnorm_logits).sum(1, keepdim=True)
cmp('logit_maxes', dlogit_maxes, logit_maxes)

dlogits = dnorm_logits.clone()

# we do this because logits is also present to calculate logit_maxes
# only the max element of each row of logits affects logit_maxes, so we only take the derivative on that element
# and the remaining elements should be zero
dlogits_p = torch.zeros_like(logits) # all elements are zero
indices = logits.max(1, keepdim=True).indices # gets the indices of the max elements for each row in logits
dlogits_p[range(logits.shape[0]), indices.view(-1)] = 1 # sets all the max element indices to be 1
dlogits += dlogits_p * dlogit_maxes # multiply by dlogit_maxes to factor in the chain rule
# dlogits += F.one_hot(logits.max(1).indices, num_classes=logits.shape[1]) * dlogit_maxes # also works, fancier ;)
cmp('logits', dlogits, logits)

# dloss/dh = dloss/dlogits * dlogits/dh
# a11 a12 a13 a14     b11 b12 b13    = a11*b11 + a12*b21 + a13*b31 + a14*b41, a11*b12 + a12*b22 + ...
# a21 a22 a23 a24  *  b21 b22 b23      a21*b11 + a22*b21 + a23*b31 + a24*b41, 
# a31 a32 a33 a34     b31 b32 b33
#                     b41 b42 b43
h.shape, W2.shape, logits.shape # [32, 64], [64, 27], [32, 27]

dh = dlogits @ W2.T
cmp('h', dh, h)

dW2 = h.T @ dlogits
cmp('W2', dW2, W2)

db2 = dlogits.sum(0, keepdim=True)
cmp('b2', db2, b2)

dhpreact = (1 - h**2) * dh
cmp('hpreact', dhpreact, hpreact)

dbnraw = bngain * dhpreact
cmp('bnraw', dbnraw, bnraw)

dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
cmp('bngain', dbngain, bngain)

dbnbias = dhpreact.sum(0, keepdim=True)
cmp('bnbias', dbnbias, bnbias)

dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)
cmp('bnvar_inv', dbnvar_inv, bnvar_inv)

dbnvar = -0.5 * (bnvar + 1e-5)**-1.5 * dbnvar_inv
cmp('bnvar', dbnvar, bnvar)

dbndiff2 = dbnvar.expand(bndiff2.shape) * (1.0/(n-1))
cmp('bndiff2', dbndiff2, bndiff2)

dbndiff = bnvar_inv * dbnraw + dbndiff2 * 2 * bndiff
cmp('bndiff', dbndiff, bndiff)

dbnmeani = -1.0 * dbndiff.sum(0, keepdim=True)
cmp('bnmeani', dbnmeani, bnmeani)

dhprebn = (1.0/n) * dbnmeani.expand(hprebn.shape) + dbndiff.clone()
cmp('hprebn', dhprebn, hprebn)

dembcat = dhprebn @ W1.T
cmp('embcat', dembcat, embcat)

dW1 = embcat.T @ dhprebn
cmp('W1', dW1, W1)

db1 = dhprebn.sum(0)
cmp('b1', db1, b1)

demb = dembcat.view(emb.shape)
cmp('emb', demb, emb)

# print(emb.shape, C.shape, Xb.shape)
# torch.Size([32, 3, 10]) torch.Size([27, 10]) torch.Size([32, 3])
# print(Xb[:5]) # each int inside each row of Xb tells us what rows of C to pluck out for each example. 
# tensor([[ 1,  1,  4], # i.e. pluck out C[1], C[1], C[4] and store it in emb[0]
#         [18, 14,  1],
#         [11,  5,  9],
#         [ 0,  0,  1],
#         [12, 15, 14]])
# now to find dC we have demb which is of shape 32x3x10 and contains gradients of emb, and emb contains the 
# list of rows plucked from C. so we just have to return those rows back to their original position to find dC.
# for any rows used multiple times, we have to add their gradients e.g. row 0 of Xb uses row 1 of C twice

dC = torch.zeros_like(C)
for k in range(Xb.shape[0]):
    for j in range(Xb.shape[1]):
        ix = Xb[k, j]
        dC[ix] += demb[k, j]
cmp('C', dC, C)

logprobs        | exact: True  | approximate: True  | maxdiff: 0.0
probs           | exact: True  | approximate: True  | maxdiff: 0.0
counts_sum_inv  | exact: True  | approximate: True  | maxdiff: 0.0
counts_sum      | exact: True  | approximate: True  | maxdiff: 0.0
counts          | exact: True  | approximate: True  | maxdiff: 0.0
norm_logits     | exact: True  | approximate: True  | maxdiff: 0.0
logit_maxes     | exact: True  | approximate: True  | maxdiff: 0.0
logits          | exact: True  | approximate: True  | maxdiff: 0.0
h               | exact: True  | approximate: True  | maxdiff: 0.0
W2              | exact: True  | approximate: True  | maxdiff: 0.0
b2              | exact: True  | approximate: True  | maxdiff: 0.0
hpreact         | exact: True  | approximate: True  | maxdiff: 0.0
bnraw           | exact: True  | approximate: True  | maxdiff: 0.0
bngain          | exact: True  | approximate: True  | maxdiff: 0.0
bnbias          | exact: True  | approximate: True  | maxdiff:

In [190]:
# Exercise 2: backprop through cross_entropy but all in one go
# to complete this challenge, look at the mathematical expression of the loss,
# take the derivative, simplify the expression, and just write it out

# forward pass

# before:
# logit_maxes = logits.max(1, keepdim=True).values
# norm_logits = logits - logit_maxes # subtract max for numerical stability
# counts = norm_logits.exp()
# counts_sum = counts.sum(1, keepdims=True)
# counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be a bit exact
# probs = counts * counts_sum_inv
# logprobs = probs.log()
# loss = -logprobs[range(n), Yb].mean()

# now:
loss_fast = F.cross_entropy(logits, Yb)
print(loss_fast.item(), 'diff:', (loss_fast - loss).item())

3.349091053009033 diff: -2.384185791015625e-07


In [192]:
# backward pass

dlogits = F.softmax(logits, 1)
dlogits[range(n), Yb] -= 1
dlogits /= n

cmp('logits', dlogits, logits) # I can only get approximate to be true, my maxdiff is 6e-9

logits          | exact: False | approximate: True  | maxdiff: 5.122274160385132e-09


In [197]:
# Exercise 3: backprop through batchnorm but all in one go
# to complete this challenge look at the mathematical expression of the output of batchnorm,
# take the derivative w.r.t. its input, simplify the expression, and just write it out

# forward pass

# before:
# bnmeani = 1/n * hprebn.sum(0, keepdim=True)
# bndiff = hprebn - bnmeani
# bndiff2 = bndiff**2
# bnvar = 1/(n-1) * (bndiff2).sum(0, keepdim=True) # node: Bessel's correction (dividing by n-1, not n)
# bnvar_inv = (bnvar + 1e-5)**-0.5
# bnraw = bndiff * bnvar_inv
# hpreact = bngain * bnraw + bnbias

# now:
hpreact_fast = bngain * (hprebn - hprebn.mean(0, keepdim=True)) / torch.sqrt(hprebn.var(0, keepdim=True, unbiased=True) + 1e-5) + bnbias
print('max diff:', (hpreact_fast - hpreact).abs().max())

bnvar.shape, bnmeani.shape, hprebn.shape

max diff: tensor(7.1526e-07, grad_fn=<MaxBackward1>)


(torch.Size([1, 64]), torch.Size([1, 64]), torch.Size([32, 64]))

In [202]:
# backward pass

# before we had:
# dbnraw = bngain * dhpreact
# dbndiff = bnvar_inv * dbnraw
# dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)
# dbnvar = (-0.5*(bnvar + 1e-5)**-1.5) * dbnvar_inv
# dbndiff2 = (1.0/(n-1))*torch.ones_like(bndiff2) * dbnvar
# dbndiff += (2*bndiff) * dbndiff2
# dhprebn = dbndiff.clone()
# dbnmeani = (-dbndiff).sum(0)
# dhprebn += 1.0/n * (torch.ones_like(hprebn) * dbnmeani)

# calculate dhprebn given dhpreact (i.e. backprop through the batchnorm)
# (you'll also need to use some of the variables from the forward pass up above)

# see your notes for how you came up with this shit lmao
dhprebn = bngain * bnvar_inv * (dhpreact - 1.0 / (n - 1) * bnraw * (bnraw * dhpreact).sum(0, keepdim=True) - 1.0 / n * dhpreact.sum(0, keepdim=True))
cmp('hprebn', dhprebn, hprebn) # I can only get approximate to be true, my maxdiff is 9e-10

hprebn          | exact: False | approximate: True  | maxdiff: 9.313225746154785e-10


In [211]:
# Exercise 4: putting it all together!
# Train the MLP neural net with your own backward pass

# init
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden,                        generator=g) * 0.1
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0
bnbias = torch.randn((1, n_hidden))*0.1

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

# same optimization as last time
max_steps = 200000
batch_size = 32
n = batch_size # convenience
lossi = []

# use this context manager for efficiency once your backward pass is written (TODO)
with torch.no_grad():

  # kick off optimization
  for i in range(max_steps):

    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

    # forward pass
    emb = C[Xb] # embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
    # Linear layer
    hprebn = embcat @ W1 + b1 # hidden layer pre-activation
    # BatchNorm layer
    # -------------------------------------------------------------
    bnmean = hprebn.mean(0, keepdim=True)
    bnvar = hprebn.var(0, keepdim=True, unbiased=True)
    bnvar_inv = (bnvar + 1e-5)**-0.5
    bnraw = (hprebn - bnmean) * bnvar_inv
    hpreact = bngain * bnraw + bnbias
    # -------------------------------------------------------------
    # Non-linearity
    h = torch.tanh(hpreact) # hidden layer
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Yb) # loss function

    # backward pass
    for p in parameters:
        p.grad = None
    #loss.backward() # use this for correctness comparisons, delete it later!

    # manual backprop!
    dlogits = F.softmax(logits, 1)
    dlogits[range(n), Yb] -= 1
    dlogits /= n
    dh = dlogits @ W2.T
    dW2 = h.T @ dlogits
    db2 = dlogits.sum(0)
    dhpreact = (1 - h**2) * dh
    dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
    dhprebn = bngain * bnvar_inv * (dhpreact - 1.0 / (n - 1) * bnraw * (bnraw * dhpreact).sum(0, keepdim=True) - 1.0 / n * dhpreact.sum(0, keepdim=True))
    dbnbias = dhpreact.sum(0, keepdim=True)
    dembcat = dhprebn @ W1.T
    dW1 = embcat.T @ dhprebn
    db1 = dhprebn.sum(0)
    demb = dembcat.view(emb.shape)
    dC = torch.zeros_like(C)
    for k in range(Xb.shape[0]):
        for j in range(Xb.shape[1]):
            ix = Xb[k, j]
            dC[ix] += demb[k, j]

    grads = [dC, dW1, db1, dW2, db2, dbngain, dbnbias]

    # update
    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
    param = 0
    for p, grad in zip(parameters, grads):
        #p.data += -lr * p.grad # old way of cheems doge (using PyTorch grad from .backward())
        p.data += -lr * grad # new way of swole doge TODO: enable
        param += 1
      
    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

    # if i >= 100: # TODO: delete early breaking when you're ready to train the full net
    #     break

12297
      0/ 200000: 3.8278
  10000/ 200000: 2.1679
  20000/ 200000: 2.3673
  30000/ 200000: 2.4550
  40000/ 200000: 2.0025
  50000/ 200000: 2.3194
  60000/ 200000: 2.4001
  70000/ 200000: 2.0518
  80000/ 200000: 2.2517
  90000/ 200000: 2.1547
 100000/ 200000: 1.9719
 110000/ 200000: 2.3002
 120000/ 200000: 1.9477
 130000/ 200000: 2.3918
 140000/ 200000: 2.2514
 150000/ 200000: 2.1468
 160000/ 200000: 1.9326
 170000/ 200000: 1.8307
 180000/ 200000: 2.0655
 190000/ 200000: 1.9032


In [212]:
# useful for checking your gradients
for p,g in zip(parameters, grads):
  cmp(str(tuple(p.shape)), g, p)

TypeError: all() received an invalid combination of arguments - got (bool), but expected one of:
 * (Tensor input, *, Tensor out = None)
 * (Tensor input, tuple of ints dim = None, bool keepdim = False, *, Tensor out = None)
 * (Tensor input, int dim, bool keepdim = False, *, Tensor out = None)
 * (Tensor input, name dim, bool keepdim = False, *, Tensor out = None)


In [213]:
# calibrate the batch norm at the end of training

with torch.no_grad():
  # pass the training set through
  emb = C[Xtr]
  embcat = emb.view(emb.shape[0], -1)
  hpreact = embcat @ W1 + b1
  # measure the mean/std over the entire training set
  bnmean = hpreact.mean(0, keepdim=True)
  bnvar = hpreact.var(0, keepdim=True, unbiased=True)

In [214]:
# evaluate train and val loss

@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  emb = C[x] # (N, block_size, n_embd)
  embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
  hpreact = embcat @ W1 + b1
  hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
  h = torch.tanh(hpreact) # (N, n_hidden)
  logits = h @ W2 + b2 # (N, vocab_size)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.0727713108062744
val 2.110630750656128


In [None]:
# I achieved:
# train 2.0727713108062744
# val 2.110630750656128

In [215]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      # ------------
      # forward pass:
      # Embedding
      emb = C[torch.tensor([context])] # (1,block_size,d)      
      embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
      hpreact = embcat @ W1 + b1
      hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
      h = torch.tanh(hpreact) # (N, n_hidden)
      logits = h @ W2 + b2 # (N, vocab_size)
      # ------------
      # Sample
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out))

carmahza.
jahleigh.
mri.
reety.
skanden.
jazhuel.
delynn.
jareei.
nellara.
chaiiv.
kaleigh.
ham.
joce.
quint.
shois.
alianni.
waterri.
jarisi.
jace.
pirra.
