## makemore: becoming a backprop ninja

In [None]:
# there no change in the first several cells from last lecture

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [None]:
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2024-01-28 23:40:23--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2024-01-28 23:40:23 (8.27 MB/s) - ‘names.txt’ saved [228145/228145]



In [None]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [None]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [None]:
# build the dataset - train/dev/test split
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
  X, Y = [], []

  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [None]:
# ok biolerplate done, now we get to the action:

![MLP_structure](https://drive.google.com/uc?export=view&id=1Qq0gnC8zHU4dGiYJ4RICcJ7ap5ifknKU)

In [None]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 64 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g) # the embedding matrix
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden,                        generator=g) * 0.1 # using b1 just for fun, it's useless because of BN
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0 # gamma
bnbias = torch.randn((1, n_hidden))*0.1 # beta

# Note: I am initializating many of these parameters in non-standard ways
# because sometimes initializating with e.g. all zeros could mask an incorrect
# implementation of the backward pass.

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

4137


In [None]:
batch_size = 32
n = batch_size # a shorter variable also, for convenience
# construct a minibatch
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

### Batchnorm:

![batchnorm_formula](https://drive.google.com/uc?export=view&id=1xc_DAYYmSfS2dejfvpmESSYWXBlv5HZR)

### Softmax:

![softmax_formula](https://drive.google.com/uc?export=view&id=11prG2AKHDAfQ4nhHN66Q-WJoFj-EPBwC)

In [None]:
# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt, t): # dt is our calculation, t.grad is by pytorch
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad) # check if they are approximately equal
  maxdiff = (dt - t.grad).abs().max().item() # printing the absolute max difference
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [None]:
# forward pass, "chunkated" into smaller steps that are possible to backward one at a time

emb = C[Xb] # embed the characters into vectors
embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
# Linear layer 1
hprebn = embcat @ W1 + b1 # hidden layer pre-activation
# BatchNorm layer
bnmeani = 1/n*hprebn.sum(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n) - in torch.batchnorm, it always uses the biased estimator which is kinda like a bug, especially when n (batch size) is small
bnvar_inv = (bnvar + 1e-5)**-0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias #  linearly transformed using the learnable parameters gamma and beta to allow the network to undo the normalization if that is what is optimal or to find the most useful distribution for the activations
# Non-linearity
h = torch.tanh(hpreact)
# Linear layer 2
logits = h @ W2 + b2 # output layer, this is matrix multipliction
# cross entropy loss (same as F.cross_entropy(logits, Yb))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True)
counts_sum_inv = counts_sum**-1 # if I use (1.0/counts_sum) instead then I can't get backprop to be bit exact in pytorch
probs = counts * counts_sum_inv # [32,27] = [32,27]*[32,1] (broadcasting across columns)
logprobs = probs.log() # [32,27]
loss = -logprobs[range(n), Yb].mean() # the mean negative log likelihood of predicting the correct token, n is batch size

# PyTorch backward pass
for p in parameters:
  p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, # afaik there is no cleaner way
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
         embcat, emb]:
  t.retain_grad() # do backprop for all of the variables above
loss.backward()
loss

tensor(3.3346, grad_fn=<NegBackward0>)

In [None]:
print(logprobs.shape)
print(probs.shape)
print(counts_sum_inv.shape)
print(counts.shape)

torch.Size([32, 27])
torch.Size([32, 27])
torch.Size([32, 1])
torch.Size([32, 27])


In [None]:
print(counts_sum_inv.shape)
print(counts_sum.shape)

torch.Size([32, 1])
torch.Size([32, 1])


In [None]:
# Exercise 1: backprop through the whole thing manually,
# backpropagating through exactly all of the variables
# as they are defined in the forward pass above, one by one

dlogprobs = torch.zeros_like(logprobs) # same shape as logprobs, initialised with zerom,
dlogprobs[range(n), Yb] = -1.0/n # if loss = -(a+b+c)/n, dloss/da = -1/n, [32, 27]
dprobs = (1/probs) * dlogprobs  # dlog(x)/dx = 1/x, chain rule, [32, 27]
dcounts_sum_inv = (counts * dprobs).sum(1, keepdim = True) # d(ab)/da = b, [32, 1], sum as counts_sum_inv is broadcasted across columns (i.e. when a node is used multiple times, the gradients of its uses sum up during backprop)
dcounts = counts_sum_inv * dprobs # no additional summation as no broadcasting, [32, 27], not the final dcounts
dcounts_sum = (-counts_sum**-2) * dcounts_sum_inv # d(1/x)/dx = -1/(x^2), [32, 1]
dcounts += torch.ones_like(counts) * dcounts_sum # b1 = a11+a12+a13, then db1/da12 = db1/da12 = db1/da13 = 1, [32, 27], use '+=' as this is the second part of the dcounts
dnorm_logits = counts * dcounts # d(exp(x)) = exp(x), [32,27]
dlogits = 1.0 * dnorm_logits.clone() # c31 = a31-b3, b is broadcasted so sum, [32,27], not the final dlogits
dlogit_maxes = -1.0 * dnorm_logits.sum(1, keepdim = True) # [32, 1]
dlogits += F.one_hot(logits.max(1).indices, num_classes = logits.shape[1]) * dlogit_maxes # Put one at where the maximum logit index occurs for each batch sample, [32,27], use '+=' as this is the second part of the dlogits
dh = dlogits @ W2.T # [32,64] = [32,27] * [27, 64], add transpose where it would make the dimensions workout
dW2 = h.T @ dlogits
db2 = dlogits.sum(0)
dhpreact = (1.0-h**2) * dh # d(tanh(x)) = 1-tanh(x)**2, [32, 64)]
dbngain = (bnraw * dhpreact).sum(0, keepdim = True) # element-wise multiplication, [1, 64]
dbnraw = bngain * dhpreact # [32, 64]
dbnbias = 1.0 * dhpreact.sum(0, keepdim = True) # [1, 64]
dbndiff = bnvar_inv * dbnraw # not the final dbndiff, [32, 64]
dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim = True) # [1, 64]
dbnvar = (-0.5*(bnvar + 1e-5)**-1.5)*dbnvar_inv # d(x^(-1/2)) = -1/2 * x^(-3/2), [1, 64]
dbndiff2 = (1.0/(n-1))*torch.ones_like(bndiff2) * dbnvar # b1 = 1/(n-1) * (a11+a21), b2 = 1/(n-1) * (a12+a22), db/da = [1/n, 1/n; 1/n, 1/n] (2*2 matrix), this is broadcasting, [32, 64]
dbndiff += (2*bndiff) * dbndiff2 # d(a^2)/da = 2a, [32, 64], use '+=' as this is the second part of the dbndiff
dhprebn = dbndiff.clone() # [32, 64], not the final hprebn
dbnmeani = -dbndiff.sum(0) # [1, 64]
dhprebn += (1.0/n)*torch.ones_like(bnmeani) * dbnmeani # broadcasting, [32, 64]
dembcat = dhprebn @ W1.T # linear gradient and match the sizes, [32, 30]
dW1 = embcat.T @ dhprebn # linear gradient and match the sizes, [30, 64]
db1 = dhprebn.sum(0) # [1, 64]
demb = dembcat.view(emb.shape) # undo the concatenation, [32, 3, 10]
dC = torch.zeros_like(C)
for k in range(Xb.shape[0]):
  for j in range(Xb.shape[1]):
    ix = Xb[k, j]
    dC[ix] += demb[k, j]



# Check the correctness of calculated gradients with pytorch
cmp('logprobs', dlogprobs, logprobs)
cmp('probs', dprobs, probs)
cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)
cmp('counts_sum', dcounts_sum, counts_sum)
cmp('counts', dcounts, counts)
cmp('norm_logits', dnorm_logits, norm_logits)
cmp('logit_maxes', dlogit_maxes, logit_maxes)
cmp('logits', dlogits, logits)
cmp('h', dh, h)
cmp('W2', dW2, W2)
cmp('b2', db2, b2)
cmp('hpreact', dhpreact, hpreact)
cmp('bngain', dbngain, bngain)
cmp('bnbias', dbnbias, bnbias)
cmp('bnraw', dbnraw, bnraw)
cmp('bnvar_inv', dbnvar_inv, bnvar_inv)
cmp('bnvar', dbnvar, bnvar)
cmp('bndiff2', dbndiff2, bndiff2)
cmp('bndiff', dbndiff, bndiff)
cmp('bnmeani', dbnmeani, bnmeani)
cmp('hprebn', dhprebn, hprebn)
cmp('embcat', dembcat, embcat)
cmp('W1', dW1, W1)
cmp('b1', db1, b1)
cmp('emb', demb, emb)
cmp('C', dC, C)

logprobs        | exact: True  | approximate: True  | maxdiff: 0.0
probs           | exact: True  | approximate: True  | maxdiff: 0.0
counts_sum_inv  | exact: True  | approximate: True  | maxdiff: 0.0
counts_sum      | exact: True  | approximate: True  | maxdiff: 0.0
counts          | exact: True  | approximate: True  | maxdiff: 0.0
norm_logits     | exact: True  | approximate: True  | maxdiff: 0.0
logit_maxes     | exact: True  | approximate: True  | maxdiff: 0.0
logits          | exact: True  | approximate: True  | maxdiff: 0.0
h               | exact: True  | approximate: True  | maxdiff: 0.0
W2              | exact: True  | approximate: True  | maxdiff: 0.0
b2              | exact: True  | approximate: True  | maxdiff: 0.0
hpreact         | exact: False | approximate: True  | maxdiff: 9.313225746154785e-10
bngain          | exact: False | approximate: True  | maxdiff: 1.862645149230957e-09
bnbias          | exact: False | approximate: True  | maxdiff: 1.862645149230957e-09
bnraw   

In [None]:
# Exercise 2: backprop through cross_entropy but all in one go
# to complete this challenge look at the mathematical expression of the loss,
# take the derivative, simplify the expression, and just write it out

# forward pass

# before:
# logit_maxes = logits.max(1, keepdim=True).values
# norm_logits = logits - logit_maxes # subtract max for numerical stability
# counts = norm_logits.exp()
# counts_sum = counts.sum(1, keepdims=True)
# counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
# probs = counts * counts_sum_inv
# logprobs = probs.log()
# loss = -logprobs[range(n), Yb].mean()

# now in Pytorch, one line to compute the loss and much faster
loss_fast = F.cross_entropy(logits, Yb)
print(loss_fast.item(), 'diff:', (loss_fast - loss).item())

3.3199410438537598 diff: 0.0


In [None]:
# backward pass

# -----------------
# YOUR CODE HERE :)
dlogits = F.softmax(logits, 1) # [32, 27], apply softmax on the rows of the logits (each row is one batch sample)
dlogits[range(n), Yb] -= 1 # subtract 1 at the logit i = k, k is the true label. n is the batch size
dlogits /= n # the loss of a batch in training is the averaged loss across batch samples
# -----------------

cmp('logits', dlogits, logits) # I can only get approximate to be true, my maxdiff is 6e-9

logits          | exact: False | approximate: True  | maxdiff: 6.05359673500061e-09


In [None]:
# Exercise 3: backprop through batchnorm but all in one go
# to complete this challenge look at the mathematical expression of the output of batchnorm,
# take the derivative w.r.t. its input, simplify the expression, and just write it out
# BatchNorm paper: https://arxiv.org/abs/1502.03167

# forward pass

# before:
# bnmeani = 1/n*hprebn.sum(0, keepdim=True)
# bndiff = hprebn - bnmeani
# bndiff2 = bndiff**2
# bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n)
# bnvar_inv = (bnvar + 1e-5)**-0.5
# bnraw = bndiff * bnvar_inv
# hpreact = bngain * bnraw + bnbias

# no simplify calculation using .mean and .var function
hpreact_fast = bngain * (hprebn - hprebn.mean(0, keepdim=True)) / torch.sqrt(hprebn.var(0, keepdim=True, unbiased=True) + 1e-5) + bnbias
print('max diff:', (hpreact_fast - hpreact).abs().max())

max diff: tensor(4.7684e-07, grad_fn=<MaxBackward1>)


![batchnorm_formula](https://drive.google.com/uc?export=view&id=1xc_DAYYmSfS2dejfvpmESSYWXBlv5HZR)

![batchnorm_dependency](https://drive.google.com/uc?export=view&id=1IzgUuJs6ti85xwB98vXUMx5VvTnxiQvo)

![batchnorm_solution](https://drive.google.com/uc?export=view&id=1fubSgcUunxuHPgP0Nik8XS4tnQTAM2Eu)


In [None]:
# backward pass

# before we had:
# dbnraw = bngain * dhpreact
# dbndiff = bnvar_inv * dbnraw
# dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)
# dbnvar = (-0.5*(bnvar + 1e-5)**-1.5) * dbnvar_inv
# dbndiff2 = (1.0/(n-1))*torch.ones_like(bndiff2) * dbnvar
# dbndiff += (2*bndiff) * dbndiff2
# dhprebn = dbndiff.clone()
# dbnmeani = (-dbndiff).sum(0)
# dhprebn += 1.0/n * (torch.ones_like(hprebn) * dbnmeani)

# calculate dhprebn given dhpreact (i.e. backprop through the batchnorm)
# (you'll also need to use some of the variables from the forward pass up above)

# -----------------
# YOUR CODE HERE :)
dhprebn = bngain*bnvar_inv/n*(n*dhpreact - dhpreact.sum(0) - n/(n-1)*bnraw*(dhpreact*bnraw).sum(0))
# -----------------

cmp('hprebn', dhprebn, hprebn) # I can only get approximate to be true, my maxdiff is 9e-10

hprebn          | exact: False | approximate: True  | maxdiff: 1.3969838619232178e-09


In [None]:
# Exercise 4: putting it all together!
# Train the MLP neural net with your own backward pass

# init
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden,                        generator=g) * 0.1
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0
bnbias = torch.randn((1, n_hidden))*0.1

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

# same optimization as last time
max_steps = 200000
batch_size = 32
n = batch_size # convenience
lossi = []

# use this context manager for efficiency once your backward pass is written (TODO)
with torch.no_grad():# as we do not call pytorch grad

  # kick off optimization
  for i in range(max_steps):

    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

    # forward pass
    emb = C[Xb] # embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
    # Linear layer
    hprebn = embcat @ W1 + b1 # hidden layer pre-activation
    # BatchNorm layer
    # -------------------------------------------------------------
    bnmean = hprebn.mean(0, keepdim=True)
    bnvar = hprebn.var(0, keepdim=True, unbiased=True)
    bnvar_inv = (bnvar + 1e-5)**-0.5
    bnraw = (hprebn - bnmean) * bnvar_inv
    hpreact = bngain * bnraw + bnbias
    # -------------------------------------------------------------
    # Non-linearity
    h = torch.tanh(hpreact) # hidden layer
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Yb) # loss function

    # backward pass
    for p in parameters:
      p.grad = None
    # loss.backward() # use this for correctness comparisons, delete it later!

    # manual backprop! #swole_doge_meme
    # -----------------
    # YOUR CODE HERE :)
    dlogits = F.softmax(logits, 1)
    dlogits[range(n), Yb] -= 1
    dlogits /= n
    # 2nd layer backprop
    dh = dlogits @ W2.T
    dW2 = h.T @ dlogits
    db2 = dlogits.sum(0)
    # tanh
    dhpreact = (1.0-h**2) * dh
    # batchnorm backprop
    dbngain = (bnraw * dhpreact).sum(0, keepdim = True) # gamma
    dbnbias = 1.0 * dhpreact.sum(0, keepdim = True) # beta
    dhprebn = bngain*bnvar_inv/n*(n*dhpreact - dhpreact.sum(0) - n/(n-1)*bnraw*(dhpreact*bnraw).sum(0))
    # 1st layer
    dembcat = dhprebn @ W1.T
    dW1 = embcat.T @ dhprebn
    db1 = dhprebn.sum(0)
    # embedding
    demb = dembcat.view(emb.shape)
    dC = torch.zeros_like(C)
    for k in range(Xb.shape[0]):
      for j in range(Xb.shape[1]):
        ix = Xb[k, j]
        dC[ix] += demb[k, j]


    grads = [dC, dW1, db1, dW2, db2, dbngain, dbnbias]
    # -----------------

    # update
    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
    for p, grad in zip(parameters, grads):
      # p.data += -lr * p.grad # old way of cheems doge (this is using PyTorch grad from .backward())
      p.data += -lr * grad # new way of swole doge TODO: enable

    # track stats
    if i % 10000 == 0: # print every once in a while
      print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

    # if i >= 100: # TODO: delete early breaking when you're ready to train the full net
    #   break

12297
      0/ 200000: 3.8160
  10000/ 200000: 2.1700
  20000/ 200000: 2.3582
  30000/ 200000: 2.4859
  40000/ 200000: 1.9925
  50000/ 200000: 2.3520
  60000/ 200000: 2.3771
  70000/ 200000: 2.0929
  80000/ 200000: 2.4128
  90000/ 200000: 2.2115
 100000/ 200000: 2.0578
 110000/ 200000: 2.3289
 120000/ 200000: 2.0253
 130000/ 200000: 2.4969
 140000/ 200000: 2.1666
 150000/ 200000: 2.1837
 160000/ 200000: 1.9913
 170000/ 200000: 1.8061
 180000/ 200000: 2.0588
 190000/ 200000: 1.9484


In [None]:
# useful for checking your gradients against pytorch
for p,g in zip(parameters, grads):
  cmp(str(tuple(p.shape)), g, p)

(27, 10)        | exact: False | approximate: True  | maxdiff: 1.862645149230957e-08
(30, 200)       | exact: False | approximate: True  | maxdiff: 7.450580596923828e-09
(200,)          | exact: False | approximate: True  | maxdiff: 5.587935447692871e-09
(200, 27)       | exact: False | approximate: True  | maxdiff: 1.1175870895385742e-08
(27,)           | exact: False | approximate: True  | maxdiff: 7.450580596923828e-09
(1, 200)        | exact: False | approximate: True  | maxdiff: 3.725290298461914e-09
(1, 200)        | exact: False | approximate: True  | maxdiff: 4.6566128730773926e-09


In [None]:
# calibrate the batch norm at the end of training
# keep track of the running mean and variance

with torch.no_grad():
  # pass the training set through
  emb = C[Xtr]
  embcat = emb.view(emb.shape[0], -1)
  hpreact = embcat @ W1 + b1
  # measure the mean/std over the entire training set
  bnmean = hpreact.mean(0, keepdim=True)
  bnvar = hpreact.var(0, keepdim=True, unbiased=True)


In [None]:
# evaluate train and val loss

@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  emb = C[x] # (N, block_size, n_embd)
  embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
  hpreact = embcat @ W1 + b1
  hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
  h = torch.tanh(hpreact) # (N, n_hidden)
  logits = h @ W2 + b2 # (N, vocab_size)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.070659875869751
val 2.108168840408325


In [None]:
# I achieved:
# train 2.0718822479248047
# val 2.1162495613098145
# similar to last video results where we use loss.backward()

In [None]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):

    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      # forward pass
      emb = C[torch.tensor([context])] # (1,block_size,d)
      embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
      hpreact = embcat @ W1 + b1
      hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
      h = torch.tanh(hpreact) # (N, n_hidden)
      logits = h @ W2 + b2 # (N, vocab_size)
      # sample
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break

    print(''.join(itos[i] for i in out))

mona.
mayanniee.
madhayla.
reisa.
jendraeg.
adelyn.
elin.
shi.
jen.
eden.
sana.
arleigh.
malkelin.
shubergihimie.
tricke.
jennex.
teron.
ubelled.
ryyah.
fael.
