This is a modification of part 4 of Andrej Karpathy's stellar Makemore tutorial (https://www.youtube.com/watch?v=q8SA3rM6ckI) in which we implement and backpropogate through layer normalization rather than batch normalization.


In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

%matplotlib inline

!git clone https://github.com/karpathy/makemore

Cloning into 'makemore'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 61 (delta 34), reused 43 (delta 19), pack-reused 0[K
Unpacking objects: 100% (61/61), done.


In [2]:
with open('makemore/names.txt','r') as file:
  words = file.read().splitlines()

In [3]:
word_lengths = torch.tensor([len(w) for w in words]).float()
print(
 f"""
 This dataset contains {word_lengths.nelement()} names\n
 The minimum name length is {word_lengths.min()} characters.\n 
 The maximum name length is {word_lengths.max()} characters.\n
 The mean name length is  {word_lengths.mean():.2f} characters. \n
 The associated standard deviation is {word_lengths.std():.2f} characters.
 """
 )


 This dataset contains 32033 names

 The minimum name length is 2.0 characters.
 
 The maximum name length is 15.0 characters.

 The mean name length is  6.12 characters. 

 The associated standard deviation is 1.44 characters.
 


In [4]:
#building the character vocabulary and lookup tables to map from characters to integer indices and back

chars = ['.']+sorted(list(set(''.join(words))))  #as before, '.' is used as a start/stop/padding special character
s_to_i = {s:i for i,s in enumerate(chars)}
i_to_s = {i:s for s,i in s_to_i.items()}
block_size = 3 #context length, size of the block that supports the prediction: P(x_n| x_{n-1}, x_{n-2}, x_{n-3} )
vocab_size = len(i_to_s)
print(i_to_s)
print(vocab_size)


{0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
27


In [5]:
def build_dataset(words, block_size):
  X,Y = [], []

  for w in words:
    #print(w)
    context = [0] * block_size #init context using indices of chars
    for ch in w+'.':
      ix = s_to_i[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(i_to_s[i] for i in context), '--->', i_to_s[ix]) #context ---> current, training pattern
      context = context[1:]+[ix]

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape,Y.shape)
  return X,Y

#training split (used to train parameters), dev/validation split (used to train hyperparameters), test split (at end with the final model)
# 80%, 10%, 10%
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1], block_size)
Xdev, Ydev = build_dataset(words[n1:n2], block_size)
Xte, Yte = build_dataset(words[n2:], block_size)

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [7]:
#re-initialize all parameters using the Kaiming init method for tanh
n_embd = 10 #embedding dimension
n_hidden = 200
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size,n_embd),             generator=g) #embedding matrix
W1 = torch.randn((n_embd * block_size,n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)  #scaling can be important, large weights that occur by chance (high dimensional gaussian) can cause the tanh nonlinearity to saturate, even at initialization. Saturated nonlinearities are flat, meaning the gradient of the loss wrt those parameters is zero. No learning for those parameters.

# note that with batch normalization, the bias b1 is useless. We include it here anyway just to show it will have zero gradient.
b1 = torch.randn(n_hidden,                       generator=g) * 0.1
W2 = torch.randn((n_hidden,vocab_size),          generator=g) * 0.1 #scaling can help with unbalanced initial probabilities output by the softmax layer due to outliers in the random input layer
b2 = torch.randn(vocab_size,                     generator=g) * 0.1

#
lngain = torch.randn((1,n_hidden))*0.1 + 1.0 #batch normalization gain
lnbias = torch.randn((1,n_hidden))*0.1 #batch normalization bias

#As stated in the lecture, the above initializations are somewhat nonstandard.
#We are avoiding certain initializations, such as zero, so that improper implementations
#of backprop will be fully exposed (nothing is hidden). 

parameters = [C, W1, b1, W2, b2, lngain, lnbias]
print(sum(p.nelement() for p in parameters))

for p in parameters:
  p.requires_grad = True

batch_size = 32
max_iters = 200000
lossi=[]

for i in range(max_iters):

  ix = torch.randint(0,Xtr.shape[0],(batch_size,), generator=g) 
  Xb, Yb = Xtr[ix], Ytr[ix] #batch

  #forward pass
  emb = C[Xb] #embedding characters (batch_size, block_size, n_embd)
  embcat = emb.view(emb.shape[0],-1) #(batch_size, block_size * n_embd)

  # Linear layer 1
  hpreln = embcat @ W1 + b1 #h(idden)preb(atch)n(ormalization), size = (batch_size, n_hidden)
  # each row of hprebn is a vector of preactivations for the corresponding input example.

  # LayerNorm layer
  lnmean = hpreln.mean(dim=1, keepdim=True)
  lnvar = hpreln.var(dim=1, keepdim=True, unbiased=True)
  lnstd_inv = (lnvar+1e-5)**-0.5
  lnraw = (hpreln - lnmean) * lnstd_inv
  hpreact = lngain * lnraw + lnbias #(batch_size, n_hidden)

  # Non-linearity
  h = torch.tanh(hpreact) #(batch_size, n_hidden)

  #Linear layer 2
  logits = h @ W2 + b2 #(batch_size, vocab_size)

  #cross entropy loss, does the same thing as F.cross_entropy(logits,Yb)
  loss = F.cross_entropy(logits, Yb) # negative log likelihood loss, averaged over the batch
  
  #backward pass

  for p in parameters:
    p.grad = None

  loss.backward() 

  #update
  lr = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0:
    print(f'{i:7d}/{max_iters:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())

    # if i == 1000: #debug
    #   break


12297
      0/ 200000: 3.7656
  10000/ 200000: 2.1812
  20000/ 200000: 2.3081
  30000/ 200000: 2.4482
  40000/ 200000: 1.9748
  50000/ 200000: 2.4055
  60000/ 200000: 2.4535
  70000/ 200000: 2.0327
  80000/ 200000: 2.2801
  90000/ 200000: 2.1518
 100000/ 200000: 1.8942
 110000/ 200000: 2.1645
 120000/ 200000: 1.9131
 130000/ 200000: 2.4799
 140000/ 200000: 2.3318
 150000/ 200000: 2.1241
 160000/ 200000: 1.8702
 170000/ 200000: 1.8250
 180000/ 200000: 1.9836
 190000/ 200000: 1.8101


In [8]:
@torch.no_grad()
def split_loss(split):
  x,y = { 
      'train': (Xtr, Ytr),
      'val': (Xdev,Ydev),
      'test': (Xte,Yte),
  }[split]
  emb = C[x] #(N, block_size, n_embd)
  embcat = emb.view(emb.shape[0],-1)
  hpreln = embcat @ W1 + b1
  # layer norm
  lnmean = hpreln.mean(dim=1, keepdim=True)
  lnvar = hpreln.var(dim=1, keepdim=True, unbiased=True)
  lnstd_inv = (lnvar+1e-5)**-0.5
  lnraw = (hpreln - lnmean) * lnstd_inv
  hpreact = lngain * lnraw + lnbias
  #
  h = torch.tanh(hpreact)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())
  
split_loss('train')
split_loss('test')

train 2.0661628246307373
test 2.112459182739258


The model with layer normalization has similar performance to batch normalization.