In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [3]:
chars = sorted(list(set(''.join(words))))
ctoi = {c:i+1 for i,c in enumerate(chars)}
ctoi['.']=0
itoc = {i:c for c,i in ctoi.items()}

In [4]:
block_size = 5
def build_dataset(words):
    X, Y = [], []

    for word in words:
        context = [0] * block_size
        for char in word + '.':
            X.append(context[:])
            Y.append(ctoi[char])
            context = context[1:] + [ctoi[char]]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

import random
random.seed(42442)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xval, Yval = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [5]:
emb_dim = 10
hidden_n = 400
vocab_size = 27

g = torch.Generator().manual_seed(42442)

# input embedding layer
C = torch.randn((vocab_size,emb_dim), generator=g)

# an arbitrary number of neurons hidden layer - kaiming init
W1 = torch.randn((block_size*emb_dim, hidden_n), generator=g) * ((5/3) / (block_size*emb_dim)**0.5)
# biases
#b1 = torch.randn(hidden_n, generator=g) * 0

# output layer - want to be even probabiliites at init - scale down w2 and set b2 to 0 so
# that the outputs are all close to 0
W2 = torch.randn((hidden_n, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0

bn_gain = torch.ones((1, hidden_n))
bn_bias = torch.zeros((1, hidden_n))
bn_mean_running = torch.zeros((1, hidden_n))
bn_std_running = torch.ones((1, hidden_n))

parameters = [C, W1, W2, b2, bn_gain, bn_bias]
for p in parameters:
    p.requires_grad=True

In [6]:
steps = []
losses = []
batch_size = 32
max_steps = 200000
momentum = 0.01

for i in range(1, max_steps+1):
    
    #random batch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    # training loop forward

    #get embeddings
    emb = C[Xtr[ix]]

    # linear layer
    # apply weights biases and then activation function
    h_preact = emb.view(-1, block_size*emb_dim) @ W1

    # Batch normalization layer
    bnmeani = h_preact.mean(0, keepdim=True)
    bnstdi = h_preact.std(0, keepdim=True)

    #apply normalization with affine parameters bn_gain and bn_bias
    h_preact = bn_gain * ((h_preact - bnmeani) / bnstdi) + bn_bias
    
    # calculate an approx bn mean and bn std for the entire training set
    # these are needed when not training as bn_mean and bn_std are coupled to the specific batch samples
    # need to use a mean for the entire training set when using single examples and not batches
    with torch.no_grad():
        bn_mean_running = (1-momentum) * bn_mean_running + momentum * bnmeani
        bn_std_running = (1-momentum) * bn_std_running + momentum * bnstdi

    # activation
    h = torch.tanh(h_preact)
    # apply weights and biases for output layer to get log counts
    logits = h @ W2 + b2
    # cross entropy loss = -log likelihood
    loss = F.cross_entropy(logits, Ytr[ix])

    #backward
    for p in parameters:
        p.grad=None
    loss.backward()

    # update params
    lr = 0.01 if i < 100000 else 0.005
    for p in parameters:
        p.data -= p.grad * lr
    
    steps.append(i)
    losses.append(loss.log10().item())

    if i % 10000 == 0:
      print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')

  10000/ 200000: 2.2781
  20000/ 200000: 2.5970
  30000/ 200000: 1.5391
  40000/ 200000: 2.2231
  50000/ 200000: 2.0124
  60000/ 200000: 1.9687
  70000/ 200000: 2.0999
  80000/ 200000: 2.4764
  90000/ 200000: 2.2261
 100000/ 200000: 2.0969
 110000/ 200000: 2.0286
 120000/ 200000: 1.6450
 130000/ 200000: 1.9005
 140000/ 200000: 1.8320
 150000/ 200000: 2.2927
 160000/ 200000: 2.1711
 170000/ 200000: 2.5743
 180000/ 200000: 2.1165
 190000/ 200000: 1.7322
 200000/ 200000: 2.3175


In [7]:
@torch.no_grad()
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val': (Xval, Yval),
        'test': (Xte, Yte)
    } [split]
    emb = C[x]
    h_preact = emb.view(-1, block_size*emb_dim) @ W1 # bias does nothing now as it is negated by the normalisation - removed
    h_preact = bn_gain * ((h_preact - bn_mean_running) / bn_std_running) + bn_bias
    h = torch.tanh(h_preact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.043363332748413
val 2.1029765605926514


In [8]:
#sample from the mlp
def sample():
    context = [0] * block_size
    output = []
    while True:
        emb = C[torch.tensor([context])]
        h_preact = emb.view(1, -1) @ W1
        h_preact = bn_gain * ((h_preact - bn_mean_running) / bn_std_running) + bn_bias
        h = torch.tanh(h_preact)
        logits = h @ W2 + b2
        dist = F.softmax(logits, dim=1)
        y = torch.multinomial(dist, num_samples=1, generator=g).item()
        context = context[1:] + [y]
        output.append(y)
        if y == 0:
            break
    print(''.join(itoc[i] for i in output))

for _ in range(10):
    sample()

deandreya.
phanner.
zymis.
leyanaelwan.
ochar.
kimmerleigh.
kation.
calen.
carmindyy.
tayri.
