[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/baggiponte/makemore/blob/main/notebooks/mlp.ipynb)

# Setup

In [None]:
try:
    from makemore.datasets import fetch_names
except ModuleNotFoundError:
    !pip install --quiet -- makemore
    from makemore.datasets import fetch_names

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [None]:
names = fetch_names(shuffle=True, seed=42)

# Obtain Training Data

Here are some special parameters, called "hyperparameters" that you can tweak manually:

In [None]:
CONTEXT_SIZE = 3

TRAIN_SIZE = 0.8
TEST_SIZE = 0.9

EMBEDDING_DIMENSIONS = 10
HIDDEN_LAYER_NEURONS = 200

TRAINING_STEPS = 200_000
MINIBATCH_SIZE = 32

In [None]:
context, labels = names.get_ngrams(CONTEXT_SIZE, as_tensor=True)

training_index = int(TRAIN_SIZE*len(context))
test_index = int(TEST_SIZE*len(context))

X_train, X_validation, X_test = context[:training_index], context[training_index:test_index], context[test_index:]
y_train, y_validation, y_test = labels[:training_index], labels[training_index:test_index], labels[test_index:]

In [None]:
print(
    f"Train set:\tX: {len(X_train)}\ty:{len(y_train)}",
    f"Validation set:\tX: {len(X_validation)}\ty:{len(y_validation)}",
    f"Test set:\tX: {len(X_test)}\ty:{len(y_test)}",
    sep="\n"
)

# Split in train and test set

In [None]:
g = torch.Generator().manual_seed(42) # for reproducibility

C = torch.randn((27, EMBEDDING_DIMENSIONS), generator=g, requires_grad=True)

W1 = torch.randn((CONTEXT_SIZE * EMBEDDING_DIMENSIONS, HIDDEN_LAYER_NEURONS), generator=g, requires_grad=True)
b1 = torch.randn(HIDDEN_LAYER_NEURONS, generator=g, requires_grad=True)

W2 = torch.randn((HIDDEN_LAYER_NEURONS, 27), generator=g, requires_grad=True)
b2 = torch.randn(27, generator=g, requires_grad=True)

parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.nelement() for p in parameters) # number of parameters in total

# Train

In [None]:
lri = []
lossi = []
stepi = []

In [None]:
%%time
for i in range(TRAINING_STEPS):

    # minibatch construct
    ix = torch.randint(0, X_train.shape[0], (MINIBATCH_SIZE,))
  
    # forward pass
    emb = C[X_train[ix]]
    h = torch.tanh(emb.view(-1, CONTEXT_SIZE * EMBEDDING_DIMENSIONS) @ W1 + b1)
    logits = h @ W2 + b2
  
    loss = F.cross_entropy(logits, y_train[ix])
  
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad

    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{TRAINING_STEPS:7d}: {loss.item():.4f}')
    
    stepi.append(i)
    lossi.append(loss.log10().item())

In [None]:
_ = plt.plot(stepi, lossi)

## Training Loss

In [None]:
emb = C[X_train]
h = torch.tanh(emb.view(-1, CONTEXT_SIZE * EMBEDDING_DIMENSIONS) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, y_train)
loss.item()

## Validation Loss

In [None]:
emb = C[X_validation]
h = torch.tanh(emb.view(-1, CONTEXT_SIZE * EMBEDDING_DIMENSIONS) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, y_validation)
loss.item()

# Visualise Embeddings

In [None]:
from makemore.utils import INT_TO_STRING

# visualize dimensions 0 and 1 of the embedding matrix C for all characters
plt.figure(figsize=(8,8))
plt.scatter(C[:,0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), INT_TO_STRING[i], ha="center", va="center", color='white')
plt.grid('minor')

# Test loss

In [None]:
# test loss
emb = C[X_test] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, y_test)
loss.item()

# Generate names

In [None]:
# sample from the model
g = torch.Generator().manual_seed(42 + 10)

for _ in range(20):
    
    out = []
    context = [0] * CONTEXT_SIZE # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(INT_TO_STRING[i] for i in out))