# Building Makemore MLP Exercise

Training Loss: 2.12

Validation Loss: 2.17

## Imports

In [None]:
import numpy
import torch
import torch.nn.functional as F
from rich import print
from rich import pretty
import matplotlib.pyplot as plot
import random
import math

In [None]:
g = torch.Generator().manual_seed(42)

## Setup

In [None]:
words = open('../data/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [None]:
len(words)

32033

In [None]:
def generate_training_set(words, block_size, print_disabled=False):
    
    chars = sorted(list(set(''.join(words))))
    stoi = {s: i+1 for i, s in enumerate(chars)}
    stoi['.'] = 0
    itos = {i:s for s, i in stoi.items()}
    
    X, Y = [], []
    
    for w in words:
        if print_disabled: print(w)
        
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            if print_disabled: print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix] # crop and append
            
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

In [None]:
X, Y = generate_training_set(words, 5)

In [None]:
X.shape, Y.shape

(torch.Size([228146, 5]), torch.Size([228146]))

In [None]:
def generate_train_valid_test_split(words, block_size=3):
    random.seed(42)
    random.shuffle(words)
    n1 = int(0.8*len(words))
    n2 = int(0.9*len(words))

    Xtr, Ytr = generate_training_set(words[:n1], block_size)
    Xdev, Ydev = generate_training_set(words[n1:n2], block_size)
    Xte, Yte = generate_training_set(words[n2:], block_size)
    
    return Xtr, Ytr, Xdev, Ydev, Xte, Yte

In [None]:
Xtr, Ytr, Xdev, Ydev, Xte, Yte = generate_train_valid_test_split(words, block_size=5)

In [None]:
Xtr.shape, Ytr.shape

(torch.Size([182625, 5]), torch.Size([182625]))

In [None]:
Xdev.shape, Ydev.shape

(torch.Size([22655, 5]), torch.Size([22655]))

In [None]:
Xte.shape, Yte.shape

(torch.Size([22866, 5]), torch.Size([22866]))

## E01

Tune the hyperparameters of the training to beat the validation loss of 2.2

   - no of neurons in the hidden layer
    
   - embedding size
    
   - no of characters
    
   - epochs
    
   - learning rate; change/decay it over the epochs
    
   - batch size

In [None]:
def evaluate_loss(parameters, X, Y, block_size=3, embedding_size=10):
    C, W1, b1, W2, b2 = parameters
    emb = C[X]
    h = torch.tanh(emb.view(-1, block_size * embedding_size) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    return loss

In [None]:
def _regularization_loss(parameters, lambdas):
    C = parameters[0]
    W1 = parameters[1]
    W2 = parameters[3]
    
    return lambdas[0]*(C**2).mean() + lambdas[1]*(W1**2).mean() + lambdas[2]*(W2**2).mean()

In [None]:
def train(X, 
          Y, 
          epochs, 
          block_size=3, 
          embedding_size=10, 
          hidden_neuron=300, 
          bs=32, 
          lr=0.1, 
          parameters=[], 
          lambdas = [0, 0, 0],
          enable_print=True):
    
    if not parameters:
        C = torch.randn((27, embedding_size), generator=g)
        W1 = torch.randn((block_size * embedding_size, hidden_neuron), generator=g)
        b1 = torch.randn(hidden_neuron, generator=g)
        W2 = torch.randn((hidden_neuron, 27), generator=g)
        b2 = torch.randn(27, generator=g)
        parameters = [C, W1, b1, W2, b2]

    
    for p in parameters: p.requires_grad = True 
        
    for epoch in range(epochs):
            
        ix = torch.randint(0, X.shape[0], (bs, ))

        loss = evaluate_loss(parameters, X[ix], Y[ix], block_size, embedding_size)
        regularization_loss = _regularization_loss(parameters, lambdas)
        loss += regularization_loss

        for p in parameters:
            p.grad= None
        loss.backward()


        for p in parameters:
            p.data += - lr * p.grad

        if enable_print: print(epoch, loss.item())
    
    return parameters, loss.item()

#### 1st try

In [None]:
parameters, loss = train(Xtr, Ytr, 50_000, block_size=5, embedding_size=50, hidden_neuron=100, bs=1800, lr=0.1, enable_print=False)

In [None]:
loss

2.0942413806915283

In [None]:
evaluate_loss(parameters, Xdev, Ydev, block_size=5, embedding_size=50)

tensor(2.2013, grad_fn=<NllLossBackward0>)

#### 2nd try

In [None]:
parameters, loss = train(Xtr, Ytr, 50_000, block_size=5, embedding_size=50, hidden_neuron=100, bs=1800, lr=0.01, parameters=parameters, enable_print=False)

In [None]:
loss

2.1155266761779785

In [None]:
evaluate_loss(parameters, Xdev, Ydev, block_size=5, embedding_size=50)

tensor(2.1946, grad_fn=<NllLossBackward0>)

#### 3rd try

In [None]:
parameters, loss = train(Xtr, Ytr, 50_000, block_size=5, embedding_size=50, hidden_neuron=100, bs=18000, lr=0.001, parameters=parameters, enable_print=False)

In [None]:
loss

2.1173627376556396

In [None]:
evaluate_loss(parameters, Xdev, Ydev, block_size=5, embedding_size=50)

tensor(2.1939, grad_fn=<NllLossBackward0>)

#### 4th try

In [None]:
parameters, loss = train(Xtr, Ytr, 60, block_size=5, embedding_size=50, hidden_neuron=100, bs=512, lr=0.001, parameters=parameters, enable_print=False)

In [None]:
loss

2.179168701171875

In [None]:
evaluate_loss(parameters, Xdev, Ydev, block_size=5, embedding_size=50)

tensor(2.2677, grad_fn=<NllLossBackward0>)

#### 5th try

In [None]:
parameters, loss = train(Xtr, Ytr, 100, block_size=5, embedding_size=50, hidden_neuron=100, bs=512, lr=0.0005, parameters=parameters, enable_print=False)

In [None]:
loss

2.254889965057373

In [None]:
evaluate_loss(parameters, Xdev, Ydev, block_size=5, embedding_size=50)

tensor(2.2668, grad_fn=<NllLossBackward0>)

#### 6th try

In [None]:
parameters, loss = train(Xtr, Ytr, 300_000, block_size=5, embedding_size=50, hidden_neuron=100, bs=128,  lr=0.0001, parameters=parameters, enable_print=False)

In [None]:
loss

2.2203402519226074

In [None]:
evaluate_loss(parameters, Xdev, Ydev, block_size=5, embedding_size=50)

tensor(2.1860, grad_fn=<NllLossBackward0>)

#### evaluate test

In [None]:
evaluate_loss(parameters, Xte, Yte, block_size=5, embedding_size=50)

tensor(2.3040, grad_fn=<NllLossBackward0>)