In [46]:
import torch
import torch.nn.functional as F
import optuna
import matplotlib.pyplot as plt

In [47]:
# read in all the words
words = open('makemore/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [48]:
# build the dataset

def build_dataset(words, block_size):
    X, Y = [], []
    for w in words:
    
        # print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix] # crop and append
            
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [49]:
def train_model(steps, batch_size, learning_rate, l1_num_inputs, x, y, stepi, lossi):
    for p in parameters:
        p.requires_grad = True

    for i in range(steps):
        ix = torch.randint(0, x.shape[0], (batch_size,))
        
        # Forward pass
        emb = C[x[ix]]  # Shape: (batch_size, block_size, embedding_dims)
        emb = emb.view(batch_size, -1)  # Flatten properly
        
        h = torch.tanh(emb @ W1 + b1)  # Now matches shape (batch_size, l2_num_neurons)
        logits = h @ W2 + b2  # Shape: (batch_size, 27)
        loss = F.cross_entropy(logits, y[ix])
        
        # Backward pass
        for p in parameters:
            p.grad = None
        loss.backward()

        for p in parameters:
            p.data += -learning_rate * p.grad

        stepi.append(i)
        lossi.append(loss.log10().item())


def total_loss(x, y, l1_num_inputs):
    emb = C[x] # (batch_size, block_size, l1_num_inputs)
    h = torch.tanh(emb.view(-1, l1_num_inputs) @ W1 + b1) # (l1_num_inputs, l2_num_neurons)
    logits = h @ W2 + b2 # (batch_size, 27)
    loss = F.cross_entropy(logits, y)
    return loss.item()

In [50]:
def get_samples(num_samples):
    # sample from the model
    g = torch.Generator().manual_seed(2147483647 + 10)
    
    for _ in range(20):
    
        out = []
        context = [0] * block_size # initialize with all ...
        while True:
            emb = C[torch.tensor([context])] # (1, block_size, d)
            h = torch.tanh(emb.view(1, -1) @ W1 + b1)
            logits = h @ W2 + b2
            probs = F.softmax(logits, dim=1)
            ix = torch.multinomial(probs, num_samples=1, generator=g).item()
            context = context[1:] + [ix]
            out.append(ix)
            if ix == 0:
                break
    
        print(''.join(itos[i] for i in out))

In [55]:
def objective(trial):
    # Suggest hyperparameters
    block_size = trial.suggest_int("block_size", 8, 12)  # Context length
    embedding_dims = trial.suggest_int("embedding_dims", 8, 12)  # Embedding size
    l2_num_neurons = trial.suggest_int("l2_num_neurons", 50, 150)  # Hidden layer size
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])  # Batch size
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 1e-1, log=True)  # Learning rate

    # Build datasets with new block_size
    Xtr, Ytr = build_dataset(words[:n1], block_size)
    Xdev, Ydev = build_dataset(words[n1:n2], block_size)

    # Define model parameters
    l1_num_inputs = embedding_dims * block_size
    g = torch.Generator().manual_seed(2147483647)
    
    global C, W1, b1, W2, b2  # Ensure variables are accessible
    C = torch.randn((27, embedding_dims), generator=g)
    W1 = torch.randn((l1_num_inputs, l2_num_neurons), generator=g)
    b1 = torch.randn(l2_num_neurons, generator=g)
    W2 = torch.randn((l2_num_neurons, 27), generator=g)
    b2 = torch.randn(27, generator=g)
    
    parameters = [C, W1, b1, W2, b2]
    for p in parameters:
        p.requires_grad = True

    # Training loop
    for i in range(50000):  # Reduce iterations for speed
        ix = torch.randint(0, Xtr.shape[0], (batch_size,))
        emb = C[Xtr[ix]]
        h = torch.tanh(emb.view(-1, l1_num_inputs) @ W1 + b1)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Ytr[ix])

        # Backprop
        for p in parameters:
            p.grad = None
        loss.backward()

        for p in parameters:
            p.data += -learning_rate * p.grad

    # Compute validation loss
    val_loss = total_loss(Xdev, Ydev, l1_num_inputs)
    
    return val_loss  # Optuna minimizes by default

In [56]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)  # Try 30 different sets of hyperparameters

print("Best hyperparameters:", study.best_params)

[I 2025-03-01 15:05:28,172] A new study created in memory with name: no-name-55c65799-95c8-4a8b-8db3-ac793ef8ab40


torch.Size([182778, 10]) torch.Size([182778])
torch.Size([22633, 10]) torch.Size([22633])


[I 2025-03-01 15:05:35,737] Trial 0 finished with value: 2.6170952320098877 and parameters: {'block_size': 10, 'embedding_dims': 10, 'l2_num_neurons': 60, 'batch_size': 32, 'learning_rate': 0.050675099941790085}. Best is trial 0 with value: 2.6170952320098877.


torch.Size([182778, 10]) torch.Size([182778])
torch.Size([22633, 10]) torch.Size([22633])


[I 2025-03-01 15:05:44,663] Trial 1 finished with value: 2.6354153156280518 and parameters: {'block_size': 10, 'embedding_dims': 9, 'l2_num_neurons': 148, 'batch_size': 64, 'learning_rate': 0.0668998711142591}. Best is trial 0 with value: 2.6170952320098877.


torch.Size([182778, 9]) torch.Size([182778])
torch.Size([22633, 9]) torch.Size([22633])


[I 2025-03-01 15:05:52,874] Trial 2 finished with value: 2.5692782402038574 and parameters: {'block_size': 9, 'embedding_dims': 9, 'l2_num_neurons': 111, 'batch_size': 64, 'learning_rate': 0.029995485554564827}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 11]) torch.Size([182778])
torch.Size([22633, 11]) torch.Size([22633])


[I 2025-03-01 15:06:01,944] Trial 3 finished with value: 2.7104671001434326 and parameters: {'block_size': 11, 'embedding_dims': 12, 'l2_num_neurons': 64, 'batch_size': 64, 'learning_rate': 0.01179814900105327}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 10]) torch.Size([182778])
torch.Size([22633, 10]) torch.Size([22633])


[I 2025-03-01 15:06:09,952] Trial 4 finished with value: 2.6507132053375244 and parameters: {'block_size': 10, 'embedding_dims': 11, 'l2_num_neurons': 138, 'batch_size': 32, 'learning_rate': 0.04380368997709687}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 10]) torch.Size([182778])
torch.Size([22633, 10]) torch.Size([22633])


[I 2025-03-01 15:06:18,998] Trial 5 finished with value: 2.590683937072754 and parameters: {'block_size': 10, 'embedding_dims': 9, 'l2_num_neurons': 118, 'batch_size': 64, 'learning_rate': 0.022109186319559136}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 11]) torch.Size([182778])
torch.Size([22633, 11]) torch.Size([22633])


[I 2025-03-01 15:06:27,437] Trial 6 finished with value: 2.6831870079040527 and parameters: {'block_size': 11, 'embedding_dims': 10, 'l2_num_neurons': 83, 'batch_size': 64, 'learning_rate': 0.014803571655093887}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 12]) torch.Size([182778])
torch.Size([22633, 12]) torch.Size([22633])


[I 2025-03-01 15:06:35,290] Trial 7 finished with value: 2.815549373626709 and parameters: {'block_size': 12, 'embedding_dims': 8, 'l2_num_neurons': 75, 'batch_size': 32, 'learning_rate': 0.028701207485083802}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:06:42,508] Trial 8 finished with value: 2.609130382537842 and parameters: {'block_size': 8, 'embedding_dims': 9, 'l2_num_neurons': 96, 'batch_size': 16, 'learning_rate': 0.024232063581398226}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 12]) torch.Size([182778])
torch.Size([22633, 12]) torch.Size([22633])


[I 2025-03-01 15:06:51,445] Trial 9 finished with value: 2.68961501121521 and parameters: {'block_size': 12, 'embedding_dims': 8, 'l2_num_neurons': 90, 'batch_size': 64, 'learning_rate': 0.011725269925454284}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:06:59,126] Trial 10 finished with value: 2.730764150619507 and parameters: {'block_size': 8, 'embedding_dims': 11, 'l2_num_neurons': 111, 'batch_size': 16, 'learning_rate': 0.08575808815031853}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 9]) torch.Size([182778])
torch.Size([22633, 9]) torch.Size([22633])


[I 2025-03-01 15:07:08,580] Trial 11 finished with value: 2.6047770977020264 and parameters: {'block_size': 9, 'embedding_dims': 9, 'l2_num_neurons': 120, 'batch_size': 64, 'learning_rate': 0.020550291046911945}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 9]) torch.Size([182778])
torch.Size([22633, 9]) torch.Size([22633])


[I 2025-03-01 15:07:17,016] Trial 12 finished with value: 2.5981767177581787 and parameters: {'block_size': 9, 'embedding_dims': 9, 'l2_num_neurons': 122, 'batch_size': 64, 'learning_rate': 0.03544218319969736}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 9]) torch.Size([182778])
torch.Size([22633, 9]) torch.Size([22633])


[I 2025-03-01 15:07:25,523] Trial 13 finished with value: 2.6111576557159424 and parameters: {'block_size': 9, 'embedding_dims': 8, 'l2_num_neurons': 105, 'batch_size': 64, 'learning_rate': 0.018410111096850753}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 9]) torch.Size([182778])
torch.Size([22633, 9]) torch.Size([22633])


[I 2025-03-01 15:07:35,799] Trial 14 finished with value: 2.602823495864868 and parameters: {'block_size': 9, 'embedding_dims': 10, 'l2_num_neurons': 133, 'batch_size': 64, 'learning_rate': 0.032234210095418776}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 11]) torch.Size([182778])
torch.Size([22633, 11]) torch.Size([22633])


[I 2025-03-01 15:07:43,430] Trial 15 finished with value: 2.6976583003997803 and parameters: {'block_size': 11, 'embedding_dims': 9, 'l2_num_neurons': 122, 'batch_size': 16, 'learning_rate': 0.018099580715354967}. Best is trial 2 with value: 2.5692782402038574.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:07:52,486] Trial 16 finished with value: 2.552072525024414 and parameters: {'block_size': 8, 'embedding_dims': 11, 'l2_num_neurons': 110, 'batch_size': 64, 'learning_rate': 0.04181795266581569}. Best is trial 16 with value: 2.552072525024414.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:08:01,476] Trial 17 finished with value: 2.5411388874053955 and parameters: {'block_size': 8, 'embedding_dims': 11, 'l2_num_neurons': 102, 'batch_size': 64, 'learning_rate': 0.05572763019715074}. Best is trial 17 with value: 2.5411388874053955.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:08:08,370] Trial 18 finished with value: 2.764185667037964 and parameters: {'block_size': 8, 'embedding_dims': 12, 'l2_num_neurons': 96, 'batch_size': 16, 'learning_rate': 0.05762368423480465}. Best is trial 17 with value: 2.5411388874053955.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:08:15,737] Trial 19 finished with value: 2.583416700363159 and parameters: {'block_size': 8, 'embedding_dims': 11, 'l2_num_neurons': 76, 'batch_size': 32, 'learning_rate': 0.09471611724987403}. Best is trial 17 with value: 2.5411388874053955.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:08:23,715] Trial 20 finished with value: 2.6038193702697754 and parameters: {'block_size': 8, 'embedding_dims': 11, 'l2_num_neurons': 52, 'batch_size': 64, 'learning_rate': 0.040844920489376074}. Best is trial 17 with value: 2.5411388874053955.


torch.Size([182778, 9]) torch.Size([182778])
torch.Size([22633, 9]) torch.Size([22633])


[I 2025-03-01 15:08:32,298] Trial 21 finished with value: 2.6122822761535645 and parameters: {'block_size': 9, 'embedding_dims': 10, 'l2_num_neurons': 107, 'batch_size': 64, 'learning_rate': 0.07140580354816159}. Best is trial 17 with value: 2.5411388874053955.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:08:41,219] Trial 22 finished with value: 2.544375419616699 and parameters: {'block_size': 8, 'embedding_dims': 12, 'l2_num_neurons': 111, 'batch_size': 64, 'learning_rate': 0.046218065496292864}. Best is trial 17 with value: 2.5411388874053955.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:08:50,691] Trial 23 finished with value: 2.6001038551330566 and parameters: {'block_size': 8, 'embedding_dims': 12, 'l2_num_neurons': 131, 'batch_size': 64, 'learning_rate': 0.04644252971688768}. Best is trial 17 with value: 2.5411388874053955.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:08:59,297] Trial 24 finished with value: 2.5592217445373535 and parameters: {'block_size': 8, 'embedding_dims': 12, 'l2_num_neurons': 101, 'batch_size': 64, 'learning_rate': 0.06043038502595976}. Best is trial 17 with value: 2.5411388874053955.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:09:08,521] Trial 25 finished with value: 2.559724807739258 and parameters: {'block_size': 8, 'embedding_dims': 11, 'l2_num_neurons': 91, 'batch_size': 64, 'learning_rate': 0.03825097294320879}. Best is trial 17 with value: 2.5411388874053955.


torch.Size([182778, 9]) torch.Size([182778])
torch.Size([22633, 9]) torch.Size([22633])


[I 2025-03-01 15:09:18,887] Trial 26 finished with value: 2.630140781402588 and parameters: {'block_size': 9, 'embedding_dims': 12, 'l2_num_neurons': 116, 'batch_size': 64, 'learning_rate': 0.051134736034429355}. Best is trial 17 with value: 2.5411388874053955.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:09:27,320] Trial 27 finished with value: 2.5282766819000244 and parameters: {'block_size': 8, 'embedding_dims': 11, 'l2_num_neurons': 87, 'batch_size': 64, 'learning_rate': 0.07081949491424479}. Best is trial 27 with value: 2.5282766819000244.


torch.Size([182778, 9]) torch.Size([182778])
torch.Size([22633, 9]) torch.Size([22633])


[I 2025-03-01 15:09:35,024] Trial 28 finished with value: 2.5792806148529053 and parameters: {'block_size': 9, 'embedding_dims': 11, 'l2_num_neurons': 82, 'batch_size': 32, 'learning_rate': 0.07740959312899753}. Best is trial 27 with value: 2.5282766819000244.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:09:42,165] Trial 29 finished with value: 2.6839208602905273 and parameters: {'block_size': 8, 'embedding_dims': 12, 'l2_num_neurons': 68, 'batch_size': 16, 'learning_rate': 0.05254943336374956}. Best is trial 27 with value: 2.5282766819000244.


torch.Size([182778, 9]) torch.Size([182778])
torch.Size([22633, 9]) torch.Size([22633])


[I 2025-03-01 15:09:49,546] Trial 30 finished with value: 2.6952197551727295 and parameters: {'block_size': 9, 'embedding_dims': 10, 'l2_num_neurons': 87, 'batch_size': 32, 'learning_rate': 0.0996781866615738}. Best is trial 27 with value: 2.5282766819000244.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[I 2025-03-01 15:09:57,788] Trial 31 finished with value: 2.514582872390747 and parameters: {'block_size': 8, 'embedding_dims': 11, 'l2_num_neurons': 100, 'batch_size': 64, 'learning_rate': 0.06515495056860778}. Best is trial 31 with value: 2.514582872390747.


torch.Size([182778, 8]) torch.Size([182778])
torch.Size([22633, 8]) torch.Size([22633])


[W 2025-03-01 15:10:05,404] Trial 32 failed with parameters: {'block_size': 8, 'embedding_dims': 11, 'l2_num_neurons': 100, 'batch_size': 64, 'learning_rate': 0.0639699936805835} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/jupyterlab/4.3.5/libexec/lib/python3.13/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/8_/kl72rt_d67z2yxqnyg9_n3mw0000gn/T/ipykernel_25469/175724720.py", line 39, in objective
    loss.backward()
    ~~~~~~~~~~~~~^^
  File "/opt/homebrew/Cellar/jupyterlab/4.3.5/libexec/lib/python3.13/site-packages/torch/_tensor.py", line 626, in backward
    torch.autograd.backward(
    ~~~~~~~~~~~~~~~~~~~~~~~^
        self, gradient, retain_graph, create_graph, inputs=inputs
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/opt/homebrew/Cellar/jupyterlab/4.3.5/libexec/lib/python3.13/site-packages/torch

KeyboardInterrupt: 

In [None]:
best_params = study.best_params  # Get the best hyperparams from Optuna
print("Best Hyperparameters:", best_params)

block_size = best_params["block_size"]
embedding_dims = best_params["embedding_dims"]
l2_num_neurons = best_params["l2_num_neurons"]

l1_num_inputs = embedding_dims * block_size

# Reinitialize parameters using the best hyperparameters
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, embedding_dims), generator=g)
W1 = torch.randn((l1_num_inputs, l2_num_neurons), generator=g)
b1 = torch.randn(l2_num_neurons, generator=g)
W2 = torch.randn((l2_num_neurons, 27), generator=g)
b2 = torch.randn(27, generator=g)

parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

In [None]:
Xtr, Ytr = build_dataset(words[:n1], block_size)
Xdev, Ydev = build_dataset(words[n1:n2], block_size)
Xte, Yte = build_dataset(words[n2:], block_size)

In [None]:
stepi = []
lossi = []
train_model(
    steps=100000,
    batch_size=best_params["batch_size"],
    learning_rate=best_params["learning_rate"],
    l1_num_inputs=l1_num_inputs,
    x=Xtr,
    y=Ytr,
    stepi=stepi,
    lossi=lossi
)
plt.plot(stepi, lossi)

In [None]:
print("Training loss:", total_loss(Xtr, Ytr, l1_num_inputs))
print("Validation loss:", total_loss(Xdev, Ydev, l1_num_inputs))
print("Test loss:", total_loss(Xte, Yte, l1_num_inputs))

In [None]:
get_samples(20)