In [2]:
%pip install joblib

Collecting joblib
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib
Successfully installed joblib-1.4.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
import torch.nn.functional as F
import optuna
from joblib import Parallel, delayed

In [2]:
# read in all the words
words = open('makemore/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [11]:
# build the dataset

def build_dataset(words, block_size):
    X, Y = [], []
    for w in words:
    
        # print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix] # crop and append
            
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    # print(X.shape, Y.shape)
    return X, Y

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [4]:
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# Xdev, Ydev = build_dataset(words[n1:n2], block_size)
# Xte, Yte = build_dataset(words[n2:], block_size)

In [13]:
# Define your model as before
def create_model(embedding_dims, block_size, l2_num_neurons):
    C = torch.randn((27, embedding_dims))
    W1 = torch.randn((embedding_dims * block_size, l2_num_neurons))
    b1 = torch.randn(l2_num_neurons)
    W2 = torch.randn((l2_num_neurons, 27))
    b2 = torch.randn(27)
    parameters = [C, W1, b1, W2, b2]
    for p in parameters:
        p.requires_grad = True
    return parameters

def train_model(device, Xtr, Ytr, steps, batch_size, learning_rate, block_size, embedding_dims, l2_num_neurons, clip_value):
    parameters = create_model(embedding_dims, block_size, l2_num_neurons)
    parameters = [p.to(device).detach().requires_grad_() for p in parameters]
    Xtr, Ytr = Xtr.to(device), Ytr.to(device)

    optimizer = torch.optim.SGD(parameters, lr=learning_rate, momentum=0.9)

    for step in range(steps):
        ix = torch.randint(0, Xtr.shape[0], (batch_size,))
        X_batch, Y_batch = Xtr[ix], Ytr[ix]

        emb = parameters[0][X_batch]
        h = torch.tanh(emb.view(-1, embedding_dims * block_size) @ parameters[1] + parameters[2])
        logits = h @ parameters[3] + parameters[4]
        loss = F.cross_entropy(logits, Y_batch)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters, clip_value)
        optimizer.step()

    return loss.item()

def objective(trial):
    block_size = trial.suggest_int("block_size", 5, 15)
    embedding_dims = trial.suggest_int("embedding_dims", 5, 20)
    l2_num_neurons = trial.suggest_int("l2_num_neurons", 50, 300)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    learning_rate = trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True)
    clip_value = trial.suggest_float("clip_value", 0.5, 2.0) # Add as a hyperparameter

    steps = 200

    Xtr, Ytr = build_dataset(words[:n1], block_size)
    device = torch.device("mps")
    # device = torch.device("cpu")
    torch.set_num_threads(torch.get_num_threads()) 
    loss = train_model(device, Xtr, Ytr, steps, batch_size, learning_rate, block_size, embedding_dims, l2_num_neurons, clip_value)
    return loss
    
# Create a study with GPU support and parallel optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30, n_jobs=-1)  # Use all available CPU cores, including parallel GPU support

[I 2025-03-02 13:20:26,251] A new study created in memory with name: no-name-d19cc43b-4c07-4c1f-b41f-b83d55171e3f
[I 2025-03-02 13:20:33,384] Trial 0 finished with value: 20.290958404541016 and parameters: {'block_size': 9, 'embedding_dims': 18, 'l2_num_neurons': 203, 'batch_size': 32, 'learning_rate': 0.002833496628053928, 'clip_value': 0.8221987537283277}. Best is trial 0 with value: 20.290958404541016.
[I 2025-03-02 13:20:33,758] Trial 2 finished with value: 19.160297393798828 and parameters: {'block_size': 8, 'embedding_dims': 8, 'l2_num_neurons': 228, 'batch_size': 16, 'learning_rate': 0.0028918005295102047, 'clip_value': 0.5351689103932158}. Best is trial 2 with value: 19.160297393798828.
[I 2025-03-02 13:20:34,230] Trial 6 finished with value: 9.767122268676758 and parameters: {'block_size': 9, 'embedding_dims': 17, 'l2_num_neurons': 124, 'batch_size': 16, 'learning_rate': 0.013766795940444084, 'clip_value': 1.2174683797146493}. Best is trial 6 with value: 9.767122268676758.
[I 

In [14]:
import time

def train_and_time(device, Xtr, Ytr, steps, batch_size, learning_rate, block_size, embedding_dims, l2_num_neurons, clip_value):
    start_time = time.time()
    loss = train_model(device, Xtr, Ytr, steps, batch_size, learning_rate, block_size, embedding_dims, l2_num_neurons, clip_value)
    end_time = time.time()
    elapsed_time = end_time - start_time
    return loss, elapsed_time

In [41]:
def objective_timed(trial):
    block_size = trial.suggest_int("block_size", 5, 15)
    embedding_dims = trial.suggest_int("embedding_dims", 5, 20)
    l2_num_neurons = trial.suggest_int("l2_num_neurons", 50, 300)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    learning_rate = trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True)
    clip_value = trial.suggest_float("clip_value", 0.5, 2.0)

    steps = 500
    Xtr, Ytr = build_dataset(words[:n1], block_size)

    # Choose between CPU and MPS
    # device_name = trial.suggest_categorical("device", ["cpu", "mps"])
    device = torch.device("mps")
    
    loss, elapsed_time = train_and_time(device, Xtr, Ytr, steps, batch_size, learning_rate, block_size, embedding_dims, l2_num_neurons, clip_value)
    
    print(f"Device: {device_name.upper()}, Loss: {loss:.4f}, Time: {elapsed_time:.2f} sec")

    return loss  # Optuna minimizes the loss

In [42]:
study = optuna.create_study(direction="minimize")
study.optimize(objective_timed, n_trials=30, n_jobs=-1)

[I 2025-03-02 13:40:05,352] A new study created in memory with name: no-name-4d2fb745-1e8e-42ac-bd70-82ed95468840
[I 2025-03-02 13:40:09,200] Trial 1 finished with value: 14.066448211669922 and parameters: {'block_size': 7, 'embedding_dims': 17, 'l2_num_neurons': 82, 'batch_size': 16, 'learning_rate': 0.0010568415166202277, 'clip_value': 1.991961094076027, 'device': 'cpu'}. Best is trial 1 with value: 14.066448211669922.


Device: CPU, Loss: 14.0664, Time: 1.89 sec


[I 2025-03-02 13:40:09,641] Trial 3 finished with value: 3.874737024307251 and parameters: {'block_size': 15, 'embedding_dims': 7, 'l2_num_neurons': 159, 'batch_size': 16, 'learning_rate': 0.08869445571703366, 'clip_value': 1.8608285703151646, 'device': 'cpu'}. Best is trial 3 with value: 3.874737024307251.


Device: CPU, Loss: 3.8747, Time: 3.26 sec


[I 2025-03-02 13:40:09,969] Trial 6 finished with value: 2.862640857696533 and parameters: {'block_size': 9, 'embedding_dims': 20, 'l2_num_neurons': 66, 'batch_size': 64, 'learning_rate': 0.029718789007869936, 'clip_value': 0.8526233190290622, 'device': 'cpu'}. Best is trial 6 with value: 2.862640857696533.


Device: CPU, Loss: 2.8626, Time: 2.65 sec


[I 2025-03-02 13:40:10,297] Trial 7 finished with value: 3.9208788871765137 and parameters: {'block_size': 14, 'embedding_dims': 9, 'l2_num_neurons': 236, 'batch_size': 32, 'learning_rate': 0.029497706744298087, 'clip_value': 1.9805455664594196, 'device': 'cpu'}. Best is trial 6 with value: 2.862640857696533.


Device: CPU, Loss: 3.9209, Time: 2.71 sec


[I 2025-03-02 13:40:10,657] Trial 2 finished with value: 4.034715175628662 and parameters: {'block_size': 9, 'embedding_dims': 14, 'l2_num_neurons': 285, 'batch_size': 32, 'learning_rate': 0.020650373164224807, 'clip_value': 1.964116232473333, 'device': 'cpu'}. Best is trial 6 with value: 2.862640857696533.


Device: CPU, Loss: 4.0347, Time: 4.61 sec


[I 2025-03-02 13:40:11,129] Trial 11 finished with value: 12.102813720703125 and parameters: {'block_size': 7, 'embedding_dims': 8, 'l2_num_neurons': 215, 'batch_size': 16, 'learning_rate': 0.0014787828023975498, 'clip_value': 1.5621177710215575, 'device': 'cpu'}. Best is trial 6 with value: 2.862640857696533.


Device: CPU, Loss: 12.1028, Time: 0.55 sec


[I 2025-03-02 13:40:11,575] Trial 9 finished with value: 3.512113332748413 and parameters: {'block_size': 13, 'embedding_dims': 18, 'l2_num_neurons': 189, 'batch_size': 32, 'learning_rate': 0.044755421564187665, 'clip_value': 1.6197159859258354, 'device': 'cpu'}. Best is trial 6 with value: 2.862640857696533.


Device: CPU, Loss: 3.5121, Time: 1.64 sec


[I 2025-03-02 13:40:12,020] Trial 13 finished with value: 4.04624080657959 and parameters: {'block_size': 12, 'embedding_dims': 10, 'l2_num_neurons': 231, 'batch_size': 16, 'learning_rate': 0.02179843710267954, 'clip_value': 1.9817364036494338, 'device': 'cpu'}. Best is trial 6 with value: 2.862640857696533.


Device: CPU, Loss: 4.0462, Time: 0.55 sec


[I 2025-03-02 13:40:12,556] Trial 15 finished with value: 3.0713889598846436 and parameters: {'block_size': 10, 'embedding_dims': 15, 'l2_num_neurons': 153, 'batch_size': 32, 'learning_rate': 0.03955153375222286, 'clip_value': 0.9336184984559306, 'device': 'cpu'}. Best is trial 6 with value: 2.862640857696533.


Device: CPU, Loss: 3.0714, Time: 0.22 sec


KeyboardInterrupt: 

In [30]:
best_params_t1 = study.best_params
best_params_t1

{'block_size': 15,
 'embedding_dims': 9,
 'l2_num_neurons': 67,
 'batch_size': 32,
 'learning_rate': 0.04122017199784867,
 'clip_value': 0.9383499285871308,
 'device': 'mps'}

In [38]:
study.enqueue_trial(best_params_t1)
study.optimize(objective_timed, n_trials=30, n_jobs=-1)

KeyboardInterrupt: 

In [35]:
def objective_with_steps(trial):
    block_size = trial.suggest_int("block_size", 5, 15)
    embedding_dims = trial.suggest_int("embedding_dims", 5, 20)
    l2_num_neurons = trial.suggest_int("l2_num_neurons", 50, 300)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    learning_rate = trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True)
    clip_value = trial.suggest_float("clip_value", 0.5, 2.0)

    # Tune number of steps (log scale since steps have a large range)
    steps = trial.suggest_int("steps", 100, 2000, step=100)

    # Choose device dynamically
    device_name = trial.suggest_categorical("device", ["cpu", "mps"])
    device = torch.device(device_name)

    Xtr, Ytr = build_dataset(words[:n1], block_size)

    loss, elapsed_time = train_and_time(device, Xtr, Ytr, steps, batch_size, learning_rate, block_size, embedding_dims, l2_num_neurons, clip_value)

    print(f"Device: {device_name.upper()}, Steps: {steps}, Loss: {loss:.4f}, Time: {elapsed_time:.2f} sec")

    return loss  # Minimize loss

In [36]:
study2 = optuna.create_study(direction="minimize")
study2.optimize(objective_with_steps, n_trials=30, n_jobs=-1)

[I 2025-03-02 13:36:42,689] A new study created in memory with name: no-name-0d6525af-1322-4484-a3c9-807d412bae42
[I 2025-03-02 13:36:45,711] Trial 3 finished with value: 5.564040184020996 and parameters: {'block_size': 6, 'embedding_dims': 16, 'l2_num_neurons': 194, 'batch_size': 32, 'learning_rate': 0.045164662645455225, 'clip_value': 0.505255587022085, 'steps': 600, 'device': 'cpu'}. Best is trial 3 with value: 5.564040184020996.


Device: CPU, Steps: 600, Loss: 5.5640, Time: 2.53 sec


[I 2025-03-02 13:36:46,875] Trial 4 finished with value: 4.393177032470703 and parameters: {'block_size': 14, 'embedding_dims': 5, 'l2_num_neurons': 99, 'batch_size': 64, 'learning_rate': 0.0012884285986914281, 'clip_value': 1.146553372184385, 'steps': 1600, 'device': 'cpu'}. Best is trial 4 with value: 4.393177032470703.


Device: CPU, Steps: 1600, Loss: 4.3932, Time: 1.64 sec


[I 2025-03-02 13:36:47,507] Trial 8 finished with value: 2.557959794998169 and parameters: {'block_size': 11, 'embedding_dims': 6, 'l2_num_neurons': 54, 'batch_size': 32, 'learning_rate': 0.0017834580158283381, 'clip_value': 1.6304668264708766, 'steps': 1900, 'device': 'cpu'}. Best is trial 8 with value: 2.557959794998169.


Device: CPU, Steps: 1900, Loss: 2.5580, Time: 1.48 sec


[I 2025-03-02 13:36:48,831] Trial 10 finished with value: 4.2762556076049805 and parameters: {'block_size': 11, 'embedding_dims': 14, 'l2_num_neurons': 218, 'batch_size': 32, 'learning_rate': 0.034095100340301936, 'clip_value': 0.6063163922016082, 'steps': 1500, 'device': 'cpu'}. Best is trial 8 with value: 2.557959794998169.


Device: CPU, Steps: 1500, Loss: 4.2763, Time: 0.92 sec


[I 2025-03-02 13:36:49,483] Trial 11 finished with value: 4.449064254760742 and parameters: {'block_size': 13, 'embedding_dims': 9, 'l2_num_neurons': 296, 'batch_size': 32, 'learning_rate': 0.09965055204571183, 'clip_value': 1.9320756883571535, 'steps': 500, 'device': 'cpu'}. Best is trial 8 with value: 2.557959794998169.


Device: CPU, Steps: 500, Loss: 4.4491, Time: 0.29 sec


[I 2025-03-02 13:36:49,941] Trial 12 finished with value: 6.347474575042725 and parameters: {'block_size': 15, 'embedding_dims': 12, 'l2_num_neurons': 209, 'batch_size': 64, 'learning_rate': 0.022807946911321337, 'clip_value': 1.3773216224888623, 'steps': 100, 'device': 'cpu'}. Best is trial 8 with value: 2.557959794998169.


Device: CPU, Steps: 100, Loss: 6.3475, Time: 0.05 sec


[I 2025-03-02 13:36:50,462] Trial 13 finished with value: 8.366116523742676 and parameters: {'block_size': 9, 'embedding_dims': 7, 'l2_num_neurons': 208, 'batch_size': 64, 'learning_rate': 0.0071166446194733946, 'clip_value': 1.6253169841032609, 'steps': 300, 'device': 'cpu'}. Best is trial 8 with value: 2.557959794998169.


Device: CPU, Steps: 300, Loss: 8.3661, Time: 0.20 sec


[I 2025-03-02 13:36:50,794] Trial 14 finished with value: 4.2606658935546875 and parameters: {'block_size': 5, 'embedding_dims': 7, 'l2_num_neurons': 190, 'batch_size': 64, 'learning_rate': 0.05735685475747297, 'clip_value': 1.481003186775664, 'steps': 400, 'device': 'cpu'}. Best is trial 8 with value: 2.557959794998169.


Device: CPU, Steps: 400, Loss: 4.2607, Time: 0.17 sec


KeyboardInterrupt: 