In [1]:
import makemore as mm

In [2]:
import torch

In [3]:
import os
import sys
import time
import math
import argparse
from dataclasses import dataclass
from typing import List

In [4]:
torch.cuda.is_available()

True

In [5]:
torch.cuda.get_device_name(0)

'NVIDIA RTX A6000'

In [6]:
#args
seed = 101010
work_dir = '/home/ubuntu/models' #run `mkdir models` first
input_file = 'names.txt'

#Model Args
n_layer=4
n_embd=64
n_embd2=64
n_head=64
device='cuda:0'

#Trainer Args
learning_rate=3e-4
weight_decay=1e-2
batch_size = 10
num_workers = 2
max_steps = 100

In [7]:
# system inits
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
os.makedirs(work_dir, exist_ok=True)
writer = mm.SummaryWriter(log_dir=work_dir)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [8]:
os.getcwd()

'/home/ubuntu/makemore'

In [9]:
# init datasets
train_dataset, test_dataset = mm.create_datasets(input_file)
vocab_size = train_dataset.get_vocab_size()
block_size = train_dataset.get_output_length()
print(f"dataset determined that: {vocab_size=}, {block_size=}")

number of examples in the dataset: 32033
max word length: 15
number of unique characters in the vocabulary: 26
vocabulary:
abcdefghijklmnopqrstuvwxyz
split up the dataset into 31033 training examples and 1000 test examples
dataset determined that: vocab_size=27, block_size=16


In [10]:
# init model
config = mm.ModelConfig(vocab_size=vocab_size, block_size=block_size,
                   n_layer=n_layer, n_head=n_head,
                   n_embd=n_embd, n_embd2=n_embd2)

In [11]:
model = mm.Transformer(config)

number of parameters: 0.20M


In [12]:
model.to(device)
print(f"model #params: {sum(p.numel() for p in model.parameters())}")
# if args.resume or args.sample_only: # note: if we sample-only then we also assume we are resuming
#     print("resuming from existing model in the workdir")
#     model.load_state_dict(torch.load(os.path.join(args.work_dir, 'model.pt')))
# if args.sample_only:
#     print_samples(num=50)
#     sys.exit()

model #params: 204544


In [13]:
# init optimizer
optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=learning_rate, 
    weight_decay=weight_decay,
    betas=(0.9, 0.99), 
    eps=1e-8
)

In [14]:
# init dataloader
batch_loader = mm.InfiniteDataLoader(
    train_dataset, 
    batch_size=batch_size, 
    pin_memory=True, 
    num_workers=num_workers
)


In [15]:
# training loop
best_loss = None
step = 0
while True:

    t0 = time.time()

    # get the next batch, ship to device, and unpack it to input and target
    batch = batch_loader.next()
    batch = [t.to(device) for t in batch]
    X, Y = batch

    # feed into the model
    logits, loss = model(X, Y)

    # calculate the gradient, update the weights
    model.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    # wait for all CUDA work on the GPU to finish then calculate iteration time taken
    if device.startswith('cuda'):
        torch.cuda.synchronize()
    t1 = time.time()

    # logging
    if step % 10 == 0:
        print(f"step {step} | loss {loss.item():.4f} | step time {(t1-t0)*1000:.2f}ms")

    # evaluate the model
    if step > 0 and step % 500 == 0:
        train_loss = evaluate(model, train_dataset, batch_size=100, max_batches=10)
        test_loss  = evaluate(model, test_dataset,  batch_size=100, max_batches=10)
        writer.add_scalar("Loss/train", train_loss, step)
        writer.add_scalar("Loss/test", test_loss, step)
        writer.flush()
        print(f"step {step} train loss: {train_loss} test loss: {test_loss}")
        # save the model to disk if it has improved
        if best_loss is None or test_loss < best_loss:
            out_path = os.path.join(work_dir, "model.pt")
            print(f"test loss {test_loss} is the best so far, saving model to {out_path}")
            torch.save(model.state_dict(), out_path)
            best_loss = test_loss

    # sample from the model
    if step > 0 and step % 200 == 0:
        print_samples(num=10)

    step += 1
    # termination conditions
    if max_steps >= 0 and step >= max_steps:
        break

step 0 | loss 3.4336 | step time 803.14ms
step 10 | loss 3.0155 | step time 19.60ms
step 20 | loss 2.9193 | step time 18.95ms
step 30 | loss 2.6380 | step time 18.81ms
step 40 | loss 2.6422 | step time 18.07ms
step 50 | loss 2.6594 | step time 16.11ms
step 60 | loss 2.7239 | step time 22.88ms
step 70 | loss 2.6593 | step time 20.81ms
step 80 | loss 2.5451 | step time 20.72ms
step 90 | loss 2.4382 | step time 20.60ms
