In [None]:
import math
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

#!git clone https://github.com/ashegde/build-nanoGPT
!wget https://raw.githubusercontent.com/ashegde/build-nanoGPT/main/model.py
!wget https://raw.githubusercontent.com/ashegde/build-nanoGPT/main/loader.py
!wget https://raw.githubusercontent.com/ashegde/build-nanoGPT/main/utils.py
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!pip install tiktoken

from model import GPT, GPTConfig
from loader import DataLoaderLite
from utils import LinearWarmupCosineAnnealingScheduler, say_hello
import tiktoken
import time

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [None]:
# trial optimization loop -- overfitting on a single batch (same batch as above)
cfg = GPTConfig(vocab_size=50304) #overriding vocab_size with a power of 2
model = GPT(cfg)
model.train()
device = "cpu"
model.to(device)
# parameter count
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has a total of {num_parameters} parameters.')

model = torch.compile(model) # most of the benefit here may be for GPUs

In [None]:
# nats needed to describe the vocab_size
# this is roughly on par with the untrained loss
np.log(cfg.vocab_size)

In [None]:
# batch size and gradient accumulator setup

total_batch_size = 524288 #2**19 = roughly the 0.5M token batch size listed in the GPT3 paper regarding the size of the 125M parameter GPT2
B = 16 # "micro"-batch size
T = 1024 # sequence length
assert total_batch_size % (B * T) == 0, "total_batch_size should be divisble by B * T"
grad_accum_steps = total_batch_size // (B * T)
train_loader = DataLoaderLite(B,T)
print(f"total desired batch size: {total_batch_size}")
print(f" required gradient accumulation steps: {grad_accum_steps}")

train_load = DataLoaderLite(B=B, T=T)


In [None]:
# revised training loop



# torch.set_float32_matmul_precision('high') # need cuda
#optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9,0.95), eps=1e-8)
optimizer = model.configure_optimizers(weight_decay = 0.1, learning_rate=6e-4, device=device)

max_lr = 3e-4
min_lr = 0.1 * max_lr
warmup_steps = 10
max_steps = 50
scheduler = LinearWarmupCosineAnnealingScheduler(max_lr=max_lr, min_lr=min_lr, warmup_steps=warmup_steps, max_steps=max_steps)

for step in range(max_steps):
  t0 = time.time()
  optimizer.zero_grad()

  loss_accum = 0
  for micro_steps in range(grad_accum_steps):
    x, y = train_loader.next_batch()
    x = x.to(device)
    y = y.to(device)
    with torch.autocast(device_type=device, dtype=torch.bfloat16):
      logits, loss = model(x,y)
    loss = loss / grad_accum_steps # compensate for accumulation, the loss defaults to a mean reduction
    loss_accum += loss.detach()
    loss.backward()
  norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  # lr
  lr = scheduler.get_lr(step)
  for param_group in optimizer.param_groups:
    param_group['lr'] = lr
  optimizer.step()
  #torch.cuda.synchronizer() # need cuda
  t1 = time.time()
  elapsed = t1-t0

  print(f'step {step} || loss: {loss_accum.item():} || norm: {norm:0.4f} || time elapsed: dt={elapsed*1000:0.4f}ms')



In [None]:
say_hello(model)