## Build Large Language Model from Scratch

### Load All Import

In [None]:
import os
import sys
import json
import time
import yaml
import numpy as np
import numpy.typing as npt
import torch
import torch.nn as nn
from data import get_batch, load, save
from tokenizer import Tokenizer, train_bpe, PAT_GPT2, PAT_SPECIAL_TOKEN
from cs336_basics.modules.layers import TransformerLM
from cs336_basics.modules.loss import CrossEntropyLoss
from cs336_basics.modules.activation import GLU, Softmax
from cs336_basics.modules.optimizer import SGD, AdamW, compute_lr, gradient_cliping
from tokenizers import Tokenizer as HFTokenizer
from init_weights import init_weights

#### 1. Load Config

In [87]:
print("--- Loading Configuration ---")
with open('config.yaml', 'r') as f:
    load = yaml.safe_load(f)
        
model_args = load['model_args']
training_args = load['training_args']
data_args = load['data_args']
    
os.makedirs(data_args['checkpoint_dir'], exist_ok=True)

--- Loading Configuration ---


#### 2. Initial

In [None]:
print("--- Initializing ---")
device = torch.device(training_args['device'] if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
    
# tokenizer = Tokenizer.from_files(
#     vocab_filepath=data_args['vocab_path'],
#     merges_filepath=data_args['merges_path']
#     )

hf_tokenizer = HFTokenizer.from_file(data_args['vocab_path'])

tokenizer = Tokenizer(
    vocab={token: idx for token, idx in hf_tokenizer.get_vocab().items()},
    merges=data_args['merges_path']
)

model_args['vocab_size'] = len(tokenizer.vocab)
print(f"Tokenizer loaded. Vocab size: {model_args['vocab_size']}")

model = torch.compile(TransformerLM(**model_args)).to(device)
init_weights(model)
print(f"Model created with {model.get_num_params():,} parameters.")
    
optimizer = AdamW(model.parameters(), lr=training_args['learning_rate'])
    
loss_init = CrossEntropyLoss()

--- Initializing ---
Using device: cuda
Tokenizer loaded. Vocab size: 10000
Model created with 17,576,448 parameters.


#### 3. Load data

In [89]:
print("--- Loading Data with np.memmap ---")
train_data = np.memmap(data_args['train_data_path'], dtype=np.uint16, mode='r')
valid_data = np.memmap(data_args['valid_data_path'], dtype=np.uint16, mode='r')
print(f"Train data tokens: {len(train_data):,}, Val data tokens: {len(valid_data):,}")

--- Loading Data with np.memmap ---
Train data tokens: 3,038,773,016, Val data tokens: 74,004,384


#### 4. Resume training

In [90]:
start_iter = 0
if data_args['resume_from_checkpoint']:
    print(f"Resuming training from {data_args['resume_from_checkpoint']}")
    start_iter = load(data_args['resume_from_checkpoint'], model, optimizer)

#### 5. Evaluation Function

In [91]:
@torch.no_grad()
def evaluate():
    model.eval()
    valid_loss = 0
    eval_iters = 100
    for _ in range(eval_iters):
        x, y = get_batch(valid_data, training_args['batch_size'], model_args['context_length'], device)
        logits = model(x)
        loss = loss_init(logits.view(-1, model_args['vocab_size']), y.view(-1))
        valid_loss += loss.item()
    model.train()
    return valid_loss / eval_iters  

#### 6. Begin Training Loop

In [92]:
import glob
import os
from data import load

ckpts = glob.glob(os.path.join(data_args['checkpoint_dir'], "model_iter_*.pt"))
if ckpts:
    latest_ckpt = max(ckpts, key=os.path.getctime)
    start_iter = load(latest_ckpt, model, optimizer) + 1
else:
    print("From Scratch Training")
    start_iter = 0

load checkpoints/model_iter_40000.pt iterations: 40000


In [93]:
from torch.utils.tensorboard import SummaryWriter


writer = SummaryWriter(log_dir="runs/llm_run")

train_losses = []
val_losses = []
iterations = []
eval_iterations = []

print("--- Starting Training Loop ---")
t0 = time.time()

for iter_num in range(start_iter, training_args['max_iters']):

    lr = compute_lr(
        iter_num,
        training_args['learning_rate'],
        training_args['min_lr'],
        training_args['warmup_steps'],
        training_args['lr_decay_steps']
    )
    for pg in optimizer.param_groups:
        pg['lr'] = lr


    inputs, targets = get_batch(
        train_data,
        training_args['batch_size'],
        model_args['context_length'],
        device
    )
    logits = model(inputs)
    loss = loss_init(
        logits.view(-1, model_args['vocab_size']),
        targets.view(-1)
    )

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    total_norm = torch.nn.utils.clip_grad_norm_(
        model.parameters(),
        training_args['gradient_clip_val']
    )
    optimizer.step()

    train_losses.append(loss.item())
    iterations.append(iter_num)

    writer.add_scalar("Loss/train", loss.item(), iter_num)
    writer.add_scalar("LR", lr, iter_num)
    writer.add_scalar("GradNorm/total", total_norm, iter_num)

    if iter_num % 10 == 0:
        t1 = time.time()
        print(
            f"Iter {iter_num}/{training_args['max_iters']}, "
            f"Train Loss: {loss.item():.4f}, "
            f"LR: {lr:.6f}, "
            f"GradNorm: {total_norm:.4f}, "
            f"Time: {(t1-t0)*1000:.1f}ms"
        )
        t0 = t1

    if iter_num > 0 and iter_num % training_args['eval_interval'] == 0:
        val_loss = evaluate()
        val_losses.append(val_loss)
        eval_iterations.append(iter_num)

        writer.add_scalar("Loss/val", val_loss, iter_num)

        print(f"--- Eval at iter {iter_num}: Val Loss: {val_loss:.4f} ---")
        checkpoint_path = os.path.join(
            data_args['checkpoint_dir'],
            f"model_iter_{iter_num}.pt"
        )
        save(model, optimizer, iter_num, checkpoint_path)

writer.close()
print("Training complete! Logs written to runs/llm_run")

--- Starting Training Loop ---
Iter 40010/50000, Train Loss: 3.7798, LR: 0.000002, GradNorm: 0.4239, Time: 10767.9ms
Iter 40020/50000, Train Loss: 3.7747, LR: 0.000002, GradNorm: 0.4210, Time: 1707.3ms
Iter 40030/50000, Train Loss: 3.7277, LR: 0.000002, GradNorm: 0.4169, Time: 1705.4ms
Iter 40040/50000, Train Loss: 3.7441, LR: 0.000002, GradNorm: 0.4398, Time: 1706.4ms
Iter 40050/50000, Train Loss: 3.7717, LR: 0.000002, GradNorm: 0.4089, Time: 1706.0ms
Iter 40060/50000, Train Loss: 3.7539, LR: 0.000002, GradNorm: 0.4196, Time: 1708.1ms
Iter 40070/50000, Train Loss: 3.7542, LR: 0.000002, GradNorm: 0.4073, Time: 1702.3ms
Iter 40080/50000, Train Loss: 3.7773, LR: 0.000002, GradNorm: 0.4091, Time: 1697.9ms
Iter 40090/50000, Train Loss: 3.7176, LR: 0.000002, GradNorm: 0.4128, Time: 1699.9ms
Iter 40100/50000, Train Loss: 3.7563, LR: 0.000002, GradNorm: 0.4101, Time: 1699.9ms
Iter 40110/50000, Train Loss: 3.7274, LR: 0.000002, GradNorm: 0.4044, Time: 1702.5ms
Iter 40120/50000, Train Loss: 3.7

In [94]:
print("--- Training Finished! ---")
final_checkpoint_path = os.path.join(data_args['checkpoint_dir'], "model_final.pt")
save(model, optimizer, training_args['max_iters'], final_checkpoint_path)

--- Training Finished! ---
save checkpoints/model_final.pt iterations: 50000
