## Build Large Language Model from Scratch

### Load All Import

In [1]:
import os
import sys
import json
import time
import yaml
import numpy as np
import numpy.typing as npt
import torch
import torch.nn as nn
from data import get_batch, load, save
from tokenizer import Tokenizer, train_bpe, PAT_GPT2, PAT_SPECIAL_TOKEN
from modules.layers import TransformerLM
from modules.loss import CrossEntropyLoss
from modules.activation import GLU, Softmax
from modules.optimizer import SGD, AdamW, compute_lr, gradient_cliping

#### 1. Load Config

In [2]:
print("--- Loading Configuration ---")
with open('config.yaml', 'r') as f:
    load = yaml.safe_load(f)
        
model_args = load['model_args']
training_args = load['training_args']
data_args = load['data_args']
    
os.makedirs(data_args['checkpoint_dir'], exist_ok=True)

--- Loading Configuration ---


#### 2. Initial

In [None]:
print("--- Initializing ---")
device = torch.device(training_args['device'] if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
    
tokenizer = Tokenizer.from_files(
    vocab_filepath=data_args['vocab_path'],
    merges_filepath=data_args['merges_path']
    )

model_args['vocab_size'] = len(tokenizer.vocab)
print(f"Tokenizer loaded. Vocab size: {model_args['vocab_size']}")
    
model = torch.compile(TransformerLM(**model_args).to(device))
print(f"Model created with {model.get_num_params():,} parameters.")
    
optimizer = AdamW(model.parameters(), lr=training_args['learning_rate'])
    
loss_init = CrossEntropyLoss()

--- Initializing ---
Using device: cpu
Tokenizer loaded. Vocab size: 10000
Model created with 17,576,448 parameters.


#### 3. Load data

In [4]:
print("--- Loading Data with np.memmap ---")
train_data = np.memmap(data_args['train_data_path'], dtype=np.uint16, mode='r')
valid_data = np.memmap(data_args['valid_data_path'], dtype=np.uint16, mode='r')
print(f"Train data tokens: {len(train_data):,}, Val data tokens: {len(valid_data):,}")

--- Loading Data with np.memmap ---
Train data tokens: 530,486,325, Val data tokens: 5,357,233


#### 4. Resume training

In [5]:
start_iter = 0
if data_args['resume_from_checkpoint']:
    print(f"Resuming training from {data_args['resume_from_checkpoint']}")
    start_iter = load(data_args['resume_from_checkpoint'], model, optimizer)

#### 5. Evaluation Function

In [6]:
@torch.no_grad()
def evaluate():
    model.eval()
    valid_loss = 0
    eval_iters = 100
    for _ in range(eval_iters):
        x, y = get_batch(valid_data, training_args['batch_size'], model_args['context_length'], device)
        logits = model(x)
        loss = loss_init(logits.view(-1, model_args['vocab_size']), y.view(-1))
        valid_loss += loss.item()
    model.train()
    return valid_loss / eval_iters  

#### 6. Begin Training Loop

In [10]:
import glob
import os
from data import load

ckpts = glob.glob(os.path.join(data_args['checkpoint_dir'], "model_iter_*.pt"))
if ckpts:
    latest_ckpt = max(ckpts, key=os.path.getctime)
    start_iter = load(latest_ckpt, model, optimizer) + 1
else:
    print("From Scratch Training")
    start_iter = 0

load checkpoints/model_iter_1800.pt iterations: 1800


In [None]:
from torch.utils.tensorboard import SummaryWriter


writer = SummaryWriter(log_dir="runs/llm_run")

train_losses = []
val_losses = []
iterations = []
eval_iterations = []

print("--- Starting Training Loop ---")
t0 = time.time()

for iter_num in range(start_iter, training_args['max_iters']):

    lr = compute_lr(
        iter_num,
        training_args['learning_rate'],
        training_args['min_lr'],
        training_args['warmup_steps'],
        training_args['lr_decay_steps']
    )
    for pg in optimizer.param_groups:
        pg['lr'] = lr


    inputs, targets = get_batch(
        train_data,
        training_args['batch_size'],
        model_args['context_length'],
        device
    )
    logits = model(inputs)
    loss = loss_init(
        logits.view(-1, model_args['vocab_size']),
        targets.view(-1)
    )

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    total_norm = torch.nn.utils.clip_grad_norm_(
        model.parameters(),
        training_args['gradient_clip_val']
    )
    optimizer.step()

    train_losses.append(loss.item())
    iterations.append(iter_num)

    writer.add_scalar("Loss/train", loss.item(), iter_num)
    writer.add_scalar("LR", lr, iter_num)
    writer.add_scalar("GradNorm/total", total_norm, iter_num)

    if iter_num % 10 == 0:
        t1 = time.time()
        print(
            f"Iter {iter_num}/{training_args['max_iters']}, "
            f"Train Loss: {loss.item():.4f}, "
            f"LR: {lr:.6f}, "
            f"GradNorm: {total_norm:.4f}, "
            f"Time: {(t1-t0)*1000:.1f}ms"
        )
        t0 = t1

    if iter_num > 0 and iter_num % training_args['eval_interval'] == 0:
        val_loss = evaluate()
        val_losses.append(val_loss)
        eval_iterations.append(iter_num)

        writer.add_scalar("Loss/val", val_loss, iter_num)

        print(f"--- Eval at iter {iter_num}: Val Loss: {val_loss:.4f} ---")
        checkpoint_path = os.path.join(
            data_args['checkpoint_dir'],
            f"model_iter_{iter_num}.pt"
        )
        save(model, optimizer, iter_num, checkpoint_path)

writer.close()
print("Training complete! Logs written to runs/llm_run")

--- Starting Training Loop ---
Iter 1810/5000, Train Loss: 2.0309, LR: 0.000227, GradNorm: 0.4491, Time: 43163.2ms
Iter 1820/5000, Train Loss: 1.9783, LR: 0.000226, GradNorm: 0.4672, Time: 44995.2ms
Iter 1830/5000, Train Loss: 1.9211, LR: 0.000225, GradNorm: 0.4525, Time: 43732.2ms
Iter 1840/5000, Train Loss: 1.9868, LR: 0.000224, GradNorm: 0.4544, Time: 43826.7ms
Iter 1850/5000, Train Loss: 2.0341, LR: 0.000224, GradNorm: 0.4633, Time: 41222.0ms
Iter 1860/5000, Train Loss: 1.9187, LR: 0.000223, GradNorm: 0.4503, Time: 41311.7ms
Iter 1870/5000, Train Loss: 1.8880, LR: 0.000222, GradNorm: 0.4511, Time: 49969.2ms
Iter 1880/5000, Train Loss: 1.9195, LR: 0.000221, GradNorm: 0.4569, Time: 51677.9ms
Iter 1890/5000, Train Loss: 1.9518, LR: 0.000220, GradNorm: 0.4443, Time: 41408.6ms
Iter 1900/5000, Train Loss: 2.0176, LR: 0.000220, GradNorm: 0.4444, Time: 41096.3ms
Iter 1910/5000, Train Loss: 1.9700, LR: 0.000219, GradNorm: 0.4469, Time: 38377.6ms
Iter 1920/5000, Train Loss: 1.9611, LR: 0.000

KeyboardInterrupt: 

In [8]:
print("--- Training Finished! ---")
final_checkpoint_path = os.path.join(data_args['checkpoint_dir'], "model_final.pt")
save(model, optimizer, training_args['max_iters'], final_checkpoint_path)

--- Training Finished! ---
save checkpoints/model_final.pt iterations: 5000
