# Training Together 

In [2]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import wandb
import os

from ece496b_basics.adapters import *
from ece496b_basics.model import CustomModule
from ece496b_basics.generator import generate_text

DATA_PATH = Path("../data").resolve()
OUTPUT_PATH = Path("outputs").resolve()
CHECKPOINT_DIR = Path("checkpoints").resolve()
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
ts_tokenized_path = OUTPUT_PATH / "tinystories_encoded.npy"
ts_vocab_path = OUTPUT_PATH / "tinystories_vocab.pkl"
ts_merges_path = OUTPUT_PATH / "tinystories_merges.pkl"

device = "cuda"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Set Hyperparameters

In [3]:
# Initialize Weights & Biases
config = {
    "vocab_size": 10_000,
    "context_length": 128,
    "d_model": 512,
    "num_layers": 4,
    "num_heads": 16,
    "d_ff": 2048,
    "attn_pdrop": 0.1,
    "residual_pdrop": 0.1,
    "batch_size": 128,
    "num_epochs": 5,
    "epochs_per_checkpoint": 5,
    "steps_per_epoch":  2500,
    "learning_rate": 0.001,
}

## Run Training

In [None]:
# Training Loop
wandb.init(project="training_together", config=config)
config = wandb.config
dataset = np.load(ts_tokenized_path, mmap_mode="r")
model = CustomModule(
    vocab_size=config.vocab_size,
    context_length=config.context_length,
    d_model=config.d_model,
    num_layers=config.num_layers,
    num_heads=config.num_heads,
    d_ff=config.d_ff,
    device=device,
)
optimizer = get_adamw_cls()(model.parameters(), lr=config.learning_rate)
# scheduler = torch.optim.lr_scheduler.LambdaLR(
# optimizer, lr_lambda=lambda it: run_get_lr_cosine_schedule(
#     it, learning_rate, learning_rate * 0.1, 1000, 10000)
# )
total_loss = 0
for epoch in range(config.num_epochs):
    for step in range(config.steps_per_epoch):
        inputs, targets = run_get_batch(dataset, config.batch_size, config.context_length, device)
        # Forward pass
        outputs = model(inputs)
        # Compute loss
        loss = run_cross_entropy(outputs.view(-1, config.vocab_size), targets.view(-1))
        optimizer.zero_grad()
        loss.backward()
        run_gradient_clipping(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

        wandb.log({f"Loss:": loss.item(), "epoch": epoch})

    print(f"Epoch {epoch+1}/{config.num_epochs}, Total Loss: {total_loss}")
    # Save checkpoint
    if epoch % config.epochs_per_checkpoint == 0:
        run_save_checkpoint(model, optimizer, epoch, f"checkpoint_epoch_{epoch}.pt")

# Save final checkpoint if not already saved
if epoch % config.epochs_per_checkpoint != 0:
    run_save_checkpoint(model, optimizer, epoch, CHECKPOINT_DIR / f"checkpoint_epoch_{epoch}.pt")
# Finish wandb run
wandb.finish()

### Generate Some Text

In [21]:
# model = CustomModule(
#     vocab_size=config.vocab_size,
#     context_length=config.context_length,
#     d_model=config.d_model,
#     num_layers=config.num_layers,
#     num_heads=config.num_heads,
#     d_ff=config.d_ff,
#     device=device,
# )
# optimizer = get_adamw_cls()(model.parameters(), lr=config.learning_rate)
# load_checkpoint(f"checkpoint_epoch_{config.num_epochs-1}.pt")
tokenizer = from_files(ts_vocab_path, ts_merges_path, special_tokens=["<|endoftext|>"])
prompt = "Mitochondria"
text = generate_text(model, tokenizer, prompt, max_tokens=256, temperature=0.1, top_p=0.9, device="cuda")
print(f"Given prompt: {prompt}")
print(f"Generated:\n{text}")

Given prompt: Mitochondria
Generated:
Mitochondria. She was very excited to go to the park and play.
When she arrived at the park, she saw a big, green tree. She wanted to climb it and see what was on the other side. She started to climb the tree, but she was scared. She didn't want to get stuck.
Suddenly, she heard a voice. It was her mom. She said, "Don't be scared, I'm here to help you." She took a deep breath and started to climb the tree.
When she was at the top, she saw a beautiful park with lots of trees and flowers. She was so happy! She ran around and explored the park, and she had a wonderful time.
<|endoftext|>
Once upon a time, there was a little girl named Lily. Lily loved to play with her toys and eat yummy food. One day, she found a big box in her room. She was very curious about what was inside.
Lily opened the box and found a small, shiny toy. It was a magic wand! The wand could make things shrink. Lily was very excited and showed the wand to her mom. Her mom said, "Wo