# Training Together 

In [1]:
%load_ext autoreload
%autoreload 2
from functools import partial
from pathlib import Path
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import wandb
import os

from ece496b_basics.adapters import *
from ece496b_basics.model import CustomModule

DATA_PATH = Path("../data").resolve()
OUTPUT_PATH = Path("outputs").resolve()
ts_tokenized_path = OUTPUT_PATH / "tinystories_encoded.npy"

device = "cuda"

### Parameter Sweep

In [2]:
# Initialize Weights & Biases
config = {
    "vocab_size": 10_000,
    "context_length": 128,
    "d_model": 512,
    "num_layers": 4,
    "num_heads": 16,
    "d_ff": 2048,
    "attn_pdrop": 0.1,
    "residual_pdrop": 0.1,
    "batch_size": 128,
    "num_steps":  2500,
    "epochs": 1,
    "epochs_per_checkpoint": 1,
}
sweep_config = {
    "method": "grid",
    "parameters": {
        "learning_rate": {
            "values": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]
        },
    }
}
sweep_id = wandb.sweep(sweep_config, project="training_together")

Create sweep with ID: 56lr8wuy
Sweep URL: https://wandb.ai/alvinyang101-university-of-hawaii-at-manoa/training_together/sweeps/56lr8wuy


### Sweep Learning Rate

In [3]:
# Training Loop
def train(config):
    run = wandb.init(config=config)
    config = wandb.config
    run.name = f"lr_{config.learning_rate}"
    dataset = np.load(ts_tokenized_path, mmap_mode="r")
    model = CustomModule(
        vocab_size=config.vocab_size,
        context_length=config.context_length,
        d_model=config.d_model,
        num_layers=config.num_layers,
        num_heads=config.num_heads,
        d_ff=config.d_ff,
        device=device,
    )
    optimizer = get_adamw_cls()(model.parameters(), lr=config.learning_rate)
    # scheduler = torch.optim.lr_scheduler.LambdaLR(
    # optimizer, lr_lambda=lambda it: run_get_lr_cosine_schedule(
    #     it, learning_rate, learning_rate * 0.1, 1000, 10000)
    # )
    total_loss = 0
    for step in range(config.num_steps):
        # Get batch
        inputs, targets = run_get_batch(dataset, config.batch_size, config.context_length, device)
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = run_cross_entropy(outputs.view(-1, config.vocab_size), targets.view(-1))
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        run_gradient_clipping(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

        # Log to wandb
        wandb.log({f"Loss:": loss.item(), "learning_rate": config.learning_rate})

    # Finish wandb run
    wandb.finish()

In [4]:
num_combinations = np.prod([len(v["values"]) for v in sweep_config["parameters"].values()])
wandb.agent(sweep_id, function=partial(train, config), count=int(num_combinations))

[34m[1mwandb[0m: Agent Starting Run: 0h3as4z0 with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: Currently logged in as: [33malvinyang101[0m ([33malvinyang101-university-of-hawaii-at-manoa[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


0,1
Loss:,██▇▅▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Loss:,3.98904
learning_rate,1e-05


[34m[1mwandb[0m: Agent Starting Run: wjdmy8c0 with config:
[34m[1mwandb[0m: 	learning_rate: 0.0001


0,1
Loss:,█▆▆▄▄▄▄▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▂▁▁▂▁▁▁▁▁▁▁▁▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Loss:,2.7046
learning_rate,0.0001


[34m[1mwandb[0m: Agent Starting Run: 3s0j1823 with config:
[34m[1mwandb[0m: 	learning_rate: 0.001


0,1
Loss:,█▇▇▇▇▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▂▁▁▁▁▁▁▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Loss:,1.99614
learning_rate,0.001


[34m[1mwandb[0m: Agent Starting Run: qs8lytdy with config:
[34m[1mwandb[0m: 	learning_rate: 0.01


0,1
Loss:,██▄▂▁▁▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▄▃▃▄▃▄▃▂▃▃▂▂▂
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Loss:,4.39612
learning_rate,0.01


[34m[1mwandb[0m: Agent Starting Run: h1e29ozl with config:
[34m[1mwandb[0m: 	learning_rate: 0.1


0,1
Loss:,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Loss:,5.38687
learning_rate,0.1


[34m[1mwandb[0m: Agent Starting Run: dcbqiqgo with config:
[34m[1mwandb[0m: 	learning_rate: 1


0,1
Loss:,█▄▂▂▁▁▁▂▂▃▃▂▂▃▂▃▂▂▂▂▃▂▂▂▃▂▃▁▃▃▄▃▁▂▂▃▂▁▂▃
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Loss:,79.12352
learning_rate,1.0


[34m[1mwandb[0m: Agent Starting Run: zv98whss with config:
[34m[1mwandb[0m: 	learning_rate: 10


0,1
Loss:,▃▁▁▃▁▃▃▃▂▄▄▃▄▂▆▁▃▃▄▃▃▃▃▂▃▄▂▃▄▃▅▄▁█▃▄▃▆▃▃
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Loss:,22320.82227
learning_rate,10.0


[34m[1mwandb[0m: Agent Starting Run: xfqyn2qv with config:
[34m[1mwandb[0m: 	learning_rate: 100


0,1
Loss:,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Loss:,9.21034
learning_rate,100.0


[34m[1mwandb[0m: Agent Starting Run: trvmohmg with config:
[34m[1mwandb[0m: 	learning_rate: 1000


0,1
Loss:,▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Loss:,
learning_rate,1000.0
