In [1]:
# ====== 1. Environment prep ======
!git clone https://github.com/karpathy/nanoGPT.git
%cd nanoGPT

!pip install torch numpy transformers datasets tiktoken wandb tqdm psutil

import os
import itertools
import json
import psutil
import torch

# We'll make a directory to hold all experiment outputs/checkpoints
os.makedirs("all_experiments", exist_ok=True)

Cloning into 'nanoGPT'...
remote: Enumerating objects: 686, done.[K
remote: Total 686 (delta 0), reused 0 (delta 0), pack-reused 686 (from 1)[K
Receiving objects: 100% (686/686), 974.06 KiB | 32.47 MiB/s, done.
Resolving deltas: 100% (380/380), done.
/content/nanoGPT


In [2]:
# ====== 2. Define the fixed + sweep hyperparams ======
FIXED = {
    "block_size": 128,
    "n_layer": 6,
}

SWEEP = {
    "n_head":       [4, 8],
    "n_embd":       [128, 256],
    "batch_size":   [8, 16],
    "max_iters":    [1000, 2000],
    "dropout":      [0.1, 0.2],
}

# cartesian product over SWEEP
keys = list(SWEEP.keys())
all_settings = []
for values in itertools.product(*[SWEEP[k] for k in keys]):
    cfg = {k:v for k,v in zip(keys, values)}
    cfg.update(FIXED)
    all_settings.append(cfg)

print(f"Total experiments: {len(all_settings)}")  # should be 32

# preview a few configs
for i, cfg in enumerate(all_settings[:3]):
    print(f"[exp {i}] {cfg}")

Total experiments: 32
[exp 0] {'n_head': 4, 'n_embd': 128, 'batch_size': 8, 'max_iters': 1000, 'dropout': 0.1, 'block_size': 128, 'n_layer': 6}
[exp 1] {'n_head': 4, 'n_embd': 128, 'batch_size': 8, 'max_iters': 1000, 'dropout': 0.2, 'block_size': 128, 'n_layer': 6}
[exp 2] {'n_head': 4, 'n_embd': 128, 'batch_size': 8, 'max_iters': 2000, 'dropout': 0.1, 'block_size': 128, 'n_layer': 6}


In [3]:
# ====== 3. Helper: generate a unique run name + output dir per config ======
def make_run_id(cfg, idx):
    """
    Create a short, readable identifier for the run.
    We'll include only the swept params, since block_size and n_layer are fixed.
    """
    return (
        f"exp{idx:02d}"
        f"_head{cfg['n_head']}"
        f"_emb{cfg['n_embd']}"
        f"_bs{cfg['batch_size']}"
        f"_it{cfg['max_iters']}"
        f"_do{cfg['dropout']}"
    )

experiment_specs = []
for idx, cfg in enumerate(all_settings):
    run_id = make_run_id(cfg, idx)
    out_dir = os.path.join("all_experiments", run_id)
    os.makedirs(out_dir, exist_ok=True)
    spec = {
        "run_id": run_id,
        "out_dir": out_dir,
        **cfg
    }
    experiment_specs.append(spec)

print("Example spec:")
print(json.dumps(experiment_specs[0], indent=2))


Example spec:
{
  "run_id": "exp00_head4_emb128_bs8_it1000_do0.1",
  "out_dir": "all_experiments/exp00_head4_emb128_bs8_it1000_do0.1",
  "n_head": 4,
  "n_embd": 128,
  "batch_size": 8,
  "max_iters": 1000,
  "dropout": 0.1,
  "block_size": 128,
  "n_layer": 6
}


In [4]:
# ====== 4. Helper: parameter count + memory usage snapshot ======
def model_size_report(n_layer, n_head, n_embd, block_size, vocab_size=50304):
    """
    We'll reuse the GPTConfig/GPT later to get exact param counts,
    but this gives you a quick estimate placeholder right now.
    We'll integrate exact numbers in the training loop step.
    """
    dummy = {
        "n_layer": n_layer,
        "n_head": n_head,
        "n_embd": n_embd,
        "block_size": block_size,
        "vocab_size": vocab_size,
    }
    return dummy

print("Memory info (host machine right now):")
print(psutil.virtual_memory())
print("GPU available:", torch.cuda.is_available())

Memory info (host machine right now):
svmem(total=13605855232, available=12198588416, percent=10.3, used=1071562752, free=8133046272, active=525000704, inactive=4483432448, buffers=140955648, cached=4260290560, shared=2088960, slab=295710720)
GPU available: True


In [5]:
import time
import math
import pickle
import psutil
import torch
import torch.nn.functional as F
from torch import nn
from contextlib import nullcontext

# import GPT model definition from nanoGPT repo
from model import GPT, GPTConfig

# We'll create a lightweight dataset loader similar to get_batch() in train.py
def build_data_loader(data_dir, block_size, batch_size, device_type, device):
    """
    Returns a callable get_batch(split) -> (X, Y)
    Expects data_dir/train.bin and data_dir/val.bin (uint16 token streams).
    """
    import numpy as np
    train_bin = os.path.join(data_dir, "train.bin")
    val_bin   = os.path.join(data_dir, "val.bin")

    def get_batch(split):
        data_path = train_bin if split == "train" else val_bin
        data = np.memmap(data_path, dtype=np.uint16, mode='r')
        ix = torch.randint(len(data) - block_size, (batch_size,))
        x = torch.stack([
            torch.from_numpy((data[i:i+block_size]).astype(np.int64))
            for i in ix
        ])
        y = torch.stack([
            torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64))
            for i in ix
        ])
        if device_type == "cuda":
            x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
        else:
            x, y = x.to(device), y.to(device)
        return x, y

    return get_batch

@torch.no_grad()
def estimate_loss(model, get_batch, eval_iters, ctx):
    """
    Compute mean train/val loss over eval_iters batches each.
    """
    model.eval()
    losses = {}
    for split in ["train", "val"]:
        split_losses = []
        for _ in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            split_losses.append(loss.item())
        losses[split] = sum(split_losses) / len(split_losses)
    model.train()
    return losses


def count_parameters(model):
    """
    Return total number of parameters that require grad.
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def gpu_memory_allocated(device):
    if "cuda" in device:
        return torch.cuda.memory_allocated(device)
    return 0


def train_one_experiment(
    spec,
    data_dir="data/shakespeare_char",   # <-- you can change dataset here
    vocab_size_fallback=50304,
    eval_interval=200,                  # how often to eval & checkpoint
    eval_iters=20,                      # how many batches for eval avg
    log_interval=10,                    # how often to print/record iter stats
    weight_decay=1e-1,
    beta1=0.9,
    beta2=0.95,
    learning_rate=6e-4,
    warmup_iters=100,
    grad_clip=1.0,
    wandb_project="nano-sweep",
):

    """
    spec is one dict from experiment_specs we built in Part 1.
    Required keys:
      run_id, out_dir, n_head, n_embd, batch_size, max_iters,
      dropout, block_size, n_layer
    """

    run_id    = spec["run_id"]
    out_dir   = spec["out_dir"]
    n_head    = spec["n_head"]
    n_embd    = spec["n_embd"]
    batch_size= spec["batch_size"]
    max_iters = spec["max_iters"]
    dropout   = spec["dropout"]
    block_size= spec["block_size"]
    n_layer   = spec["n_layer"]

    # device / dtype setup (single GPU or CPU in Colab)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    device_type = "cuda" if "cuda" in device else "cpu"
    dtype = "bfloat16" if (device_type=="cuda" and torch.cuda.is_bf16_supported()) else "float16"
    ptdtype = {
        "float32": torch.float32,
        "bfloat16": torch.bfloat16,
        "float16": torch.float16,
    }[dtype]
    ctx = nullcontext() if device_type == "cpu" else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

    torch.manual_seed(1337)
    if device_type == "cuda":
        torch.cuda.manual_seed(1337)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

    # === DATA ===
    get_batch = build_data_loader(
        data_dir=data_dir,
        block_size=block_size,
        batch_size=batch_size,
        device_type=device_type,
        device=device
    )

    # try to infer vocab_size from meta.pkl (like nanoGPT does)
    meta_path = os.path.join(data_dir, "meta.pkl")
    vocab_size = vocab_size_fallback
    if os.path.exists(meta_path):
        with open(meta_path, "rb") as f:
            meta = pickle.load(f)
        if "vocab_size" in meta:
            vocab_size = meta["vocab_size"]

    # === MODEL ===
    gptconf = GPTConfig(
        block_size=block_size,
        vocab_size=vocab_size,
        n_layer=n_layer,
        n_head=n_head,
        n_embd=n_embd,
        dropout=dropout,
        bias=False,
    )
    model = GPT(gptconf).to(device)
    model.train()

    # === OPTIMIZER ===
    # same grouping trick as nanoGPT.configure_optimizers()
    decay_params = [p for p in model.parameters() if p.requires_grad and p.dim() >= 2]
    nodecay_params = [p for p in model.parameters() if p.requires_grad and p.dim() < 2]
    optim_groups = [
        {"params": decay_params,   "weight_decay": weight_decay},
        {"params": nodecay_params, "weight_decay": 0.0},
    ]
    fused_available = "fused" in torch.optim.AdamW.__init__.__code__.co_varnames
    use_fused = fused_available and device_type == "cuda"
    optimizer = torch.optim.AdamW(
        optim_groups,
        lr=learning_rate,
        betas=(beta1, beta2),
        fused=use_fused
    )

    scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))

    # === LR scheduler ===
    def get_lr(it):
        # warmup
        if it < warmup_iters:
            return learning_rate * (it + 1) / (warmup_iters + 1)
        # cosine decay to ~0
        # (for simplicity we decay to ~0 instead of min_lr here;
        # you can add a min_lr param if you want 6e-5 floor like train.py)
        decay_ratio = (it - warmup_iters) / max(1, (max_iters - warmup_iters))
        decay_ratio = min(max(decay_ratio, 0.0), 1.0)
        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
        return learning_rate * coeff

    # === W&B logging ===
    import wandb
    wandb.init(
        project=wandb_project,
        name=run_id,
        config={
            "run_id": run_id,
            "n_head": n_head,
            "n_embd": n_embd,
            "batch_size": batch_size,
            "max_iters": max_iters,
            "dropout": dropout,
            "block_size": block_size,
            "n_layer": n_layer,
            "device": device,
        },
    )

    # track best val
    best_val_loss = float("inf")
    best_ckpt_path = os.path.join(out_dir, "ckpt_best.pt")

    t0 = time.time()
    for it in range(max_iters):
        # set lr
        lr = get_lr(it)
        for pg in optimizer.param_groups:
            pg["lr"] = lr

        # fetch batch
        X, Y = get_batch("train")

        # forward
        with ctx:
            logits, loss = model(X, Y)
            loss_val = loss / 1.0  # no grad accumulation here (accum=1)

        # backward
        scaler.scale(loss_val).backward()

        # grad clip
        if grad_clip > 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

        # step
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)

        # logging loop
        if (it % log_interval == 0) or (it == max_iters - 1):
            # training stats
            loss_float = loss.item()

            # eval stats
            losses = estimate_loss(model, get_batch, eval_iters, ctx)
            train_loss_eval = losses["train"]
            val_loss_eval   = losses["val"]
            loss_gap        = train_loss_eval - val_loss_eval

            # model size / memory
            num_params = count_parameters(model)
            host_mem   = psutil.virtual_memory().percent
            gpu_mem    = gpu_memory_allocated(device)

            dt = time.time() - t0
            t0 = time.time()

            print(f"[{run_id}] iter {it}/{max_iters} | "
                  f"train_now {loss_float:.4f} | "
                  f"train_eval {train_loss_eval:.4f} | "
                  f"val {val_loss_eval:.4f} | "
                  f"gap {loss_gap:.4f} | "
                  f"lr {lr:.2e} | "
                  f"params {num_params/1e6:.2f}M | "
                  f"host_mem {host_mem:.1f}% | "
                  f"gpu_mem {gpu_mem/1e6:.1f}MB | "
                  f"{dt*1000:.1f} ms/it")

            # log to wandb
            wandb.log({
                "iter": it,
                "loss/train_iter": loss_float,              # instant train loss
                "loss/train_eval": train_loss_eval,         # eval() train split
                "loss/val": val_loss_eval,                  # eval() val split
                "loss/gap(train-val)": loss_gap,
                "lr": lr,
                "num_params": num_params,
                "host_mem_percent": host_mem,
                "gpu_mem_bytes": gpu_mem,
            })

            # save best model by val loss
            if val_loss_eval < best_val_loss:
                best_val_loss = val_loss_eval
                torch.save(
                    {
                        "model_state_dict": model.state_dict(),
                        "iter": it,
                        "val_loss": val_loss_eval,
                        "config": {
                            "run_id": run_id,
                            "n_head": n_head,
                            "n_embd": n_embd,
                            "batch_size": batch_size,
                            "max_iters": max_iters,
                            "dropout": dropout,
                            "block_size": block_size,
                            "n_layer": n_layer,
                        },
                    },
                    best_ckpt_path,
                )

    wandb.finish()
    return {
        "run_id": run_id,
        "out_dir": out_dir,
        "best_val_loss": best_val_loss,
        "best_ckpt_path": best_ckpt_path,
        "num_params": count_parameters(model),
    }

print("train_one_experiment() is defined.")


train_one_experiment() is defined.


In [6]:
!python data/shakespeare_char/prepare.py


length of dataset in characters: 1,115,394
all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65
train has 1,003,854 tokens
val has 111,540 tokens


In [7]:
!ls -lh data/shakespeare_char


total 3.3M
-rw-r--r-- 1 root root 1.1M Oct 28 04:50 input.txt
-rw-r--r-- 1 root root  703 Oct 28 04:50 meta.pkl
-rw-r--r-- 1 root root 2.3K Oct 28 04:50 prepare.py
-rw-r--r-- 1 root root  209 Oct 28 04:50 readme.md
-rw-r--r-- 1 root root 2.0M Oct 28 04:50 train.bin
-rw-r--r-- 1 root root 218K Oct 28 04:50 val.bin


In [8]:
import os
import json
import time
import wandb

# --------- 1. AUTO-LOGIN TO WANDB (no prompt) ---------
os.environ["WANDB_API_KEY"] = "4169865ba6d683df54b62381e15e7896561ee8f1"  # <-- <- <- IMPORTANT
os.environ["WANDB_PROJECT"] = "nanoGPT_Project"           # or any project name you like
os.environ["WANDB_ENTITY"]  = "arunjung1991"                     # optional: your wandb username or team

wandb.login(key=os.environ["WANDB_API_KEY"])

print("wandb logged in as:", wandb.api.api_key[:4] + "...")


# --------- 2. Directory to store sweep summary ---------
os.makedirs("all_experiments", exist_ok=True)
summary_path = os.path.join("all_experiments", "sweep_summary.jsonl")

# we'll append 1 JSON line per finished run so you can come back even if Colab disconnects
def append_summary(result_dict):
    with open(summary_path, "a") as f:
        f.write(json.dumps(result_dict) + "\n")


# --------- 3. Safety helper: GPU memory reset each run ---------
def cleanup_gpu():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()


# --------- 4. Main sweep loop ---------
def run_full_sweep(
    experiment_specs,
    data_dir="data/shakespeare_char",  # <-- change if you're training on a different dataset
    wandb_project="nano-sweep",
    max_runs=None,  # set like 2 to just test; None = run all
):
    results = []
    total_runs = len(experiment_specs) if max_runs is None else min(max_runs, len(experiment_specs))

    print(f"Starting sweep of {total_runs} experiment(s).")

    for run_idx, spec in enumerate(experiment_specs):
        if max_runs is not None and run_idx >= max_runs:
            break

        print("\n" + "="*80)
        print(f" >>> RUN {run_idx+1}/{total_runs}: {spec['run_id']}")
        print("="*80)

        # small sleep just so wandb/rate limits / Colab GPU have a breath between runs
        time.sleep(2)

        # train one experiment using Part 2's function
        try:
            result = train_one_experiment(
                spec,
                data_dir=data_dir,
                wandb_project=wandb_project,
                # you can also override eval_interval, eval_iters, etc. here if needed
            )
        except RuntimeError as e:
            # OOM or other runtime error - record it and continue
            result = {
                "run_id": spec["run_id"],
                "out_dir": spec["out_dir"],
                "error": str(e),
            }
            print(f"[{spec['run_id']}] ERROR DURING TRAINING:", e)

        # persist result to disk immediately
        append_summary(result)
        results.append(result)

        # try to clean up VRAM before next model
        cleanup_gpu()

    print("\nSweep finished.")
    print(f"Sweep summary written line-by-line to {summary_path}")
    return results


# --------- 5. Run it on all 32 (or test with fewer first) ---------
# WARNING: running all 32 sequentially in Colab will take a long time and likely hit runtime limits.
# Start with max_runs=1 or 2 to confirm everything works.

results_preview = run_full_sweep(
    experiment_specs,
    data_dir="data/shakespeare_char",  # make sure this dataset is prepared (train.bin/val.bin/meta.pkl)
    wandb_project="nano-sweep",
    max_runs=None,                        # <-- set to None to do all 32 for real
)

print("Preview results of first few runs:")
print(json.dumps(results_preview, indent=2))


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marunjung1991[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


wandb logged in as: 4169...
Starting sweep of 32 experiment(s).

 >>> RUN 1/32: exp00_head4_emb128_bs8_it1000_do0.1
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp00_head4_emb128_bs8_it1000_do0.1] iter 0/1000 | train_now 4.1911 | train_eval 4.1830 | val 4.1794 | gap 0.0035 | lr 5.94e-06 | params 1.21M | host_mem 16.5% | gpu_mem 31.7MB | 1283.3 ms/it
[exp00_head4_emb128_bs8_it1000_do0.1] iter 10/1000 | train_now 3.8013 | train_eval 3.7765 | val 3.7849 | gap -0.0084 | lr 6.53e-05 | params 1.21M | host_mem 16.4% | gpu_mem 31.7MB | 521.2 ms/it
[exp00_head4_emb128_bs8_it1000_do0.1] iter 20/1000 | train_now 3.6570 | train_eval 3.6224 | val 3.6240 | gap -0.0016 | lr 1.25e-04 | params 1.21M | host_mem 16.4% | gpu_mem 31.7MB | 418.5 ms/it
[exp00_head4_emb128_bs8_it1000_do0.1] iter 30/1000 | train_now 3.4199 | train_eval 3.3848 | val 3.3993 | gap -0.0145 | lr 1.84e-04 | params 1.21M | host_mem 16.4% | gpu_mem 31.7MB | 1014.8 ms/it
[exp00_head4_emb128_bs8_it1000_do0.1] iter 40/1000 | train_now 3.2136 | train_eval 3.2053 | val 3.2270 | gap -0.0217 | lr 2.44e-04 | params 1.21M | host_mem 16.3% | gpu_mem 31.7MB | 1160.5 ms/it
[exp00_head4_emb128_bs8_it100

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,██▆▅▃▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▃▃▃▅▁▁▃▃▃▃▃▃▃▃▁▆▆▆▆
iter,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇███
loss/gap(train-val),▄▃▁▆▅▆▇▃▅▃▂▃▃▃▁▅▄▃▄▃▃▄▄▄▂▅▇█▅▅▃▇▂▅▃▆▂▂▅▁
loss/train_eval,█▆▆▅▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▇▆▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁
loss/val,█▇▆▅▄▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▃▄▅▆▇████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▄▄▄▄▃▂▂▂▂▂▂▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,31682048.0
host_mem_percent,16.3
iter,999.0
loss/gap(train-val),-0.01839
loss/train_eval,2.30672
loss/train_iter,2.33032
loss/val,2.32512
lr,0.0
num_params,1206016.0



 >>> RUN 2/32: exp01_head4_emb128_bs8_it1000_do0.2
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp01_head4_emb128_bs8_it1000_do0.2] iter 0/1000 | train_now 4.1932 | train_eval 4.1835 | val 4.1798 | gap 0.0036 | lr 5.94e-06 | params 1.21M | host_mem 16.1% | gpu_mem 36.5MB | 279.5 ms/it
[exp01_head4_emb128_bs8_it1000_do0.2] iter 10/1000 | train_now 3.8223 | train_eval 3.7833 | val 3.7920 | gap -0.0087 | lr 6.53e-05 | params 1.21M | host_mem 16.1% | gpu_mem 36.5MB | 434.0 ms/it
[exp01_head4_emb128_bs8_it1000_do0.2] iter 20/1000 | train_now 3.6810 | train_eval 3.6473 | val 3.6525 | gap -0.0053 | lr 1.25e-04 | params 1.21M | host_mem 16.1% | gpu_mem 36.5MB | 418.9 ms/it
[exp01_head4_emb128_bs8_it1000_do0.2] iter 30/1000 | train_now 3.4656 | train_eval 3.4169 | val 3.4296 | gap -0.0126 | lr 1.84e-04 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 540.9 ms/it
[exp01_head4_emb128_bs8_it1000_do0.2] iter 40/1000 | train_now 3.2548 | train_eval 3.2245 | val 3.2463 | gap -0.0218 | lr 2.44e-04 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 558.7 ms/it
[exp01_head4_emb128_bs8_it1000_d

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▁▁▅▅▅██████▅▅▅▅▅█▁▁▁▁▁▁▁▁██████████▅▅▅▅▅
iter,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
loss/gap(train-val),▄▄▁▂▅▃▅▅▄▄▃▂▇▅▄▅▄▄▃▄▄▅▅▄▁▅▄▄▆▅▄▆▆▅▄▅▃▅█▃
loss/train_eval,█▆▆▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▂▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▂▂▁
loss/val,█▆▄▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▁▂▅▅▆████▇▇▇▇▆▆▅▅▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,36506112.0
host_mem_percent,16.2
iter,999.0
loss/gap(train-val),-0.01249
loss/train_eval,2.36679
loss/train_iter,2.38063
loss/val,2.37928
lr,0.0
num_params,1206016.0



 >>> RUN 3/32: exp02_head4_emb128_bs8_it2000_do0.1
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp02_head4_emb128_bs8_it2000_do0.1] iter 0/2000 | train_now 4.1911 | train_eval 4.1830 | val 4.1794 | gap 0.0035 | lr 5.94e-06 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 285.1 ms/it
[exp02_head4_emb128_bs8_it2000_do0.1] iter 10/2000 | train_now 3.8013 | train_eval 3.7765 | val 3.7849 | gap -0.0084 | lr 6.53e-05 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 415.1 ms/it
[exp02_head4_emb128_bs8_it2000_do0.1] iter 20/2000 | train_now 3.6570 | train_eval 3.6224 | val 3.6240 | gap -0.0016 | lr 1.25e-04 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 443.3 ms/it
[exp02_head4_emb128_bs8_it2000_do0.1] iter 30/2000 | train_now 3.4199 | train_eval 3.3848 | val 3.3993 | gap -0.0145 | lr 1.84e-04 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 422.4 ms/it
[exp02_head4_emb128_bs8_it2000_do0.1] iter 40/2000 | train_now 3.2136 | train_eval 3.2053 | val 3.2270 | gap -0.0217 | lr 2.44e-04 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 551.8 ms/it
[exp02_head4_emb128_bs8_it2000_d

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▅▅▅▅▅▅▅▅▅▅▅▅▁▁███████▅█▅▅▅▅▅▅▁▅▅▅▅▅▅▅▅▅█
iter,▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇█████
loss/gap(train-val),▇▆▆▅▇▆▆▅▄▇█▆█▇██▇▇██▃▄▅▄▄▅▅▆▃▅▇▆▂▃▅▅▃▄▁▄
loss/train_eval,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▇▇█▇▇▆▇▆▅▅▅▅▆▅▄▃▂▄▂▂▃▂▁▃▃▂▂▂▂▂▃▃▃▂▂▂▁▂▂
loss/val,█▇▆▅▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▄▅██████▇▇▇▆▆▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,36506112.0
host_mem_percent,16.3
iter,1999.0
loss/gap(train-val),-0.05716
loss/train_eval,2.11546
loss/train_iter,2.23213
loss/val,2.17263
lr,0.0
num_params,1206016.0



 >>> RUN 4/32: exp03_head4_emb128_bs8_it2000_do0.2
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp03_head4_emb128_bs8_it2000_do0.2] iter 0/2000 | train_now 4.1932 | train_eval 4.1835 | val 4.1798 | gap 0.0036 | lr 5.94e-06 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 289.8 ms/it
[exp03_head4_emb128_bs8_it2000_do0.2] iter 10/2000 | train_now 3.8223 | train_eval 3.7833 | val 3.7920 | gap -0.0087 | lr 6.53e-05 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 427.4 ms/it
[exp03_head4_emb128_bs8_it2000_do0.2] iter 20/2000 | train_now 3.6810 | train_eval 3.6473 | val 3.6525 | gap -0.0053 | lr 1.25e-04 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 415.5 ms/it
[exp03_head4_emb128_bs8_it2000_do0.2] iter 30/2000 | train_now 3.4656 | train_eval 3.4169 | val 3.4296 | gap -0.0126 | lr 1.84e-04 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 427.5 ms/it
[exp03_head4_emb128_bs8_it2000_do0.2] iter 40/2000 | train_now 3.2548 | train_eval 3.2245 | val 3.2463 | gap -0.0218 | lr 2.44e-04 | params 1.21M | host_mem 16.2% | gpu_mem 36.5MB | 420.5 ms/it
[exp03_head4_emb128_bs8_it2000_d

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▁▁▁▁▁▁▁▅█████▆▆▆▆▆▆▆▇▇▇▃▃▃▃▃▃▅▅▅▅▅▃▅▅▅▅▃
iter,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇█████
loss/gap(train-val),▇▆▂█▄▅▄▃▅▃▄▆▇█▇▇█▆█▆▁▂▁▂▆▄▇▆▁▄▆▁▂▄▃▁▂▅▄▁
loss/train_eval,█▅▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▄▄▃▃▃▃▃▃▂▃▂▃▂▂▂▂▃▂▂▂▂▂▂▁▁▁▁▁▂▂▂▂▁▁▁▁▁▁▁
loss/val,█▇▇▆▇▅▅▅▅▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
lr,▇███████▇▇▇▇▇▇▆▅▅▅▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,36506112.0
host_mem_percent,16.4
iter,1999.0
loss/gap(train-val),-0.04572
loss/train_eval,2.19331
loss/train_iter,2.33519
loss/val,2.23903
lr,0.0
num_params,1206016.0



 >>> RUN 5/32: exp04_head4_emb128_bs16_it1000_do0.1
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp04_head4_emb128_bs16_it1000_do0.1] iter 0/1000 | train_now 4.1883 | train_eval 4.1837 | val 4.1794 | gap 0.0043 | lr 5.94e-06 | params 1.21M | host_mem 16.4% | gpu_mem 36.7MB | 365.0 ms/it
[exp04_head4_emb128_bs16_it1000_do0.1] iter 10/1000 | train_now 3.8255 | train_eval 3.7672 | val 3.7700 | gap -0.0029 | lr 6.53e-05 | params 1.21M | host_mem 16.4% | gpu_mem 36.7MB | 429.3 ms/it
[exp04_head4_emb128_bs16_it1000_do0.1] iter 20/1000 | train_now 3.6144 | train_eval 3.5865 | val 3.5936 | gap -0.0071 | lr 1.25e-04 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 545.0 ms/it
[exp04_head4_emb128_bs16_it1000_do0.1] iter 30/1000 | train_now 3.3770 | train_eval 3.3592 | val 3.3824 | gap -0.0232 | lr 1.84e-04 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 587.2 ms/it
[exp04_head4_emb128_bs16_it1000_do0.1] iter 40/1000 | train_now 3.2124 | train_eval 3.1964 | val 3.2069 | gap -0.0105 | lr 2.44e-04 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 575.5 ms/it
[exp04_head4_emb128_bs16_it

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▁▃▃▃▃▃▃▃▁▁▁▃▃▃▃▆▆▆▆▆▆▆▆▆▆▁▃▃▆▆▆▆███▆▆▆▆▆
iter,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
loss/gap(train-val),▆▅▅▄▄▆▅▄▅▆▆▆▃▃▇▇█▅▄▃▃▅▂▄▃▃▄▅▃▄▂▁▂▃▄▄▃▃▁▂
loss/train_eval,█▇▆▅▅▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▅▅▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/val,█▅▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▂▅▆██████▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▃▃▃▂▂▂▁▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,36655616.0
host_mem_percent,16.6
iter,999.0
loss/gap(train-val),-0.0198
loss/train_eval,2.21266
loss/train_iter,2.28796
loss/val,2.23246
lr,0.0
num_params,1206016.0



 >>> RUN 6/32: exp05_head4_emb128_bs16_it1000_do0.2
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp05_head4_emb128_bs16_it1000_do0.2] iter 0/1000 | train_now 4.1877 | train_eval 4.1840 | val 4.1798 | gap 0.0043 | lr 5.94e-06 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 371.3 ms/it
[exp05_head4_emb128_bs16_it1000_do0.2] iter 10/1000 | train_now 3.8489 | train_eval 3.7734 | val 3.7762 | gap -0.0029 | lr 6.53e-05 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 450.7 ms/it
[exp05_head4_emb128_bs16_it1000_do0.2] iter 20/1000 | train_now 3.6425 | train_eval 3.6161 | val 3.6275 | gap -0.0114 | lr 1.25e-04 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 461.5 ms/it
[exp05_head4_emb128_bs16_it1000_do0.2] iter 30/1000 | train_now 3.4226 | train_eval 3.3890 | val 3.4106 | gap -0.0216 | lr 1.84e-04 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 444.6 ms/it
[exp05_head4_emb128_bs16_it1000_do0.2] iter 40/1000 | train_now 3.2446 | train_eval 3.2144 | val 3.2251 | gap -0.0107 | lr 2.44e-04 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 448.8 ms/it
[exp05_head4_emb128_bs16_it

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▆▆▆▆▃▆▃▃▃▃▆▆█▁▁▁▁▃▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
iter,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
loss/gap(train-val),▅▂▃▅▃▆▃▄▅▅▂▆▃▆▃▃▅▅█▄▄▂▃▄▄▃▂▅▃▁▁▂▂▁▄▄▂▁▃▃
loss/train_eval,█▆▅▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▇▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▂
loss/val,█▆▅▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▄▆▇████████▇▇▇▇▇▆▆▆▅▅▅▄▄▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,36655616.0
host_mem_percent,16.5
iter,999.0
loss/gap(train-val),-0.0118
loss/train_eval,2.27117
loss/train_iter,2.34687
loss/val,2.28297
lr,0.0
num_params,1206016.0



 >>> RUN 7/32: exp06_head4_emb128_bs16_it2000_do0.1
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp06_head4_emb128_bs16_it2000_do0.1] iter 0/2000 | train_now 4.1883 | train_eval 4.1837 | val 4.1794 | gap 0.0043 | lr 5.94e-06 | params 1.21M | host_mem 16.4% | gpu_mem 41.5MB | 360.8 ms/it
[exp06_head4_emb128_bs16_it2000_do0.1] iter 10/2000 | train_now 3.8255 | train_eval 3.7672 | val 3.7700 | gap -0.0029 | lr 6.53e-05 | params 1.21M | host_mem 16.4% | gpu_mem 36.7MB | 453.4 ms/it
[exp06_head4_emb128_bs16_it2000_do0.1] iter 20/2000 | train_now 3.6144 | train_eval 3.5865 | val 3.5936 | gap -0.0071 | lr 1.25e-04 | params 1.21M | host_mem 16.4% | gpu_mem 36.7MB | 436.4 ms/it
[exp06_head4_emb128_bs16_it2000_do0.1] iter 30/2000 | train_now 3.3770 | train_eval 3.3592 | val 3.3824 | gap -0.0232 | lr 1.84e-04 | params 1.21M | host_mem 16.4% | gpu_mem 36.7MB | 454.1 ms/it
[exp06_head4_emb128_bs16_it2000_do0.1] iter 40/2000 | train_now 3.2124 | train_eval 3.1964 | val 3.2069 | gap -0.0105 | lr 2.44e-04 | params 1.21M | host_mem 16.4% | gpu_mem 36.7MB | 439.2 ms/it
[exp06_head4_emb128_bs16_it

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▁▅▅▅▅▅▅▅▁▅███▅▅▅▁▅▅▅▅▅▅▁▅▅▅▁▁▅▅▅▅▅▁▁▅▅▅▅
iter,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇██
loss/gap(train-val),█▆▇█▆▇██▇▆▇▇█▆▅▅▄▅▄▅▅▅▄▄▄▄▃▃▃▂▄▂▂▃▁▂▃▂▂▃
loss/train_eval,█▄▄▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,██▆▅▆▄▅▄▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▂▂▂▂▂▁▂▁▂▂▂▂▁▂▁
loss/val,█▅▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▆███████▇▇▇▇▇▇▇▆▆▆▆▅▅▄▄▄▃▃▃▃▃▃▂▁▁▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,36655616.0
host_mem_percent,16.5
iter,1999.0
loss/gap(train-val),-0.07824
loss/train_eval,1.91639
loss/train_iter,2.00458
loss/val,1.99463
lr,0.0
num_params,1206016.0



 >>> RUN 8/32: exp07_head4_emb128_bs16_it2000_do0.2
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp07_head4_emb128_bs16_it2000_do0.2] iter 0/2000 | train_now 4.1877 | train_eval 4.1840 | val 4.1798 | gap 0.0043 | lr 5.94e-06 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 375.7 ms/it
[exp07_head4_emb128_bs16_it2000_do0.2] iter 10/2000 | train_now 3.8489 | train_eval 3.7734 | val 3.7762 | gap -0.0029 | lr 6.53e-05 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 454.6 ms/it
[exp07_head4_emb128_bs16_it2000_do0.2] iter 20/2000 | train_now 3.6425 | train_eval 3.6161 | val 3.6275 | gap -0.0114 | lr 1.25e-04 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 440.6 ms/it
[exp07_head4_emb128_bs16_it2000_do0.2] iter 30/2000 | train_now 3.4226 | train_eval 3.3890 | val 3.4106 | gap -0.0216 | lr 1.84e-04 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 453.0 ms/it
[exp07_head4_emb128_bs16_it2000_do0.2] iter 40/2000 | train_now 3.2446 | train_eval 3.2144 | val 3.2251 | gap -0.0107 | lr 2.44e-04 | params 1.21M | host_mem 16.5% | gpu_mem 36.7MB | 441.9 ms/it
[exp07_head4_emb128_bs16_it

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▂▂▂▂▂▂▂▁▁▂▂▂▃▃▃▁▁▁▃▃▃▆▆▆▇▇▇▇▇▇████████▇▇
iter,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇███
loss/gap(train-val),▇▇▇▇▇▇███▅▆▅▆▆▅▆▆█▅▆▅▅▆▄▄▅▄▅▄▄▃▂▂▄▁▃▁▃▂▂
loss/train_eval,█▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/val,█▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▁▃▆██████▇▇▇▇▇▇▆▆▅▅▅▅▅▅▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,36655616.0
host_mem_percent,16.9
iter,1999.0
loss/gap(train-val),-0.04698
loss/train_eval,2.03186
loss/train_iter,2.13976
loss/val,2.07884
lr,0.0
num_params,1206016.0



 >>> RUN 9/32: exp08_head4_emb256_bs8_it1000_do0.1
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp08_head4_emb256_bs8_it1000_do0.1] iter 0/1000 | train_now 4.2288 | train_eval 4.1748 | val 4.1745 | gap 0.0002 | lr 5.94e-06 | params 4.77M | host_mem 16.9% | gpu_mem 79.3MB | 403.8 ms/it
[exp08_head4_emb256_bs8_it1000_do0.1] iter 10/1000 | train_now 3.4247 | train_eval 3.4099 | val 3.4484 | gap -0.0385 | lr 6.53e-05 | params 4.77M | host_mem 16.9% | gpu_mem 79.3MB | 468.9 ms/it
[exp08_head4_emb256_bs8_it1000_do0.1] iter 20/1000 | train_now 3.0994 | train_eval 3.0720 | val 3.0989 | gap -0.0269 | lr 1.25e-04 | params 4.77M | host_mem 17.0% | gpu_mem 79.3MB | 539.9 ms/it
[exp08_head4_emb256_bs8_it1000_do0.1] iter 30/1000 | train_now 2.9665 | train_eval 2.8885 | val 2.8806 | gap 0.0079 | lr 1.84e-04 | params 4.77M | host_mem 17.1% | gpu_mem 79.3MB | 603.7 ms/it
[exp08_head4_emb256_bs8_it1000_do0.1] iter 40/1000 | train_now 2.7838 | train_eval 2.7578 | val 2.7628 | gap -0.0050 | lr 2.44e-04 | params 4.77M | host_mem 17.1% | gpu_mem 79.3MB | 603.5 ms/it
[exp08_head4_emb256_bs8_it1000_do

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▁▆▆▁▃▃▃▆▃▃▁▁▁▃▃▁▁▃▃▃▃▁▁▁▁▃▃▆▆██▆▆▃▃▃▃▆▆▆
iter,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss/gap(train-val),▄▅█▆▆▅▅▇▆█▆█▇▇█▄▆▆▄▄▅▆▅▅▄▅▅▅▄▂▄▂▂▄▂▄▅▄▂▁
loss/train_eval,█▅▄▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▆▅▅▄▄▄▄▄▄▃▅▄▃▃▃▃▃▃▃▃▃▃▂▃▂▂▁▂▂▁▂▁▁▂▂▁▁▂▁
loss/val,█▆▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▂▅▇▇█████████▇▇▆▆▆▆▅▅▅▅▅▄▄▃▃▃▂▂▂▂▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79289856.0
host_mem_percent,17.1
iter,999.0
loss/gap(train-val),-0.04852
loss/train_eval,2.15995
loss/train_iter,2.12468
loss/val,2.20847
lr,0.0
num_params,4771328.0



 >>> RUN 10/32: exp09_head4_emb256_bs8_it1000_do0.2
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp09_head4_emb256_bs8_it1000_do0.2] iter 0/1000 | train_now 4.2217 | train_eval 4.1773 | val 4.1769 | gap 0.0003 | lr 5.94e-06 | params 4.77M | host_mem 17.0% | gpu_mem 79.3MB | 381.0 ms/it
[exp09_head4_emb256_bs8_it1000_do0.2] iter 10/1000 | train_now 3.4561 | train_eval 3.4293 | val 3.4690 | gap -0.0398 | lr 6.53e-05 | params 4.77M | host_mem 16.9% | gpu_mem 79.3MB | 486.7 ms/it
[exp09_head4_emb256_bs8_it1000_do0.2] iter 20/1000 | train_now 3.1877 | train_eval 3.1246 | val 3.1472 | gap -0.0227 | lr 1.25e-04 | params 4.77M | host_mem 17.0% | gpu_mem 79.3MB | 493.2 ms/it
[exp09_head4_emb256_bs8_it1000_do0.2] iter 30/1000 | train_now 3.0165 | train_eval 2.9262 | val 2.9239 | gap 0.0024 | lr 1.84e-04 | params 4.77M | host_mem 17.0% | gpu_mem 79.3MB | 525.8 ms/it
[exp09_head4_emb256_bs8_it1000_do0.2] iter 40/1000 | train_now 2.8284 | train_eval 2.7741 | val 2.7799 | gap -0.0058 | lr 2.44e-04 | params 4.77M | host_mem 17.0% | gpu_mem 79.3MB | 463.1 ms/it
[exp09_head4_emb256_bs8_it1000_do

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▃▃▃▃▃▃▃▆▆▆▆▆█▆▃▃▃▁▁▁▁▃▃▃▃▆▆▆▆▃▃▃▃▃▃█▆▆▃▃
iter,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
loss/gap(train-val),▇▆▄▇▆▅▅▇▇▅▇█▇█▆▅▆▃▆▇▃▁▄▁▁▇▅▄█▄▄▄▅▆▆▂▃▅▁▃
loss/train_eval,█▅▅▅▄▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▄▂▃▃▃▂▂▂▂▂▂▂▂▂▁▁▂▂▂▂▂▁▁▂
loss/val,█▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▂▅▅▇█████████▇▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79289856.0
host_mem_percent,17.0
iter,999.0
loss/gap(train-val),-0.03893
loss/train_eval,2.22194
loss/train_iter,2.20785
loss/val,2.26087
lr,0.0
num_params,4771328.0



 >>> RUN 11/32: exp10_head4_emb256_bs8_it2000_do0.1
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp10_head4_emb256_bs8_it2000_do0.1] iter 0/2000 | train_now 4.2288 | train_eval 4.1748 | val 4.1745 | gap 0.0002 | lr 5.94e-06 | params 4.77M | host_mem 17.1% | gpu_mem 98.4MB | 375.7 ms/it
[exp10_head4_emb256_bs8_it2000_do0.1] iter 10/2000 | train_now 3.4247 | train_eval 3.4099 | val 3.4484 | gap -0.0385 | lr 6.53e-05 | params 4.77M | host_mem 17.0% | gpu_mem 98.4MB | 466.5 ms/it
[exp10_head4_emb256_bs8_it2000_do0.1] iter 20/2000 | train_now 3.0994 | train_eval 3.0720 | val 3.0989 | gap -0.0269 | lr 1.25e-04 | params 4.77M | host_mem 17.1% | gpu_mem 98.4MB | 617.7 ms/it
[exp10_head4_emb256_bs8_it2000_do0.1] iter 30/2000 | train_now 2.9665 | train_eval 2.8885 | val 2.8806 | gap 0.0079 | lr 1.84e-04 | params 4.77M | host_mem 17.2% | gpu_mem 98.4MB | 593.4 ms/it
[exp10_head4_emb256_bs8_it2000_do0.1] iter 40/2000 | train_now 2.7838 | train_eval 2.7578 | val 2.7628 | gap -0.0050 | lr 2.44e-04 | params 4.77M | host_mem 17.2% | gpu_mem 98.4MB | 600.4 ms/it
[exp10_head4_emb256_bs8_it2000_do

0,1
gpu_mem_bytes,█████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,█▃▃▃▃▃▃▆▃▆▃▃▃▁▆▃▃▃▆▁▃▆▆▆▆▆▆▆▆█▆▆▃▆▆▆▆▆▆▃
iter,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇██
loss/gap(train-val),█▇▇▇▇▇▅█▇▇█▇▆▆▇▄▇▆▄▄▃▄▅▅▄▄▅▄▄▃▂▃▂▁▂▃▁▁▁▃
loss/train_eval,█▅▅▅▅▅▄▄▄▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
loss/train_iter,██▇▆▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▄▃▃▂▂▂▂▁▂▁▁▂▁▂▁▂▁▂▂
loss/val,█▅▄▄▄▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▄▇████████▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79289856.0
host_mem_percent,17.0
iter,1999.0
loss/gap(train-val),-0.09994
loss/train_eval,1.807
loss/train_iter,1.89243
loss/val,1.90694
lr,0.0
num_params,4771328.0



 >>> RUN 12/32: exp11_head4_emb256_bs8_it2000_do0.2
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp11_head4_emb256_bs8_it2000_do0.2] iter 0/2000 | train_now 4.2217 | train_eval 4.1773 | val 4.1769 | gap 0.0003 | lr 5.94e-06 | params 4.77M | host_mem 17.1% | gpu_mem 98.4MB | 457.1 ms/it
[exp11_head4_emb256_bs8_it2000_do0.2] iter 10/2000 | train_now 3.4561 | train_eval 3.4293 | val 3.4690 | gap -0.0398 | lr 6.53e-05 | params 4.77M | host_mem 17.1% | gpu_mem 79.3MB | 577.3 ms/it
[exp11_head4_emb256_bs8_it2000_do0.2] iter 20/2000 | train_now 3.1877 | train_eval 3.1246 | val 3.1472 | gap -0.0227 | lr 1.25e-04 | params 4.77M | host_mem 17.2% | gpu_mem 79.3MB | 622.6 ms/it
[exp11_head4_emb256_bs8_it2000_do0.2] iter 30/2000 | train_now 3.0165 | train_eval 2.9262 | val 2.9239 | gap 0.0024 | lr 1.84e-04 | params 4.77M | host_mem 17.2% | gpu_mem 79.3MB | 696.4 ms/it
[exp11_head4_emb256_bs8_it2000_do0.2] iter 40/2000 | train_now 2.8284 | train_eval 2.7741 | val 2.7799 | gap -0.0058 | lr 2.44e-04 | params 4.77M | host_mem 17.2% | gpu_mem 79.3MB | 492.8 ms/it
[exp11_head4_emb256_bs8_it2000_do

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▅▅▇▅▇▂▂▄▂▂▄▂▄▄▂▄▄▁▁▂▄▁▁▁▁▂▂▂▂▄▅▄▄▄▂█████
iter,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██
loss/gap(train-val),▅▇▇▇▆▆█▅▇▇▇▆█▆▅▆▇▇▄▄▅▃▅▅▅▃▃▃▄▄▃▂▂▂▂▃▄▁▁▃
loss/train_eval,█▇▆▆▆▆▆▅▄▄▄▃▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
loss/train_iter,█▆▅▃▃▃▂▂▃▃▂▂▂▂▃▂▂▂▂▂▂▂▂▂▁▁▂▁▂▂▂▁▁▁▁▁▁▁▁▁
loss/val,█▃▂▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▁▂▇███████▇▇▇▇▆▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79289856.0
host_mem_percent,17.3
iter,1999.0
loss/gap(train-val),-0.04781
loss/train_eval,1.99074
loss/train_iter,2.06514
loss/val,2.03855
lr,0.0
num_params,4771328.0



 >>> RUN 13/32: exp12_head4_emb256_bs16_it1000_do0.1
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp12_head4_emb256_bs16_it1000_do0.1] iter 0/1000 | train_now 4.2344 | train_eval 4.1708 | val 4.1731 | gap -0.0023 | lr 5.94e-06 | params 4.77M | host_mem 17.5% | gpu_mem 79.4MB | 591.7 ms/it
[exp12_head4_emb256_bs16_it1000_do0.1] iter 10/1000 | train_now 3.3925 | train_eval 3.4032 | val 3.4245 | gap -0.0213 | lr 6.53e-05 | params 4.77M | host_mem 17.4% | gpu_mem 79.4MB | 817.1 ms/it
[exp12_head4_emb256_bs16_it1000_do0.1] iter 20/1000 | train_now 3.0836 | train_eval 3.0552 | val 3.0748 | gap -0.0196 | lr 1.25e-04 | params 4.77M | host_mem 17.4% | gpu_mem 79.4MB | 831.0 ms/it
[exp12_head4_emb256_bs16_it1000_do0.1] iter 30/1000 | train_now 2.8490 | train_eval 2.8407 | val 2.8625 | gap -0.0218 | lr 1.84e-04 | params 4.77M | host_mem 17.3% | gpu_mem 79.4MB | 827.7 ms/it
[exp12_head4_emb256_bs16_it1000_do0.1] iter 40/1000 | train_now 2.7754 | train_eval 2.7178 | val 2.7370 | gap -0.0192 | lr 2.44e-04 | params 4.77M | host_mem 17.3% | gpu_mem 79.4MB | 826.3 ms/it
[exp12_head4_emb256_bs16_i

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▆▆▅▇▆▆▆▆▆▇█████▆▆▆▆▆▅▅▅▅▅▆▅▅▅▅▅▅▅▅▅▄▄▄▁▂
iter,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
loss/gap(train-val),▆██▇▇▇▆█▆▆▆▅▅▆▅▅▄▇▅▅▅▅▅▄▄▄▅▃▃▃▃▄▃▃▁▄▄▂▃▃
loss/train_eval,█▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▅▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/val,█▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▃▅▅██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79439360.0
host_mem_percent,16.9
iter,999.0
loss/gap(train-val),-0.0659
loss/train_eval,1.95433
loss/train_iter,2.07381
loss/val,2.02023
lr,0.0
num_params,4771328.0



 >>> RUN 14/32: exp13_head4_emb256_bs16_it1000_do0.2
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp13_head4_emb256_bs16_it1000_do0.2] iter 0/1000 | train_now 4.2263 | train_eval 4.1727 | val 4.1750 | gap -0.0023 | lr 5.94e-06 | params 4.77M | host_mem 17.0% | gpu_mem 98.5MB | 624.4 ms/it
[exp13_head4_emb256_bs16_it1000_do0.2] iter 10/1000 | train_now 3.4214 | train_eval 3.4252 | val 3.4484 | gap -0.0232 | lr 6.53e-05 | params 4.77M | host_mem 16.9% | gpu_mem 98.5MB | 810.4 ms/it
[exp13_head4_emb256_bs16_it1000_do0.2] iter 20/1000 | train_now 3.1462 | train_eval 3.1016 | val 3.1221 | gap -0.0206 | lr 1.25e-04 | params 4.77M | host_mem 16.8% | gpu_mem 98.5MB | 821.0 ms/it
[exp13_head4_emb256_bs16_it1000_do0.2] iter 30/1000 | train_now 2.8922 | train_eval 2.8649 | val 2.8941 | gap -0.0292 | lr 1.84e-04 | params 4.77M | host_mem 16.8% | gpu_mem 98.5MB | 822.8 ms/it
[exp13_head4_emb256_bs16_it1000_do0.2] iter 40/1000 | train_now 2.8020 | train_eval 2.7358 | val 2.7507 | gap -0.0149 | lr 2.44e-04 | params 4.77M | host_mem 16.9% | gpu_mem 98.5MB | 984.8 ms/it
[exp13_head4_emb256_bs16_i

0,1
gpu_mem_bytes,███████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▁▃▄▄▄▇▇▇▅▅▅▅▅█▇▇▇▇▇▇▇▇▇▇▇▇██▇▇▇▇▇██▆▆▇▇█
iter,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
loss/gap(train-val),▆▅▇▇▆▆▇▇▆▅▇▆▅▄▅█▄▅▆▃▆▄▄▄▄▅▅▄▅▄▆▄▁▂▄▃▄▂▁▄
loss/train_eval,█▅▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▆▅▅▄▄▄▄▄▃▃▄▄▃▃▃▃▃▂▃▃▃▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁
loss/val,█▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▁▂▅▇██████▇▇▇▇▇▆▆▆▆▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79439360.0
host_mem_percent,17.7
iter,999.0
loss/gap(train-val),-0.0397
loss/train_eval,2.06252
loss/train_iter,2.18668
loss/val,2.10222
lr,0.0
num_params,4771328.0



 >>> RUN 15/32: exp14_head4_emb256_bs16_it2000_do0.1
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp14_head4_emb256_bs16_it2000_do0.1] iter 0/2000 | train_now 4.2344 | train_eval 4.1708 | val 4.1731 | gap -0.0023 | lr 5.94e-06 | params 4.77M | host_mem 17.5% | gpu_mem 98.5MB | 605.8 ms/it
[exp14_head4_emb256_bs16_it2000_do0.1] iter 10/2000 | train_now 3.3925 | train_eval 3.4032 | val 3.4245 | gap -0.0213 | lr 6.53e-05 | params 4.77M | host_mem 17.5% | gpu_mem 98.5MB | 807.8 ms/it
[exp14_head4_emb256_bs16_it2000_do0.1] iter 20/2000 | train_now 3.0836 | train_eval 3.0552 | val 3.0748 | gap -0.0196 | lr 1.25e-04 | params 4.77M | host_mem 17.5% | gpu_mem 98.5MB | 820.4 ms/it
[exp14_head4_emb256_bs16_it2000_do0.1] iter 30/2000 | train_now 2.8490 | train_eval 2.8407 | val 2.8625 | gap -0.0218 | lr 1.84e-04 | params 4.77M | host_mem 17.5% | gpu_mem 98.5MB | 827.8 ms/it
[exp14_head4_emb256_bs16_it2000_do0.1] iter 40/2000 | train_now 2.7754 | train_eval 2.7178 | val 2.7370 | gap -0.0192 | lr 2.44e-04 | params 4.77M | host_mem 17.5% | gpu_mem 98.5MB | 826.1 ms/it
[exp14_head4_emb256_bs16_i

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▅▅▅▆▆▅▆▆▆█▆█▆██▆▆▃▃▅▃▆▆▅▅▅▅▅▃▅▁▁▃▃▅▅▃▅▅▆
iter,▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇███
loss/gap(train-val),▇▇▇▇█▇▇▆▆▇▆▄▅▄▃▃▄▃▄▂▃▃▃▃▃▃▃▂▃▂▂▂▂▃▂▂▃▁▂▁
loss/train_eval,█▇▆▆▆▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▇▆▆▅▅▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂
loss/val,█▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▁▅█████▇▇▇▇▇▇▇▆▆▆▅▅▅▅▅▅▄▄▃▃▃▃▃▂▂▂▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,98524672.0
host_mem_percent,17.6
iter,1999.0
loss/gap(train-val),-0.17806
loss/train_eval,1.60281
loss/train_iter,1.67769
loss/val,1.78087
lr,0.0
num_params,4771328.0



 >>> RUN 16/32: exp15_head4_emb256_bs16_it2000_do0.2
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp15_head4_emb256_bs16_it2000_do0.2] iter 0/2000 | train_now 4.2263 | train_eval 4.1727 | val 4.1750 | gap -0.0023 | lr 5.94e-06 | params 4.77M | host_mem 17.7% | gpu_mem 117.6MB | 612.1 ms/it
[exp15_head4_emb256_bs16_it2000_do0.2] iter 10/2000 | train_now 3.4214 | train_eval 3.4252 | val 3.4484 | gap -0.0232 | lr 6.53e-05 | params 4.77M | host_mem 17.6% | gpu_mem 117.6MB | 832.5 ms/it
[exp15_head4_emb256_bs16_it2000_do0.2] iter 20/2000 | train_now 3.1462 | train_eval 3.1016 | val 3.1221 | gap -0.0206 | lr 1.25e-04 | params 4.77M | host_mem 17.7% | gpu_mem 117.6MB | 862.0 ms/it
[exp15_head4_emb256_bs16_it2000_do0.2] iter 30/2000 | train_now 2.8922 | train_eval 2.8649 | val 2.8941 | gap -0.0292 | lr 1.84e-04 | params 4.77M | host_mem 17.5% | gpu_mem 117.6MB | 842.1 ms/it
[exp15_head4_emb256_bs16_it2000_do0.2] iter 40/2000 | train_now 2.8020 | train_eval 2.7358 | val 2.7507 | gap -0.0149 | lr 2.44e-04 | params 4.77M | host_mem 17.5% | gpu_mem 117.6MB | 825.5 ms/it
[exp15_head4_emb256_b

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,█▆█▆▆▆▆█▃▃▆▆▆▆▆██▆▆▆▃▃▆█▆▆▃▁▁▃▁▃▃▃▃▃▆▆▃▃
iter,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇██
loss/gap(train-val),▇██▇▇▇▇▇▇▇▆▅▆▆▆▆▆▆▆▆▆▅▃▄▃▃▃▃▃▂▂▃▂▃▃▃▃▁▂▁
loss/train_eval,█▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▃▂▂▂▃▂▂▂▂▂▂▂▂▂▁▁▂▁▂▁
loss/val,█▅▄▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,█████████▇▇▇▇▇▆▆▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,117609984.0
host_mem_percent,17.5
iter,1999.0
loss/gap(train-val),-0.15662
loss/train_eval,1.75487
loss/train_iter,1.86275
loss/val,1.91149
lr,0.0
num_params,4771328.0



 >>> RUN 17/32: exp16_head8_emb128_bs8_it1000_do0.1
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp16_head8_emb128_bs8_it1000_do0.1] iter 0/1000 | train_now 4.1936 | train_eval 4.1828 | val 4.1793 | gap 0.0035 | lr 5.94e-06 | params 1.21M | host_mem 17.5% | gpu_mem 74.7MB | 299.7 ms/it
[exp16_head8_emb128_bs8_it1000_do0.1] iter 10/1000 | train_now 3.8003 | train_eval 3.7766 | val 3.7851 | gap -0.0085 | lr 6.53e-05 | params 1.21M | host_mem 17.5% | gpu_mem 74.7MB | 433.4 ms/it
[exp16_head8_emb128_bs8_it1000_do0.1] iter 20/1000 | train_now 3.6549 | train_eval 3.6227 | val 3.6245 | gap -0.0018 | lr 1.25e-04 | params 1.21M | host_mem 17.5% | gpu_mem 74.7MB | 422.1 ms/it
[exp16_head8_emb128_bs8_it1000_do0.1] iter 30/1000 | train_now 3.4193 | train_eval 3.3861 | val 3.4001 | gap -0.0140 | lr 1.84e-04 | params 1.21M | host_mem 17.5% | gpu_mem 74.7MB | 443.8 ms/it
[exp16_head8_emb128_bs8_it1000_do0.1] iter 40/1000 | train_now 3.2203 | train_eval 3.2082 | val 3.2297 | gap -0.0214 | lr 2.44e-04 | params 1.21M | host_mem 17.5% | gpu_mem 74.7MB | 420.5 ms/it
[exp16_head8_emb128_bs8_it1000_d

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▃▃▃▃▃▃▃▆▆▆▆▆▆▆▆█▁▁▁▃▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
iter,▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇████
loss/gap(train-val),▆▄▃▂▁▆▂▇▃▅▃▁█▅▃▃▃▅▃▅▃▅▁▅▅▄▆▇▇▆▆▇▂▆▅▃▃▅▇▃
loss/train_eval,█▆▆▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▇▄▄▄▃▃▄▃▄▂▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▃▂▂▁▂▂▂▁▂▁▂▂▁
loss/val,█▆▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▄▅▇███████▇▇▇▆▆▆▆▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,74676736.0
host_mem_percent,17.6
iter,999.0
loss/gap(train-val),-0.0181
loss/train_eval,2.30572
loss/train_iter,2.31305
loss/val,2.32381
lr,0.0
num_params,1206016.0



 >>> RUN 18/32: exp17_head8_emb128_bs8_it1000_do0.2
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp17_head8_emb128_bs8_it1000_do0.2] iter 0/1000 | train_now 4.1934 | train_eval 4.1834 | val 4.1799 | gap 0.0035 | lr 5.94e-06 | params 1.21M | host_mem 17.5% | gpu_mem 79.5MB | 316.4 ms/it
[exp17_head8_emb128_bs8_it1000_do0.2] iter 10/1000 | train_now 3.8258 | train_eval 3.7833 | val 3.7917 | gap -0.0084 | lr 6.53e-05 | params 1.21M | host_mem 17.4% | gpu_mem 79.5MB | 425.6 ms/it
[exp17_head8_emb128_bs8_it1000_do0.2] iter 20/1000 | train_now 3.6818 | train_eval 3.6473 | val 3.6526 | gap -0.0054 | lr 1.25e-04 | params 1.21M | host_mem 17.4% | gpu_mem 79.5MB | 436.7 ms/it
[exp17_head8_emb128_bs8_it1000_do0.2] iter 30/1000 | train_now 3.4652 | train_eval 3.4182 | val 3.4305 | gap -0.0123 | lr 1.84e-04 | params 1.21M | host_mem 17.4% | gpu_mem 79.5MB | 423.9 ms/it
[exp17_head8_emb128_bs8_it1000_do0.2] iter 40/1000 | train_now 3.2601 | train_eval 3.2277 | val 3.2493 | gap -0.0216 | lr 2.44e-04 | params 1.21M | host_mem 17.4% | gpu_mem 79.5MB | 431.1 ms/it
[exp17_head8_emb128_bs8_it1000_d

0,1
gpu_mem_bytes,███████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▅▁▁▁▅█████████▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████
iter,▁▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
loss/gap(train-val),▄▄▃▂▅▆▅▅▃▆▄▂▅▆▅▄▄▅▄▁▃▄▇▇▅▄█▆▆█▃▇▆▆▇▃▆▅▂▃
loss/train_eval,█▅▄▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▆▅▅▄▂▂▂▂▂▂▂▂▂▁▁▁▁▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁
loss/val,█▇▅▄▄▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▄▅▇██████▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▁▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,74676736.0
host_mem_percent,17.6
iter,999.0
loss/gap(train-val),-0.01382
loss/train_eval,2.35831
loss/train_iter,2.37231
loss/val,2.37213
lr,0.0
num_params,1206016.0



 >>> RUN 19/32: exp18_head8_emb128_bs8_it2000_do0.1
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp18_head8_emb128_bs8_it2000_do0.1] iter 0/2000 | train_now 4.1936 | train_eval 4.1828 | val 4.1793 | gap 0.0035 | lr 5.94e-06 | params 1.21M | host_mem 17.5% | gpu_mem 79.5MB | 290.9 ms/it
[exp18_head8_emb128_bs8_it2000_do0.1] iter 10/2000 | train_now 3.8003 | train_eval 3.7766 | val 3.7851 | gap -0.0085 | lr 6.53e-05 | params 1.21M | host_mem 17.5% | gpu_mem 79.5MB | 419.7 ms/it
[exp18_head8_emb128_bs8_it2000_do0.1] iter 20/2000 | train_now 3.6549 | train_eval 3.6227 | val 3.6245 | gap -0.0018 | lr 1.25e-04 | params 1.21M | host_mem 17.5% | gpu_mem 79.5MB | 448.1 ms/it
[exp18_head8_emb128_bs8_it2000_do0.1] iter 30/2000 | train_now 3.4193 | train_eval 3.3861 | val 3.4001 | gap -0.0140 | lr 1.84e-04 | params 1.21M | host_mem 17.5% | gpu_mem 79.5MB | 417.9 ms/it
[exp18_head8_emb128_bs8_it2000_do0.1] iter 40/2000 | train_now 3.2203 | train_eval 3.2082 | val 3.2297 | gap -0.0214 | lr 2.44e-04 | params 1.21M | host_mem 17.5% | gpu_mem 79.5MB | 442.0 ms/it
[exp18_head8_emb128_bs8_it2000_d

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▅▅▅▅████████████████████▁▁▁▅████▅███████
iter,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇████
loss/gap(train-val),▄▃▄█▅▃▆▅▅▄▅▃▆▅▆▅▄▅▅▆▅▂▅▄▅▄▄▄▁▃▁▃▂▂▄▂▃▅▃▃
loss/train_eval,█▅▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▇█▇▇▅▅▆▅▅▅▅▆▅▄▄▃▄▄▄▃▄▃▂▄▃▂▂▂▃▂▂▂▂▂▂▂▁▁▂
loss/val,█▅▃▃▃▃▃▃▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂██████████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▃▃▃▂▂▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79500800.0
host_mem_percent,17.6
iter,1999.0
loss/gap(train-val),-0.06156
loss/train_eval,2.11425
loss/train_iter,2.23884
loss/val,2.17581
lr,0.0
num_params,1206016.0



 >>> RUN 20/32: exp19_head8_emb128_bs8_it2000_do0.2
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp19_head8_emb128_bs8_it2000_do0.2] iter 0/2000 | train_now 4.1934 | train_eval 4.1834 | val 4.1799 | gap 0.0035 | lr 5.94e-06 | params 1.21M | host_mem 17.6% | gpu_mem 84.3MB | 306.7 ms/it
[exp19_head8_emb128_bs8_it2000_do0.2] iter 10/2000 | train_now 3.8258 | train_eval 3.7833 | val 3.7917 | gap -0.0084 | lr 6.53e-05 | params 1.21M | host_mem 17.6% | gpu_mem 84.3MB | 423.5 ms/it
[exp19_head8_emb128_bs8_it2000_do0.2] iter 20/2000 | train_now 3.6818 | train_eval 3.6473 | val 3.6526 | gap -0.0054 | lr 1.25e-04 | params 1.21M | host_mem 17.6% | gpu_mem 84.3MB | 446.8 ms/it
[exp19_head8_emb128_bs8_it2000_do0.2] iter 30/2000 | train_now 3.4652 | train_eval 3.4182 | val 3.4305 | gap -0.0123 | lr 1.84e-04 | params 1.21M | host_mem 17.6% | gpu_mem 84.3MB | 427.5 ms/it
[exp19_head8_emb128_bs8_it2000_do0.2] iter 40/2000 | train_now 3.2601 | train_eval 3.2277 | val 3.2493 | gap -0.0216 | lr 2.44e-04 | params 1.21M | host_mem 17.6% | gpu_mem 84.3MB | 440.0 ms/it
[exp19_head8_emb128_bs8_it2000_d

0,1
gpu_mem_bytes,█████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▆▁▁▃▆▆▆▆▆▆███████▆▆▆██▃▃▃▃▆▆▆▆▆▆▆▆▆█████
iter,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
loss/gap(train-val),▇▆▅▃▄▄▇▇▆▆▆█▆▆▇█▆█▇██▅▃▆▇▄▅▄▃▇▇▂▆▂▅▁▆▂▁▆
loss/train_eval,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▄▄▄▄▄▃▃▃▃▂▃▃▂▃▂▁▃▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▁▂▁▂▁▁
loss/val,█▇▅▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▁▃▅███████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79500800.0
host_mem_percent,17.7
iter,1999.0
loss/gap(train-val),-0.04858
loss/train_eval,2.18117
loss/train_iter,2.31787
loss/val,2.22974
lr,0.0
num_params,1206016.0



 >>> RUN 21/32: exp20_head8_emb128_bs16_it1000_do0.1
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp20_head8_emb128_bs16_it1000_do0.1] iter 0/1000 | train_now 4.1939 | train_eval 4.1835 | val 4.1792 | gap 0.0043 | lr 5.94e-06 | params 1.21M | host_mem 17.7% | gpu_mem 84.5MB | 410.3 ms/it
[exp20_head8_emb128_bs16_it1000_do0.1] iter 10/1000 | train_now 3.8264 | train_eval 3.7673 | val 3.7702 | gap -0.0029 | lr 6.53e-05 | params 1.21M | host_mem 17.6% | gpu_mem 84.5MB | 498.1 ms/it
[exp20_head8_emb128_bs16_it1000_do0.1] iter 20/1000 | train_now 3.6148 | train_eval 3.5864 | val 3.5934 | gap -0.0070 | lr 1.25e-04 | params 1.21M | host_mem 17.6% | gpu_mem 84.5MB | 502.9 ms/it
[exp20_head8_emb128_bs16_it1000_do0.1] iter 30/1000 | train_now 3.3791 | train_eval 3.3595 | val 3.3827 | gap -0.0232 | lr 1.84e-04 | params 1.21M | host_mem 17.5% | gpu_mem 84.5MB | 563.4 ms/it
[exp20_head8_emb128_bs16_it1000_do0.1] iter 40/1000 | train_now 3.2140 | train_eval 3.1954 | val 3.2062 | gap -0.0107 | lr 2.44e-04 | params 1.21M | host_mem 17.5% | gpu_mem 84.5MB | 594.2 ms/it
[exp20_head8_emb128_bs16_it

0,1
gpu_mem_bytes,█████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▆▃▃▃▆███▆▁▃▃▃▃▃▃▃▆▆▁▃▃▃▃▃▃▃▃▃▃▆▆▃▃▃▃▃▃▃▃
iter,▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇███
loss/gap(train-val),▃▄▄▄▄▄▅▆▆▅█▇▄▄▆▄▃▅▁▄▂▃▃▅▂▅▃▄▁▂▂▃▁▄▂▃▃▄▃▃
loss/train_eval,█▇▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▆▅▅▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/val,█▅▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▅▆███████▇▇▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79650304.0
host_mem_percent,17.5
iter,999.0
loss/gap(train-val),-0.01869
loss/train_eval,2.20458
loss/train_iter,2.26992
loss/val,2.22328
lr,0.0
num_params,1206016.0



 >>> RUN 22/32: exp21_head8_emb128_bs16_it1000_do0.2
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp21_head8_emb128_bs16_it1000_do0.2] iter 0/1000 | train_now 4.1957 | train_eval 4.1838 | val 4.1796 | gap 0.0042 | lr 5.94e-06 | params 1.21M | host_mem 17.4% | gpu_mem 84.5MB | 419.8 ms/it
[exp21_head8_emb128_bs16_it1000_do0.2] iter 10/1000 | train_now 3.8493 | train_eval 3.7734 | val 3.7763 | gap -0.0029 | lr 6.53e-05 | params 1.21M | host_mem 17.4% | gpu_mem 84.5MB | 501.7 ms/it
[exp21_head8_emb128_bs16_it1000_do0.2] iter 20/1000 | train_now 3.6426 | train_eval 3.6159 | val 3.6270 | gap -0.0111 | lr 1.25e-04 | params 1.21M | host_mem 17.4% | gpu_mem 84.5MB | 490.4 ms/it
[exp21_head8_emb128_bs16_it1000_do0.2] iter 30/1000 | train_now 3.4244 | train_eval 3.3878 | val 3.4097 | gap -0.0219 | lr 1.84e-04 | params 1.21M | host_mem 17.4% | gpu_mem 84.5MB | 503.3 ms/it
[exp21_head8_emb128_bs16_it1000_do0.2] iter 40/1000 | train_now 3.2440 | train_eval 3.2130 | val 3.2236 | gap -0.0106 | lr 2.44e-04 | params 1.21M | host_mem 17.4% | gpu_mem 84.5MB | 491.8 ms/it
[exp21_head8_emb128_bs16_it

0,1
gpu_mem_bytes,█████████████████████████████▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▃▃▃▃▃▃▃▃█████████▃▃▃▆▆▆▁▃███████████████
iter,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
loss/gap(train-val),▆▄▄▄▄▅▅▆▆█▄█▅▇█▄█▆▇█▄▂▃▆▁▂▂▃▃▄▄▂▃▂▂▄▁▅▄▄
loss/train_eval,█▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▆▅▅▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/val,█▇▅▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▄▆▇█████▇▇▇▇▆▆▆▆▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79650304.0
host_mem_percent,17.6
iter,999.0
loss/gap(train-val),-0.01306
loss/train_eval,2.26439
loss/train_iter,2.35945
loss/val,2.27745
lr,0.0
num_params,1206016.0



 >>> RUN 23/32: exp22_head8_emb128_bs16_it2000_do0.1
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp22_head8_emb128_bs16_it2000_do0.1] iter 0/2000 | train_now 4.1939 | train_eval 4.1835 | val 4.1792 | gap 0.0043 | lr 5.94e-06 | params 1.21M | host_mem 17.4% | gpu_mem 79.7MB | 460.1 ms/it
[exp22_head8_emb128_bs16_it2000_do0.1] iter 10/2000 | train_now 3.8264 | train_eval 3.7673 | val 3.7702 | gap -0.0029 | lr 6.53e-05 | params 1.21M | host_mem 17.5% | gpu_mem 79.7MB | 681.0 ms/it
[exp22_head8_emb128_bs16_it2000_do0.1] iter 20/2000 | train_now 3.6148 | train_eval 3.5864 | val 3.5934 | gap -0.0070 | lr 1.25e-04 | params 1.21M | host_mem 17.5% | gpu_mem 79.7MB | 506.9 ms/it
[exp22_head8_emb128_bs16_it2000_do0.1] iter 30/2000 | train_now 3.3791 | train_eval 3.3595 | val 3.3827 | gap -0.0232 | lr 1.84e-04 | params 1.21M | host_mem 17.5% | gpu_mem 79.7MB | 503.1 ms/it
[exp22_head8_emb128_bs16_it2000_do0.1] iter 40/2000 | train_now 3.2140 | train_eval 3.1954 | val 3.2062 | gap -0.0107 | lr 2.44e-04 | params 1.21M | host_mem 17.5% | gpu_mem 79.7MB | 494.4 ms/it
[exp22_head8_emb128_bs16_it

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▆▆▆███▃▃▃▃▃▃▃▃▆▆▆▃▃▃▃▃▃▃▃▃▆▆▆▆▆▁▁▁▃▆▆▆▆▃
iter,▁▁▁▁▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
loss/gap(train-val),▆▇▆▇███▇▆▅▆▆▆▆▅▅▅▅▄▄▅▄▅▅▄▃▄▃▃▂▃▂▃▁▂▂▂▄▁▄
loss/train_eval,█▆▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▆▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/val,█▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▄▅█████▇▇▇▆▆▆▆▆▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79650304.0
host_mem_percent,17.4
iter,1999.0
loss/gap(train-val),-0.08447
loss/train_eval,1.90275
loss/train_iter,1.96533
loss/val,1.98721
lr,0.0
num_params,1206016.0



 >>> RUN 24/32: exp23_head8_emb128_bs16_it2000_do0.2
number of parameters: 1.19M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp23_head8_emb128_bs16_it2000_do0.2] iter 0/2000 | train_now 4.1957 | train_eval 4.1838 | val 4.1796 | gap 0.0042 | lr 5.94e-06 | params 1.21M | host_mem 17.4% | gpu_mem 84.5MB | 414.3 ms/it
[exp23_head8_emb128_bs16_it2000_do0.2] iter 10/2000 | train_now 3.8493 | train_eval 3.7734 | val 3.7763 | gap -0.0029 | lr 6.53e-05 | params 1.21M | host_mem 17.4% | gpu_mem 84.5MB | 493.8 ms/it
[exp23_head8_emb128_bs16_it2000_do0.2] iter 20/2000 | train_now 3.6426 | train_eval 3.6159 | val 3.6270 | gap -0.0111 | lr 1.25e-04 | params 1.21M | host_mem 17.4% | gpu_mem 84.5MB | 503.7 ms/it
[exp23_head8_emb128_bs16_it2000_do0.2] iter 30/2000 | train_now 3.4244 | train_eval 3.3878 | val 3.4097 | gap -0.0219 | lr 1.84e-04 | params 1.21M | host_mem 17.5% | gpu_mem 84.5MB | 504.8 ms/it
[exp23_head8_emb128_bs16_it2000_do0.2] iter 40/2000 | train_now 3.2440 | train_eval 3.2130 | val 3.2236 | gap -0.0106 | lr 2.44e-04 | params 1.21M | host_mem 17.5% | gpu_mem 84.5MB | 501.6 ms/it
[exp23_head8_emb128_bs16_it

0,1
gpu_mem_bytes,████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▄▆▆▆▄▄▆▆▆▆█▄▄▄▆▆▆▆▆▆████▆▆▆▆▆▆▆▄▄▄▃▆▆▆▆▁
iter,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
loss/gap(train-val),▅▆▆█▇▇█▆█▆▅▆▆▇▆▄▅▄▅▅▅▅▅▄▂▃▄▂▃▄▃▃▂▃▃▂▃▁▃▄
loss/train_eval,█▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▄▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/val,█▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▁▆████████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,79650304.0
host_mem_percent,17.2
iter,1999.0
loss/gap(train-val),-0.04827
loss/train_eval,2.01852
loss/train_iter,2.12047
loss/val,2.06679
lr,0.0
num_params,1206016.0



 >>> RUN 25/32: exp24_head8_emb256_bs8_it1000_do0.1
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp24_head8_emb256_bs8_it1000_do0.1] iter 0/1000 | train_now 4.2279 | train_eval 4.1742 | val 4.1739 | gap 0.0003 | lr 5.94e-06 | params 4.77M | host_mem 17.5% | gpu_mem 127.1MB | 410.4 ms/it
[exp24_head8_emb256_bs8_it1000_do0.1] iter 10/1000 | train_now 3.4268 | train_eval 3.4084 | val 3.4469 | gap -0.0385 | lr 6.53e-05 | params 4.77M | host_mem 17.5% | gpu_mem 127.1MB | 494.8 ms/it
[exp24_head8_emb256_bs8_it1000_do0.1] iter 20/1000 | train_now 3.1023 | train_eval 3.0728 | val 3.1001 | gap -0.0273 | lr 1.25e-04 | params 4.77M | host_mem 17.5% | gpu_mem 127.1MB | 510.2 ms/it
[exp24_head8_emb256_bs8_it1000_do0.1] iter 30/1000 | train_now 2.9585 | train_eval 2.8876 | val 2.8807 | gap 0.0068 | lr 1.84e-04 | params 4.77M | host_mem 17.5% | gpu_mem 127.1MB | 503.6 ms/it
[exp24_head8_emb256_bs8_it1000_do0.1] iter 40/1000 | train_now 2.7835 | train_eval 2.7586 | val 2.7634 | gap -0.0048 | lr 2.44e-04 | params 4.77M | host_mem 17.5% | gpu_mem 127.1MB | 506.9 ms/it
[exp24_head8_emb256_bs8_it10

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▁▁▁▁▁▁▁▁▁▆▃▃▁▁▁▃▃▁▃▆▆▁▁▁▆▆▆▃▃▃▃▃▃▃▃▆███▆
iter,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██
loss/gap(train-val),▃▇▆▅▅▆▆▇▅▇▇█▆▄▂▅▄▄▄▆▂▂▄▆▅▅▄▄▁▅▃▆▂▂▄▅▃▄▄▃
loss/train_eval,█▆▄▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▅▄▄▄▃▃▃▃▄▃▃▃▃▂▃▃▂▃▂▂▂▂▂▁▁▂▂▂▃▂▂▁▂▂▁▁▁▁▁
loss/val,█▅▄▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▁▂▄▄▅▇█████████▇▇▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,127108608.0
host_mem_percent,17.7
iter,999.0
loss/gap(train-val),-0.04766
loss/train_eval,2.16185
loss/train_iter,2.14173
loss/val,2.20951
lr,0.0
num_params,4771328.0



 >>> RUN 26/32: exp25_head8_emb256_bs8_it1000_do0.2
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp25_head8_emb256_bs8_it1000_do0.2] iter 0/1000 | train_now 4.2206 | train_eval 4.1765 | val 4.1763 | gap 0.0002 | lr 5.94e-06 | params 4.77M | host_mem 17.5% | gpu_mem 146.2MB | 467.2 ms/it
[exp25_head8_emb256_bs8_it1000_do0.2] iter 10/1000 | train_now 3.4562 | train_eval 3.4284 | val 3.4680 | gap -0.0397 | lr 6.53e-05 | params 4.77M | host_mem 17.5% | gpu_mem 146.2MB | 599.2 ms/it
[exp25_head8_emb256_bs8_it1000_do0.2] iter 20/1000 | train_now 3.1808 | train_eval 3.1209 | val 3.1450 | gap -0.0240 | lr 1.25e-04 | params 4.77M | host_mem 17.6% | gpu_mem 146.2MB | 656.0 ms/it
[exp25_head8_emb256_bs8_it1000_do0.2] iter 30/1000 | train_now 2.9992 | train_eval 2.9149 | val 2.9110 | gap 0.0038 | lr 1.84e-04 | params 4.77M | host_mem 17.6% | gpu_mem 146.2MB | 499.3 ms/it
[exp25_head8_emb256_bs8_it1000_do0.2] iter 40/1000 | train_now 2.8260 | train_eval 2.7712 | val 2.7738 | gap -0.0026 | lr 2.44e-04 | params 4.77M | host_mem 17.6% | gpu_mem 146.2MB | 506.8 ms/it
[exp25_head8_emb256_bs8_it10

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,█▅▅▁▁▁▅▅▅▅▅▅▅▁▁▁▁█▅▅▅▅▅▅▅▁▁▅▅██▅▅▅▅▅▅▅██
iter,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇█
loss/gap(train-val),▃▇▅▇▇▄▅▅▆█▇█▆▇▃▅▅▅▆▄▄▆▅▆▆▅▄▂▂▄▄▄▄▁▇▃▂▅▅▃
loss/train_eval,█▅▄▄▃▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▅▄▄▄▃▃▃▃▃▄▃▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▃▂▂▂▁▁▁▁▁
loss/val,█▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▄▅▇███████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,146193920.0
host_mem_percent,17.6
iter,999.0
loss/gap(train-val),-0.03874
loss/train_eval,2.21671
loss/train_iter,2.20774
loss/val,2.25545
lr,0.0
num_params,4771328.0



 >>> RUN 27/32: exp26_head8_emb256_bs8_it2000_do0.1
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp26_head8_emb256_bs8_it2000_do0.1] iter 0/2000 | train_now 4.2279 | train_eval 4.1742 | val 4.1739 | gap 0.0003 | lr 5.94e-06 | params 4.77M | host_mem 17.7% | gpu_mem 146.2MB | 434.8 ms/it
[exp26_head8_emb256_bs8_it2000_do0.1] iter 10/2000 | train_now 3.4268 | train_eval 3.4084 | val 3.4469 | gap -0.0385 | lr 6.53e-05 | params 4.77M | host_mem 17.7% | gpu_mem 146.2MB | 503.7 ms/it
[exp26_head8_emb256_bs8_it2000_do0.1] iter 20/2000 | train_now 3.1023 | train_eval 3.0728 | val 3.1001 | gap -0.0273 | lr 1.25e-04 | params 4.77M | host_mem 17.6% | gpu_mem 146.2MB | 514.4 ms/it
[exp26_head8_emb256_bs8_it2000_do0.1] iter 30/2000 | train_now 2.9585 | train_eval 2.8876 | val 2.8807 | gap 0.0068 | lr 1.84e-04 | params 4.77M | host_mem 17.6% | gpu_mem 146.2MB | 501.3 ms/it
[exp26_head8_emb256_bs8_it2000_do0.1] iter 40/2000 | train_now 2.7835 | train_eval 2.7586 | val 2.7634 | gap -0.0048 | lr 2.44e-04 | params 4.77M | host_mem 17.6% | gpu_mem 146.2MB | 511.1 ms/it
[exp26_head8_emb256_bs8_it20

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▅▅▆▆▆▆▆▆▅█▆▆█▅█▆▆▆██▆▆█▅▅▁▁▁▁▁▅▅▅▆▆▅▅▅▅▃
iter,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
loss/gap(train-val),█▆▇▇██▆▆▇▆█▅▇█▇▆▄▅▅▆▅▃▅▄▃▂▄▃▂▂▂▃▄▂▁▄▁▃▂▄
loss/train_eval,██▇▇▇▇▇▆▆▆▆▆▆▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
loss/train_iter,████▇▆▇▇▆▆▄▅▅▄▄▅▄▄▄▄▄▃▃▄▄▂▂▃▂▂▂▂▁▂▂▂▁▂▁▁
loss/val,█▃▃▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▅▆▇██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▄▄▃▂▂▂▂▂▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,146193920.0
host_mem_percent,17.4
iter,1999.0
loss/gap(train-val),-0.08357
loss/train_eval,1.85275
loss/train_iter,1.92125
loss/val,1.93632
lr,0.0
num_params,4771328.0



 >>> RUN 28/32: exp27_head8_emb256_bs8_it2000_do0.2
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp27_head8_emb256_bs8_it2000_do0.2] iter 0/2000 | train_now 4.2206 | train_eval 4.1765 | val 4.1763 | gap 0.0002 | lr 5.94e-06 | params 4.77M | host_mem 17.7% | gpu_mem 146.2MB | 404.7 ms/it
[exp27_head8_emb256_bs8_it2000_do0.2] iter 10/2000 | train_now 3.4562 | train_eval 3.4284 | val 3.4680 | gap -0.0397 | lr 6.53e-05 | params 4.77M | host_mem 17.6% | gpu_mem 146.2MB | 496.4 ms/it
[exp27_head8_emb256_bs8_it2000_do0.2] iter 20/2000 | train_now 3.1808 | train_eval 3.1209 | val 3.1450 | gap -0.0240 | lr 1.25e-04 | params 4.77M | host_mem 17.6% | gpu_mem 146.2MB | 507.1 ms/it
[exp27_head8_emb256_bs8_it2000_do0.2] iter 30/2000 | train_now 2.9992 | train_eval 2.9149 | val 2.9110 | gap 0.0038 | lr 1.84e-04 | params 4.77M | host_mem 17.5% | gpu_mem 146.2MB | 501.3 ms/it
[exp27_head8_emb256_bs8_it2000_do0.2] iter 40/2000 | train_now 2.8260 | train_eval 2.7712 | val 2.7738 | gap -0.0026 | lr 2.44e-04 | params 4.77M | host_mem 17.5% | gpu_mem 146.2MB | 519.7 ms/it
[exp27_head8_emb256_bs8_it20

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▁▁▁▃▃▆▆▃▁▁▃▃▃▃▃▃█▃▃▃▃▃▁▆▃▆▆▆▆▆▆▃▃▆▆▆▆▆▆▆
iter,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
loss/gap(train-val),▆▇▅▇▆██▆▅▅▅▆▅▆▅▄▄▆▄▂▂▅▃▄▂▁▃▄▃▂▂▂▂▁▂▄▃▁▃▃
loss/train_eval,█▇▅▅▅▄▄▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▆▅▅▅▅▄▄▄▄▄▃▃▃▄▃▃▂▂▂▂▂▃▂▂▂▃▂▂▂▂▁▂▁▁▁▂▁▁▁
loss/val,█▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▇████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,146193920.0
host_mem_percent,17.4
iter,1999.0
loss/gap(train-val),-0.05122
loss/train_eval,1.9764
loss/train_iter,2.04734
loss/val,2.02762
lr,0.0
num_params,4771328.0



 >>> RUN 29/32: exp28_head8_emb256_bs16_it1000_do0.1
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp28_head8_emb256_bs16_it1000_do0.1] iter 0/1000 | train_now 4.2315 | train_eval 4.1702 | val 4.1726 | gap -0.0024 | lr 5.94e-06 | params 4.77M | host_mem 17.6% | gpu_mem 146.3MB | 677.1 ms/it
[exp28_head8_emb256_bs16_it1000_do0.1] iter 10/1000 | train_now 3.3975 | train_eval 3.4032 | val 3.4244 | gap -0.0212 | lr 6.53e-05 | params 4.77M | host_mem 17.6% | gpu_mem 146.3MB | 932.8 ms/it
[exp28_head8_emb256_bs16_it1000_do0.1] iter 20/1000 | train_now 3.0853 | train_eval 3.0548 | val 3.0743 | gap -0.0195 | lr 1.25e-04 | params 4.77M | host_mem 17.7% | gpu_mem 146.3MB | 937.5 ms/it
[exp28_head8_emb256_bs16_it1000_do0.1] iter 30/1000 | train_now 2.8618 | train_eval 2.8420 | val 2.8625 | gap -0.0205 | lr 1.84e-04 | params 4.77M | host_mem 17.7% | gpu_mem 146.3MB | 941.2 ms/it
[exp28_head8_emb256_bs16_it1000_do0.1] iter 40/1000 | train_now 2.7767 | train_eval 2.7163 | val 2.7348 | gap -0.0184 | lr 2.44e-04 | params 4.77M | host_mem 17.7% | gpu_mem 146.3MB | 911.8 ms/it
[exp28_head8_emb256_b

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▁▃▃▁▁▁▁▃▃▃▆▃▃▆▆▆▆▆▆▆▆▆▆▆▆█▆█▆▆▆▆█▆▆▆▆▆▆█
iter,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss/gap(train-val),▇▆▆█▇█▆▆▆▆▅▆▆▆▄▄▅▄▅▅▄▄▄▅▃▂▃▅▃▂▃▄▃▄▂▃▃▃▂▁
loss/train_eval,█▇▇▆▆▆▆▆▆▆▅▅▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▄▄▄▄▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁
loss/val,██▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
lr,▂▃▅▆█████▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,146343424.0
host_mem_percent,17.8
iter,999.0
loss/gap(train-val),-0.06124
loss/train_eval,1.95189
loss/train_iter,2.05913
loss/val,2.01313
lr,0.0
num_params,4771328.0



 >>> RUN 30/32: exp29_head8_emb256_bs16_it1000_do0.2
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp29_head8_emb256_bs16_it1000_do0.2] iter 0/1000 | train_now 4.2213 | train_eval 4.1721 | val 4.1744 | gap -0.0024 | lr 5.94e-06 | params 4.77M | host_mem 17.8% | gpu_mem 146.3MB | 682.7 ms/it
[exp29_head8_emb256_bs16_it1000_do0.2] iter 10/1000 | train_now 3.4278 | train_eval 3.4255 | val 3.4486 | gap -0.0231 | lr 6.53e-05 | params 4.77M | host_mem 17.8% | gpu_mem 146.3MB | 922.6 ms/it
[exp29_head8_emb256_bs16_it1000_do0.2] iter 20/1000 | train_now 3.1410 | train_eval 3.0979 | val 3.1180 | gap -0.0201 | lr 1.25e-04 | params 4.77M | host_mem 17.8% | gpu_mem 146.3MB | 940.9 ms/it
[exp29_head8_emb256_bs16_it1000_do0.2] iter 30/1000 | train_now 2.8989 | train_eval 2.8664 | val 2.8968 | gap -0.0305 | lr 1.84e-04 | params 4.77M | host_mem 17.8% | gpu_mem 146.3MB | 928.0 ms/it
[exp29_head8_emb256_bs16_it1000_do0.2] iter 40/1000 | train_now 2.8116 | train_eval 2.7358 | val 2.7510 | gap -0.0152 | lr 2.44e-04 | params 4.77M | host_mem 17.8% | gpu_mem 146.3MB | 903.2 ms/it
[exp29_head8_emb256_b

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▄▄▄█▄█▄▄▄▄▄▄▄▄▄▄▄▄▄█▄▄████████████████▁▁
iter,▁▁▁▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▇▇▇▇▇▇▇███
loss/gap(train-val),▅█▇▇▆█▅▆▅▆▅▄▅▄▃▄▃▃▄▅▄▄▅▃▃▄▅▃▄▄▅▂▂▂▂▃▃▃▁▂
loss/train_eval,█▆▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/val,█▆▅▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
lr,▂▂▄▅▆██████▇▇▇▇▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,146343424.0
host_mem_percent,17.7
iter,999.0
loss/gap(train-val),-0.04049
loss/train_eval,2.05968
loss/train_iter,2.16665
loss/val,2.10017
lr,0.0
num_params,4771328.0



 >>> RUN 31/32: exp30_head8_emb256_bs16_it2000_do0.1
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp30_head8_emb256_bs16_it2000_do0.1] iter 0/2000 | train_now 4.2315 | train_eval 4.1702 | val 4.1726 | gap -0.0024 | lr 5.94e-06 | params 4.77M | host_mem 17.8% | gpu_mem 146.3MB | 651.7 ms/it
[exp30_head8_emb256_bs16_it2000_do0.1] iter 10/2000 | train_now 3.3975 | train_eval 3.4032 | val 3.4244 | gap -0.0212 | lr 6.53e-05 | params 4.77M | host_mem 17.7% | gpu_mem 146.3MB | 902.0 ms/it
[exp30_head8_emb256_bs16_it2000_do0.1] iter 20/2000 | train_now 3.0853 | train_eval 3.0548 | val 3.0743 | gap -0.0195 | lr 1.25e-04 | params 4.77M | host_mem 17.6% | gpu_mem 146.3MB | 912.3 ms/it
[exp30_head8_emb256_bs16_it2000_do0.1] iter 30/2000 | train_now 2.8618 | train_eval 2.8420 | val 2.8625 | gap -0.0205 | lr 1.84e-04 | params 4.77M | host_mem 17.9% | gpu_mem 146.3MB | 922.7 ms/it
[exp30_head8_emb256_bs16_it2000_do0.1] iter 40/2000 | train_now 2.7767 | train_eval 2.7163 | val 2.7348 | gap -0.0184 | lr 2.44e-04 | params 4.77M | host_mem 17.9% | gpu_mem 146.3MB | 946.6 ms/it
[exp30_head8_emb256_b

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▁▃▃▃▃▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▃▆▆▆▆▆▆█▆▆▆▆▆▆▆▆▆▆█
iter,▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
loss/gap(train-val),▇█▆▇▇▇▇▆▆▆▆▆▅▆▅▅▄▄▄▄▃▄▃▃▃▃▂▁▂▂▂▂▁▂▂▂▁▂▂▂
loss/train_eval,█▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,█▇▆▇▆▆▆▆▆▅▅▆▅▅▄▄▄▃▃▃▂▂▃▂▂▂▂▁▂▁▁▂▁▂▁▁▂▂▁▂
loss/val,█▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr,▇██████████▇▇▇▇▆▆▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,146343424.0
host_mem_percent,17.9
iter,1999.0
loss/gap(train-val),-0.18678
loss/train_eval,1.62474
loss/train_iter,1.71858
loss/val,1.81152
lr,0.0
num_params,4771328.0



 >>> RUN 32/32: exp31_head8_emb256_bs16_it2000_do0.2
number of parameters: 4.74M


  scaler = torch.cuda.amp.GradScaler(enabled=(dtype=="float16"))


[exp31_head8_emb256_bs16_it2000_do0.2] iter 0/2000 | train_now 4.2213 | train_eval 4.1721 | val 4.1744 | gap -0.0024 | lr 5.94e-06 | params 4.77M | host_mem 17.9% | gpu_mem 146.3MB | 670.5 ms/it
[exp31_head8_emb256_bs16_it2000_do0.2] iter 10/2000 | train_now 3.4278 | train_eval 3.4255 | val 3.4486 | gap -0.0231 | lr 6.53e-05 | params 4.77M | host_mem 17.8% | gpu_mem 146.3MB | 904.4 ms/it
[exp31_head8_emb256_bs16_it2000_do0.2] iter 20/2000 | train_now 3.1410 | train_eval 3.0979 | val 3.1180 | gap -0.0201 | lr 1.25e-04 | params 4.77M | host_mem 17.8% | gpu_mem 146.3MB | 908.3 ms/it
[exp31_head8_emb256_bs16_it2000_do0.2] iter 30/2000 | train_now 2.8989 | train_eval 2.8664 | val 2.8968 | gap -0.0305 | lr 1.84e-04 | params 4.77M | host_mem 17.8% | gpu_mem 146.3MB | 907.6 ms/it
[exp31_head8_emb256_bs16_it2000_do0.2] iter 40/2000 | train_now 2.8116 | train_eval 2.7358 | val 2.7510 | gap -0.0152 | lr 2.44e-04 | params 4.77M | host_mem 17.7% | gpu_mem 146.3MB | 924.2 ms/it
[exp31_head8_emb256_b

0,1
gpu_mem_bytes,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
host_mem_percent,▆█▆▁▆▆▆▆▆▆▆▆▆█▆▆▆▆█▆▆▆█▆▆▆▆▆█▆▆█▆▆▆▆▆▆▆█
iter,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇███
loss/gap(train-val),██▇▇█▆▆▇▆▆▆▇▆▆▅▆▅▆▄▄▄▄▃▄▂▃▃▃▂▃▃▂▂▁▂▂▂▃▂▁
loss/train_eval,█▇▆▆▆▆▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
loss/train_iter,██▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▃▃▃▃▃▃▂▂▁▂▂▂▂▂▂▂▂▂▂▁▂
loss/val,███▇▇▇▇▆▆▆▆▆▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
lr,▅▆▇███████▇▇▇▆▆▆▆▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁
num_params,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
gpu_mem_bytes,146343424.0
host_mem_percent,17.9
iter,1999.0
loss/gap(train-val),-0.16172
loss/train_eval,1.7464
loss/train_iter,1.85203
loss/val,1.90811
lr,0.0
num_params,4771328.0



Sweep finished.
Sweep summary written line-by-line to all_experiments/sweep_summary.jsonl
Preview results of first few runs:
[
  {
    "run_id": "exp00_head4_emb128_bs8_it1000_do0.1",
    "out_dir": "all_experiments/exp00_head4_emb128_bs8_it1000_do0.1",
    "best_val_loss": 2.2987210631370543,
    "best_ckpt_path": "all_experiments/exp00_head4_emb128_bs8_it1000_do0.1/ckpt_best.pt",
    "num_params": 1206016
  },
  {
    "run_id": "exp01_head4_emb128_bs8_it1000_do0.2",
    "out_dir": "all_experiments/exp01_head4_emb128_bs8_it1000_do0.2",
    "best_val_loss": 2.353180992603302,
    "best_ckpt_path": "all_experiments/exp01_head4_emb128_bs8_it1000_do0.2/ckpt_best.pt",
    "num_params": 1206016
  },
  {
    "run_id": "exp02_head4_emb128_bs8_it2000_do0.1",
    "out_dir": "all_experiments/exp02_head4_emb128_bs8_it2000_do0.1",
    "best_val_loss": 2.1426128268241884,
    "best_ckpt_path": "all_experiments/exp02_head4_emb128_bs8_it2000_do0.1/ckpt_best.pt",
    "num_params": 1206016
  },
  {
  

In [19]:
import os
import torch
import pickle
import tiktoken
from contextlib import nullcontext
from model import GPT, GPTConfig

@torch.no_grad()
def generate_tokens(model, idx, max_new_tokens, temperature=1.0, top_k=None):
    # Same logic as model.generate(), inlined for clarity
    for _ in range(max_new_tokens):
        # crop context if it gets longer than block_size
        idx_cond = idx if idx.size(1) <= model.config.block_size else idx[:, -model.config.block_size:]

        # forward pass
        logits, _ = model(idx_cond)

        # get logits for last position
        logits = logits[:, -1, :] / temperature

        # optionally apply top-k filtering
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('inf')

        # convert logits to probabilities
        probs = torch.softmax(logits, dim=-1)

        # sample next token id
        idx_next = torch.multinomial(probs, num_samples=1)

        # append sampled token id to sequence
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

def load_tokenizer_from_meta_or_gpt2(data_dir):
    """Return encode(str)->list[int], decode(list[int])->str."""
    meta_path = os.path.join(data_dir, "meta.pkl")
    if os.path.exists(meta_path):
        # character-level tokenizer from shakespeare_char
        with open(meta_path, "rb") as f:
            meta = pickle.load(f)
        stoi, itos = meta["stoi"], meta["itos"]
        encode = lambda s: [stoi[c] for c in s]
        decode = lambda l: "".join([itos[i] for i in l])
        return encode, decode
    else:
        # fallback to GPT-2 tokenizer
        enc = tiktoken.get_encoding("gpt2")
        encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
        decode = lambda l: enc.decode(l)
        return encode, decode

def generate_from_checkpoint(
    exp_dir,
    data_dir,
    prompt,
    max_new_tokens=400,
    temperature=1.0,
    top_k=50,
    output_filename="sample_scene.txt",
    device=None,
):
    """
    Load ckpt_best.pt from exp_dir, rebuild the model, generate continuation
    from `prompt` (e.g. "\n"), and save it to {exp_dir}/{output_filename}.
    Returns the generated full text (prompt + completion).
    """

    ckpt_path = os.path.join(exp_dir, "ckpt_best.pt")
    if not os.path.exists(ckpt_path):
        raise FileNotFoundError(f"Could not find checkpoint at {ckpt_path}")

    # pick device
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    device_type = "cuda" if "cuda" in device else "cpu"

    # load checkpoint
    ckpt = torch.load(ckpt_path, map_location=device)
    cfg = ckpt["config"]  # we saved model hyperparams here

    # infer vocab_size (char-level dataset or fallback)
    meta_path = os.path.join(data_dir, "meta.pkl")
    if os.path.exists(meta_path):
        with open(meta_path, "rb") as f:
            meta = pickle.load(f)
        vocab_size = meta["vocab_size"]
    else:
        vocab_size = 50304  # GPT-2 padded vocab_size used in nanoGPT

    # rebuild the model
    gptconf = GPTConfig(
        block_size=cfg["block_size"],
        vocab_size=vocab_size,
        n_layer=cfg["n_layer"],
        n_head=cfg["n_head"],
        n_embd=cfg["n_embd"],
        dropout=cfg["dropout"],
        bias=False,
    )
    model = GPT(gptconf)
    model.load_state_dict(ckpt["model_state_dict"])
    model.to(device)
    model.eval()

    # tokenizer
    encode, decode = load_tokenizer_from_meta_or_gpt2(data_dir)

    # turn prompt string -> tensor of token ids
    start_ids = encode(prompt)
    x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]

    # autocast for speed on GPU
    ctx = nullcontext() if device_type == "cpu" else torch.amp.autocast(
        device_type=device_type,
        dtype=(torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16)
    )

    with ctx:
        y = generate_tokens(
            model,
            x,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
        )

    # decode back to text
    full_text = decode(y[0].tolist())

    # save nice structured output for this model
    out_path = os.path.join(exp_dir, output_filename)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("PROMPT:\n")
        f.write(prompt)
        f.write("\n\nGENERATED:\n")
        f.write(full_text)
        f.write("\n")

    print(f"[OK] saved generation to {out_path}")
    return full_text


In [20]:
base_dir = "all_experiments"
data_dir = "data/shakespeare_char"  # same dataset you trained with

# This prompt style encourages "ANGELO:", "DUKE VINCENTIO:", etc.
scene_style_prompt = "\n\n"

for folder in sorted(os.listdir(base_dir)):
    exp_dir = os.path.join(base_dir, folder)
    if not os.path.isdir(exp_dir):
        continue

    print(f">>> Generating Shakespeare-style scene for {folder}")
    try:
        _ = generate_from_checkpoint(
            exp_dir=exp_dir,
            data_dir=data_dir,
            prompt=scene_style_prompt,
            max_new_tokens=400,   # give it room to create multiple speakers
            temperature=1.0,      # more creative / diverse speakers
            top_k=50,             # keeps language closer to 'Shakespeare play'
            output_filename="sample_scene.txt",
        )
    except Exception as e:
        print(f"[WARN] Skipping {folder} due to error: {e}")


>>> Generating Shakespeare-style scene for exp00_head4_emb128_bs8_it1000_do0.1
number of parameters: 1.19M
[OK] saved generation to all_experiments/exp00_head4_emb128_bs8_it1000_do0.1/sample_scene.txt
>>> Generating Shakespeare-style scene for exp01_head4_emb128_bs8_it1000_do0.2
number of parameters: 1.19M
[OK] saved generation to all_experiments/exp01_head4_emb128_bs8_it1000_do0.2/sample_scene.txt
>>> Generating Shakespeare-style scene for exp02_head4_emb128_bs8_it2000_do0.1
number of parameters: 1.19M
[OK] saved generation to all_experiments/exp02_head4_emb128_bs8_it2000_do0.1/sample_scene.txt
>>> Generating Shakespeare-style scene for exp03_head4_emb128_bs8_it2000_do0.2
number of parameters: 1.19M
[OK] saved generation to all_experiments/exp03_head4_emb128_bs8_it2000_do0.2/sample_scene.txt
>>> Generating Shakespeare-style scene for exp04_head4_emb128_bs16_it1000_do0.1
number of parameters: 1.19M
[OK] saved generation to all_experiments/exp04_head4_emb128_bs16_it1000_do0.1/sample_sce

In [21]:
import os
import wandb
import matplotlib.pyplot as plt

ENTITY_NAME = "arunjung1991"         # <-- your wandb username or team
PROJECT_NAME = "nano-sweep"          # <-- the project you set in wandb_project

api = wandb.Api()

runs = api.runs(f"{ENTITY_NAME}/{PROJECT_NAME}")
print(f"Found {len(runs)} runs in {PROJECT_NAME}")




Found 32 runs in nano-sweep


In [22]:
base_dir = "all_experiments"

for run in runs:
    run_id = run.name  # we set name=run_id in wandb.init(...)

    # Find matching local experiment folder
    exp_dir = os.path.join(base_dir, run_id)
    if not os.path.isdir(exp_dir):
        # if for some reason folder names don't match exactly, skip
        print(f"Skipping {run_id} (no dir {exp_dir})")
        continue

    print(f"Processing run {run_id} -> {exp_dir}")

    # download the full history (all logged steps)
    history = run.history(keys=[
        "iter",
        "loss/train_iter",
        "loss/train_eval",
        "loss/val",
        "loss/gap(train-val)",
        "lr",
        "gpu_mem_bytes",
        "num_params",
    ])

    # Convert to plain Python lists (Colab-friendly)
    iters         = history["iter"].tolist()
    train_now     = history["loss/train_iter"].tolist()
    train_eval    = history["loss/train_eval"].tolist()
    val_eval      = history["loss/val"].tolist()
    gap_eval      = history["loss/gap(train-val)"].tolist()
    lr_curve      = history["lr"].tolist()
    gpu_mem_curve = history["gpu_mem_bytes"].tolist()
    params_curve  = history["num_params"].tolist()

    # --- 1. Train vs Val loss plot ---
    plt.figure()
    plt.plot(iters, train_eval, label="train_eval_loss")
    plt.plot(iters, val_eval,   label="val_loss")
    plt.xlabel("iter")
    plt.ylabel("loss")
    plt.title(f"{run_id} - train vs val loss")
    plt.legend()
    plt.grid(True, alpha=0.3)
    loss_plot_path = os.path.join(exp_dir, "loss_curve.png")
    plt.savefig(loss_plot_path, dpi=200, bbox_inches="tight")
    plt.close()

    # --- 2. Gap plot (train_eval - val) ---
    plt.figure()
    plt.plot(iters, gap_eval, label="gap(train-val)")
    plt.xlabel("iter")
    plt.ylabel("loss gap")
    plt.title(f"{run_id} - train/val gap")
    plt.legend()
    plt.grid(True, alpha=0.3)
    gap_plot_path = os.path.join(exp_dir, "gap_curve.png")
    plt.savefig(gap_plot_path, dpi=200, bbox_inches="tight")
    plt.close()

    # --- 3. Learning rate schedule plot ---
    plt.figure()
    plt.plot(iters, lr_curve, label="lr")
    plt.xlabel("iter")
    plt.ylabel("learning rate")
    plt.title(f"{run_id} - LR schedule")
    plt.legend()
    plt.grid(True, alpha=0.3)
    lr_plot_path = os.path.join(exp_dir, "lr_curve.png")
    plt.savefig(lr_plot_path, dpi=200, bbox_inches="tight")
    plt.close()

    # --- 4. GPU memory usage plot (optional) ---
    if any(x is not None for x in gpu_mem_curve):
        plt.figure()
        plt.plot(iters, [m/(1024**2) if m is not None else None for m in gpu_mem_curve],
                 label="gpu_mem (MB)")
        plt.xlabel("iter")
        plt.ylabel("MB allocated")
        plt.title(f"{run_id} - GPU mem usage")
        plt.legend()
        plt.grid(True, alpha=0.3)
        mem_plot_path = os.path.join(exp_dir, "gpu_mem_curve.png")
        plt.savefig(mem_plot_path, dpi=200, bbox_inches="tight")
        plt.close()

    # --- 5. Save summary info as a metadata json (super helpful for comparison) ---
    summary_info = {
        "run_id": run_id,
        "num_params_final": params_curve[-1] if len(params_curve) else None,
        "final_train_eval_loss": train_eval[-1] if len(train_eval) else None,
        "final_val_loss": val_eval[-1] if len(val_eval) else None,
        "final_gap": gap_eval[-1] if len(gap_eval) else None,
        "min_val_loss": float(min(v for v in val_eval if v is not None)) if len(val_eval) else None,
        "sweep_config": run.config,  # contains n_head, n_embd, etc.
    }

    import json
    with open(os.path.join(exp_dir, "summary.json"), "w") as f:
        json.dump(summary_info, f, indent=2)

print("Done. Plots and summaries saved alongside each experiment.")


Processing run exp00_head4_emb128_bs8_it1000_do0.1 -> all_experiments/exp00_head4_emb128_bs8_it1000_do0.1
Processing run exp01_head4_emb128_bs8_it1000_do0.2 -> all_experiments/exp01_head4_emb128_bs8_it1000_do0.2
Processing run exp02_head4_emb128_bs8_it2000_do0.1 -> all_experiments/exp02_head4_emb128_bs8_it2000_do0.1
Processing run exp03_head4_emb128_bs8_it2000_do0.2 -> all_experiments/exp03_head4_emb128_bs8_it2000_do0.2
Processing run exp04_head4_emb128_bs16_it1000_do0.1 -> all_experiments/exp04_head4_emb128_bs16_it1000_do0.1
Processing run exp05_head4_emb128_bs16_it1000_do0.2 -> all_experiments/exp05_head4_emb128_bs16_it1000_do0.2
Processing run exp06_head4_emb128_bs16_it2000_do0.1 -> all_experiments/exp06_head4_emb128_bs16_it2000_do0.1
Processing run exp07_head4_emb128_bs16_it2000_do0.2 -> all_experiments/exp07_head4_emb128_bs16_it2000_do0.2
Processing run exp08_head4_emb256_bs8_it1000_do0.1 -> all_experiments/exp08_head4_emb256_bs8_it1000_do0.1
Processing run exp09_head4_emb256_bs8_

In [39]:
import os

os.environ['GITHUB_TOKEN'] = '***************************'
os.environ['GITHUB_USER'] = '********************'

# now build the remote URL with the token embedded
repo_url = f"https://{os.environ['GITHUB_USER']}:{os.environ['GITHUB_TOKEN']}@github.com/{os.environ['GITHUB_USER']}/nanoGPT-Experiments.git"

# remove any old remotes and add the new one
!git remote remove origin || true
!git remote add origin $repo_url
!git branch -M main


In [40]:
!git push -u origin main


Enumerating objects: 1272, done.
Counting objects:   0% (1/1272)Counting objects:   1% (13/1272)Counting objects:   2% (26/1272)Counting objects:   3% (39/1272)Counting objects:   4% (51/1272)Counting objects:   5% (64/1272)Counting objects:   6% (77/1272)Counting objects:   7% (90/1272)Counting objects:   8% (102/1272)Counting objects:   9% (115/1272)Counting objects:  10% (128/1272)Counting objects:  11% (140/1272)Counting objects:  12% (153/1272)Counting objects:  13% (166/1272)Counting objects:  14% (179/1272)Counting objects:  15% (191/1272)Counting objects:  16% (204/1272)Counting objects:  17% (217/1272)Counting objects:  18% (229/1272)Counting objects:  19% (242/1272)Counting objects:  20% (255/1272)Counting objects:  21% (268/1272)Counting objects:  22% (280/1272)Counting objects:  23% (293/1272)Counting objects:  24% (306/1272)Counting objects:  25% (318/1272)Counting objects:  26% (331/1272)Counting objects:  27% (344/1272)Counting objects:  28% (

In [41]:
!pwd
!ls -lh


/content/nanoGPT
total 372K
drwxr-xr-x 34 root root 4.0K Oct 28 04:51 all_experiments
drwxr-xr-x  2 root root 4.0K Oct 28 04:50 assets
-rw-r--r--  1 root root 4.8K Oct 28 04:50 bench.py
drwxr-xr-x  2 root root 4.0K Oct 28 04:50 config
-rw-r--r--  1 root root 1.8K Oct 28 04:50 configurator.py
drwxr-xr-x  5 root root 4.0K Oct 28 04:50 data
-rw-r--r--  1 root root 1.1K Oct 28 04:50 LICENSE
-rw-r--r--  1 root root  16K Oct 28 04:50 model.py
drwxr-xr-x  2 root root 4.0K Oct 28 04:50 __pycache__
-rw-r--r--  1 root root  14K Oct 28 04:50 README.md
-rw-r--r--  1 root root 3.9K Oct 28 04:50 sample.py
-rw-r--r--  1 root root 263K Oct 28 04:50 scaling_laws.ipynb
-rw-r--r--  1 root root  15K Oct 28 04:50 train.py
-rw-r--r--  1 root root  15K Oct 28 04:50 transformer_sizing.ipynb
drwxr-xr-x 34 root root 4.0K Oct 28 05:37 wandb


In [42]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [45]:
!cp "/content/drive/MyDrive/Colab Notebooks/nanoGPT_assignment.ipynb" /content/nanoGPT-Experiments/


cp: cannot create regular file '/content/nanoGPT-Experiments/': Not a directory
