In [1]:
import cs336_basics

In [1]:
# Original benchmarking script –> got up to large backward with 32GB GPU

import argparse
import torch
import timeit
import numpy as np
from cs336_basics.model import BasicsTransformerLM

# Define model sizes
MODEL_SIZES = {
    "small":   {"d_model": 768,  "d_ff": 3072,  "num_layers": 12, "num_heads": 12},
    "medium":  {"d_model": 1024, "d_ff": 4096,  "num_layers": 24, "num_heads": 16},
    "large":   {"d_model": 1280, "d_ff": 5120,  "num_layers": 36, "num_heads": 20},
    "xl":      {"d_model": 1600, "d_ff": 6400,  "num_layers": 48, "num_heads": 25},
    "2.7B":    {"d_model": 2560, "d_ff": 10240, "num_layers": 32, "num_heads": 32},
}

def benchmark(model_size="small", batch_size=8, seq_len=512, steps=5, warmup=1, backward=False):
    config = MODEL_SIZES[model_size]
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = BasicsTransformerLM(
        vocab_size=50257,
        context_length=seq_len,
        d_model=config["d_model"],
        d_ff=config["d_ff"],
        num_layers=config["num_layers"],
        num_heads=config["num_heads"],
        attn_pdrop=0.1,
        residual_pdrop=0.1
    ).to(device)

    dummy_input = torch.randint(0, 50257, (batch_size, seq_len), device=device)
    dummy_target = torch.randint(0, 50257, (batch_size, seq_len), device=device)

    optimizer = torch.optim.AdamW(model.parameters())
    times = []

    # Warm-up steps
    for _ in range(warmup):
        output = model(dummy_input)
        if backward:
            loss = torch.nn.functional.cross_entropy(output.view(-1, output.size(-1)), dummy_target.view(-1))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        if device == "cuda":
            torch.cuda.synchronize()

    # Timed steps
    for _ in range(steps):
        start = timeit.default_timer()
        output = model(dummy_input)
        if backward:
            loss = torch.nn.functional.cross_entropy(output.view(-1, output.size(-1)), dummy_target.view(-1))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        if device == "cuda":
            torch.cuda.synchronize()
        end = timeit.default_timer()
        times.append(end - start)

    avg_time = np.mean(times)
    std_dev = np.std(times)
    kind = "forward+backward" if backward else "forward"
    print(f"[{model_size}] {kind} pass — Avg: {avg_time:.4f}s, Std Dev: {std_dev:.4f}s")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_size", type=str, choices=MODEL_SIZES.keys(), default="small")
    parser.add_argument("--batch_size", type=int, default=8)
    parser.add_argument("--seq_len", type=int, default=512)
    parser.add_argument("--steps", type=int, default=5)
    parser.add_argument("--warmup", type=int, default=1)
    parser.add_argument("--backward", action="store_true", help="Include backward pass")
    args = parser.parse_args()

    benchmark(
        model_size=args.model_size,
        batch_size=args.batch_size,
        seq_len=args.seq_len,
        steps=args.steps,
        warmup=args.warmup,
        backward=args.backward
    )



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/home/cadekane/.conda/envs/cs336_systems/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/cadekane/.conda/envs/cs336_systems/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/cadekane/.conda/envs/cs336_systems/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/cadekane/.conda/envs/cs336_systems/lib/python3.10/site-packages/traitlets/config/

In [None]:
# Example usage
# benchmark("small", backward=False)
# benchmark("small", backward=True)

In [10]:
torch.cuda.empty_cache()

In [2]:
# Forward-only benchmarks
benchmark("medium", backward=False)
torch.cuda.empty_cache()

# Forward + Backward benchmarks
benchmark("medium", backward=True)
torch.cuda.empty_cache()

OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 15.56 GiB of which 5.44 MiB is free. Including non-PyTorch memory, this process has 15.55 GiB memory in use. Of the allocated memory 15.40 GiB is allocated by PyTorch, and 6.41 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
benchmark("large", backward=False)

benchmark("large", backward=True)

In [None]:
benchmark("xl", backward=False)

benchmark("xl", backward=True)

In [None]:
benchmark("2.7B", backward=False)

benchmark("2.7B", backward=True)

In [12]:
!nvidia-smi

Wed Apr 30 09:36:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Quadro RTX 5000                On  |   00000000:89:00.0 Off |                  Off |
| 33%   33C    P8             10W /  230W |   15920MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Pytorch profiler

# 3 Distributed data parallel training

## 3.1 SINGLE node distributed communication in Pytorch
best practices for benchmarking distributed applications.

In [None]:
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import time
import os
import argparse

def run(rank, world_size, backend, device_type, tensor_size_MB, return_dict):
    os.environ["MASTER_ADDR"] = "127.0.0.1"
    os.environ["MASTER_PORT"] = "29500"
    dist.init_process_group(backend=backend, rank=rank, world_size=world_size)

    dtype = torch.float32
    numel = int((tensor_size_MB * 1024 * 1024) / 4)
    device = torch.device(f"{device_type}:{rank % torch.cuda.device_count()}" if device_type == "cuda" else "cpu")
    tensor = torch.ones(numel, dtype=dtype, device=device)

    # Warm-up
    for _ in range(5):
        dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
        if device_type == "cuda":
            torch.cuda.synchronize()

    # Timed
    if device_type == "cuda":
        torch.cuda.synchronize()
    start = time.time()
    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
    if device_type == "cuda":
        torch.cuda.synchronize()
    end = time.time()

    dist.destroy_process_group()
    return_dict[rank] = end - start

def benchmark(backend, device_type, processes, sizes_MB):
    results = []
    for size_MB in sizes_MB:
        manager = mp.Manager()
        return_dict = manager.dict()
        mp.spawn(run,
                 args=(processes, backend, device_type, size_MB, return_dict),
                 nprocs=processes,
                 join=True)
        times = list(return_dict.values())
        avg_time = sum(times) / len(times)
        results.append((backend, device_type, processes, size_MB, avg_time))
    return results

if __name__ == "__main__":
    sizes_MB = [0.5, 1, 10, 50, 100, 500, 1024]
    configs = [
        ("gloo", "cpu"),
        ("gloo", "cuda"),
        ("nccl", "cuda"),
    ]
    all_results = []

    for backend, device in configs:
        for proc in [2, 4, 6]:
            if device == "cuda" and not torch.cuda.is_available():
                continue
            if device == "cuda" and proc > torch.cuda.device_count():
                continue
            print(f"Running: {backend} on {device} with {proc} processes")
            results = benchmark(backend, device, proc, sizes_MB)
            all_results.extend(results)

    # Save or print results
    import pandas as pd
    df = pd.DataFrame(all_results, columns=["Backend", "Device", "Processes", "Size_MB", "Time_s"])
    print(df)
    df.to_csv("allreduce_benchmark_results.csv", index=False)

    # Plot
    import matplotlib.pyplot as plt
    plt.figure(figsize=(12, 6))
    for (backend, device), group in df.groupby(["Backend", "Device"]):
        for nproc in sorted(group["Processes"].unique()):
            sub = group[group["Processes"] == nproc]
            plt.plot(sub["Size_MB"], sub["Time_s"], label=f"{backend}+{device} ({nproc} proc)", marker="o")
    plt.xscale("log")
    plt.yscale("log")
    plt.xlabel("Tensor Size (MB)")
    plt.ylabel("AllReduce Time (s)")
    plt.title("AllReduce Performance")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("allreduce_benchmark_plot.png")


## 3.3 Naive implementation of distributed data parallel (DDP) training
Each device initially constructs a randomly initialized model, then we use broadcast collective communication, so each device holds an identical copy of the parameters and optimizer states for the model.