# CS336 Assignment 2: Systems - Google Colab Setup

This notebook sets up and runs CS336 Assignment 2 in Google Colab with GPU support.

**Important**: Make sure to enable GPU in Colab:
- Go to Runtime → Change runtime type → Hardware accelerator → GPU (T4 or better)

## Step 1: Check GPU and Clone Repository

In [None]:
# Check GPU availability
!nvidia-smi

# Clone the repository (replace with your repo URL)
# Option 1: Public repo
!git clone https://github.com/YOUR_USERNAME/assignment2-systems.git

# Option 2: Private repo (you'll need to authenticate)
# !git clone https://YOUR_TOKEN@github.com/YOUR_USERNAME/assignment2-systems.git

# Option 3: Upload a zip file
# from google.colab import files
# uploaded = files.upload()  # Upload cs336_assignment2_cloud.tar.gz
# !tar -xzf cs336_assignment2_cloud.tar.gz

%cd assignment2-systems

## Step 2: Install Dependencies

In [None]:
# Install PyTorch (Colab usually has it pre-installed, but let's ensure CUDA version)
import torch
print(f"Existing PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Install additional dependencies
!pip install -q numpy tqdm matplotlib pandas pytest regex humanfriendly wandb triton

# Install local packages
!pip install -e ./cs336-basics
!pip install -e .

# Verify installation
import cs336_basics
import cs336_systems
print("\nModules imported successfully!")
print(f"CUDA device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

## Step 3: Run Benchmarking Script

In [None]:
# Run with default parameters
!python cs336_systems/benchmarking_script.py

In [None]:
# Run with custom parameters for larger model
!python cs336_systems/benchmarking_script.py \
    --batchsize 64 \
    --context 256 \
    --dmodel 1024 \
    --nlayers 24 \
    --nheads 16 \
    --dff 4096 \
    --num-runs 5 \
    --num-warmup 2

## Step 4: Interactive Benchmarking

In [None]:
import sys
sys.path.append('/content/assignment2-systems')

from cs336_systems.benchmarking_script import Config, benchmark_model, generate_random_data
from cs336_basics.model import BasicsTransformerLM
import torch
import matplotlib.pyplot as plt

# Test different model sizes
results = []
model_sizes = [
    {"name": "Small", "d_model": 256, "num_layers": 6, "num_heads": 8},
    {"name": "Medium", "d_model": 512, "num_layers": 12, "num_heads": 8},
    {"name": "Large", "d_model": 768, "num_layers": 12, "num_heads": 12},
]

for size in model_sizes:
    config = Config(
        batch_size=32,
        vocab_size=10000,
        context_length=128,
        d_model=size["d_model"],
        num_layers=size["num_layers"],
        num_heads=size["num_heads"],
        d_ff=size["d_model"] * 4
    )
    
    model = BasicsTransformerLM(
        vocab_size=config.vocab_size,
        context_length=config.context_length,
        d_model=config.d_model,
        num_layers=config.num_layers,
        num_heads=config.num_heads,
        d_ff=config.d_ff,
        rope_theta=config.rope_theta
    ).cuda()
    
    x = generate_random_data(config).cuda()
    
    print(f"\nBenchmarking {size['name']} model...")
    timing = benchmark_model(model, x, num_runs=5, num_warmup=2)
    
    results.append({
        "name": size["name"],
        "params": sum(p.numel() for p in model.parameters()),
        "avg_time": timing["avg"],
        "min_time": timing["min"]
    })
    
    print(f"Parameters: {results[-1]['params']:,}")
    print(f"Avg time: {timing['avg']:.4f}s")

# Plot results
names = [r["name"] for r in results]
times = [r["avg_time"] for r in results]

plt.figure(figsize=(10, 6))
plt.bar(names, times)
plt.xlabel("Model Size")
plt.ylabel("Average Forward Pass Time (s)")
plt.title("Model Size vs Inference Time")
for i, (name, time) in enumerate(zip(names, times)):
    plt.text(i, time + 0.001, f"{time:.4f}s", ha='center')
plt.show()

## Step 5: Run Tests

In [None]:
# Run all tests
!pytest -v ./tests/

# Or run specific test files
# !pytest -v ./tests/test_attention.py

## Step 6: Memory Profiling (Optional)

In [None]:
# Profile GPU memory usage
import torch
import gc

def get_gpu_memory():
    return torch.cuda.memory_allocated() / 1024**3  # Convert to GB

# Clear GPU memory
gc.collect()
torch.cuda.empty_cache()

print(f"Initial GPU memory: {get_gpu_memory():.2f} GB")

# Create a large model
config = Config(
    batch_size=32,
    vocab_size=50000,
    context_length=512,
    d_model=1024,
    num_layers=24,
    num_heads=16,
    d_ff=4096
)

model = BasicsTransformerLM(
    vocab_size=config.vocab_size,
    context_length=config.context_length,
    d_model=config.d_model,
    num_layers=config.num_layers,
    num_heads=config.num_heads,
    d_ff=config.d_ff,
    rope_theta=config.rope_theta
).cuda()

print(f"After model creation: {get_gpu_memory():.2f} GB")

# Run forward pass
x = generate_random_data(config).cuda()
with torch.no_grad():
    output = model(x)

print(f"After forward pass: {get_gpu_memory():.2f} GB")
print(f"\nModel parameters: {sum(p.numel() for p in model.parameters()):,}")