In [48]:
import pandas as pd
import gc
from p2_efficiency.utils import benchmark_llm, profile_llm_memory
from time import sleep

In [2]:
inputs = {
    "size": ["small", "medium", "large", "xl", "2.7B"], 
    "num_layers": [12, 24, 36, 48, 32],
    "d_model": [768, 1024, 1280, 1600, 2560],
    "d_ff": [3072, 4096, 5120, 6400, 10240],
    "num_heads": [12, 16, 20, 25, 32],
    "cntx_len": [],
    "mode": [],
    "mean (milliseconds)": [],
    "std (milliseconds)": []
}
cntx_lens = [128, 256, 512, 1024]
modes = ["forward", "forward+backward"]

# Benchmark speed

In [3]:
def run_benchmark(inputs, warmup_iters = 5):
    data_benchmark = {k:[] for k in inputs}
    for i in range(len(inputs["size"])):
        for cntx_len in cntx_lens:
            for mode in modes:    
                try:
                    m, std = benchmark_llm(
                        d_model = inputs["d_model"][i], d_ff = inputs["d_ff"][i], num_layers = inputs["num_layers"][i], num_heads = inputs["num_heads"][i],
                        context_length = cntx_len, warmup_iters = warmup_iters, benchmark_iters = 10, mode = mode
                    )
                    for k in inputs:
                        if inputs[k] != []:
                            data_benchmark[k].append(inputs[k][i])
                    data_benchmark["cntx_len"].append(cntx_len)
                    data_benchmark["mode"].append(mode)
                    data_benchmark["mean (milliseconds)"].append(m)
                    data_benchmark["std (milliseconds)"].append(std)
                except RuntimeError as e:
                    if "out of memory" in str(e):
                        print(f"⚠️ OOM for {inputs["size"][i]} model with context length = {cntx_len}, freeing memory...")
                        torch.cuda.empty_cache()   # frees cached allocator blocks
                        gc.collect()               # run Python garbage collector
                        torch.cuda.synchronize()   # wait for cleanup to complete
                    else:
                        raise e
                sleep(3)
    torch.cuda.empty_cache()
    gc.collect()
    return data_benchmark


In [4]:
%%time
data_benchmark0 = run_benchmark(inputs, 0)

⚠️ OOM for medium model with context length = 1024, freeing memory...
⚠️ OOM for medium model with context length = 1024, freeing memory...
⚠️ OOM for large model with context length = 512, freeing memory...
⚠️ OOM for large model with context length = 1024, freeing memory...
⚠️ OOM for large model with context length = 1024, freeing memory...
⚠️ OOM for xl model with context length = 512, freeing memory...
⚠️ OOM for xl model with context length = 512, freeing memory...
⚠️ OOM for xl model with context length = 1024, freeing memory...
⚠️ OOM for xl model with context length = 1024, freeing memory...
⚠️ OOM for 2.7B model with context length = 128, freeing memory...
⚠️ OOM for 2.7B model with context length = 256, freeing memory...
⚠️ OOM for 2.7B model with context length = 512, freeing memory...
⚠️ OOM for 2.7B model with context length = 512, freeing memory...
⚠️ OOM for 2.7B model with context length = 1024, freeing memory...
⚠️ OOM for 2.7B model with context length = 1024, freein

In [5]:
pd.DataFrame(data_benchmark0)

Unnamed: 0,size,num_layers,d_model,d_ff,num_heads,cntx_len,mode,mean (milliseconds),std (milliseconds)
0,small,12,768,3072,12,128,forward,27.210902,49.40992
1,small,12,768,3072,12,128,forward+backward,31.933658,26.38063
2,small,12,768,3072,12,256,forward,11.677848,1.127935
3,small,12,768,3072,12,256,forward+backward,27.421579,0.057029
4,small,12,768,3072,12,512,forward,17.287351,1.050202
5,small,12,768,3072,12,512,forward+backward,49.40088,0.045378
6,small,12,768,3072,12,1024,forward,54.00419,0.434013
7,small,12,768,3072,12,1024,forward+backward,158.102872,0.055844
8,medium,24,1024,4096,16,128,forward,20.932854,0.370619
9,medium,24,1024,4096,16,128,forward+backward,49.80297,0.117829


In [4]:
%%time
data_benchmark1 = run_benchmark(inputs, 1)

⚠️ OOM for medium model with context length = 1024, freeing memory...
⚠️ OOM for medium model with context length = 1024, freeing memory...
⚠️ OOM for large model with context length = 512, freeing memory...
⚠️ OOM for large model with context length = 1024, freeing memory...
⚠️ OOM for large model with context length = 1024, freeing memory...
⚠️ OOM for xl model with context length = 512, freeing memory...
⚠️ OOM for xl model with context length = 512, freeing memory...
⚠️ OOM for xl model with context length = 1024, freeing memory...
⚠️ OOM for xl model with context length = 1024, freeing memory...
⚠️ OOM for 2.7B model with context length = 128, freeing memory...
⚠️ OOM for 2.7B model with context length = 256, freeing memory...
⚠️ OOM for 2.7B model with context length = 512, freeing memory...
⚠️ OOM for 2.7B model with context length = 512, freeing memory...
⚠️ OOM for 2.7B model with context length = 1024, freeing memory...
⚠️ OOM for 2.7B model with context length = 1024, freein

In [6]:
pd.DataFrame(data_benchmark1)

Unnamed: 0,size,num_layers,d_model,d_ff,num_heads,cntx_len,mode,mean (milliseconds),std (milliseconds)
0,small,12,768,3072,12,128,forward,10.715511,1.201898
1,small,12,768,3072,12,128,forward+backward,22.352556,0.697683
2,small,12,768,3072,12,256,forward,11.609577,1.111441
3,small,12,768,3072,12,256,forward+backward,27.318731,0.055945
4,small,12,768,3072,12,512,forward,16.981092,0.771432
5,small,12,768,3072,12,512,forward+backward,49.322208,0.014699
6,small,12,768,3072,12,1024,forward,53.888881,0.408823
7,small,12,768,3072,12,1024,forward+backward,157.983309,0.023634
8,medium,24,1024,4096,16,128,forward,20.897351,0.38432
9,medium,24,1024,4096,16,128,forward+backward,49.84307,0.06283


In [4]:
%%time
data_benchmark5 = run_benchmark(inputs, 5)

⚠️ OOM for medium model with context length = 1024, freeing memory...
⚠️ OOM for medium model with context length = 1024, freeing memory...
⚠️ OOM for large model with context length = 512, freeing memory...
⚠️ OOM for large model with context length = 1024, freeing memory...
⚠️ OOM for large model with context length = 1024, freeing memory...
⚠️ OOM for xl model with context length = 512, freeing memory...
⚠️ OOM for xl model with context length = 512, freeing memory...
⚠️ OOM for xl model with context length = 1024, freeing memory...
⚠️ OOM for xl model with context length = 1024, freeing memory...
⚠️ OOM for 2.7B model with context length = 128, freeing memory...
⚠️ OOM for 2.7B model with context length = 256, freeing memory...
⚠️ OOM for 2.7B model with context length = 512, freeing memory...
⚠️ OOM for 2.7B model with context length = 512, freeing memory...
⚠️ OOM for 2.7B model with context length = 1024, freeing memory...
⚠️ OOM for 2.7B model with context length = 1024, freein

In [5]:
pd.DataFrame(data_benchmark5)

Unnamed: 0,size,num_layers,d_model,d_ff,num_heads,cntx_len,mode,mean (milliseconds),std (milliseconds)
0,small,12,768,3072,12,128,forward,10.248798,0.031703
1,small,12,768,3072,12,128,forward+backward,22.19391,0.035818
2,small,12,768,3072,12,256,forward,11.253574,0.027241
3,small,12,768,3072,12,256,forward+backward,27.365297,0.056862
4,small,12,768,3072,12,512,forward,16.7624,0.024275
5,small,12,768,3072,12,512,forward+backward,49.3912,0.01292
6,small,12,768,3072,12,1024,forward,53.803942,0.074543
7,small,12,768,3072,12,1024,forward+backward,158.165682,0.060323
8,medium,24,1024,4096,16,128,forward,20.782726,0.055863
9,medium,24,1024,4096,16,128,forward+backward,50.035197,0.072863


# Profile memory

In [5]:
%%time
for i in range(5):
    d_model, d_ff, num_layers, num_heads = info["d_model"][i], info["d_ff"][i], info["num_layers"][i], info["num_heads"][i]
    profile_llm_memory(d_model = d_model, d_ff = d_ff, num_layers = num_layers , num_heads = num_heads)



CPU times: user 38.5 s, sys: 4.54 s, total: 43 s
Wall time: 43.7 s


OutOfMemoryError: CUDA out of memory. Tried to allocate 40.00 MiB. GPU 0 has a total capacity of 31.37 GiB of which 8.81 MiB is free. Including non-PyTorch memory, this process has 31.34 GiB memory in use. Of the allocated memory 27.45 GiB is allocated by PyTorch, and 3.28 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
import sys
print(sys.executable)
!which python

/home/sasha/.cache/uv/builds-v0/.tmp0pyFvl/bin/python
/home/sasha/.cache/uv/builds-v0/.tmp0pyFvl/bin/python


# Mixed Precision (AMP)

In [1]:
import torch
from torch import nn
from torch.amp import autocast, GradScaler
from torch.optim import AdamW

## Intuition

In [3]:
# accumulating float32 in float32
s = torch.tensor(0, dtype=torch.float32)
for i in range(1000):
    s += torch.tensor(0.01,dtype=torch.float32)
print(s)

tensor(10.0001)


In [4]:
# accumulating float16 in float16
s = torch.tensor(0, dtype=torch.float16)
for i in range(1000):
    s += torch.tensor(0.01,dtype=torch.float16)
print(s)

tensor(9.9531, dtype=torch.float16)


In [5]:
# accumulating float16 in float32
s = torch.tensor(0, dtype=torch.float32)
for i in range(1000):
    s += torch.tensor(0.01,dtype=torch.float16)
print(s)

tensor(10.0021)


In [6]:
# accumulating float16->float32 in float32
s = torch.tensor(0, dtype=torch.float32)
for i in range(1000):
    x = torch.tensor(0.01,dtype=torch.float16)
    s += x.type(torch.float32)
print(s)

tensor(10.0021)


In [7]:
# accumulating bfloat16 in bfloat16
s = torch.tensor(0, dtype=torch.bfloat16)
for i in range(1000):
    s += torch.tensor(0.01,dtype=torch.bfloat16)
print(s)

tensor(4., dtype=torch.bfloat16)


In [8]:
# accumulating bfloat16 in float32
s = torch.tensor(0, dtype=torch.float32)
for i in range(1000):
    s += torch.tensor(0.01,dtype=torch.bfloat16)
print(s)

tensor(10.0098)


## Understanding Toy Model dtypes

In [2]:
class ToyModel(nn.Module):
    def __init__(self, in_features: int, out_features: int):
        super().__init__()
        self.fc1 = nn.Linear(in_features, 10, bias=False)
        self.ln = nn.LayerNorm(10)
        self.fc2 = nn.Linear(10, out_features, bias=False)
        self.relu = nn.ReLU()

    def forward(self, x):
        print("       input dtype:", x.dtype)
        x = self.fc1(x)
        print(" fc1 activat dtype:", x.dtype)
        x = self.relu(x)
        print("relu activat dtype:", x.dtype)
        x = self.ln(x)
        print("  ln activat dtype:", x.dtype)
        x = self.fc2(x)
        print(" fc2 activat dtype:", x.dtype)
        return x

In [3]:
dtype = torch.float32
device = torch.device("cuda:0")
in_features, out_features = 160, 32
batch_size = 16

In [4]:
loss_fn = nn.CrossEntropyLoss()

model = ToyModel(in_features, out_features)
model.to(dtype = dtype, device = device)
model.train()

optimizer = AdamW(model.parameters())

scaler = GradScaler()

In [5]:
for name, p in model.named_parameters():
    print(name, " " * (16 - len(name)), p.dtype, p.shape)

fc1.weight        torch.float32 torch.Size([10, 160])
ln.weight         torch.float32 torch.Size([10])
ln.bias           torch.float32 torch.Size([10])
fc2.weight        torch.float32 torch.Size([32, 10])


### float16

In [7]:
# training loop
x = torch.rand(batch_size, in_features, dtype = dtype, device = device)
y = torch.empty(batch_size, dtype=torch.long, device = device).random_(out_features)
for i in range(1):
    with autocast('cuda', enabled = True, dtype=torch.float16):
        print(" fc1 weights dtype:", model.fc1.weight.dtype)
        print(" fc2 weights dtype:", model.fc2.weight.dtype)
        print("  ln weights dtype:", model.ln.weight.dtype)
        print("  ln    bias dtype:", model.ln.bias.dtype, "\n")
        logits = model(x)
        print("\n      logits dtype:", logits.dtype)
        
        # loss
        loss = loss_fn(logits, y)
        print("\n        loss dtype:", loss.dtype)

    # scale (and make optimizer step)
    scaler.scale(loss).backward()
    # print grad
    print(" fc1   grads dtype:", model.fc1.weight.grad.dtype)
    print(" fc2   grads dtype:", model.fc2.weight.grad.dtype)
    print("  ln  wgrads dtype:", model.ln.weight.grad.dtype)
    print("  ln  bgrads dtype:", model.ln.bias.grad.dtype,"\n")
    
    scaler.step(optimizer)
    scaler.update()

 fc1 weights dtype: torch.float32
 fc2 weights dtype: torch.float32
  ln weights dtype: torch.float32
  ln    bias dtype: torch.float32 

       input dtype: torch.float32
 fc1 activat dtype: torch.float16
relu activat dtype: torch.float16
  ln activat dtype: torch.float32
 fc2 activat dtype: torch.float16

      logits dtype: torch.float16

        loss dtype: torch.float32
 fc1   grads dtype: torch.float32
 fc2   grads dtype: torch.float32
  ln  wgrads dtype: torch.float32
  ln  bgrads dtype: torch.float32 



### bfloat16

In [9]:
# training loop
x = torch.rand(batch_size, in_features, dtype = dtype, device = device)
y = torch.empty(batch_size, dtype=torch.long, device = device).random_(out_features)
for i in range(1):
    optimizer.zero_grad()
    with autocast('cuda', enabled = True, dtype=torch.bfloat16):
        print(" fc1 weights dtype:", model.fc1.weight.dtype)
        print(" fc2 weights dtype:", model.fc2.weight.dtype)
        print("  ln weights dtype:", model.ln.weight.dtype)
        print("  ln    bias dtype:", model.ln.bias.dtype,"\n")
        logits = model(x)
        print("\n      logits dtype:", logits.dtype)
        
        # loss
        loss = loss_fn(logits, y)
        print("\n        loss dtype:", loss.dtype)

    # scale (and make optimizer step)
    scaler.scale(loss).backward()
    # print grad
    print(" fc1   grads dtype:", model.fc1.weight.grad.dtype)
    print(" fc2   grads dtype:", model.fc2.weight.grad.dtype)
    print("  ln  wgrads dtype:", model.ln.weight.grad.dtype)
    print("  ln  bgrads dtype:", model.ln.bias.grad.dtype,"\n")
    
    scaler.step(optimizer)
    scaler.update()

 fc1 weights dtype: torch.float32
 fc2 weights dtype: torch.float32
  ln weights dtype: torch.float32
  ln    bias dtype: torch.float32 

       input dtype: torch.float32
 fc1 activat dtype: torch.bfloat16
relu activat dtype: torch.bfloat16
  ln activat dtype: torch.float32
 fc2 activat dtype: torch.bfloat16

      logits dtype: torch.bfloat16

        loss dtype: torch.float32
 fc1   grads dtype: torch.float32
 fc2   grads dtype: torch.float32
  ln  wgrads dtype: torch.float32
  ln  bgrads dtype: torch.float32 

