In [None]:
!nvidia-smi

Sun Oct 26 16:38:30 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               On  |   00000000:01:00.0 Off |                  Off |
| 30%   35C    P8             23W /  300W |       1MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import os, time, json, torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [4]:
MODEL_ID = os.environ.get("MODEL_ID", "mistralai/Mistral-7B-Instruct-v0.3")
DTYPE = torch.bfloat16
DEVICE_MAP = "auto"

In [5]:
PROMPT = """You are a helpful assistant. Briefly explain attention and the KV cache in LLMs."""

torch.cuda.reset_peak_memory_stats()
tok = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=DTYPE,
    device_map=DEVICE_MAP,
    use_cache=True,                 # full KV-cache (baseline)
    attn_implementation="eager",    # standard attention
)

inputs = tok(PROMPT, return_tensors="pt").to(model.device)

# Simple latency measurement
torch.cuda.synchronize()
t0 = time.perf_counter()
with torch.inference_mode():
    out = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=False,
    )
torch.cuda.synchronize()
t1 = time.perf_counter()

text = tok.decode(out[0], skip_special_tokens=True)
latency_s = t1 - t0
toks_out = out.shape[-1] - inputs["input_ids"].shape[-1]
tps = toks_out / latency_s if latency_s > 0 else float("nan")
peak_mem = torch.cuda.max_memory_allocated() / (1024**3)

result = {
    "model": MODEL_ID,
    "latency_s": round(latency_s, 4),
    "tokens_generated": int(toks_out),
    "throughput_tok_per_s": round(tps, 2),
    "peak_gpu_mem_gb": round(peak_mem, 2),
}

print(json.dumps(result, indent=2))
logdir = os.environ.get("LOGDIR", f"{os.environ.get('SCRATCH','.')}/logs")
os.makedirs(logdir, exist_ok=True)
with open(os.path.join(logdir, "baseline_generate.json"), "w") as f:
    json.dump(result, f, indent=2)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{
  "model": "mistralai/Mistral-7B-Instruct-v0.3",
  "latency_s": 9.9482,
  "tokens_generated": 256,
  "throughput_tok_per_s": 25.73,
  "peak_gpu_mem_gb": 13.55
}
