# SYSTEMDS-BENCH-GPT: vLLM Benchmarking

This notebook runs all vLLM benchmarks on Google Colab's GPU.

**Steps:**
1. Check GPU and install dependencies
2. Clone/update repository
3. Start vLLM server
4. Run all 4 workloads
5. Download results

**Requirements:** Enable GPU runtime (Runtime → Change runtime type → T4 GPU)

## Step 1: Check GPU

In [None]:
!nvidia-smi


## Step 2: Install Dependencies

In [None]:
!pip install vllm torch transformers accelerate -q
!pip install pyyaml numpy tqdm datasets requests psutil rouge-score -q
print("\n✓ Dependencies installed")

## Step 3: Clone Repository

In [None]:
import os

if os.path.exists('/content/systemds-bench-gpt'):
    print("Repository exists, pulling latest...")
    %cd /content/systemds-bench-gpt
    !git pull origin main
else:
    print("Cloning repository...")
    !git clone https://github.com/kubraaksux/systemds-bench-gpt.git
    %cd /content/systemds-bench-gpt

print("\n✓ Repository ready")
!pwd

## Step 4: Start vLLM Server

In [None]:
# Start vLLM server
import subprocess
import time
import requests

# ========== MODEL SELECTION ==========

# Option 1: phi-2 (2.7B) - Fast, good for testing
MODEL = "microsoft/phi-2"

# Option 2: Llama-2-7B - Better accuracy, fits in T4 (requires HF login)
# MODEL = "meta-llama/Llama-2-7b-chat-hf"

# Option 3: TinyLlama (1.1B) - Fastest, lowest accuracy
# MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# =====================================

# kill any existing server
!pkill -f "vllm.entrypoints" 2>/dev/null || True
time.sleep(2)

print(f"Starting vLLM server with model: {MODEL}")
print("This takes 4-6 minutes (download + load + compile CUDA graphs)...")
print()

# start server in background
!nohup python -m vllm.entrypoints.openai.api_server \
    --model {MODEL} \
    --host 0.0.0.0 \
    --port 8000 \
    --dtype float16 > vllm_server.log 2>&1 &

# wait for server to start
print("Waiting for model to load...")
for i in range(72):  
    time.sleep(5)
    elapsed = (i+1)*5
    mins = elapsed // 60
    secs = elapsed % 60
    print(f"  {mins}m {secs}s...", end="")
    try:
        resp = requests.get("http://localhost:8000/v1/models", timeout=5)
        if resp.status_code == 200:
            print("\n\n" + "="*50)
            print("✓ vLLM SERVER IS READY!")
            print("="*50)
            print(resp.json())
            break
    except:
        print(" loading...")
else:
    print("\n\nServer still loading. Check if process is running:")
    !ps aux | grep -E "vllm|python" | grep -v grep | head -5
    print("\nLatest logs:")
    !tail -30 vllm_server.log

## Step 5: Verify Server

In [None]:
# quick test to verify server works
import requests

try:
    resp = requests.get("http://localhost:8000/v1/models", timeout=10)
    print("✓ Server is running!")
    print(f"  Models: {resp.json()}")
except Exception as e:
    print(f"✗ Server not ready: {e}")
    print("\nRun the previous cell again or check logs:")
    !tail -30 vllm_server.log

## Step 6: Run ALL Benchmarks

In [None]:
# run all 4 workloads
import os
os.chdir('/content/systemds-bench-gpt')

workloads = [
    ("math", "results/vllm_math"),
    ("reasoning", "results/vllm_reasoning"),
    ("summarization", "results/vllm_summarization"),
    ("json_extraction", "results/vllm_json"),
]

for workload, output in workloads:
    print("\n" + "="*60)
    print(f"Running: {workload}")
    print("="*60)
    !python runner.py \
        --backend vllm \
        --model {MODEL} \
        --workload workloads/{workload}/config.yaml \
        --out {output}

print("\n" + "="*60)
print("ALL BENCHMARKS COMPLETE!")
print("="*60)

## Step 7: View Results

In [None]:
# display results summary
import json
import os

print("="*60)
print("vLLM BENCHMARK RESULTS (microsoft/phi-2)")
print("="*60)

results_dir = "/content/systemds-bench-gpt/results"
for run_dir in sorted(os.listdir(results_dir)):
    if run_dir.startswith("vllm_"):
        metrics_path = f"{results_dir}/{run_dir}/metrics.json"
        if os.path.exists(metrics_path):
            with open(metrics_path) as f:
                m = json.load(f)
            workload = run_dir.replace("vllm_", "")
            acc = m.get('accuracy_mean', 0) * 100
            acc_count = m.get('accuracy_count', 'N/A')
            lat = m.get('latency_ms_p50', 0)
            thr = m.get('throughput_req_per_s', 0)
            
            print(f"\n{workload.upper()}:")
            print(f"  Accuracy:   {acc:.0f}% ({acc_count})")
            print(f"  Latency:    {lat:.0f}ms (p50)")
            print(f"  Throughput: {thr:.3f} req/s")

## Step 8: Download Results

In [None]:
# zip and download all vLLM results
import os
os.chdir('/content/systemds-bench-gpt')

!zip -r vllm_results_final.zip results/vllm_*

from google.colab import files
files.download('vllm_results_final.zip')

print("\n" + "="*60)
print("DOWNLOAD COMPLETE!")
print("="*60)
print("\nNext steps in your local IDE:")
print("1. unzip ~/Downloads/vllm_results_final.zip -d results/")
print("2. python scripts/report.py --out benchmark_report.html")
print("3. open benchmark_report.html")

## Step 9: Cleanup (Optional)

In [None]:
# stop the vLLM server to free GPU memory
!pkill -f "vllm.entrypoints" || True
print("✓ vLLM server stopped")