# üéØ Model Performance Benchmarking

**The ROI of Quantization** - Run and analyze GuideLLM benchmarks to measure vLLM inference performance.

## What This Notebook Does

1. **Run Benchmarks** - Execute GuideLLM tests against deployed models
2. **Analyze Results** - Parse benchmark JSON files from PVC
3. **Visualize Metrics** - Plot TTFT, TPOT, throughput curves
4. **Compare Models** - INT4 vs BF16 efficiency analysis

---

## Volume Mounts

| Path | Source | Contents |
|------|--------|----------|
| `/results` | `guidellm-results` PVC | CronJob benchmark results |
| `/pipeline-results` | `guidellm-pipeline-results` PVC | Tekton Pipeline results |
| `/opt/app-root/src` | Workbench storage | Your notebooks and analysis |


## 1. Setup Environment


In [None]:
# Install required packages
!pip install -q pandas matplotlib plotly requests tabulate


In [None]:
import os
import json
import glob
import subprocess
from datetime import datetime
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt

# Configure paths
CRONJOB_RESULTS = Path("/results")
PIPELINE_RESULTS = Path("/pipeline-results")

# Model endpoints
MODELS = {
    "mistral-3-int4": "http://mistral-3-int4-predictor.private-ai.svc.cluster.local:8080/v1",
    "mistral-3-bf16": "http://mistral-3-bf16-predictor.private-ai.svc.cluster.local:8080/v1",
}

print("‚úÖ Environment ready")
print(f"\nüìÅ CronJob results: {CRONJOB_RESULTS}")
print(f"üìÅ Pipeline results: {PIPELINE_RESULTS}")


## 2. Check Available Models


In [None]:
# List InferenceServices
!oc get inferenceservice -n private-ai -o custom-columns=NAME:.metadata.name,READY:.status.conditions[0].status,URL:.status.url


In [None]:
# Quick health check
import requests

def check_model_health(name, url):
    try:
        resp = requests.get(f"{url}/models", timeout=5)
        if resp.status_code == 200:
            models = resp.json().get("data", [])
            return f"‚úÖ {name}: {len(models)} model(s) loaded"
        return f"‚ö†Ô∏è {name}: HTTP {resp.status_code}"
    except Exception as e:
        return f"‚ùå {name}: {str(e)[:50]}"

print("\nüîç Model Health Check\n" + "="*50)
for name, url in MODELS.items():
    print(check_model_health(name, url))


I

In [None]:
# Configure benchmark parameters
BENCHMARK_CONFIG = {
    "model": "mistral-3-int4",           # Model to benchmark
    "profile": "constant",                # constant, poisson, sweep
    "rate_type": "concurrent",            # concurrent, constant
    "rates": [1, 3, 5, 8, 10],             # Concurrency levels to test
    "max_seconds": 30,                     # Duration per rate level
    "max_requests": 50,                    # Max requests per rate level
    "prompt_tokens": 256,                  # Synthetic input tokens
    "output_tokens": 256,                  # Synthetic output tokens
}

print("üìã Benchmark Configuration:")
for k, v in BENCHMARK_CONFIG.items():
    print(f"   {k}: {v}")


In [None]:
# Trigger the daily CronJob manually (simplest approach)
!oc create job --from=cronjob/guidellm-daily benchmark-notebook-$(date +%H%M%S) -n private-ai

print("\nüìä Monitor with: oc logs -f job/benchmark-notebook-... -n private-ai")


## 4. Analyze Benchmark Results


In [None]:
# List available result files
def list_results(results_dir: Path) -> list:
    """List all JSON result files in a directory."""
    if not results_dir.exists():
        return []
    files = sorted(results_dir.glob("**/*.json"), key=os.path.getmtime, reverse=True)
    return files[:20]  # Return most recent 20

print("üìÅ CronJob Results (most recent):")
for f in list_results(CRONJOB_RESULTS)[:5]:
    size = f.stat().st_size / 1024
    mtime = datetime.fromtimestamp(f.stat().st_mtime).strftime("%Y-%m-%d %H:%M")
    print(f"   {f.name:<40} {size:>8.1f} KB  {mtime}")

print("\nüìÅ Pipeline Results (most recent):")
for f in list_results(PIPELINE_RESULTS)[:5]:
    size = f.stat().st_size / 1024
    mtime = datetime.fromtimestamp(f.stat().st_mtime).strftime("%Y-%m-%d %H:%M")
    print(f"   {f.name:<40} {size:>8.1f} KB  {mtime}")


In [None]:
def parse_guidellm_results(filepath: Path) -> dict:
    """Parse GuideLLM benchmark results JSON."""
    with open(filepath) as f:
        data = json.load(f)
    
    benchmarks = data.get("benchmarks", [])
    results = []
    
    for bench in benchmarks:
        metrics = bench.get("metrics", {})
        results.append({
            "rate": bench.get("rate", 0),
            "rate_type": bench.get("rate_type", "unknown"),
            "completed_requests": metrics.get("request_count", 0),
            "throughput_tok_s": metrics.get("output_token_throughput", {}).get("mean", 0),
            "ttft_p50_ms": metrics.get("ttft", {}).get("p50", 0) * 1000,
            "ttft_p95_ms": metrics.get("ttft", {}).get("p95", 0) * 1000,
            "tpot_p50_ms": metrics.get("itl", {}).get("p50", 0) * 1000,
            "tpot_p95_ms": metrics.get("itl", {}).get("p95", 0) * 1000,
        })
    
    return {
        "model": data.get("model", "unknown"),
        "target": data.get("target", "unknown"),
        "benchmarks": results
    }

# Parse and display most recent result
recent_files = list_results(CRONJOB_RESULTS)
if recent_files:
    result = parse_guidellm_results(recent_files[0])
    print(f"\nüìä Latest Result: {recent_files[0].name}")
    print(f"   Model: {result['model']}")
    print(f"   Benchmarks: {len(result['benchmarks'])} rate levels")
    
    # Show as table
    df = pd.DataFrame(result["benchmarks"])
    print("\n" + df.to_string(index=False))
else:
    print("No results found. Run a benchmark first!")


## 5. Visualize Performance Metrics


In [None]:
def plot_latency_vs_concurrency(results: list, title: str = "Latency vs Concurrency"):
    """Plot TTFT and TPOT against concurrency levels."""
    df = pd.DataFrame(results)
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # TTFT plot
    axes[0].plot(df["rate"], df["ttft_p50_ms"], "b-o", label="P50")
    axes[0].plot(df["rate"], df["ttft_p95_ms"], "r-s", label="P95")
    axes[0].axhline(y=1000, color="orange", linestyle="--", label="SLA (1s)")
    axes[0].axhline(y=2000, color="red", linestyle="--", label="Breaking (2s)")
    axes[0].set_xlabel("Concurrent Users")
    axes[0].set_ylabel("TTFT (ms)")
    axes[0].set_title("Time to First Token")
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # TPOT plot
    axes[1].plot(df["rate"], df["tpot_p50_ms"], "b-o", label="P50")
    axes[1].plot(df["rate"], df["tpot_p95_ms"], "r-s", label="P95")
    axes[1].axhline(y=100, color="orange", linestyle="--", label="SLA (100ms)")
    axes[1].set_xlabel("Concurrent Users")
    axes[1].set_ylabel("TPOT (ms)")
    axes[1].set_title("Time per Output Token")
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.suptitle(title, fontsize=14, fontweight="bold")
    plt.tight_layout()
    plt.show()

# Plot if results exist
if recent_files:
    result = parse_guidellm_results(recent_files[0])
    if result["benchmarks"]:
        plot_latency_vs_concurrency(result["benchmarks"], f"Model: {result['model']}")


In [None]:
def plot_throughput_curve(results: list, title: str = "Throughput vs Concurrency"):
    """Plot throughput against concurrency levels."""
    df = pd.DataFrame(results)
    
    fig, ax = plt.subplots(figsize=(10, 5))
    
    ax.plot(df["rate"], df["throughput_tok_s"], "g-o", linewidth=2, markersize=8)
    ax.fill_between(df["rate"], df["throughput_tok_s"], alpha=0.3, color="green")
    
    ax.set_xlabel("Concurrent Users", fontsize=12)
    ax.set_ylabel("Throughput (tokens/sec)", fontsize=12)
    ax.set_title(title, fontsize=14, fontweight="bold")
    ax.grid(True, alpha=0.3)
    
    # Annotate max throughput
    max_idx = df["throughput_tok_s"].idxmax()
    max_rate = df.loc[max_idx, "rate"]
    max_throughput = df.loc[max_idx, "throughput_tok_s"]
    ax.annotate(f"Peak: {max_throughput:.0f} tok/s @ {max_rate} users",
                xy=(max_rate, max_throughput),
                xytext=(max_rate + 2, max_throughput * 0.9),
                fontsize=10,
                arrowprops=dict(arrowstyle="->", color="black"))
    
    plt.tight_layout()
    plt.show()

# Plot if results exist
if recent_files:
    result = parse_guidellm_results(recent_files[0])
    if result["benchmarks"]:
        plot_throughput_curve(result["benchmarks"], f"Throughput: {result['model']}")


## 6. ROI Analysis: INT4 vs BF16

Calculate the economics of quantization.


In [None]:
# Hardware costs (AWS on-demand pricing)
COSTS = {
    "int4": {"gpus": 1, "instance": "g6.4xlarge", "cost_hr": 0.85},
    "bf16": {"gpus": 4, "instance": "g6.12xlarge", "cost_hr": 3.40},
}

# Find INT4 and BF16 results
def find_model_results(model_name: str, results_dir: Path) -> Path:
    pattern = f"*{model_name}*.json"
    files = sorted(results_dir.glob(pattern), key=os.path.getmtime, reverse=True)
    return files[0] if files else None

int4_file = find_model_results("int4", CRONJOB_RESULTS)
bf16_file = find_model_results("bf16", CRONJOB_RESULTS)

print("üîç Found result files:")
print(f"   INT4: {int4_file.name if int4_file else 'Not found'}")
print(f"   BF16: {bf16_file.name if bf16_file else 'Not found'}")


In [None]:
def calculate_roi(int4_results: dict, bf16_results: dict):
    """Calculate ROI metrics for INT4 vs BF16."""
    
    int4_df = pd.DataFrame(int4_results["benchmarks"])
    bf16_df = pd.DataFrame(bf16_results["benchmarks"])
    
    int4_peak = int4_df["throughput_tok_s"].max()
    bf16_peak = bf16_df["throughput_tok_s"].max()
    
    int4_efficiency = int4_peak / COSTS["int4"]["cost_hr"]
    bf16_efficiency = bf16_peak / COSTS["bf16"]["cost_hr"]
    
    print("\nüí∞ ROI Analysis: The Economics of Precision\n" + "="*55)
    print(f"\n{'Metric':<30} {'INT4 (1-GPU)':<15} {'BF16 (4-GPU)':<15}")
    print("-" * 60)
    print(f"{'Hardware Cost':<30} ${COSTS['int4']['cost_hr']:.2f}/hr{'':<9} ${COSTS['bf16']['cost_hr']:.2f}/hr")
    print(f"{'Peak Throughput':<30} {int4_peak:.0f} tok/s{'':<7} {bf16_peak:.0f} tok/s")
    print(f"{'Efficiency (tok/s per $)':<30} {int4_efficiency:.0f}{'':<14} {bf16_efficiency:.0f}")
    print(f"{'Cost Ratio':<30} 1x{'':<15} {COSTS['bf16']['cost_hr']/COSTS['int4']['cost_hr']:.1f}x")
    
    # 4x INT4 vs 1x BF16 comparison
    four_int4_throughput = int4_peak * 4
    
    print("\n" + "="*55)
    print("\nüéØ Key Insight: 4x INT4 vs 1x BF16 (Same Cost)")
    print(f"   4x INT4 Throughput: {four_int4_throughput:.0f} tok/s")
    print(f"   1x BF16 Throughput: {bf16_peak:.0f} tok/s")
    if four_int4_throughput > bf16_peak:
        print(f"   Advantage: INT4 delivers {(four_int4_throughput/bf16_peak - 1)*100:.0f}% more throughput!")
    else:
        print(f"   Note: BF16 delivers {(bf16_peak/four_int4_throughput - 1)*100:.0f}% more throughput")

if int4_file and bf16_file:
    int4_results = parse_guidellm_results(int4_file)
    bf16_results = parse_guidellm_results(bf16_file)
    calculate_roi(int4_results, bf16_results)
else:
    print("‚ö†Ô∏è Need both INT4 and BF16 results for ROI analysis.")


---

## üìö References

- [GuideLLM Documentation](https://github.com/neuralmagic/guidellm)
- [vLLM Production Metrics](https://docs.vllm.ai/en/latest/serving/metrics.html)
- [Red Hat AI Benchmarking Guide](https://developers.redhat.com/articles/2025/06/20/guidellm-evaluate-llm-deployments-real-world-inference)
