In [None]:
# CPU LLM Benchmarking: latency vs tokens vs quantization
# Author: Sheng + GPT-5

import time
import pandas as pd
from llama_cpp import Llama

# === Config ===
model_paths = {
    "Q4": "/models/llama-2-7b.Q4_K_M.gguf",
    "Q5": "/models/llama-2-7b.Q5_K_M.gguf",
    "Q8": "/models/llama-2-7b.Q8_0.gguf",
}
prompts = [
    "What is artificial intelligence?",
    "Explain quantum computing in simple terms.",
    "Summarize the benefits and risks of AI in healthcare in 100 words."
]
n_threads = 8     # adjust for your CPU cores
n_tokens = 64     # tokens to generate
results = []

# === Benchmark loop ===
for quant, path in model_paths.items():
    print(f"\nLoading model ({quant})...")
    llm = Llama(model_path=path, n_threads=n_threads, n_ctx=2048, verbose=False)
    
    for prompt in prompts:
        t0 = time.time()
        output = llm(prompt, max_tokens=n_tokens, echo=False)
        t1 = time.time()

        total_time = t1 - t0
        gen_time = output["timings"]["predicted_ms"] / 1000 if "timings" in output else total_time
        tokens_generated = len(output["choices"][0]["text"].split())
        latency = gen_time / max(1, tokens_generated)

        results.append({
            "quant": quant,
            "prompt_len": len(prompt.split()),
            "tokens_generated": tokens_generated,
            "total_time_s": total_time,
            "latency_s/token": latency
        })
        print(f"{quant} | {len(prompt.split())}w | {tokens_generated}t | {latency:.3f}s/token")

# === Report ===
df = pd.DataFrame(results)
print("\n=== Benchmark Summary ===")
print(df)

# Optional: visualize
try:
    import matplotlib.pyplot as plt
    plt.figure(figsize=(8,5))
    for quant in df['quant'].unique():
        subset = df[df['quant']==quant]
        plt.plot(subset['prompt_len'], subset['latency_s/token'], marker='o', label=quant)
    plt.xlabel("Prompt length (words)")
    plt.ylabel("Latency (s/token)")
    plt.title("CPU LLM Benchmark: Latency vs Prompt vs Quant Level")
    plt.legend()
    plt.grid(True)
    plt.show()
except ImportError:
    print("Install matplotlib for chart visualization (pip install matplotlib).")
