# Benchmarking LLM Performance with GuideLLM

This notebook demonstrates how to benchmark the performance of different LLM models using vLLM and GuideLLM. We'll compare a full precision model with its quantized version to analyze the trade-offs between accuracy and inference speed.



In [None]:
!pip install vllm guidellm openai

In [None]:
full_precision_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_name = "TinyLlama-1.1B-Chat-v1.0-INT8"


In [None]:
import subprocess
import time
import signal
import os

from openai import OpenAI


def start_vllm_server(model_path):
    # Start vllm serve in the background and redirect output to a log file
    log_file = "vllm_server.log"
    with open(log_file, "w") as f:
        process = subprocess.Popen(
            ["vllm", "serve", model_path],
            stdout=f,
            stderr=subprocess.STDOUT,
            text=True
        )

    # Store the process ID for later use
    pid = process.pid
    print(f"Starting vllm server with PID: {pid}")

    # Tail the log file until we see the startup message
    startup_complete = False
    while not startup_complete:
        try:
            with open(log_file, "r") as f:
                content = f.read()
                print(content)
                if "Application startup complete" in content:
                    startup_complete = True
                    print("Server startup complete!")
                    break
        except FileNotFoundError:
            pass
        time.sleep(5)

    return pid
    

def stop_vllm_server(pid):
    os.kill(pid, signal.SIGTERM)
    print(f"Stopped vllm server (PID: {pid})")


def test_served_model(
    infer_endpoint="http://localhost:8000",
    api_key="EMPTY",
    model_name=None,
    prompt="Hi. Who are you?"):  
    
    client = OpenAI(
        api_key=api_key,
        base_url=f"{infer_endpoint}/v1"
    )

    # Get model name of the first available model
    # Since we're using OpenShift AI serving with vLLM, there is only one model available
    if not model_name:
        model_list = client.models.list()
        model_name = model_list.data[0].id
        
    print(f"Using model: {model_name}")

    completion = client.chat.completions.create(
        model=model_name,
        messages=[
            # User message
            {
                "role": "user",
                "content": prompt
            }
        ],
        stream=True
    )

    for chunk in completion:
        if chunk.choices[0].delta.content is not None:
            print(chunk.choices[0].delta.content, end='', flush=True)
    print()


def run_benchmark(
    target_url="http://localhost:8000",
    log_file="guidellm.log",
    output_path="benchmarks.json"):
    # Start guidellm benchmark in the background and redirect output to a log file
    with open(log_file, "w") as f:
        process = subprocess.Popen(
            ["guidellm", "benchmark",
             "--target", target_url,
             "--rate-type", "sweep",
             "--max-seconds", "30",
             "--data", "prompt_tokens=256,output_tokens=128",
             "--disable-progress",
             "--output-path", output_path],
            stdout=f,
            stderr=subprocess.STDOUT,
            text=True
        )

    # Store the process ID for later use
    pid = process.pid
    print(f"Starting guidellm benchmark with PID: {pid}")

    # Wait for the benchmark to complete
    process.wait()
    print("Benchmark complete!")

    # Read and print the log file
    with open(log_file, "r") as f:
        print(f.read())

    return pid

In [None]:
vllm_pid = start_vllm_server(quantized_model_name)
print(f"vllm started on PID {vllm_pid}")

test_served_model(prompt="Hi. Who are you?")

guidellm_pid = run_benchmark(target_url="http://localhost:8000", log_file="guidellm.log", output_path="quantized-benchmarks.json")
print(f"guidellm started on PID {guidellm_pid}")

stop_vllm_server(vllm_pid)
print(f"vllm stopped")

In [None]:
vllm_pid = start_vllm_server(quantized_model_name)
print(f"vllm started on PID {vllm_pid}")

test_served_model(prompt="Hi. Who are you?")

guidellm_pid = run_benchmark(target_url="http://localhost:8000", log_file="guidellm.log", output_path="quantized-benchmarks.json")
print(f"guidellm started on PID {guidellm_pid}")

stop_vllm_server(vllm_pid)
print(f"vllm stopped")