## Init

In [None]:
%load_ext autoreload
%autoreload 2

from datetime import datetime
import plotly.io as pio
import urllib3
import warnings
import random

from prometheus_api_client import PrometheusConnect

from data_source.prometheus import *
from transform.sampling import *
from transform.tables import *
from plotting.load_signal_static import *
from plotting.violin_plots import *
from plotting.candlestick import *
from plotting.combine import *
from load.storage import *

random.seed(42)

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", FutureWarning)
pio.templates.default = "plotly_dark"

# Configuration: $ oc port-forward -n openshift-monitoring svc/thanos-querier 9091:9091
PROMETHEUS_URL = "https://localhost:9091"
BEARER_TOKEN = "" # <<<< SET ME
MODEL_NAME = "meta-llama/Llama-3.1-8B"
NAMESPACE = "experiment-01"

TIME_RANGES = [
    (datetime(2025, 12, 17, 11, 48), datetime(2025, 12, 17, 12, 48), "1 Replica"),
    (datetime(2025, 12, 17, 10, 12), datetime(2025, 12, 17, 11, 12), "WVA"),
    (datetime(2025, 12, 16, 20, 34), datetime(2025, 12, 16, 21, 34), "12 Replicas"),
]
OUTPUT_FOLDER = "2025-12-17T13-00-00+0000"


In [None]:
prom = PrometheusConnect(
    url=PROMETHEUS_URL,
    disable_ssl=True,
    headers={"Authorization": f"Bearer {BEARER_TOKEN}"},
)

## Plotting

In [None]:
metrics_for_violin_plots = {
    "ITL": {
        "title": "Inter-Token Latency (ITL)",
        "ytitle": "Inter-Token Latency (ms)",
        "xtitle": "Scenario",
        "query": (
            'sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{{model_name="{m}",namespace="{ns}"}}[1m]))'
        ).format(m=MODEL_NAME, ns=NAMESPACE),
        "samples_generator": histogram_to_samples_global,
        "yticksuffix": "ms",
        "yscale": 1000,
        "ytickformat": ".0f"
    },
    "TTFT": {
        "title": "Time-to-First-Token (TTFT)",
        "ytitle": "Time to First Token (s)",
        "xtitle": "Scenario",
        "query": (
            'sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{{model_name="{m}",namespace="{ns}"}}[1m]))'
        ).format(m=MODEL_NAME, ns=NAMESPACE),
        "samples_generator": histogram_to_samples_global,
        "yticksuffix": "s",
        "ytickformat": ".0f"
    },
    "E2E": {
        "title": "End-to-End Latency (E2E)",
        "ytitle": "E2E Request Latency (s)",
        "xtitle": "Scenario",
        "labels": {"value": "E2E Request Latency (seconds)", "run": "Scenario"},
        "query": (
            'sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{{model_name="{m}",namespace="{ns}"}}[1m]))'
        ).format(m=MODEL_NAME, ns=NAMESPACE),
        "samples_generator": histogram_to_samples_global,
        "yticksuffix": "s",
        "ytickformat": ".0f"
    },
}

metrics_for_violin_plots2 = {
    "KV Cache Usage": {
        "title": "KV Cache Utilization",
        "ytitle": "KV Cache Utilization (%)",
        "xtitle": "Scenario",
        "query": "vllm:kv_cache_usage_perc",
        "yticksuffix": "%",
        "ytickformat": ".0f",
        "yscale": 100
    },
    "Scheduling queue": {
        "title": "Queued Requests",
        "ytitle": "Number of Queued Requests",
        "xtitle": "Scenario",
        "query": 'sum(vllm:num_requests_waiting{{model_name="{m}",namespace="{ns}"}})'.format(m=MODEL_NAME, ns=NAMESPACE)
    },
}

violin_plots_latency = violin_plots_by_config(metrics_for_violin_plots, prom, TIME_RANGES, MODEL_NAME, NAMESPACE, samples_generator_flat)
violin_plots_other = violin_plots_by_config(metrics_for_violin_plots2, prom, TIME_RANGES, MODEL_NAME, NAMESPACE, samples_generator_flat)


In [None]:

metrics = {
    "ITL": "vllm:inter_token_latency_seconds_bucket",
    "E2E": "vllm:e2e_request_latency_seconds_bucket",
    "TTFT": "vllm:time_to_first_token_seconds_bucket",
}
latency_df = get_histograms_p_tables_by_run(prom, TIME_RANGES, metrics, MODEL_NAME, NAMESPACE)


In [None]:
gauge_metrics = {
    "KV Cache Util.": "avg(vllm:kv_cache_usage_perc)",
    "Queued Requests": (
        'sum(vllm:num_requests_waiting{{model_name="{m}",namespace="{ns}"}})'
        .format(m=MODEL_NAME, ns=NAMESPACE)
    ),
    "Power": 'sum(DCGM_FI_DEV_POWER_USAGE{{exported_namespace=~"{ns}"}})'.format(ns=NAMESPACE)
}
other_df = get_gauge_p_tables_by_run(prom, TIME_RANGES, gauge_metrics)


In [None]:

itl_candlestick = candlestick_over_time_with_scaling(
    _prom=prom,
    model_name=MODEL_NAME,
    namespace=NAMESPACE,
    time_ranges=TIME_RANGES,
    metric_name="vllm:inter_token_latency_seconds",
    title="Inter-Token Latency (ITL)",
    yaxis_title="ITL (ms)",
    candle_step="5m",
    candle_rate_interval="5m",
    step="10s",
    rate_interval="1m",
    variant_name="ms-inference-scheduling-llm-d-modelservice-decode",
    values_scale_func=lambda x: x * 1e3,
    y_unit="ms",
)

e2e_candlestick = candlestick_over_time_with_scaling(
    _prom=prom,
    model_name=MODEL_NAME,
    namespace=NAMESPACE,
    time_ranges=TIME_RANGES,
    metric_name="vllm:e2e_request_latency_seconds",
    title="E2E Request Latency (VLLM)",
    yaxis_title="E2E (s)",
    candle_step="5m",
    candle_rate_interval="5m",
    step="10s",
    rate_interval="1m",
    variant_name="ms-inference-scheduling-llm-d-modelservice-decode",
    y_unit="s",
)

ttft_candlestick = candlestick_over_time_with_scaling(
    _prom=prom,
    model_name=MODEL_NAME,
    namespace=NAMESPACE,
    time_ranges=TIME_RANGES,
    metric_name="vllm:time_to_first_token_seconds",
    title="Time-To-First-Token (TTFT)",
    yaxis_title="TTFT (s)",
    candle_step="5m",
    candle_rate_interval="5m",
    step="10s",
    rate_interval="1m",
    variant_name="ms-inference-scheduling-llm-d-modelservice-decode",
    y_unit="s",
)



# Store

In [None]:
out = Path(f"_out/{OUTPUT_FOLDER}/plots")
out.mkdir(parents=True, exist_ok=True)
save_plot_dict(out, {"load_signal": (pd.DataFrame(), plot_load_signal_static(time_step=10, instances_over_time=[0, 1, 2, 3, 4, 4, 4, 4, 4], rps_per_instance=0.5))}, "01-load_signal")
save_plot_dict(out, figures_to_single_row(ttft_candlestick), "10-ttft_candlestick_single_row")
save_plot_dict(out, figures_to_single_row(e2e_candlestick), "20-e2e_candlestick_single_row")
save_plot_dict(out, figures_to_single_row(itl_candlestick), "30-e2e_candlestick_single_row")
save_plot_dict(out, violin_plots_latency, "40-violin_plots_latency")
save_plot_dict(out, violin_plots_other, "50-violin_plots_other")

out_tables = Path(f"_out/{OUTPUT_FOLDER}/tables")
out_tables.mkdir(exist_ok=True, parents=True)
latency_df.to_parquet(out_tables / "latency_df.parquet")
other_df.to_parquet(out_tables / "other_df.parquet")
