## Init

In [None]:
%load_ext autoreload
%autoreload 2

from datetime import datetime
import plotly.io as pio
import urllib3
import warnings
import random
from pathlib import Path
import pickle

from prometheus_api_client import PrometheusConnect

from data_source.prometheus import *
from transform.sampling import *
from plotting.load_signal_static import *
from plotting.violin_plots import *
from plotting.candlestick import *
from plotting.combine import *

random.seed(42)

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", FutureWarning)
pio.templates.default = "plotly_dark"

# Configuration: $ oc port-forward -n openshift-monitoring svc/thanos-querier 9091:9091
PROMETHEUS_URL = "https://localhost:9091"
BEARER_TOKEN = ""
MODEL_NAME = "meta-llama/Llama-3.1-8B"
NAMESPACE = "experiment-01"

TIME_RANGES = [
    (datetime(2025, 12, 19, 12, 20), datetime(2025, 12, 19, 13, 20), "WVA"),
    (datetime(2025, 12, 19, 12, 50), datetime(2025, 12, 19, 13, 20), "1 Replica"),
]
OUTPUT_FOLDER = Path("_out/refactoring")
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

QUERY = False
STORE = True # Will store queried data to avoid re-querying (needs QUERY=True first) - Ignored when LOAD=True, QUERY=False
LOAD = True


In [None]:
if not (QUERY or LOAD):
    raise ValueError("Either QUERY or LOAD must be True.")

In [None]:
if QUERY:
    prom = PrometheusConnect(
        url=PROMETHEUS_URL,
        disable_ssl=True,
        headers={"Authorization": f"Bearer {BEARER_TOKEN}"},
    )

In [None]:
if QUERY:
    e2e_aggregated = custom_query_range_by_run(prom, TIME_RANGES, 'sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{{model_name="{m}",namespace="{ns}"}}[1m]))'.format(m=MODEL_NAME, ns=NAMESPACE), "1m", histogram_to_samples_global)
    itl_aggregated = custom_query_range_by_run(prom, TIME_RANGES, 'sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{{model_name="{m}",namespace="{ns}"}}[1m]))'.format(m=MODEL_NAME, ns=NAMESPACE), "1m", histogram_to_samples_global)
    ttft_aggregated = custom_query_range_by_run(prom, TIME_RANGES, 'sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{{model_name="{m}",namespace="{ns}"}}[1m]))'.format(m=MODEL_NAME, ns=NAMESPACE), "1m", histogram_to_samples_global)
    kvcache_aggregated = custom_query_range_by_run(prom, TIME_RANGES, 'avg(vllm:kv_cache_usage_perc{{model_name="{m}",namespace="{ns}"}})'.format(m=MODEL_NAME, ns=NAMESPACE), "1m", samples_generator_flat)
    queue_size_aggregated = custom_query_range_by_run(prom, TIME_RANGES, 'sum(vllm:num_requests_waiting{{model_name="{m}",namespace="{ns}"}})'.format(m=MODEL_NAME, ns=NAMESPACE), "1m", samples_generator_flat)
    if STORE:
        e2e_aggregated.to_parquet(OUTPUT_FOLDER / f"e2e_aggregated.parquet")
        itl_aggregated.to_parquet(OUTPUT_FOLDER / f"itl_aggregated.parquet")
        ttft_aggregated.to_parquet(OUTPUT_FOLDER / f"ttft_aggregated.parquet")
        kvcache_aggregated.to_parquet(OUTPUT_FOLDER / f"kvcache_aggregated.parquet")
        queue_size_aggregated.to_parquet(OUTPUT_FOLDER / f"queue_size_aggregated.parquet")

if LOAD:
    e2e_aggregated = pd.read_parquet(OUTPUT_FOLDER / f"e2e_aggregated.parquet")
    itl_aggregated = pd.read_parquet(OUTPUT_FOLDER / f"itl_aggregated.parquet")
    ttft_aggregated = pd.read_parquet(OUTPUT_FOLDER / f"ttft_aggregated.parquet")
    kvcache_aggregated = pd.read_parquet(OUTPUT_FOLDER / f"kvcache_aggregated.parquet")
    queue_size_aggregated = pd.read_parquet(OUTPUT_FOLDER / f"queue_size_aggregated.parquet")


In [None]:
if QUERY:
    metrics = {
        "ITL": "vllm:inter_token_latency_seconds_bucket",
        "E2E": "vllm:e2e_request_latency_seconds_bucket",
        "TTFT": "vllm:time_to_first_token_seconds_bucket",
    }
    latency_df = get_histograms_p_tables_by_run(prom, TIME_RANGES, metrics, MODEL_NAME, NAMESPACE)

    if STORE:
        latency_df.to_parquet(OUTPUT_FOLDER / f"latency_df.parquet")

if LOAD:
    latency_df = pd.read_parquet(OUTPUT_FOLDER / f"latency_df.parquet")

In [None]:
if QUERY:
    gauge_metrics = {
        "KV Cache Util.": "avg(vllm:kv_cache_usage_perc)",
        "Queued Requests": (
            'sum(vllm:num_requests_waiting{{model_name="{m}",namespace="{ns}"}})'
            .format(m=MODEL_NAME, ns=NAMESPACE)
        ),
        "Power": 'sum(DCGM_FI_DEV_POWER_USAGE{{exported_namespace=~"{ns}"}})'.format(ns=NAMESPACE)
    }
    other_df = get_gauge_p_tables_by_run(prom, TIME_RANGES, gauge_metrics)

    if STORE:
        other_df.to_parquet(OUTPUT_FOLDER / f"other_df.parquet")

if LOAD:
    other_df = pd.read_parquet(OUTPUT_FOLDER / f"other_df.parquet")

In [None]:
if QUERY:
    ttft_candlestick_results = compare_runs_quantiles_for_metric(
        _prom=prom,
        time_ranges=TIME_RANGES,
        model_name=MODEL_NAME,
        namespace=NAMESPACE,
        variant_name="ms-inference-scheduling-llm-d-modelservice-decode",
        metric_name="vllm:time_to_first_token_seconds",
        iqr_step="5m",
        iqr_rate_interval="5m",
        p50_step="10s",
        p50_rate_interval="1m",
    )

    e2e_candlestick_results = compare_runs_quantiles_for_metric(
        _prom=prom,
        time_ranges=TIME_RANGES,
        model_name=MODEL_NAME,
        namespace=NAMESPACE,
        variant_name="ms-inference-scheduling-llm-d-modelservice-decode",
        metric_name="vllm:e2e_request_latency_seconds",
        iqr_step="5m",
        iqr_rate_interval="5m",
        p50_step="10s",
        p50_rate_interval="1m",
    )

    itl_candlestick_results = compare_runs_quantiles_for_metric(
        _prom=prom,
        time_ranges=TIME_RANGES,
        model_name=MODEL_NAME,
        namespace=NAMESPACE,
        variant_name="ms-inference-scheduling-llm-d-modelservice-decode",
        metric_name="vllm:inter_token_latency_seconds",
        iqr_step="5m",
        iqr_rate_interval="5m",
        p50_step="10s",
        p50_rate_interval="1m",
        values_scale_func=lambda x: x * 1e3,
    )

    if STORE:
        with open(OUTPUT_FOLDER / "ttft_candlestick_results.pkl", "wb") as f:
            pickle.dump(ttft_candlestick_results, f)

        with open(OUTPUT_FOLDER / "e2e_candlestick_results.pkl", "wb") as f:
            pickle.dump(e2e_candlestick_results, f)

        with open(OUTPUT_FOLDER / "itl_candlestick_results.pkl", "wb") as f:
            pickle.dump(itl_candlestick_results, f)

        # Also store the dataframes
        for name, dfs in [
            ("ttft_candlestick_df", ttft_candlestick_results),
            ("e2e_candlestick_df", e2e_candlestick_results),
            ("itl_candlestick_df", itl_candlestick_results),
        ]:
            for run, dfs in dfs.items():
                df, scaling_actions = dfs
                df.to_parquet(OUTPUT_FOLDER / f"{name}_{run}_data.parquet")
                scaling_actions.to_parquet(OUTPUT_FOLDER / f"{name}_{run}_scaling_actions.parquet")


if LOAD:
    with open(OUTPUT_FOLDER / "ttft_candlestick_results.pkl", "rb") as f:
        ttft_candlestick_results = pickle.load(f)

    with open(OUTPUT_FOLDER / "e2e_candlestick_results.pkl", "rb") as f:
        e2e_candlestick_results = pickle.load(f)

    with open(OUTPUT_FOLDER / "itl_candlestick_results.pkl", "rb") as f:
        itl_candlestick_results = pickle.load(f)


## Plotting

In [None]:

from plotting.tables import *

metric_scale = {
    "ITL": 1e3,
    "KV Cache Util.": 1e2,
    "Energy": 1e-3,
    "Power": 1e-3,

}

metric_unit = {
    "ITL": "ms",
    "KV Cache Util.": "%",
    "Queued Requests": "",
    "Power": "kW",
    "Energy": "kWh"
}

order = [ x[2] for x in TIME_RANGES ]


In [None]:
display(
    add_metric_separators(
        format_with_units_per_run_metric(
            with_relative_change(
                sort(latency_df, order), baseline_key="WVA"
            ),
            metric_scale=metric_scale,
            metric_unit=metric_unit,
            baseline_key="WVA"
        )
    ).hide(axis='index')
)

In [None]:

display(
    add_metric_separators(
        format_with_units_per_run_metric(
            with_relative_change(
                sort(other_df, order), baseline_key="WVA"),
            metric_scale=metric_scale,
            metric_unit=metric_unit,
            baseline_key="WVA"
        )
    ).hide(axis='index')
)


In [None]:
e2e_violin = violin_plot_by_run(
    df=e2e_aggregated,
    title="End-to-End Latency (E2E)",
    yscale=1,
    xtitle="Scenario",
    yaxes_config=dict(
        title="E2E Request Latency (s)",
        ticksuffix="s",
        tickformat=".0f"
    )
)

e2e_violin.show()

In [None]:

itl_violin = violin_plot_by_run(
    df=itl_aggregated,
    title="Inter-Token Latency (ITL)",
    yscale=1e3,
    xtitle="Scenario",
    yaxes_config=dict(
        title="Inter-Token Latency (ms)",
        ticksuffix="ms",
        tickformat=".0f"
    )
)

itl_violin.show()

In [None]:

ttft_violin = violin_plot_by_run(
    df=ttft_aggregated,
    title="Time-to-First-Token (TTFT)",
    yscale=1,
    xtitle="Scenario",
    yaxes_config=dict(
        title="Time to First Token (s)",
        ticksuffix="s",
        tickformat=".0f"
    )
)

ttft_violin.show()

In [None]:

kvcache_violin = violin_plot_by_run(
    df=kvcache_aggregated,
    title="KV Cache Utilization",
    yscale=100,
    xtitle="Scenario",
    yaxes_config=dict(
        title="KV Cache Utilization (%)",
        ticksuffix="%",
        tickformat=".0f"
    )
)

kvcache_violin.show()

In [None]:

queue_size_violin = violin_plot_by_run(
    df=queue_size_aggregated,
    title="Queued Requests",
    yscale=1,
    xtitle="Scenario",
    yaxes_config=dict(
        title="Number of Queued Requests",
    )
)

queue_size_violin.show()

In [None]:
ttft_candlesticks = candlesticks_over_time_with_scaling(
    data=ttft_candlestick_results,
    title="Time-To-First-Token (TTFT)",
    yaxis_title="TTFT (s)",
    y_unit="s",
)

for fig in ttft_candlesticks.values():
    fig.show()

candlesticks_over_time_with_scaling_subplots(
    data=ttft_candlestick_results,
    title="Time-To-First-Token (TTFT) - Subplots",
    yaxis_title="TTFT (s)",
    y_unit="s",
).show()


In [None]:

e2e_candlesticks = candlesticks_over_time_with_scaling(
    data=e2e_candlestick_results,
    title="E2E Request Latency (E2E)",
    yaxis_title="E2E (s)",
    y_unit="s",
)

candlesticks_over_time_with_scaling_subplots(
    data=e2e_candlestick_results,
    title="E2E Request Latency (E2E) - Subplots",
    yaxis_title="E2E (s)",
    y_unit="s",
).show()

for fig in e2e_candlesticks.values():
    fig.show()

In [None]:

itl_candlesticks = candlesticks_over_time_with_scaling(
    data=itl_candlestick_results,
    title="Inter-Token Latency (ITL)",
    yaxis_title="ITL (ms)",
    y_unit="ms",
)

candlesticks_over_time_with_scaling_subplots(
    data=itl_candlestick_results,
    title="Inter-Token Latency (ITL) - Subplots",
    yaxis_title="ITL (ms)",
    y_unit="ms",
).show()

for fig in itl_candlesticks.values():
    fig.show()
