# Performance Benchmarks Visualiazation

In [1]:
import glob
import re
import json
import pandas as pd
from ipywidgets import interact, fixed, IntSlider, IntText
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
def flatten_dict(d, prefix="", new_d=None, separator="-"):
    if new_d is None: new_d = {}
    for k, v in d.items():
        new_k = prefix + separator + k if prefix else k
        if isinstance(v, dict):
            new_d = flatten_dict(v, new_k, new_d, separator)
        else:
            new_d[new_k] = v
    return new_d

def format_value(v):
    if isinstance(v, int):
        return f"{v:,}"
    elif isinstance(v, float):
        if np.isnan(v): return "N/A"
        if np.abs(v) >= 1 and np.abs(v) <= 1e6:
            return f"{v:,.1f}"
        return f"{v:,.2g}"
    else: return f"{v}"



def load_latencies_nim_pbr():
    files = glob.glob("*genai-perf*/**/*genai_perf.json", recursive=True)
    data = []

    # Example path I have
    # genai-perf-artifacts/model-meta-llama-3.1-8b-instruct/profile-tensorrt_llm_buildable-h200-bf16-tp4-dp1-throughput/2025-07-29_16.33.09/concurrency-50/meta-llama-3.1-8b-instruct-openai-chat-concurrency50/200_200_genai_perf.json    
    # # TODO: Modify regex to match your path structure

    fn_regex = re.compile(r'[./\\]*?genai-perf-.*[/\\]'
                        +r'model-(?P<model>.*?)[/\\]'
                        +r'profile-(?P<profile>.*?)[/\\]'
                        +r'(?P<datetime>.*?)[/\\]'
                        +r'concurrency-(?P<concurrency>\d+)[/\\]'
                        +r'(?P<measurement_label>.*?[/\\])?' # introduced by the 25.06 GenAI-Perf
                        +r'(?P<input_len>\d+)_(?P<output_len>\d+)_genai_perf.json'
                        )

    profile_regex = re.compile(r'(?P<engine>vllm|tensorrt_llm|tensorrt_llm_buildable)'
                        +r'(-(?P<device>.*?))?'
                        +r'-(?P<precision>[^-]*)'
                        +r'(-tp(?P<TP>\d+))?'
                        +r'(-pp(?P<PP>\d+))?'
                        +r'(-dp(?P<DP>\d+))?'
                        +r'(-(?P<tuning>throughput|latency))?'
                        +r'(-(?P<lora>lora))?'
                        )

    for f in files:
        with open(f, "r") as jf:
            l = json.load(jf)

            # match = re.search(fn_regex, l["input_config"]["profile_export_file"])
            match = re.search(fn_regex, f)
            task_inputs = match.groupdict()

            # print(task_inputs["profile"])
            profile = {}
            try:
                profile_match = re.search(profile_regex, task_inputs["profile"])
                profile = profile_match.groupdict()
                # print(f"{task_inputs['profile']=}")
                # print(f"{profile=}")
            except:
                print(f"WARNING: Failed to parse profile from {task_inputs['profile']}")
                continue

            l["task_inputs"] = {**task_inputs, **profile}
            l_flattened = flatten_dict(l)
            data += [l_flattened]

    df_raw = pd.DataFrame(data)
    return df_raw


In [3]:
df_raw = load_latencies_nim_pbr()
print(len(df_raw))

10


In [4]:
df = pd.DataFrame()
# Rename columns of raw csv file
df["model"] = df_raw['task_inputs-model']
df["device"] = df_raw.get('task_inputs-device', "some-GPU")
df["engine"] = df_raw.get('task_inputs-engine', "some-engine")
df["profile"] = df_raw.get('task_inputs-profile', "some-profile")
df["datetime"] = df_raw['task_inputs-datetime']
df["TP"] = pd.to_numeric(df_raw.get('task_inputs-TP', -1), errors='coerce').fillna(-1).astype(int)
df["n_nims"] = pd.to_numeric(df_raw.get('task_inputs-DP', 1), errors='coerce').fillna(1).astype(int)
df["PP"] = pd.to_numeric(df_raw.get('task_inputs-PP', 1), errors='coerce').fillna(1).astype(int)
df["out_tokens_per_s"] = df_raw['output_token_throughput-avg']
df["latency_first_token"] = df_raw['time_to_first_token-avg'] / 1 # ms → ms
df["latency_per_token_decoding"] = df_raw['inter_token_latency-avg'] / 1 # ms → ms
df["input_len"] = df_raw['input_config-input-synthetic_tokens-mean']
df["output_len"] = df_raw['input_config-input-output_tokens-mean']

df["batch_size"] = np.nan
df["concurrency"] = df_raw['input_config-perf_analyzer-stimulus-concurrency']
df["precision"] = df_raw.get("task_inputs-precision", "some-precision")
df["prompts_per_s"] = df_raw['request_throughput-avg']
df["latency"] = df_raw['request_latency-avg'] / 1
df["out_tokens_per_s_per_user"] = df_raw['output_token_throughput_per_user-avg']

df["n_gpus"] = df["TP"] * df["n_nims"]

df["input_output_len"] = df["input_len"].astype(str) + " in → " + df["output_len"].astype(str) + " out"
df["prompts_per_s_per_gpu"] = df["prompts_per_s"] / df["n_gpus"]
df["prompts_per_s_per_8_gpus"] = df["prompts_per_s_per_gpu"] * 8
df["out_tokens_per_s_per_gpu"] = df["out_tokens_per_s"] / df["n_gpus"]
df["out_tokens_per_s_per_8_gpus"] = df["out_tokens_per_s_per_gpu"]  * 8 
df["batch_size_per_8"] = df["batch_size"] * 8 / df["n_gpus"]
df["concurrency_per_8"] = df["concurrency"] * 8 / df["n_gpus"]


In [5]:
def scatter_fixed_lengths(df, model, device, input_output_len, x_metric="latency", y_metric="prompts_per_s_per_gpu", include_costs=False):
    df_measured = df.iloc[::-1]
    filters = {
        "model": model,
        "input_output_len": input_output_len,
        }
    df_measured = df_measured[df_measured[list(filters)].eq(pd.Series(filters)).all(axis=1)]
    index_columns = ["profile", "n_nims", "datetime"]
    hover_fields = ["TP", "latency", "concurrency", "concurrency_per_8", "input_len", "output_len"]
    if include_costs:
         hover_fields = ["planned_prompts_per_second", "on_prem_cost_per_1M_input_tokens", "on_prem_cost_per_1M_output_tokens", "min_servers_required", "min_gpus_required"] + hover_fields
    index_set = list(df_measured[index_columns].groupby(index_columns, dropna=False).first().index)
    index_set.reverse()
    def get_hover(df_iterator):
            return [
                        (
                            "</br>"
                            + " </br>".join(f"{f}: {format_value(row[f])}" for f in hover_fields)
                            +"",
                            dict(row),
                        )
                        for i, row in df_iterator
                    ]
    def get_name(filters):
            name = []
            for k, v in filters.items():
                if k in ["execution_mode", "precision", "datetime", "device", "profile"]:
                    name += [f"{v}"]
                elif v != v: # v is nan
                    continue
                else:
                    name += [f"{k} {v}"]
            return ", ".join(name)
    def compare_with_nan(series, value):
            if pd.isna(value):
                return series.isna()
            else:
                return series == value
    fig = make_subplots()
    for i, index_value in enumerate(index_set):
        filters = { k: v for k, v in zip(index_columns, index_value) }
        boolean_series_list = [compare_with_nan(df_measured[col], val) for col, val in filters.items()]
        df_filtered = df_measured[np.logical_and.reduce(boolean_series_list)].sort_values(by="concurrency")
        hover = get_hover(df_filtered.iterrows())
        trace = go.Scatter(
            x = df_filtered[x_metric],
            y = df_filtered[y_metric],
            name = get_name(filters),
            customdata = hover,
            mode = 'lines+markers',
            hovertemplate = "%{customdata[0]}",
            marker={
                "size": 12,
                "color": px.colors.qualitative.G10[i % 10],
                "opacity": 0.7
            },
        )
        fig.add_trace(trace)
    fig.update_xaxes(title_text=x_metric, type="log")
    fig.update_yaxes(title_text=y_metric)
    title=f"{model}, {device}, tokens: {input_output_len}"
    if include_costs:
        planned_prompts_per_second = df.iloc[0]["planned_prompts_per_second"]
        title += f", {planned_prompts_per_second=:.2f}"
    fig.update_layout(title=title)


    fig.show()

In [6]:
sorted(df["datetime"].unique())

['2025-09-09_18.27.06']

In [7]:
interact(scatter_fixed_lengths, 
         df = fixed(df),
         model = df['model'].unique(),
         device = df['device'].unique(),
         input_output_len = df['input_output_len'].unique(),
         x_metric = ["latency_per_token_decoding", "latency_first_token", "latency"],
         y_metric = ["out_tokens_per_s", "prompts_per_s", "prompts_per_s_per_gpu", "out_tokens_per_s_per_user", "out_tokens_per_s_per_gpu"],
         include_costs=fixed(False), 
         )

interactive(children=(Dropdown(description='model', options=('meta-llama-3.1-8b-instruct',), value='meta-llama…

<function __main__.scatter_fixed_lengths(df, model, device, input_output_len, x_metric='latency', y_metric='prompts_per_s_per_gpu', include_costs=False)>

# Now let's estimate the TCO. Please fill in the costs

In [8]:
df["on_prem_server_cost"] = 300_000 # the CapEx per server in USD
df["server_depreciation_years"] = 4 
df["hosting_cost_per_server_per_year"] = 0 # per year in USD
df["nvaie_license_per_gpu_per_year"] = 4500 # per year in USD
df["on_prem_gpus_per_server"] = 8
df["cloud_api_cost_per_1M_input_tokens"] = 1 # USD
df["cloud_api_cost_per_1M_output_tokens"] = 3 # USD

In [9]:
def calculate_tco_metrics(df, planned_prompts_per_second):
    df["output_to_input_token_cost_ratio"] = df["cloud_api_cost_per_1M_output_tokens"] / df["cloud_api_cost_per_1M_input_tokens"]
    df["planned_prompts_per_second"] = planned_prompts_per_second
    df["min_instances_required"] = planned_prompts_per_second/ df["prompts_per_s"]
    df["min_servers_required"] = df["min_instances_required"] / np.floor_divide(df["on_prem_gpus_per_server"], df["n_gpus"])
    df["min_gpus_required"] = df["min_instances_required"] * df["n_gpus"]
    df["on_prem_costs_per_year_per_server"] = \
        df["nvaie_license_per_gpu_per_year"] * df["on_prem_gpus_per_server"]\
        + df["hosting_cost_per_server_per_year"] \
        + df["on_prem_server_cost"] / df["server_depreciation_years"]
    df["on_prem_costs_per_year_for_min_servers"] = \
        df["nvaie_license_per_gpu_per_year"] * df["planned_prompts_per_second"] / df["prompts_per_s_per_gpu"]\
        + df["hosting_cost_per_server_per_year"] * df["min_servers_required"] \
        + df["on_prem_server_cost"] * df["min_servers_required"] / df["server_depreciation_years"]
    df["on_prem_costs_per_day_per_server"] = df["on_prem_costs_per_year_per_server"] / 365.
    df["prompts_per_s_per_server"] = df["prompts_per_s"] * np.floor_divide(df["on_prem_gpus_per_server"], df["n_gpus"])
    df["prompts_per_day_per_server"] = df["prompts_per_s_per_server"] * 60 * 60 * 24 
    df["on_prem_cost_per_prompt"] = df["on_prem_costs_per_day_per_server"] / df["prompts_per_day_per_server"]
    df["on_prem_cost_per_1k_prompts"] = df["on_prem_cost_per_prompt"] * 1000
    # Now we have 
    # ( cost_per_input_token * input_length  + cost_per_output_token * output_length ) * 1000 = df["on_prem_cost_per_1k_prompts"]
    # and cost_per_output_token = df["output_to_input_token_cost_ratio"] * cost_per_input_token
    # thus
    # ( cost_per_input_token * input_length  + df["output_to_input_token_cost_ratio"] * cost_per_input_token * output_length ) * 1000 = df["on_prem_cost_per_1k_prompts"]
    # thus
    # cost_per_input_token * ( input_length  + df["output_to_input_token_cost_ratio"] * output_length ) * 1000 = df["on_prem_cost_per_1k_prompts"]
    # thus
    df["on_prem_cost_per_input_token"] = df["on_prem_cost_per_1k_prompts"] / ( df["input_len"]  + df["output_to_input_token_cost_ratio"] * df["output_len"] ) / 1000 
    df["on_prem_cost_per_1M_input_tokens"] = df["on_prem_cost_per_input_token"] * 1_000_000
    df["on_prem_cost_per_1M_output_tokens"] = df["on_prem_cost_per_1M_input_tokens"] * df["output_to_input_token_cost_ratio"]

    df["cloud_api_cost_per_1M_input_prompts"] = df["cloud_api_cost_per_1M_input_tokens"] * df['input_len']
    df["cloud_api_cost_per_1M_output_prompts"] = df["cloud_api_cost_per_1M_output_tokens"] * df['output_len']
    df["cloud_api_cost_per_1M_prompts"] = df["cloud_api_cost_per_1M_input_prompts"] + df["cloud_api_cost_per_1M_output_prompts"]
    df["cloud_api_cost_per_1k_prompts"] = df["cloud_api_cost_per_1M_prompts"] / 1000
    df["on_prem_to_cloud_api_cost_ratio"] = df["on_prem_cost_per_1k_prompts"] / df["cloud_api_cost_per_1k_prompts"]
    return df


In [10]:
def simulate_different_output_length(df, new_output_length):
    prev_output_length = df['output_len']
    ratio = new_output_length / prev_output_length
    latency_metrics = [ 'latency_first_token', 'latency']
    for metric in latency_metrics:
        df[metric] *= ratio
    throughput_metrics = ['out_tokens_per_s',
        'prompts_per_s', 'out_tokens_per_s_per_user',
        'prompts_per_s_per_gpu',
        'prompts_per_s_per_8_gpus', 'out_tokens_per_s_per_gpu',
        'out_tokens_per_s_per_8_gpus'
    ]
    for metric in throughput_metrics:
        df[metric] /= ratio
    df['output_len'] = new_output_length
    df["input_output_len"] = df["input_len"].astype(str) + " in → " + df["output_len"].astype(str) + " out"
    return df

def add_simulated_output_lengths(df, new_output_lengths):
    df_orig = df.copy()
    df = df.copy()
    for l in new_output_lengths:
        df = pd.concat([df, simulate_different_output_length(df_orig.copy(), l)])
    return df

Let's take an example

Expected usage is 1 million requests per month to LLM Application per use case. <br>
We need to convert that number to number of Requests per second at peak load on the system.<br>
We will calculate the average requests per second, and then multiply by the ratio of **average to peak** (current default is 3x). 

In [11]:
requests_per_month = 1_000_000
ratio_peak_to_avg = 3 # change this to your expected ration

seconds_per_month = 30 * 24 * 60 * 60
requests_per_second_avg = requests_per_month / seconds_per_month
requests_per_minute_avg = requests_per_second_avg * 60
requests_per_second_peak = ratio_peak_to_avg * requests_per_second_avg
print(f"""
{requests_per_minute_avg=:.2f}
{requests_per_second_avg=:.2f}
{requests_per_second_peak=:.2f}
""")

df_costs = calculate_tco_metrics(df.copy(), requests_per_second_peak)


requests_per_minute_avg=23.15
requests_per_second_avg=0.39
requests_per_second_peak=1.16



In [14]:
interact(scatter_fixed_lengths, 
         df = fixed(df_costs),
         model = df_costs['model'].unique(),
         device = df_costs['device'].unique(),
         input_output_len = df_costs['input_output_len'].unique(),
         x_metric = ["latency_per_token_decoding", "latency_first_token", "latency"],
         y_metric = ["min_servers_required", "on_prem_cost_per_1M_output_tokens", "prompts_per_s_per_server", "prompts_per_s_per_gpu", "min_gpus_required"],
         include_costs=fixed(True),
         )

interactive(children=(Dropdown(description='model', options=('meta-llama-3.1-8b-instruct',), value='meta-llama…

<function __main__.scatter_fixed_lengths(df, model, device, input_output_len, x_metric='latency', y_metric='prompts_per_s_per_gpu', include_costs=False)>