In [None]:
from datetime import timedelta
import json
import prometheus_api_client
from prometheus_api_client import PrometheusConnect
from prometheus_api_client.metric_range_df import MetricRangeDataFrame
from prometheus_api_client.metric_snapshot_df import MetricSnapshotDataFrame
from prometheus_api_client.metrics_list import MetricsList
from prometheus_api_client.utils import parse_datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
prom = PrometheusConnect(url="http://lsdataitb.fnal.gov:9009/prometheus", disable_ssl=True)

In [None]:
xxx = prom.all_metrics()
len(xxx)
[xx for xx in xxx if "GPU" in xx or "DCGM" in xx]

In [None]:
def get_all_queries(timestamp_tuples, step):
    results = {}
    queries = []
    timestamp_tuples = [("2023-03-30 at 16:00:00 MDT", "2023-03-30 at 19:00:00 MDT"),
                       ]
    unique_model_versions = None
    unique_gpu_instances = None
    for key, query in {
        "num_instances": "count((sum by(pod) (delta(nv_inference_request_success["+step+"]))) > 0)",
        "inf_rate_net":"sum (rate(nv_inference_count["+step+"]))",
        "inf_rate_bypod":"sum by(pod) (rate(nv_inference_count["+step+"]))",
        "inf_rate":"sum by(model, version, pod) (rate(nv_inference_count["+step+"]))",
        "inf_cache_hit_rate":"sum by(model, version, pod) (rate(nv_cache_num_hits_per_model["+step+"]))",
        "inf_reqs_net":"sum(rate(nv_inference_request_success["+step+"]))",
        "inf_reqs_bypod":"sum by(pod) (rate(nv_inference_request_success["+step+"]))",
        "inf_reqs":"sum by(model, version, pod) (rate(nv_inference_request_success["+step+"]))",
        "inf_req_dur_net": "avg (delta(nv_inference_request_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_que_dur_net": "avg (delta(nv_inference_queue_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_inp_dur_net": "avg (delta(nv_inference_compute_input_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_inf_dur_net": "avg (delta(nv_inference_compute_infer_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_out_dur_net": "avg (delta(nv_inference_compute_output_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_req_dur": "avg by(model, version, pod) (delta(nv_inference_request_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_que_dur": "avg by(model, version, pod) (delta(nv_inference_queue_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_inp_dur": "avg by(model, version, pod) (delta(nv_inference_compute_input_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_inf_dur": "avg by(model, version, pod) (delta(nv_inference_compute_infer_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_out_dur": "avg by(model, version, pod) (delta(nv_inference_compute_output_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "gpu_tensor_util": "sum by(device,GPU_I_ID,instance) (avg_over_time (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        "gpu_dram_util": "sum by(device,GPU_I_ID,instance) (avg_over_time (DCGM_FI_PROF_DRAM_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        #"inf_cache_hits": "avg by(model, version, pod) (delta(nv_cache_num_hits_per_model["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        }.items():
        results[key] = []
        queries.append((key, query))
        for st, et in timestamp_tuples:
            test_inp = prom.custom_query_range(
                query=query,
                start_time=parse_datetime(st),
                end_time=parse_datetime(et),
                step=step
            )
            df = MetricRangeDataFrame(test_inp)
            results[key].append(df)
        results[key] = pd.concat(results[key], axis=0)
        if unique_model_versions is None and hasattr(results[key], "model") and hasattr(results[key], "version"):
            unique_model_versions = set((results[key].model+"/"+results[key].version).values)
        if unique_gpu_instances is None and hasattr(results[key], "GPU_I_ID"):
            unique_gpu_instances = set((results[key].device+"/"+results[key].GPU_I_ID+"/"+results[key].instance).values)
    model_queries = {"num_instances_"+model_version: "count((sum by(pod) (delta(nv_inference_request_success{model='"+
                     model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'}["+step+"]))) > 0)"
                     for model_version in unique_model_versions}
    model_queries.update(
        {"inf_rate_"+model_version: "sum (rate(nv_inference_count{model='"+
         model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'}["+step+"]))"
         for model_version in unique_model_versions})
    for key, query in model_queries.items():
        queries.append((key, query))
        results[key] = []
        for st, et in timestamp_tuples:
            test_inp = prom.custom_query_range(
                query=query,
                start_time=parse_datetime(st),
                end_time=parse_datetime(et),
                step=step
            )
            if len(test_inp) > 0:
                df = MetricRangeDataFrame(test_inp)
                results[key].append(df)
        if len(results[key]) > 0:
            results[key] = pd.concat(results[key], axis=0)
        else:
            results.pop(key)
            unique_model_versions.remove(key.split("_instances_")[1])
    gpu_queries = {"gpu_tensor_util_"+str(mg): "sum (avg_over_time(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{"+
                   "exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',"+
                   "device='"+gpu_inst.split("/")[0]+"',GPU_I_ID='"+gpu_inst.split("/")[1]+"',instance='"+gpu_inst.split("/")[2]+"'}["+step+"]))" for mg, gpu_inst in enumerate(unique_gpu_instances)}
    gpu_queries.update(
        {"gpu_dram_util_"+str(mg): "avg (avg_over_time(DCGM_FI_PROF_DRAM_ACTIVE{"+
         "exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',"+
        "device='"+gpu_inst.split("/")[0]+"',GPU_I_ID='"+gpu_inst.split("/")[1]+"',instance='"+gpu_inst.split("/")[2]+"'}["+step+"]))"
         for mg, gpu_inst in enumerate(unique_gpu_instances)})
    for key, query in gpu_queries.items():
        queries.append((key, query))
        results[key] = []
        for st, et in timestamp_tuples:
            test_inp = prom.custom_query_range(
                query=query,
                start_time=parse_datetime(st),
                end_time=parse_datetime(et),
                step=step
            )
            if len(test_inp) > 0:
                df = MetricRangeDataFrame(test_inp)
                results[key].append(df)
        if len(results[key]) > 0:
            results[key] = pd.concat(results[key], axis=0)
            #print(key)
        else:
            #print(f"results empty for {key}")
            results.pop(key)
            unique_gpu_instances.remove(key.split("_util_")[1])
    return results, queries, unique_model_versions, unique_gpu_instances

In [None]:
results, queries, unique_model_versions, unique_gpu_instances = get_all_queries([("2023-03-30 at 16:00:00 MDT", "2023-03-30 at 19:00:00 MDT"),], step="60s")


In [None]:
unique_model_versions, unique_gpu_instances

In [None]:
results["gpu_tensor_util_0"]

In [None]:
def convert_results_to_df(results):
    i0 = results["inf_rate_net"].join(results["num_instances"],
                                      how="left", 
                                      rsuffix="_num_instances",
                                     )
    i0 = i0.join(results["inf_reqs_net"],
                 how="left",
                 rsuffix="_inf_reqs_net")
    i0 = i0.join(results["inf_req_dur_net"],
                 how="left",
                 rsuffix="_inf_req_dur_net")
    i0 = i0.join(results["inf_que_dur_net"],
                 how="left",
                 rsuffix="_inf_que_dur_net")
    i0 = i0.join(results["inf_inp_dur_net"],
                 how="left",
                 rsuffix="_inf_inp_dur_net")
    i0 = i0.join(results["inf_inf_dur_net"],
                 how="left",
                 rsuffix="_inf_inf_dur_net")
    i0 = i0.join(results["inf_out_dur_net"],
                 how="left",
                 rsuffix="_inf_out_dur_net")
    
    #Add the model metrics
    for model in unique_model_versions:
        itemp = results["inf_rate_" + model].join(results["num_instances_" + model],
                                                  how="left",
                                                  rsuffix="_num_instances_"+model.split("/")[0],
                                                  lsuffix="_rate_"+model.split("/")[0],
                                                 )
        i0 = i0.join(itemp, how="left")
        
    #Add the GPU Instance metrics
    for mg, gpu in enumerate(unique_gpu_instances):
        results["gpu_tensor_util_" + str(mg)].fillna(0, inplace=True)
        results["gpu_dram_util_" + str(mg)].fillna(0, inplace=True)
        itemp = results["gpu_tensor_util_" + str(mg)].join(results["gpu_dram_util_" + str(mg)],
                                                  how="left",
                                                  rsuffix="_gpu_dram_util_"+str(mg),
                                                  lsuffix="_gpu_tensor_util_"+str(mg),
                                                 )
        i0 = i0.join(itemp, how="left")

    #Get rid of the "value" in names, and fill NaN values with 0 everywhere
    i0.rename(columns={"value": "rate"}, inplace=True)
    i0.rename(columns={col:col[6:] for col in i0.columns if col.startswith("value_")}, inplace=True)
    i0.fillna(0, inplace=True)
    
    #Aggregate some stats for models
    valid_model_keys = [col for col in i0.columns if col.startswith("rate_") and col.replace("rate_", "num_instances_") in i0.columns]
    i0["summed_rate"] = sum([i0[col] for col in valid_model_keys])
    i0["summed_instances"] = sum([i0[col.replace("rate_", "num_instances_")] for col in valid_model_keys])
    
    #Aggregate some stats for GPU instances
    valid_gpu_keys = [col for col in i0.columns if col.startswith("gpu_tensor_util") and col.replace("tensor", "dram") in i0.columns]
    i0["summed_gpu_tensor_util"] = sum([i0[col] for col in valid_gpu_keys])
    i0["summed_gpu_dram_util"] = sum([i0[col.replace("tensor", "dram")] for col in valid_model_keys])
    return i0

In [None]:
i0 = convert_results_to_df(results)
i0

In [None]:
plt.plot(i0.index.values, i0.rate.values)
scale_value = max(i0.rate.values)/max(i0.summed_gpu_tensor_util)
plt.plot(i0.index.values, i0.summed_gpu_tensor_util.values*scale_value, color="tab:red")

In [None]:
import pickle
with open(f"triton_metrics_test.pickle", "wb") as output_file:
    pickle.dump(i0, output_file)

In [None]:
#Concurrency question: if models tend to gravitate to their own instances, summed instances ~ num_instances
#If concurrency is as high as possible, summed instances ~ avg_num_models * num_instances
ii = i0[i0.num_instances > 0].summed_instances/i0[i0.num_instances > 0].num_instances
print(np.mean(ii), np.max(ii), np.min(ii))
print(np.sqrt(np.var(ii)))

#Consistency check: summed rate should always add to net rate!
kk = i0[i0.num_instances > 0].summed_rate/i0[i0.num_instances > 0].rate
print(np.mean(kk), np.max(kk), np.min(kk))
print(np.sqrt(np.var(kk)))

In [None]:
plt.scatter("num_instances", "rate", data=i0[i0.num_instances > 0], color="tab:red")

In [None]:
plt.scatter("num_instances_pn_demo", "rate_pn_demo", data=i0[i0.num_instances > 0], color="tab:blue")

In [None]:
plt.scatter("num_instances_svj_tch_gnn", "rate_svj_tch_gnn", data=i0[i0.num_instances > 0], color="tab:green")