In [6]:
import argparse
import csv
import json
import os

import numpy as np
import pandas as pd

#from benchmark import Benchmark

workload_names = [
    "archeology.json",
    "astronomy.json",
    "biomedical.json" "environment.json",
    "legal.json",
    "wildfire.json",
]


sys_names = {
    'BaselineLLMSystemGPTo3Naive': 'GPT-o3',
    'BaselineLLMSystemGPT4oNaive': 'GPT-4o',
    'BaselineLLMSystemClaude35Naive': 'Claude-3-5',
    'BaselineLLMSystemLlama3_3InstructNaive': 'Llama3_3Instruct',
    'BaselineLLMSystemDeepSeekR1Naive': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderNaive': 'Qwen2_5Coder',
    
    'BaselineLLMSystemGPTo3OneShot': 'GPT-o3',
    'BaselineLLMSystemGPT4oOneShot': 'GPT-4o',
    'BaselineLLMSystemClaude35OneShot': 'Claude-3-5',
    'BaselineLLMSystemLlama3_3InstructOneShot': 'Llama3_3Instruct',
    'BaselineLLMSystemDeepSeekR1OneShot': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderOneShot': 'Qwen2_5Coder',
    
    'BaselineLLMSystemGPTo3FewShot': 'GPT-o3',
    'BaselineLLMSystemGPT4oFewShot': 'GPT-4o',
    'BaselineLLMSystemClaude35FewShot': 'Claude-3-5',
    'BaselineLLMSystemLlama3_3InstructFewShot': 'Llama3_3Instruct',
    'BaselineLLMSystemDeepSeekR1FewShot': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderFewShot': 'Qwen2_5Coder',
}

In [7]:
sut_metrics = {}
for sut_name in sys_names:

    aggregated_result_filepath = "./results/aggregated_results.csv"

    df = pd.read_csv(aggregated_result_filepath)
    metric_aggregation_dict = {}
    for (sut, metric), group in df.groupby(["sut", "metric"]):
        if sut != sut_name:
            continue
        group_dropped_na = group.dropna()
        metric_aggregation_dict[metric] = group["value_mean"].mean()
    # print(f"Aggregated results for {sut_name}:")
    # print(metric_aggregation_dict)
    sut_metrics[sut_name] = metric_aggregation_dict

metrics_df = pd.DataFrame.from_dict(sut_metrics, orient="index")
metrics = ['bleu', 'llm_code_eval', 'f1', 'mean_absolute_error', 'precision', 'recall', 'rouge', 'success', 'runtime']
for m in metrics:
    if m != 'runtime':
        metrics_df[m] = metrics_df[m]*100

display(metrics_df)
ltx_table = metrics_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different systems.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)

for sut_name in sys_names:
    ltx_table = ltx_table.replace(sut_name, "& "+sys_names[sut_name])

print(ltx_table)

Unnamed: 0,bleu,f1,f1_approximate,llm_code_eval,llm_paraphrase,mean_absolute_error,mean_relative_absolute_error,mean_squared_error,precision,rae_score,recall,rouge,runtime,string_bootstrap,success,total_token_usage_answers,total_token_usage_pipeline
BaselineLLMSystemGPTo3Naive,28.429437,15.887608,0.0,57.666304,0.163889,75676.795,9046.777,1145387.0,20.662577,4523.417257,25.775432,30.463175,7.509467,0.293004,16.897737,15.233077,1761.477116
BaselineLLMSystemGPT4oNaive,19.650903,14.813657,0.0,46.470033,0.055556,1316.28,8332.667,173.2593,7.492504,4166.333375,9.343542,19.432624,4.791857,0.194326,7.501252,15.502316,1232.749868
BaselineLLMSystemClaude35Naive,16.716076,16.405441,0.0,49.507407,0.074074,,9999.0,,12.84937,4999.50005,17.205879,17.239587,3.224136,0.150624,7.717128,11.626596,1273.222222
BaselineLLMSystemGPTo3OneShot,36.072754,19.75971,0.0,60.232017,0.188889,918.1775,8094.498,267.8764,26.941128,4047.364212,36.268587,41.738426,10.162886,0.329543,27.216814,15.407115,1455.797222
BaselineLLMSystemGPT4oOneShot,13.02473,16.889501,0.0,44.951074,0.111111,11.5,7856.497,0.02645,13.63349,3928.30969,17.56798,15.05168,5.34847,0.138889,12.184503,15.532867,1122.065741
BaselineLLMSystemClaude35OneShot,19.509739,14.78998,0.0,57.140101,0.163889,0.02,65828450000.0,1.2e-07,15.022658,3095.138492,22.083579,23.114951,4.75562,0.204293,15.402336,15.27909,1200.506481
BaselineLLMSystemGPTo3FewShot,38.395166,24.370241,0.0,59.654967,0.219444,70.527942,6427.966,3.665125,37.983075,3214.291177,45.304568,49.890953,14.699095,0.402606,36.122221,15.267112,1419.10291
BaselineLLMSystemGPT4oFewShot,33.283642,20.977183,0.0,37.846939,0.136111,437.822255,6910.738,112.5732,21.785575,3333.253289,26.176795,35.814979,8.714452,0.346131,23.852744,15.484927,961.833598
BaselineLLMSystemClaude35FewShot,29.671255,20.427169,0.0,54.499021,0.163889,141.555883,8094.467,7.332651,30.889769,4047.373737,39.874849,39.837067,14.601052,0.294523,24.091471,15.401815,1173.692857


\begin{table}
\centering
\caption{Metrics for different systems.}
\label{tab:metrics}
\begin{tabular}{lccccccccccccccccc}
\toprule
{} &  bleu &    f1 &  f1\_approximate &  llm\_code\_eval &  llm\_paraphrase &  mean\_absolute\_error &  mean\_relative\_absolute\_error &  mean\_squared\_error &  precision &  rae\_score &  recall &  rouge &  runtime &  string\_bootstrap &  success &  total\_token\_usage\_answers &  total\_token\_usage\_pipeline \\
\midrule
& GPT-o3      & 28.43 & 15.89 &            0.00 &          57.67 &            0.16 &             75676.79 &                       9046.78 &          1145386.53 &      20.66 &    4523.42 &   25.78 &  30.46 &     7.51 &              0.29 &    16.90 &                      15.23 &                     1761.48 \\
& GPT-4o      & 19.65 & 14.81 &            0.00 &          46.47 &            0.06 &              1316.28 &                       8332.67 &              173.26 &       7.49 &    4166.33 &    9.34 &  19.43 &     4.79 &              0.1

  ltx_table = metrics_df.to_latex(


In [4]:
print("Per-domain aggregation:")
# Calculate the weighted mean of the following metrics per domain
domains = [
    "archeology",
    "astronomy",
    "biomedical", "environment",
    "legal",
    "wildfire",
]
metrics = ['success', 'llm_paraphrase', 'rae_score', 'f1']
suts = list(sys_names.keys())
# measures = {}
# for sut in df['sut'].unique():
#     for workload_name in workload_names:
#         df['weighted_metric'] = df['value_support'] + df['value_mean']
#         x = df.groupby(['sut', 'workload'])['value_support'].sum()
#         supports = dict(x)
#         measures = []
#         for key in supports.keys():
#             sut, workload = key
#             sys_domain_measure = df[df['sut'] == sut][df['workload'] == workload]['value_mean'] * df[df['sut'] == sut][df['workload'] == workload]['value_support'] / supports[key]
df['meansupp'] = df['value_mean'] * df['value_support']
results = {}
for domain in domains+['overall', 'runtime']:
    if domain not in ['overall', 'runtime']:
        sut_df = df.query(f'sut in {suts} and workload == "{domain}.json" and metric in {metrics}')
        x = sut_df.groupby(['sut']).sum()['meansupp']
        y = sut_df.groupby(['sut']).sum()['value_support']
        results[domain] = x/y

    elif domain == 'overall':
        sut_df = df.query(f'sut in {suts} and metric in {metrics}')
        x = sut_df.groupby(['sut']).sum()['meansupp']
        y = sut_df.groupby(['sut']).sum()['value_support']
        results[domain] = x/y

    elif domain == 'runtime':
        sut_df = df.query(f'sut in {suts} and metric == "runtime"')
        x = sut_df.groupby(['sut']).sum()['meansupp']
        y = sut_df.groupby(['sut']).sum()['value_support']
        results[domain] = x/y


domain_df = pd.DataFrame(results)*100
domain_df = domain_df.reindex(suts)

display(domain_df)
#ltx_table = domain_df.to_latex(
#    index=True,
#    label="tab:metrics",
#    caption="Metrics for different domains.",
#    float_format="%.2f",
#    column_format="l" + "c" * len(metrics_df.columns),
#)
#for sys_name in sys_names:
#    ltx_table = ltx_table.replace(sys_name, "& "+sys_names[sys_name])

#print(ltx_table)
domain_df.to_markdown(index=True, tablefmt="pipe", floatfmt=".4f")

Per-domain aggregation:


  x = sut_df.groupby(['sut']).sum()['meansupp']
  y = sut_df.groupby(['sut']).sum()['value_support']
  x = sut_df.groupby(['sut']).sum()['meansupp']
  y = sut_df.groupby(['sut']).sum()['value_support']
  x = sut_df.groupby(['sut']).sum()['meansupp']
  y = sut_df.groupby(['sut']).sum()['value_support']
  x = sut_df.groupby(['sut']).sum()['meansupp']
  y = sut_df.groupby(['sut']).sum()['value_support']
  x = sut_df.groupby(['sut']).sum()['meansupp']
  y = sut_df.groupby(['sut']).sum()['value_support']
  x = sut_df.groupby(['sut']).sum()['meansupp']
  y = sut_df.groupby(['sut']).sum()['value_support']
  x = sut_df.groupby(['sut']).sum()['meansupp']
  y = sut_df.groupby(['sut']).sum()['value_support']
  x = sut_df.groupby(['sut']).sum()['meansupp']
  y = sut_df.groupby(['sut']).sum()['value_support']


Unnamed: 0_level_0,archeology,astronomy,biomedical,environment,legal,wildfire,overall,runtime
sut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BaselineLLMSystemGPTo3Naive,17.839651,27407.455658,19.477334,18.675133,9578.367775,37327.716189,12889.54414,660.371373
BaselineLLMSystemGPT4oNaive,15.091342,27403.664806,12.162825,11.258023,4793.572122,52240.728279,14314.913551,401.188829
BaselineLLMSystemClaude35Naive,16.521325,27405.14967,9.866901,12.511544,9578.223662,,7090.566269,288.831116
BaselineLLMSystemLlama3_3InstructNaive,,,,,,,,
BaselineLLMSystemDeepSeekR1Naive,,,,,,,,
BaselineLLMSystemQwen2_5CoderNaive,,,,,,,,
BaselineLLMSystemGPTo3OneShot,23.903794,27415.659472,18.28845,28.477743,9586.911059,22415.596928,10037.056912,906.637232
BaselineLLMSystemGPT4oOneShot,14.255286,13709.413429,9.383845,20.374575,9579.38362,44791.949765,12889.481093,528.040158
BaselineLLMSystemClaude35OneShot,17.065971,13707.49721,9.440771,22.26516,4796.633234,44790.920727,11459.798255,393.675362
BaselineLLMSystemLlama3_3InstructOneShot,,,,,,,,


'| sut                                      |   archeology |   astronomy |   biomedical |   environment |     legal |   wildfire |    overall |   runtime |\n|:-----------------------------------------|-------------:|------------:|-------------:|--------------:|----------:|-----------:|-----------:|----------:|\n| BaselineLLMSystemGPTo3Naive              |      17.8397 |  27407.4557 |      19.4773 |       18.6751 | 9578.3678 | 37327.7162 | 12889.5441 |  660.3714 |\n| BaselineLLMSystemGPT4oNaive              |      15.0913 |  27403.6648 |      12.1628 |       11.2580 | 4793.5721 | 52240.7283 | 14314.9136 |  401.1888 |\n| BaselineLLMSystemClaude35Naive           |      16.5213 |  27405.1497 |       9.8669 |       12.5115 | 9578.2237 |   nan      |  7090.5663 |  288.8311 |\n| BaselineLLMSystemLlama3_3InstructNaive   |     nan      |    nan      |     nan      |      nan      |  nan      |   nan      |   nan      |  nan      |\n| BaselineLLMSystemDeepSeekR1Naive         |     nan      |    

In [8]:
print("Per-domain aggregation:")
# Calculate the weighted mean of the following metrics per domain
domains = [
    "archeology",
    "astronomy",
    "biomedical", "environment",
    "legal",
    "wildfire",
]
metrics = ["success", "llm_paraphrase", "rae_score", "f1"]
suts = list(sys_names.keys())
# measures = {}
# for sut in df['sut'].unique():
#     for workload_name in workload_names:
#         df['weighted_metric'] = df['value_support'] + df['value_mean']
#         x = df.groupby(['sut', 'workload'])['value_support'].sum()
#         supports = dict(x)
#         measures = []
#         for key in supports.keys():
#             sut, workload = key
#             sys_domain_measure = df[df['sut'] == sut][df['workload'] == workload]['value_mean'] * df[df['sut'] == sut][df['workload'] == workload]['value_support'] / supports[key]

df.loc[df["metric"] == "rae_score", "value_mean"] = 1 / (1 + df["value_mean"])
df["meansupp"] = df["value_mean"] * df["value_support"]
# change value_mean of rae_score to 1/(1+ value_mean)

results = {}
for domain in domains + ["overall", "runtime"]:
    if domain not in ["overall", "runtime", "rae_score"]:
        sut_df = df.query(
            f'sut in {suts} and workload == "{domain}.json" and metric in {metrics}'
        )
        x = sut_df.groupby(["sut"]).sum()["meansupp"]
        y = sut_df.groupby(["sut"]).sum()["value_support"]
        results[domain] = x / y

    elif domain == "overall":
        sut_df = df.query(f"sut in {suts} and metric in {metrics}")
        x = sut_df.groupby(["sut"]).sum()["meansupp"]
        y = sut_df.groupby(["sut"]).sum()["value_support"]
        results[domain] = x / y

    elif domain == "runtime":
        sut_df = df.query(f'sut in {suts} and metric == "runtime"')
        x = sut_df.groupby(["sut"]).sum()["meansupp"]
        y = sut_df.groupby(["sut"]).sum()["value_support"]
        results[domain] = x / y


domain_df = pd.DataFrame(results) * 100
domain_df = domain_df.reindex(suts)

display(domain_df)
ltx_table = domain_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different domains.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)
for sys_name in sys_names:
    ltx_table = ltx_table.replace(sys_name, "& " + sys_names[sys_name])

print(ltx_table)

Per-domain aggregation:


  x = sut_df.groupby(["sut"]).sum()["meansupp"]
  y = sut_df.groupby(["sut"]).sum()["value_support"]
  x = sut_df.groupby(["sut"]).sum()["meansupp"]
  y = sut_df.groupby(["sut"]).sum()["value_support"]
  x = sut_df.groupby(["sut"]).sum()["meansupp"]
  y = sut_df.groupby(["sut"]).sum()["value_support"]
  x = sut_df.groupby(["sut"]).sum()["meansupp"]
  y = sut_df.groupby(["sut"]).sum()["value_support"]
  x = sut_df.groupby(["sut"]).sum()["meansupp"]
  y = sut_df.groupby(["sut"]).sum()["value_support"]
  x = sut_df.groupby(["sut"]).sum()["meansupp"]
  y = sut_df.groupby(["sut"]).sum()["value_support"]
  x = sut_df.groupby(["sut"]).sum()["meansupp"]
  y = sut_df.groupby(["sut"]).sum()["value_support"]
  x = sut_df.groupby(["sut"]).sum()["meansupp"]
  y = sut_df.groupby(["sut"]).sum()["value_support"]


Unnamed: 0_level_0,archeology,astronomy,biomedical,environment,legal,wildfire,overall,runtime
sut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BaselineLLMSystemGPTo3Naive,17.839651,12.935932,19.477334,18.675133,9.947009,16.134503,14.934947,660.371373
BaselineLLMSystemGPT4oNaive,15.091342,9.145079,12.162825,11.258023,8.883844,7.147757,10.05006,401.188829
BaselineLLMSystemClaude35Naive,16.521325,10.629944,9.866901,12.511544,9.802897,,11.628428,288.831116
BaselineLLMSystemLlama3_3InstructNaive,,,,,,,,
BaselineLLMSystemDeepSeekR1Naive,,,,,,,,
BaselineLLMSystemQwen2_5CoderNaive,,,,,,,,
BaselineLLMSystemGPTo3OneShot,23.903794,21.139746,18.28845,28.477743,18.490293,25.082778,22.852655,906.637232
BaselineLLMSystemGPT4oOneShot,14.255286,10.578677,9.383845,20.374575,10.962854,19.207521,14.856862,528.040158
BaselineLLMSystemClaude35OneShot,17.065971,10.23899,9.440771,22.26516,11.466487,17.926161,15.476406,393.675362
BaselineLLMSystemLlama3_3InstructOneShot,,,,,,,,


\begin{table}
\centering
\caption{Metrics for different domains.}
\label{tab:metrics}
\begin{tabular}{lccccccccccccccccc}
\toprule
{} &  archeology &  astronomy &  biomedical &  environment &  legal &  wildfire &  overall &  runtime \\
sut                                      &             &            &             &              &        &           &          &          \\
\midrule
& GPT-o3              &       17.84 &      12.94 &       19.48 &        18.68 &   9.95 &     16.13 &    14.93 &   660.37 \\
& GPT-4o              &       15.09 &       9.15 &       12.16 &        11.26 &   8.88 &      7.15 &    10.05 &   401.19 \\
& Claude-3-5           &       16.52 &      10.63 &        9.87 &        12.51 &   9.80 &       NaN &    11.63 &   288.83 \\
BaselineLLMSystemLlama3\_3InstructNaive   &         NaN &        NaN &         NaN &          NaN &    NaN &       NaN &      NaN &      NaN \\
& DeepSeek-R1         &         NaN &        NaN &         NaN &          NaN &    NaN &       

  ltx_table = domain_df.to_latex(
