In [7]:
import argparse
import csv
import json
import os

import numpy as np
import pandas as pd

from benchmark import Benchmark

workload_names = [
    "archeology.json",
    "astronomy.json",
    "biomedical.json" "environment.json",
    "legal.json",
    "wildfire.json",
]


sys_names = {
    'BaselineLLMSystemLlama3_3InstructNaive': 'Llama3-3Intruct',
    'BaselineLLMSystemDeepseekR1Naive': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderNaive': 'Qwen2-5Coder',
    'BaselineLLMSystemLlama3_3InstructOneShot': 'Llama3-3Intruct',
    'BaselineLLMSystemDeepseekR1OneShot': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderOneShot': 'Qwen2-5Coder',
    'BaselineLLMSystemLlama3_3InstructFewShot': 'Llama3-3Intruct',
    'BaselineLLMSystemDeepseekR1FewShot': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderFewShot': 'Qwen2-5Coder',
}

In [5]:
sut_metrics = {}
for sut_name in sys_names:

    aggregated_result_filepath = "./results/aggregated_results.csv"

    df = pd.read_csv(aggregated_result_filepath)
    metric_aggregation_dict = {}
    for (sut, metric), group in df.groupby(["sut", "metric"]):
        if sut != sut_name:
            continue
        group_dropped_na = group.dropna()
        metric_aggregation_dict[metric] = group["value_mean"].mean()
    # print(f"Aggregated results for {sut_name}:")
    # print(metric_aggregation_dict)
    sut_metrics[sut_name] = metric_aggregation_dict

metrics_df = pd.DataFrame.from_dict(sut_metrics, orient="index")
metrics = ['bleu', 'llm_code_eval', 'f1', 'mean_absolute_error', 'precision', 'recall', 'rouge', 'success']
metrics_df = metrics_df[metrics]*100

display(metrics_df)
ltx_table = metrics_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different systems.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)

for sut_name in sys_names:
    ltx_table = ltx_table.replace(sut_name, "& "+sys_names[sut_name])

print(ltx_table)

Unnamed: 0,bleu,llm_code_eval,f1,mean_absolute_error,precision,recall,rouge,success
BaselineLLMSystemDeepseekR1Naive,2.626221,6.740079,20.803454,14050.0,2.01249,2.609796,3.842177,0.0
BaselineLLMSystemLlama3_3InstructNaive,8.751508,13.878307,16.664995,,5.467286,6.085015,7.275907,0.0
BaselineLLMSystemQwen2_5CoderNaive,2.107671,21.78836,12.16902,,2.038493,3.571717,2.685407,0.0
BaselineLLMSystemDeepseekR1OneShot,4.017527,4.777778,12.857579,,3.01179,4.063017,8.097608,0.287356
BaselineLLMSystemLlama3_3InstructOneShot,2.947388,3.0,14.563555,,4.029784,5.415947,3.554972,0.37037
BaselineLLMSystemQwen2_5CoderOneShot,1.824298,21.488095,13.462146,,2.227447,3.801885,6.38964,0.37037
BaselineLLMSystemDeepseekR1FewShot,2.591152,5.555556,14.7555,,0.333333,1.111111,8.156356,4.578877
BaselineLLMSystemLlama3_3InstructFewShot,1.619408,12.050265,17.313495,1270756.76,2.5,5.0,7.386364,2.840909
BaselineLLMSystemQwen2_5CoderFewShot,3.708245,9.722222,30.303998,,7.222222,4.444444,2.083333,5.681818


\begin{table}
\caption{Metrics for different systems.}
\label{tab:metrics}
\begin{tabular}{lcccccccc}
\toprule
 & bleu & llm_code_eval & f1 & mean_absolute_error & precision & recall & rouge & success \\
\midrule
& DeepSeek-R1 & 2.63 & 6.74 & 20.80 & 14050.00 & 2.01 & 2.61 & 3.84 & 0.00 \\
& Llama3-3Intruct & 8.75 & 13.88 & 16.66 & NaN & 5.47 & 6.09 & 7.28 & 0.00 \\
& Qwen2-5Coder & 2.11 & 21.79 & 12.17 & NaN & 2.04 & 3.57 & 2.69 & 0.00 \\
& DeepSeek-R1 & 4.02 & 4.78 & 12.86 & NaN & 3.01 & 4.06 & 8.10 & 0.29 \\
& Llama3-3Intruct & 2.95 & 3.00 & 14.56 & NaN & 4.03 & 5.42 & 3.55 & 0.37 \\
& Qwen2-5Coder & 1.82 & 21.49 & 13.46 & NaN & 2.23 & 3.80 & 6.39 & 0.37 \\
& DeepSeek-R1 & 2.59 & 5.56 & 14.76 & NaN & 0.33 & 1.11 & 8.16 & 4.58 \\
& Llama3-3Intruct & 1.62 & 12.05 & 17.31 & 1270756.76 & 2.50 & 5.00 & 7.39 & 2.84 \\
& Qwen2-5Coder & 3.71 & 9.72 & 30.30 & NaN & 7.22 & 4.44 & 2.08 & 5.68 \\
\bottomrule
\end{tabular}
\end{table}



In [None]:
print("Per-domain aggregation:")
# Calculate the weighted mean of the following metrics per domain
domains = ['archeology', 'astronomy', 'biomedical', 'environment', 'legal', 'wildfire']
metrics = ['success', 'llm_paraphrase', 'mean_relative_absolute_error', 'f1']
suts = list(sys_names.keys())
# measures = {}
# for sut in df['sut'].unique():
#     for workload_name in workload_names:
#         df['weighted_metric'] = df['value_support'] + df['value_mean']
#         x = df.groupby(['sut', 'workload'])['value_support'].sum()
#         supports = dict(x)
#         measures = []
#         for key in supports.keys():
#             sut, workload = key
#             sys_domain_measure = df[df['sut'] == sut][df['workload'] == workload]['value_mean'] * df[df['sut'] == sut][df['workload'] == workload]['value_support'] / supports[key]
df['meansupp'] = df['value_mean'] * df['value_support']
results = {}
for domain in domains+['overall']:
    if domain != 'overall':
        sut_df = df.query(f'sut in {suts} and workload == "{domain}.json" and metric in {metrics}')
    else:
        sut_df = df.query(f'sut in {suts} and metric in {metrics}')
    x = sut_df.groupby(['sut']).sum()['meansupp']
    y = sut_df.groupby(['sut']).sum()['value_support']
    results[domain] = x/y

domain_df = pd.DataFrame(results)*100
domain_df = domain_df.reindex(suts)

display(domain_df)
ltx_table = domain_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different domains.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)
for sys_name in sys_names:
    ltx_table = ltx_table.replace(sys_name, "& "+sys_names[sys_name])

print(ltx_table)


Per-domain aggregation:


Unnamed: 0_level_0,archeology,astronomy,biomedical,environment,legal,wildfire,overall
sut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BaselineLLMSystemLlama3_3InstructNaive,3.124224,2.748194,6.953459,2.543302,12.336846,34.625024,11.470289
BaselineLLMSystemDeepseekR1Naive,1.13284,3.438176,5.080677,4.472612,13.204679,33.084668,11.990435
BaselineLLMSystemQwen2_5CoderNaive,1.274417,3.46791,5.194229,2.45564,9.797337,32.967272,10.305872
BaselineLLMSystemLlama3_3InstructOneShot,1.863691,2.579835,5.94617,3.021397,11.75223,34.668798,11.216819
BaselineLLMSystemDeepseekR1OneShot,1.427483,2.868674,4.479036,3.677805,13.238037,34.153308,11.537398
BaselineLLMSystemQwen2_5CoderOneShot,1.265998,2.574294,4.834084,3.075782,11.322222,36.206989,11.275547
BaselineLLMSystemLlama3_3InstructFewShot,0.0,1.807595,5.76199,1.152386,11.739448,39.789076,12.556231
BaselineLLMSystemDeepseekR1FewShot,0.0,1.328904,4.744877,6.024185,15.003979,39.836411,14.344712
BaselineLLMSystemQwen2_5CoderFewShot,0.0,5.549133,5.424347,2.350058,16.853774,39.940112,14.663404


\begin{table}
\caption{Metrics for different domains.}
\label{tab:metrics}
\begin{tabular}{lcccccccc}
\toprule
 & archeology & astronomy & biomedical & environment & legal & wildfire & overall \\
sut &  &  &  &  &  &  &  \\
\midrule
& Llama3-3Intruct & 3.12 & 2.75 & 6.95 & 2.54 & 12.34 & 34.63 & 11.47 \\
& DeepSeek-R1 & 1.13 & 3.44 & 5.08 & 4.47 & 13.20 & 33.08 & 11.99 \\
& Qwen2-5Coder & 1.27 & 3.47 & 5.19 & 2.46 & 9.80 & 32.97 & 10.31 \\
& Llama3-3Intruct & 1.86 & 2.58 & 5.95 & 3.02 & 11.75 & 34.67 & 11.22 \\
& DeepSeek-R1 & 1.43 & 2.87 & 4.48 & 3.68 & 13.24 & 34.15 & 11.54 \\
& Qwen2-5Coder & 1.27 & 2.57 & 4.83 & 3.08 & 11.32 & 36.21 & 11.28 \\
& Llama3-3Intruct & 0.00 & 1.81 & 5.76 & 1.15 & 11.74 & 39.79 & 12.56 \\
& DeepSeek-R1 & 0.00 & 1.33 & 4.74 & 6.02 & 15.00 & 39.84 & 14.34 \\
& Qwen2-5Coder & 0.00 & 5.55 & 5.42 & 2.35 & 16.85 & 39.94 & 14.66 \\
\bottomrule
\end{tabular}
\end{table}

