In [None]:
import argparse
import csv
import json
import os

import numpy as np
import pandas as pd

from benchmark import Benchmark

workload_names = [
    "archeology.json",
    "astronomy.json",
    "biomedical.json" "environment.json",
    "legal.json",
    "wildfire.json",
]


sys_names = {
    'BaselineLLMSystemDeepseekR1FewShot': 'DeepSeek-R1',
    'BaselineLLMSystemLlama3_3InstructFewShot': 'Llama3-3Intruct',
    'BaselineLLMSystemQwen2_5CoderFewShot': 'Qwen2-5Coder'
}

In [None]:
sut_metrics = {}
for sut_name in sys_names:

    aggregated_result_filepath = "./results/aggregated_results.csv"

    df = pd.read_csv(aggregated_result_filepath)
    metric_aggregation_dict = {}
    for (sut, metric), group in df.groupby(["sut", "metric"]):
        if sut != sut_name:
            continue
        group_dropped_na = group.dropna()
        metric_aggregation_dict[metric] = group["value_mean"].mean()
    # print(f"Aggregated results for {sut_name}:")
    # print(metric_aggregation_dict)
    sut_metrics[sut_name] = metric_aggregation_dict

metrics_df = pd.DataFrame.from_dict(sut_metrics, orient="index")
metrics = ['bleu', 'llm_code_eval', 'f1', 'mean_absolute_error', 'precision', 'recall', 'rouge', 'success']
metrics_df = metrics_df[metrics]*100

display(metrics_df)
ltx_table = metrics_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different systems.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)

for sut_name in sys_names:
    ltx_table = ltx_table.replace(sut_name, "& "+sys_names[sut_name])

print(ltx_table)

Unnamed: 0,bleu,llm_code_eval,f1,mean_absolute_error,precision,recall,rouge,success
BaselineLLMSystemDeepseekR1,2.591152,5.555556,14.7555,,0.333333,1.111111,8.156356,4.578877
BaselineLLMSystemLlama3_3Intruct,1.619408,12.050265,17.313495,1270756.76,2.5,5.0,7.386364,2.840909
BaselineLLMSystemQwen2_5Coder,3.708245,9.722222,30.303998,,7.222222,4.444444,2.083333,5.681818


\begin{table}
\caption{Metrics for different systems.}
\label{tab:metrics}
\begin{tabular}{lcccccccc}
\toprule
 & bleu & llm_code_eval & f1 & mean_absolute_error & precision & recall & rouge & success \\
\midrule
& DeepSeek-R1 & 2.59 & 5.56 & 14.76 & NaN & 0.33 & 1.11 & 8.16 & 4.58 \\
& Llama3-3Intruct & 1.62 & 12.05 & 17.31 & 1270756.76 & 2.50 & 5.00 & 7.39 & 2.84 \\
& Qwen2-5Coder & 3.71 & 9.72 & 30.30 & NaN & 7.22 & 4.44 & 2.08 & 5.68 \\
\bottomrule
\end{tabular}
\end{table}



In [None]:
print("Per-domain aggregation:")
# Calculate the weighted mean of the following metrics per domain
domains = ['archeology', 'astronomy', 'biomedical', 'environment', 'legal', 'wildfire']
metrics = ['success', 'llm_paraphrase', 'mean_relative_absolute_error', 'f1']
suts = ['BaselineLLMSystemDeepseekR1', 'BaselineLLMSystemLlama3_3Intruct', 'BaselineLLMSystemQwen2_5Coder']
# measures = {}
# for sut in df['sut'].unique():
#     for workload_name in workload_names:
#         df['weighted_metric'] = df['value_support'] + df['value_mean']
#         x = df.groupby(['sut', 'workload'])['value_support'].sum()
#         supports = dict(x)
#         measures = []
#         for key in supports.keys():
#             sut, workload = key
#             sys_domain_measure = df[df['sut'] == sut][df['workload'] == workload]['value_mean'] * df[df['sut'] == sut][df['workload'] == workload]['value_support'] / supports[key]
df['meansupp'] = df['value_mean'] * df['value_support']
results = {}
for domain in domains+['overall']:
    if domain != 'overall':
        sut_df = df.query(f'sut in {suts} and workload == "{domain}.json" and metric in {metrics}')
    else:
        sut_df = df.query(f'sut in {suts} and metric in {metrics}')
    x = sut_df.groupby(['sut']).sum()['meansupp']
    y = sut_df.groupby(['sut']).sum()['value_support']
    results[domain] = x/y

domain_df = pd.DataFrame(results)*100
display(domain_df)
ltx_table = domain_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different domains.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)
for sys_name in sys_names:
    ltx_table = ltx_table.replace(sys_name, "& "+sys_names[sys_name])

print(ltx_table)


Per-domain aggregation:


Unnamed: 0_level_0,archeology,astronomy,biomedical,environment,legal,wildfire,overall
sut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BaselineLLMSystemDeepseekR1,0.0,1.328904,4.744877,6.024185,15.003979,39.836411,14.344712
BaselineLLMSystemLlama3_3Intruct,0.0,1.807595,5.76199,1.152386,11.739448,39.789076,12.556231
BaselineLLMSystemQwen2_5Coder,0.0,5.549133,5.424347,2.350058,16.853774,39.940112,14.663404


\begin{table}
\caption{Metrics for different domains.}
\label{tab:metrics}
\begin{tabular}{lcccccccc}
\toprule
 & archeology & astronomy & biomedical & environment & legal & wildfire & overall \\
sut &  &  &  &  &  &  &  \\
\midrule
& DeepSeek-R1 & 0.00 & 1.33 & 4.74 & 6.02 & 15.00 & 39.84 & 14.34 \\
& Llama3-3Intruct & 0.00 & 1.81 & 5.76 & 1.15 & 11.74 & 39.79 & 12.56 \\
& Qwen2-5Coder & 0.00 & 5.55 & 5.42 & 2.35 & 16.85 & 39.94 & 14.66 \\
\bottomrule
\end{tabular}
\end{table}

