# Imports

In [None]:
# standard library imports
# /

# related third party imports
import structlog

# local application/library specific imports
from tools.configurator import (
    get_configs_out,
    get_config_ids,
)
from tools.analyzer import (
    print_table_from_dict,
    get_results_dict,
    merge_all_results,
    create_config_id_print,
)


logger = structlog.get_logger(__name__)

In [None]:
##### INPUTS #####
EXP_NAME = "experiment"  # "experiment_20250318"
CONFIG_ID = "llama3~T0.0~Srandom~F3"
EXCLUDE_METRICS = []
LEGEND_EXACT = True

In [None]:
METRIC2LEGEND_DICT = {
    "acc_student_pred": "Accuracy LLM -> student",
    "acc_true_student": "Accuracy student -> true",
    "acc_true_pred": "Accuracy LLM -> true",
}

In [None]:
configs = get_configs_out(EXP_NAME)
config_ids = get_config_ids(configs)
config_dict = {config_id: cfg for config_id, cfg in zip(config_ids, configs)}

CONFIG2LEGEND_DICT = {
    config_id: create_config_id_print(config_id) for config_id in config_ids
}
legend_kwargs = {
    "config2legend": CONFIG2LEGEND_DICT,
    "legend_exact": LEGEND_EXACT,
    "metric2legend": METRIC2LEGEND_DICT,
}

In [None]:
# merge results for all configs
run_id_dict = merge_all_results(EXP_NAME, config_ids)

# Test set performance

In [None]:
results_dict = get_results_dict(
    exp_name=EXP_NAME,
    config_ids=config_ids,
    run_id=None,
)
print_table_from_dict(
    eval_dict=results_dict,
    exp_name=EXP_NAME,
    exclude_metrics=EXCLUDE_METRICS,
    decimals=3,
    **legend_kwargs,
)