# Imports

In [4]:
# standard library imports
# /

# related third party imports
import structlog

# local application/library specific imports
from tools.configurator import (
    get_configs_out,
    get_config_ids,
)
from tools.analyzer import (
    print_table_from_dict,
    get_results_dict,
    merge_all_results,
    create_config_id_print,
)


logger = structlog.get_logger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
##### INPUTS #####
EXP_NAME = "experiment_20250318"
CONFIG_ID = "llama3_TEMP0.5"
EXCLUDE_METRICS = []
LEGEND_EXACT = True

In [6]:
CONFIG2LEGEND_DICT = {
    "llama3_TEMP0.5_FEW3": "LLama3 (3-shot, temp 0.5)",
    "olmo2:7b_TEMP0.5_FEW3": "Olmo2 (3-shot, temp 0.5)",
}

# TODO: currently not used but might be useful
METRIC2LEGEND_DICT = {
    "acc_student_pred": "Accuracy LLM -> student",
    "acc_true_student": "Accuracy student -> true",
    "acc_true_pred": "Accuracy LLM -> true",
}

In [7]:
configs = get_configs_out(EXP_NAME)
config_ids = get_config_ids(configs)
config_dict = {config_id: cfg for config_id, cfg in zip(config_ids, configs)}

CONFIG2LEGEND_DICT = {
    config_id: create_config_id_print(config_id) for config_id in config_ids
}
legend_kwargs = {
    "config2legend": CONFIG2LEGEND_DICT,
    "legend_exact": LEGEND_EXACT,
    "metric2legend": METRIC2LEGEND_DICT,
}

In [8]:
# merge results for all configs
run_id_dict = merge_all_results(EXP_NAME, config_ids)

[2m2025-03-19 11:24:25[0m [[32m[1minfo     [0m] [1mMerging runs [1] in: output/experiment_20250318/llama3~T0.0~Srandom~F3.pickle[0m
[2m2025-03-19 11:24:25[0m [[32m[1minfo     [0m] [1mMerging runs [1] in: output/experiment_20250318/olmo2:7b~T0.0~Sstudentid_random~F3.pickle[0m
[2m2025-03-19 11:24:25[0m [[32m[1minfo     [0m] [1mMerging runs [1] in: output/experiment_20250318/llama3~T0.0~Sstudentid_random~F3.pickle[0m
[2m2025-03-19 11:24:25[0m [[32m[1minfo     [0m] [1mMerging runs [1] in: output/experiment_20250318/olmo2:7b~T0.0~Srandom~F3.pickle[0m


# Test set performance

In [9]:
results_dict = get_results_dict(
    exp_name=EXP_NAME,
    config_ids=config_ids,
    run_id=None,
)
print_table_from_dict(
    eval_dict=results_dict,
    exp_name=EXP_NAME,
    exclude_metrics=EXCLUDE_METRICS,
    decimals=3,
    **legend_kwargs,
)

[2m2025-03-19 11:24:25[0m [[32m[1minfo     [0m] [1mLoading checkpoint            [0m [36moutput_path[0m=[35moutput/experiment_20250318/llama3~T0.0~Srandom~F3.pickle[0m
[2m2025-03-19 11:24:25[0m [[32m[1minfo     [0m] [1mLoading checkpoint            [0m [36moutput_path[0m=[35moutput/experiment_20250318/olmo2:7b~T0.0~Sstudentid_random~F3.pickle[0m
[2m2025-03-19 11:24:25[0m [[32m[1minfo     [0m] [1mLoading checkpoint            [0m [36moutput_path[0m=[35moutput/experiment_20250318/llama3~T0.0~Sstudentid_random~F3.pickle[0m
[2m2025-03-19 11:24:25[0m [[32m[1minfo     [0m] [1mLoading checkpoint            [0m [36moutput_path[0m=[35moutput/experiment_20250318/olmo2:7b~T0.0~Srandom~F3.pickle[0m
+-----------------------------------------------+---------------------------+----------------------------+------------------------+----------------+
| Config                                        | Accuracy LLM -> student   | Accuracy student -> true   | Accu

  stderror = scipy.stats.sem(ary, ddof=1, axis=axis)
