# Imports

In [31]:
# standard library imports
# /

# related third party imports
import os
import pickle
import structlog
import pandas as pd

# local application/library specific imports
from tools.configurator import (
    get_configs_out,
    get_config_ids,
)
from tools.analyzer import (
    print_table_from_dict,
    get_results_dict,
    merge_all_results,
    create_config_id_print,
)


logger = structlog.get_logger(__name__)

In [36]:
##### INPUTS #####
comparison_df = {}
EXCLUDE_METRICS = ["val_acc_true_student", "val_acc_true_pred"]
LEGEND_EXACT = True
METRIC2LEGEND_DICT = {
    "val_acc_student_pred": "val acc LLM -> student",
    "val_acc_true_student": "val acc student -> true",
    "val_acc_true_pred": "val acc LLM -> true",
    "val_prop_invalid": "val proportion invalid"
}
CONFIG_DIR = "C:/Users/marav/OneDrive/Документы/LLM-virtual-pretesting/src/config"
OUTPUT_DIR = "C:/Users/marav/OneDrive/Документы/LLM-virtual-pretesting/src/output"
experiment_names = [f for f in os.listdir(OUTPUT_DIR)]
runs = []
for EXP_NAME in experiment_names:
    print(EXP_NAME)
    configs = get_configs_out(EXP_NAME)
    config_ids = get_config_ids(configs)
    if configs:
        config_dict = {config_id: cfg for config_id, cfg in zip(config_ids, configs)}

        CONFIG2LEGEND_DICT = {
            config_id: create_config_id_print(config_id) for config_id in config_ids
        }
        legend_kwargs = {
            "config2legend": CONFIG2LEGEND_DICT,
            "legend_exact": LEGEND_EXACT,
            "metric2legend": METRIC2LEGEND_DICT,
        }
        run_id_dict = merge_all_results(EXP_NAME, config_ids)
        results_dict = get_results_dict(
            exp_name=EXP_NAME,
            config_ids=config_ids,
            run_id=None,
        )
        print_table_from_dict(
            eval_dict=results_dict,
            exp_name=EXP_NAME,
            exclude_metrics=EXCLUDE_METRICS,
            decimals=3,
            **legend_kwargs,
        )
        for f in os.listdir(os.path.join(OUTPUT_DIR, EXP_NAME)):
            if '~' in f and not f.endswith('.pickle'):
                run1 = pickle.load(open(os.path.join(OUTPUT_DIR, EXP_NAME, f, "run_1.pickle"), 'rb'))
                print(run1)
                runs.append({'experiment_name': EXP_NAME, 'run': run1})
                comparison_df[len(comparison_df)] = {'experiment_name': EXP_NAME, 'model': configs[0]['MODEL']['NAME'],
                                                     'prompt': configs[0]['PROMPT']['NAME'], 'temperature': configs[0]['MODEL']['TEMPERATURE'],
                                                      'num_preds': len(run1['preds_raw']['val_preds_raw']),
                                                     'accuracy': run1['metrics']['val_acc_student_pred'],}
    print('\n=============================================\n')
comparison_df = pd.DataFrame.from_dict(comparison_df, orient = 'index')
    

experiment_kate_20250425-152445
2025-04-28 09:32:48 [info     ] 

Merging runs [1] in: output\experiment_kate_20250425-152445\gpt-4o~SO_B~SP_teacher_A~EF_quotes~ES_random3.pickle
2025-04-28 09:32:48 [info     ] Saving data                    path=output
2025-04-28 09:32:48 [info     ] Loading checkpoint             output_path=output\experiment_kate_20250425-152445\gpt-4o~SO_B~SP_teacher_A~EF_quotes~ES_random3.pickle


  stderror = scipy.stats.sem(ary, ddof=1, axis=axis)


+-----------------------------------------------+--------------------------+--------------------------+
| Config                                        | val acc LLM -> student   | val proportion invalid   |
|-----------------------------------------------+--------------------------+--------------------------|
| gpt-4o~SO_B~SP_teacher_A~EF_quotes~ES_random3 | 0.000 ± nan              | 0.000 ± nan              |
+-----------------------------------------------+--------------------------+--------------------------+
{'metrics': {'val_acc_student_pred': 0.0, 'val_acc_true_student': 0.6, 'val_acc_true_pred': 0.2, 'val_prop_invalid': np.float64(0.0)}, 'preds_raw': {'val_preds_raw': [AIMessage(content='{"misconception":"The student might have a misconception about the relationship between functional dependencies and minimal covers, as seen in their correct response to the question about minimal covers. They might not fully understand how functional dependencies affect the number of rows in a

In [37]:
comparison_df

Unnamed: 0,experiment_name,model,prompt,temperature,num_preds,accuracy
0,experiment_kate_20250425-152445,gpt-4o,teacher_A,0.0,10,0.0
1,experiment_kate_20250425-155635,o3-mini,teacher_A,0.0,10,0.6
2,experiment_kate_20250427-075601,o3-mini,teacher_A,0.0,100,0.29
3,experiment_kate_gpt4_studentaprompt_highttemp_...,gpt-4o,student_A,0.6,100,0.73
4,experiment_kate_gpt4_studentaprompt_zerotemp_2...,gpt-4o,student_A,0.0,100,0.74
5,experiment_kate_gpt4_teacherbprompt_hightemp_2...,gpt-4o,teacher_B,0.6,100,0.52
6,experiment_kate_gpt4_teacherbprompt_zerotemp_2...,gpt-4o,teacher_B,0.0,100,0.53
7,experiment_kate_o3mini_hightemp_20250428-082021,o3-mini,teacher_A,0.6,100,0.28
8,experiment_kate_o3mini_studentaprompt_20250428...,o3-mini,student_A,0.6,100,0.78
9,experiment_kate_o3mini_studentaprompt_zerotemp...,o3-mini,student_A,0.0,100,0.78


In [27]:
for run in runs:
    print(run['experiment_name'])
    for k, v in run['run']['preds'].items():
        print(k, v)

experiment_kate_20250425-152445
val_y_pred [3 4 3 3 3 3 3 1 2 2]
val_y_true [2 4 1 2 1 1 3 4 1 1]
val_y_student [2 1 1 1 1 1 1 2 1 1]
experiment_kate_20250425-155635
val_y_pred [1 1 2 1 1 2 1 1 1 1]
val_y_true [2 4 1 2 1 1 3 4 1 1]
val_y_student [2 1 1 1 1 1 1 2 1 1]
experiment_kate_20250427-075601
val_y_pred [2 4 2 3 3 2 3 1 1 2 3 4 1 2 4 1 4 1 2 1 1 3 2 2 1 3 1 1 3 4 1 1 1 2 1 1 1
 1 2 1 1 3 2 1 2 1 4 2 2 1 2 3 2 1 1 3 2 4 1 1 4 1 1 3 3 1 2 1 1 2 2 1 2 1
 2 1 1 2 1 1 1 1 1 4 2 2 2 2 2 1 1 1 1 2 1 1 1 1 1 4]
val_y_true [2 4 1 2 1 1 3 4 1 1 3 4 3 2 4 2 4 4 3 2 4 3 1 1 2 1 1 1 1 4 2 2 1 1 2 4 1
 3 1 3 2 1 3 4 2 4 1 1 2 3 1 4 3 3 3 2 3 3 1 2 4 1 3 4 3 1 1 2 4 1 2 1 1 4
 1 3 1 1 3 4 1 3 1 4 2 2 1 4 1 2 1 4 4 3 1 2 1 2 4 3]
val_y_student [2 1 1 1 1 1 1 2 1 1 3 4 3 2 4 2 1 1 1 2 4 3 1 1 2 1 1 2 2 4 2 2 1 1 1 4 1
 3 1 3 2 1 3 1 2 2 1 1 2 3 4 4 3 3 3 2 3 3 2 2 2 3 3 4 3 1 1 2 4 1 2 1 1 2
 1 3 1 1 3 2 1 1 1 4 2 1 1 4 1 2 1 2 4 3 1 2 1 2 4 3]
experiment_kate_gpt4_studentaprompt_highttemp_202504