# Imports

In [None]:
# standard library imports
# /

# related third party imports
import structlog

# local application/library specific imports
from tools.configurator import (
    get_configs_out,
    get_config_ids,
)
from tools.analyzer import (
    print_table_from_dict,
    print_df_from_dict,
    get_results_dict,
    merge_all_results,
    create_config_id_print,
    get_config_df,
    get_llm_student_preds,
)
from tools.plotter import plot_level_correctness


logger = structlog.get_logger(__name__)

In [None]:
##### INPUTS #####
EXP_NAME = (
    # "roleplay_misconceptions_tryout_20250812-171855"
    "roleplay_misconceptions_20250812-173641"
)
EXCLUDE_METRICS = [
    "val_acc_true_pred",
    "val_f1_true_pred",
]
LEGEND_EXACT = True
PROBLEM_TYPE = "roleplay"

In [None]:
METRIC2LEGEND_DICT = {
    "val_acc_student_pred": "val acc LLM -> student",
    "val_acc_true_student": "val acc student -> true",
    "val_acc_true_pred": "val acc LLM -> true",
    "val_prop_invalid": "val prop invalid",
    "val_f1_student_pred": "val f1 LLM -> student",
    "val_f1_true_student": "val f1 student -> true",
    "val_f1_true_pred": "val f1 LLM -> true",
    "val_rmse": "val RMSE",
    "val_distractor_alignment": "val distr alignment"
}

In [None]:
configs = get_configs_out(EXP_NAME)
config_ids = get_config_ids(configs, problem_type=PROBLEM_TYPE)
config_dict = {config_id: cfg for config_id, cfg in zip(config_ids, configs)}

CONFIG2LEGEND_DICT = {
    config_id: create_config_id_print(config_id) for config_id in config_ids
}
legend_kwargs = {
    "config2legend": CONFIG2LEGEND_DICT,
    "legend_exact": LEGEND_EXACT,
    "metric2legend": METRIC2LEGEND_DICT,
}

In [None]:
# merge results for all configs
run_id_dict = merge_all_results(EXP_NAME, config_ids) # TODO: remove roleplay

# Val/Test set performance
## Complete table

In [None]:
results_dict = get_results_dict(
    exp_name=EXP_NAME,
    config_ids=config_ids,
    run_id=None,
)
# # NOTE: print paper-like table with this code
# print_table_from_dict(
#     eval_dict=results_dict,
#     exp_name=EXP_NAME,
#     exclude_metrics=EXCLUDE_METRICS,
#     decimals=3,
#     **legend_kwargs,
# )

In [None]:
# NOTE: print dataframe
df = print_df_from_dict(
    eval_dict=results_dict,
    exp_name=EXP_NAME,
    exclude_metrics=EXCLUDE_METRICS,
    **legend_kwargs,
    # save=True,
    # save_kwargs={"fname": os.path.join("output", EXP_NAME, "results.csv")},
)

df_config = get_config_df(config_dict)

# mean
df_mean = df.xs('mean', axis=1, level=1, drop_level=True)
df_results = df_mean.merge(df_config, how="left", on="config_id")
df_results = df_results.reindex(
    columns=(
        list(df_config.columns)
        + list([a for a in df_mean.columns if a not in df_config.columns])
    )
)
df_results

In [None]:
# standard error
df_stderr = df.xs("stderr", axis=1, level=1, drop_level=True)
df_results = df_stderr.merge(df_config, how="left", on="config_id")
df_results = df_results.reindex(
    columns=(
        list(df_config.columns)
        + list([a for a in df_stderr.columns if a not in df_config.columns])
    )
)
df_results

## Aggregated per config value

In [None]:
# inspect average performance per config value
FEATURE = "num_examples"
df_results.groupby(FEATURE).agg({"val RMSE": "mean", "val_monotonicity": "mean"}).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "model"
df_results.groupby(FEATURE).agg({"val RMSE": "mean", "val_monotonicity": "mean"}).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "temp"
df_results.groupby(FEATURE).agg({"val RMSE": "mean", "val_monotonicity": "mean"}).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "prompt"
df_results.groupby(FEATURE).agg({"val RMSE": "mean", "val_monotonicity": "mean"}).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "num_examples"
df_results.groupby(FEATURE).agg({"val RMSE": "mean", "val_monotonicity": "mean"}).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "example_selec"
df_results.groupby(FEATURE).agg({"val RMSE": "mean", "val_monotonicity": "mean"}).round(3)

## Additional: LLM question answering

In [None]:
# for config_id in config_ids:
#     logger.info(f"Plotting student level performance", config_id=config_id)
#     plot_student_level_performance(
#         exp_name=EXP_NAME,
#         config_id=config_id,
#         metric="val_accuracy",
#         **legend_kwargs,
#         save=False,
#     )

# Student levels

In [None]:
# all configs
for config_id in config_ids:
    preds_dict = get_llm_student_preds(
        exp_name=EXP_NAME,
        config_id=config_id,
        run_id=1,
        split="val",
        problem_type=PROBLEM_TYPE,
    )
    plot_level_correctness(
        preds_dict,
        problem_type=PROBLEM_TYPE,
        config_id=config_id,
    )

## Distractor alignment

In [None]:
# TODO: obtain answer proportion for all questions in valsmall set


In [None]:
import pandas as pd

df_interactions = pd.read_csv("../data/silver/dbe_kt22_interactions.csv")
df_q_val = pd.read_csv("../data/gold/dbe_kt22_questions_validation.csv")

In [None]:
df_i_val = df_interactions[df_interactions["question_id"].isin(df_q_val["question_id"])]

In [None]:
df_i_val

In [None]:
# Get proportions as a DataFrame with options as columns
prop_df = (df_i_val.groupby(["question_id", "student_level_group"])["student_option_id"]
           .value_counts(normalize=True)
           .unstack(fill_value=0.0)
           .reset_index())
# convert 4 columns to dict
prop_df["dict"] = prop_df.set_index(['question_id', 'student_level_group']).to_dict('index').values()
prop_df = prop_df.drop(columns=[1, 2, 3, 4])
prop_df

In [None]:
preds_dict = get_llm_student_preds(
        exp_name=EXP_NAME,
        config_id="qwen3:8b~T_0.0~SO_student_bool~L_5~SP_student_chocolate_level_nocontext~SS_proficiency_5_str~EFQ_quotes~EFI_quotes~ES_miscon_studentlevel_random0",
        run_id=1,
        split="val",
        problem_type=PROBLEM_TYPE,
    )

In [None]:
preds_dict

In [None]:
import numpy as np


def alignment_score_single(
    y_true: int, y_llm: int, dict_props: dict
) -> float:
    """Calculate the alignment score for a single question.

    Parameters
    ----------
    y_true : int
        The true answer.
    y_llm : int
        The LLM predicted answer.
    dict_props : dict
        A dictionary mapping answer options to student proportions.

    Returns
    -------
    float
        The alignment score.
    """  
    llm_answer_incorrect = y_true != y_llm
    if llm_answer_incorrect:
        dict_tmp = dict_props.copy()
        prop_answer_llm = dict_tmp[y_llm]
        # remove correct idx from dict
        dict_tmp.pop(y_true, None)
        idx_most_popular_distractor = max(dict_tmp, key=dict_tmp.get)
        prop_most_popular_distractor = dict_tmp[idx_most_popular_distractor]
        # calculate score
        try:
            score = prop_answer_llm / prop_most_popular_distractor
        except ZeroDivisionError:
            score = 0.0
    else:
        score = np.nan
    return score

alignment_score_single(
    y_true=3,
    y_llm=4,
    dict_props={1: 0.07692307692307693, 2: 0.2692307692307692, 3: 0.6538461538461539, 4: 0.0}
)

In [None]:
from operator import itemgetter
import numpy as np
from numpy.typing import NDArray
import pandas as pd


def eval_distractor_alignment(
    y_true_array: NDArray,
    y_llm_array: NDArray,
    student_level_group_array: NDArray,
    question_id_array: NDArray,
    student_scale_map: dict,
    prop_df: pd.DataFrame,
) -> float:
    """Evaluate the alignment of distractor answers.

    Parameters
    ----------
    y_true_array : NDArray
        The true answers.
    y_llm_array : NDArray
        The LLM predicted answers.
    student_level_group_array : NDArray
        The student level groups.
    question_id_array : NDArray
        The question IDs.
    student_scale_map : dict
        A mapping from student IDs to their scale.
    prop_df : pd.DataFrame
        A DataFrame containing the student proportions of each answer option.

    Returns
    -------
    float
        The mean alignment score.
    """
    dict_inverse = {v: int(k) for k, v in student_scale_map.items()}
    student_level_group_array_int = np.array(
        itemgetter(*student_level_group_array)(dict_inverse)
    )
    print(student_level_group_array_int)

    scores = []
    for y_true, y_llm, student_level_group, question_id in zip(
        y_true_array, y_llm_array, student_level_group_array_int, question_id_array
    ):
        dict_tmp = (
            prop_df[
                (prop_df["question_id"] == question_id)
                & (prop_df["student_level_group"] == student_level_group)
            ]["dict"]
            .item()
            .copy()
        )
        score = alignment_score_single(y_true=y_true, y_llm=y_llm, dict_props=dict_tmp)
        scores.append(score)

    # compute mean and ignore NaNs
    mean_score = np.nanmean(scores) if scores else 0.0
    return mean_score


eval_distractor_alignment(
    y_true_array=preds_dict["y_true"],
    y_llm_array=preds_dict["y_pred"],
    student_level_group_array=preds_dict["student_level_group"],
    question_id_array=preds_dict["question_ids"],
    student_scale_map=preds_dict["student_scale_map"],
    prop_df=prop_df,
)

In [None]:
df_prop = pd.read_csv("../data/platinum/dbe_kt22_proportions_val.csv")
df_prop["dict"] = df_prop["dict"].apply(eval)

In [None]:
df_prop.info()

In [None]:
type(df_prop.iloc[0,2])