# Imports

In [None]:
# standard library imports
# /

# related third party imports
import structlog

# local application/library specific imports
from tools.configurator import (
    get_configs_out,
    get_config_ids,
)
from tools.analyzer import (
    print_table_from_dict,
    print_df_from_dict,
    get_results_dict,
    merge_all_results,
    create_config_id_print,
    get_config_df,
    check_overlap,
)
from tools.plotter import (
    activate_latex,
    deactivate_latex,
)


logger = structlog.get_logger(__name__)

In [None]:
##### INPUTS #####
EXP_NAME = (
    # context
    # "roleplay_miscon_test_kt_20250820-204306"

    # no context
    # "roleplay_miscon_test_kt_nocontext_20250821-075331"

    # merged
    # "roleplay_miscon_test_kt_merged_20250821"

    # DBE-KT22
    # "roleplay_dbekt22_val_20250922-194750"
    "roleplay_dbekt22_test_20250924-090854"

    # CFE
    # "roleplay_cupacfe_val_20250919-112843"
    # "roleplay_cupacfe_val_20250925-080936"    
    # "roleplay_cupacfe_test_20250925-142305"
)
SPLIT = "val" if "val" in EXP_NAME else "test"
EXCLUDE_METRICS = [
    "val_acc_true_pred",
    "val_f1_true_pred",
]
LEGEND_EXACT = True
PROBLEM_TYPE = "roleplay"
SANS_SERIF = True
PRINT_PAPER = False

In [None]:
METRIC2LEGEND_DICT = {
    f"{SPLIT}_rmse": f"{SPLIT} RMSE",
    f"{SPLIT}_mae": f"{SPLIT} MAE",
    f"{SPLIT}_llm_correctness": "llm correctness",
    f"{SPLIT}_monotonicity": f"{SPLIT} monotonicity",
    f"{SPLIT}_prop_invalid": f"{SPLIT} prop invalid",
    f"{SPLIT}_distractor_alignment": f"{SPLIT} distr alignment"
}

In [None]:
configs = get_configs_out(EXP_NAME)
config_ids = get_config_ids(configs, problem_type=PROBLEM_TYPE)
config_dict = {config_id: cfg for config_id, cfg in zip(config_ids, configs)}

CONFIG2LEGEND_DICT = {
    config_id: create_config_id_print(config_id) for config_id in config_ids
}
legend_kwargs = {
    "config2legend": CONFIG2LEGEND_DICT,
    "legend_exact": LEGEND_EXACT,
    "metric2legend": METRIC2LEGEND_DICT,
}

In [None]:
# merge results for all configs
run_id_dict = merge_all_results(EXP_NAME, config_ids)

# Val/Test set performance
## Complete table

In [None]:
results_dict = get_results_dict(
    exp_name=EXP_NAME,
    config_ids=config_ids,
    run_id=None,
)
# # NOTE: print paper-like table with this code
# print_table_from_dict(
#     eval_dict=results_dict,
#     exp_name=EXP_NAME,
#     exclude_metrics=EXCLUDE_METRICS,
#     decimals=3,
#     **legend_kwargs,
# )

In [None]:
# NOTE: print dataframe
df = print_df_from_dict(
    eval_dict=results_dict,
    exp_name=EXP_NAME,
    exclude_metrics=EXCLUDE_METRICS,
    **legend_kwargs,
    # save=True,
    # save_kwargs={"fname": os.path.join("output", EXP_NAME, "results.csv")},
)

df_config = get_config_df(config_dict)

# mean
df_mean = df.xs('mean', axis=1, level=1, drop_level=True)
df_results = df_mean.merge(df_config, how="left", on="config_id")
df_results = df_results.reindex(
    columns=(
        list(df_config.columns)
        + list([a for a in df_mean.columns if a not in df_config.columns])
    )
)
df_results

In [None]:
# standard error
df_stderr = df.xs("stderr", axis=1, level=1, drop_level=True)
df_stderr = df_stderr.merge(df_config, how="left", on="config_id")
df_stderr = df_stderr.reindex(
    columns=(
        list(df_config.columns)
        + list([a for a in df_stderr.columns if a not in df_config.columns])
    )
)
# df_stderr

## Aggregating results over characteristics

In [None]:
if "kt" in EXP_NAME:
    agg_dict = {
        f"{SPLIT} RMSE": "mean",
        f"{SPLIT} MAE": "mean",
        f"{SPLIT} monotonicity": "mean",
        "llm correctness": "mean",
    }
else:
    agg_dict = {
        f"{SPLIT} RMSE": "mean",
        f"{SPLIT} MAE": "mean",
        f"{SPLIT} monotonicity": "mean",
        # f"{SPLIT} distr alignment": "mean",
        "llm correctness": "mean",
    }

In [None]:
# inspect average performance per config value
FEATURE = "num_examples"
df_results.groupby(FEATURE).agg(agg_dict).round(3)

### Contextual models

In [None]:
df_results_context = df_results[df_results["num_examples"] > 0]

In [None]:
# inspect average performance per config value
FEATURE = "context"
df_results_context.groupby(FEATURE).agg(agg_dict).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "num_examples"
df_results_context.groupby(FEATURE).agg(agg_dict).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "model"
df_results_context.groupby(FEATURE).agg(agg_dict).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "temp"
df_results_context.groupby(FEATURE).agg(agg_dict).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "prompt"
df_results_context.groupby(FEATURE).agg(agg_dict).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "example_selec"
df_results_context.groupby(FEATURE).agg(agg_dict).round(3)

In [None]:
# inspect average performance per config value
FEATURE = "struc_output"
df_results_context.groupby(FEATURE).agg(agg_dict).round(3)

In [None]:
# Get the full rows for these best-performing configs
metric = f"{SPLIT} RMSE"

best_indices = df_results_context.groupby(["model"])[metric].idxmin()  # NOTE: min because RMSE
best_configs = df_results_context.loc[best_indices]
best_configs

### Non-contextual models

In [None]:
df_results_nocontext = df_results[df_results["num_examples"] == 0]

In [None]:
df_results_nocontext

In [None]:
if not df_results_nocontext.empty:
    # inspect average performance per config value
    FEATURE = "context"
    display(df_results_nocontext.groupby(FEATURE).agg(agg_dict).round(3))

In [None]:
if not df_results_nocontext.empty:
    # inspect average performance per config value
    FEATURE = "num_examples"
    display(df_results_nocontext.groupby(FEATURE).agg(agg_dict).round(3))

In [None]:
if not df_results_nocontext.empty:
    # inspect average performance per config value
    FEATURE = "model"
    display(df_results_nocontext.groupby(FEATURE).agg(agg_dict).round(3))

In [None]:
if not df_results_nocontext.empty:
    # inspect average performance per config value
    FEATURE = "temp"
    display(df_results_nocontext.groupby(FEATURE).agg(agg_dict).round(3))

In [None]:
if not df_results_nocontext.empty:
    # inspect average performance per config value
    FEATURE = "prompt"
    display(df_results_nocontext.groupby(FEATURE).agg(agg_dict).round(3))

In [None]:
if not df_results_nocontext.empty:
    # inspect average performance per config value
    FEATURE = "example_selec"
    display(df_results_nocontext.groupby(FEATURE).agg(agg_dict).round(3))

In [None]:
if not df_results_nocontext.empty:
    # inspect average performance per config value
    FEATURE = "struc_output"
    display(df_results_nocontext.groupby(FEATURE).agg(agg_dict).round(3))

In [None]:
# Get the full rows for these best-performing configs
metric = f"{SPLIT} RMSE"

best_indices = df_results_nocontext.groupby(["model"])[metric].idxmin()  # NOTE: min because RMSE
best_configs = df_results_nocontext.loc[best_indices]
best_configs

## Contextual & Non-contextual models

In [None]:
df_metric = df[[f"{SPLIT} monotonicity", f"{SPLIT} RMSE"]].droplevel(0, axis=1)
df_metric.columns = ['mean_monotonicity', 'stderr_monotonicity', "mean_rmse", "stderr_rmse"]
df_metric = df_metric.merge(df_config, how="left", on="config_id")
df_metric = df_metric.reindex(
    columns=(
        list(df_config.columns)
        + list([a for a in df_metric.columns if a not in df_config.columns])
    )
)
df_metric["performance monotonicity"] = df_metric.apply(lambda x: f"{x['mean_monotonicity']:.3f} \gray{{$\pm$ {x['stderr_monotonicity']:.3f}}}", axis=1)
df_metric["performance rmse"] = df_metric.apply(lambda x: f"{x['mean_rmse']:.3f} \gray{{$\pm$ {x['stderr_rmse']:.3f}}}", axis=1)
# extract model family and size
df_metric["family"] = df_metric["model"].str.extract(r"^(.*?):")[0]
df_metric["size"] = (
        df_metric["model"].str.extract(r":(\d+\.?\d*)b$")[0].astype(float).round(1)
    )
# create new column to group on having context or not
context_map = {True: "Context", False: "No context"}
df_metric["context"] = df_metric["prompt"].str.contains("_context").map(context_map)
# clean prompt persona
prompt_map = {"teacher_kt": "Teacher", "student_kt": "Student", "student_bool_nocontext": "Student", "teacher_bool_nocontext": "Teacher"}
df_metric["persona"] = df_metric["struc_output"].map(prompt_map)
# clean example selector
example_selec_map = {
    "miscon_studentid_kc_exact": "Knowledge Concept",
    "miscon_studentid_random": "Random",
    "both_dbe_studentid_random": "Random",
    "both_dbe_studentid_kc_exact": "Knowledge Concept",
    "both_cfe_studentid_random": "Random",
    "both_cfe_studentlevel_random": "Random",
    "both_dbe_studentlevel_random": "Random",
    "both_dbe_studentlevel_kc_exact": "Knowledge Concept",
}
df_metric["example_selec"] = df_metric["example_selec"].map(example_selec_map)
# round temp
df_metric["temp"] = df_metric["temp"].round(1).astype(str)
# insert "\NA" for example selector if no context
df_metric.loc[df_metric["context"] == "No context", "example_selec"] = "\\NA"

df_metric


In [None]:
col_names = ["family", "size", "context", "performance rmse", "performance monotonicity", "persona", "example_selec", "num_examples", "temp"]

df_clean = df_metric.sort_values(by=["family", "size", "context"], ascending=True)[col_names]
df_clean["size"] = df_clean["size"].astype(str) + " B"

df_clean

In [None]:
print(df_clean.to_latex(index=False))

Check overlap in results (to know what should be boldface in the table)

In [None]:
# Apply to your DataFrame for monotonicity
# df_sorted = df_metric.sort_values(by=["family", "size", "context"], ascending=True)
df_sorted = df_metric.sort_values(by=["mean_monotonicity"], ascending=False)
df_with_overlap = check_overlap(df_sorted, 'mean_monotonicity', 'stderr_monotonicity')

# Show relevant columns
df_with_overlap[['family', 'size', 'context', 'mean_monotonicity', 'stderr_monotonicity', 
                'lower_bound', 'upper_bound', 'overlap_with_prev', 'overlap_with_next']]

In [None]:
# Apply to your DataFrame for monotonicity
df_sorted = df_metric.sort_values(by=["family", "size", "context"], ascending=True)
# df_sorted = df_metric.sort_values(by=["mean_rmse"], ascending=True)
df_with_overlap = check_overlap(df_sorted, 'mean_rmse', 'stderr_rmse')

# Show relevant columns
df_with_overlap[['family', 'size', 'context', 'mean_rmse', 'stderr_rmse', 
                'lower_bound', 'upper_bound', 'overlap_with_prev', 'overlap_with_next']]

## CTT

In [None]:
import pandas as pd

In [None]:
df_q_test = pd.read_csv("../data/gold/dbe_kt22_questions_test.csv")

df_i_silver = pd.read_csv("../data/silver/dbe_kt22_interactions.csv")
df_i_test = df_i_silver[df_i_silver["question_id"].isin(df_q_test["question_id"].unique())]
# df_ctt = df_i_test.groupby("question_id").agg({"student_option_correct": ["mean"]})
df_ctt = df_i_test.groupby("question_id")[["student_option_correct"]].mean()
df_ctt["ctt_difficulty"] = 1 - df_ctt["student_option_correct"]
df_ctt = df_ctt.drop(columns=["student_option_correct"]).reset_index()
df_ctt

In [None]:
from tools.analyzer import read_pickle
from typing import Any
import os

def get_irt_df(experiment: str, config_id: str, run_id: int) -> dict:
    # paths
    output_dir = os.path.join("output", experiment)
    output_path = os.path.join(output_dir, config_id, f"run_{run_id}.pickle")

    logger.info("Loading checkpoint", output_path=output_path)
    log_dict = read_pickle(output_path)
    irt_df = log_dict["preds_qdiff"]["test_df_input"]
    y_true = log_dict["preds_qdiff"]["test_y_true"]
    return irt_df, y_true



In [None]:

irt_df, y_true = get_irt_df(
    experiment=EXP_NAME,
    config_id="llama3.2:1b~T_0.0~SO_student_bool_nocontext~L_5~SP_student_dbe_miscons_level_context~SS_proficiency_5_str~EFQ_quotes~EFI_quotes~ES_both_dbe_studentlevel_random1",
    run_id=1,
)

In [None]:
df_pred_ctt = irt_df.copy()

df_pred_ctt = irt_df.groupby("question_id")[["student_option_correct"]].mean()
df_pred_ctt["ctt_difficulty"] = 1 - df_pred_ctt["student_option_correct"]
df_pred_ctt = df_pred_ctt.drop(columns=["student_option_correct"]).reset_index()
df_pred_ctt

In [None]:
import matplotlib.pyplot as plt
from scipy import stats
res = stats.linregress(x=df_q_test["q_difficulty"], y=df_q_test["q_difficulty_pred"])

fig, ax = plt.subplots(figsize=(6, 6))
# ax.plot([-5, -5], [5, 5], linestyle='--', color='gray')
ax.axline((1, 1), slope=1)
ax.set_xlabel('True Difficulty', fontsize=14)
ax.set_ylabel('Predicted Difficulty', fontsize=14)
# ax.set_xlim(-5, 5)
# ax.set_title('Predicted vs True Difficulty', fontsize=16)
ax.scatter(x=df_ctt["ctt_difficulty"], y=df_pred_ctt["ctt_difficulty"])
ax.plot(df_q_test["q_difficulty"], res.intercept + res.slope*df_q_test["q_difficulty"], 'r', label='fitted line')

## Combine LLMs for IRT

In [None]:
family_models = df_metric[(df_metric["family"] == "qwen3") & (df_metric["context"] == "Context")]["config_id"].values.tolist()
family_models

In [None]:
config_dict[family_models[0]]["LOADER"]["NAME"]

In [None]:
import os
import pandas as pd
from sklearn.metrics import root_mean_squared_error
from tools.irt_estimator import irt_estimation
from tools.analyzer import read_pickle


def get_irt_df(experiment: str, config_id: str, run_id: int) -> dict:
    # paths
    output_dir = os.path.join("output", experiment)
    output_path = os.path.join(output_dir, config_id, f"run_{run_id}.pickle")

    logger.info("Loading checkpoint", output_path=output_path)
    log_dict = read_pickle(output_path)
    irt_df = log_dict["preds_qdiff"]["test_df_input"]
    y_true = log_dict["preds_qdiff"]["test_y_true"]
    return irt_df, y_true


def hybrid_multi_roleplaying_llm(experiment: str, dataset_name: str, config_ids: list[str], run_id: int):
    # obtain one large Dataframe for IRT
    irt_df_all = pd.DataFrame()
    for i, config_id in enumerate(config_ids):
        irt_df, _ = get_irt_df(
            experiment=experiment,
            config_id=config_id,
            run_id=run_id,
        )
        # append i to student_id in irt_df to make unique
        irt_df["student_level"] = irt_df["student_id"]
        irt_df["student_id"] = irt_df["student_id"] + f"_model{i}"
        # concatenate
        irt_df_all = pd.concat([irt_df_all, irt_df], axis=0)

    # Compute IRT parameters
    _, difficulty_dict, _ = irt_estimation(interactions_df=irt_df_all)

    irt_scale = {
        "dbe_kt22": (-5,5),
        "cupacfe": (30, 110),
    }
    old_range = (-5, 5)

    logger.info(
        "Rescaling difficulty range",
        old_range=old_range,
        new_range=irt_scale[dataset_name],
    )
    new_min_diff, new_max_diff = irt_scale[dataset_name]
    for key, value in difficulty_dict.items():
        difficulty_dict[key] = (
            (value - old_range[0]) / (old_range[1] - old_range[0])
        ) * (new_max_diff - new_min_diff) + new_min_diff

    df_q_test = pd.read_csv(f"../data/gold/{dataset_name}_questions_test.csv")
    df_q_test = df_q_test[["question_id", "q_difficulty"]]

    df_q_test["q_difficulty_pred"] = df_q_test["question_id"].map(difficulty_dict)

    rmse = root_mean_squared_error(y_true=df_q_test["q_difficulty"], y_pred=df_q_test["q_difficulty_pred"])
    return rmse

# dataset_name = config_dict[family_models[0]]["LOADER"]["NAME"]
# hybrid_multi_roleplaying_llm(
#     experiment=EXP_NAME,
#     dataset_name=dataset_name,
#     config_ids=family_models,
#     run_id=1,
# )

In [None]:
from tools.analyzer import mean_stderror

def hybrid_multi_roleplaying_llm_all_runs(experiment: str, dataset_name: str, config_ids: list[str]):
    rmse_scores = []
    for run_id in [1, 2, 3]:
        rmse = hybrid_multi_roleplaying_llm(
            experiment=experiment,
            dataset_name=dataset_name,
            config_ids=config_ids,
            run_id=run_id,
        )
        rmse_scores.append(rmse)
    
    mean_rmse, stderr_rmse = mean_stderror(rmse_scores)
    print()
    print(f"Hybrid multi-roleplaying LLM RMSE: {mean_rmse:.3f} ± {stderr_rmse:.3f}  ({rmse_scores})")
    return rmse_scores

In [None]:
family_models = df_metric[(df_metric["family"] == "qwen3") & (df_metric["context"] == "Context")]["config_id"].values.tolist()
dataset_name = config_dict[family_models[0]]["LOADER"]["NAME"]
rmse_scores = hybrid_multi_roleplaying_llm_all_runs(
    experiment=EXP_NAME,
    dataset_name=dataset_name,
    config_ids=family_models,
)

In [None]:
family_models = df_metric[(df_metric["family"] == "qwen3") & (df_metric["context"] == "No context")]["config_id"].values.tolist()
dataset_name = config_dict[family_models[0]]["LOADER"]["NAME"]
rmse_scores = hybrid_multi_roleplaying_llm_all_runs(
    experiment=EXP_NAME,
    dataset_name=dataset_name,
    config_ids=family_models,
)

In [None]:
family_models = df_metric[(df_metric["family"].isin(["llama3.1", "llama3.2"])) & (df_metric["context"] == "Context")]["config_id"].values.tolist()
dataset_name = config_dict[family_models[0]]["LOADER"]["NAME"]
rmse_scores = hybrid_multi_roleplaying_llm_all_runs(
    experiment=EXP_NAME,
    dataset_name=dataset_name,
    config_ids=family_models,
)

In [None]:
family_models = df_metric[(df_metric["family"].isin(["llama3.1", "llama3.2"])) & (df_metric["context"] == "No context")]["config_id"].values.tolist()
dataset_name = config_dict[family_models[0]]["LOADER"]["NAME"]
rmse_scores = hybrid_multi_roleplaying_llm_all_runs(
    experiment=EXP_NAME,
    dataset_name=dataset_name,
    config_ids=family_models,
)

Code for plots

In [None]:
# obtain one large Dataframe for IRT
irt_df_all = pd.DataFrame()
family_models = df_metric[(df_metric["family"] == "qwen3") & (df_metric["context"] == "Context")]["config_id"].values.tolist()


for i, config_id in enumerate(family_models):
    irt_df, y_true = get_irt_df(
        experiment=EXP_NAME,
        config_id=config_id,
        run_id=1,
    )
    # append i to student_id in irt_df to make unique
    irt_df["student_id"] = irt_df["student_id"] + f"_model{i}"
    # concatenate
    irt_df_all = pd.concat([irt_df_all, irt_df], axis=0)

irt_df_all

In [None]:
from tools.irt_estimator import irt_estimation

# Compute IRT parameters
student_dict, difficulty_dict, _ = irt_estimation(interactions_df=irt_df_all)

In [None]:
irt_scale = {
    "dbe_kt22": (-5,5),
    "cupacfe": (30, 110),
}
old_range = (-5, 5)

dataset_name = config_dict[family_models[0]]["LOADER"]["NAME"]

logger.info(
    "Rescaling difficulty range",
    old_range=old_range,
    new_range=irt_scale[dataset_name],
)
new_min_diff, new_max_diff = irt_scale[dataset_name]
for key, value in difficulty_dict.items():
    difficulty_dict[key] = (
        (value - old_range[0]) / (old_range[1] - old_range[0])
    ) * (new_max_diff - new_min_diff) + new_min_diff

df_q_test = pd.read_csv(f"../data/gold/{dataset_name}_questions_test.csv")
df_q_test = df_q_test[["question_id", "q_difficulty"]]

In [None]:
df_q_test["q_difficulty_pred"] = df_q_test["question_id"].map(difficulty_dict)
df_q_test

In [None]:
from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(y_true=df_q_test["q_difficulty"], y_pred=df_q_test["q_difficulty_pred"])

In [None]:
from scipy import stats
res = stats.linregress(x=df_q_test["q_difficulty"], y=df_q_test["q_difficulty_pred"])

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(6, 6))
# ax.plot([-5, -5], [5, 5], linestyle='--', color='gray')
ax.axline((1, 1), slope=1)
ax.set_xlabel('True Difficulty', fontsize=14)
ax.set_ylabel('Predicted Difficulty', fontsize=14)
# ax.set_xlim(-5, 5)
# ax.set_title('Predicted vs True Difficulty', fontsize=16)
ax.scatter(x=df_q_test["q_difficulty"], y=df_q_test["q_difficulty_pred"])
ax.plot(df_q_test["q_difficulty"], res.intercept + res.slope*df_q_test["q_difficulty"], 'r', label='fitted line')

In [None]:
import pandas as pd
from tools.irt_estimator import irt_estimation
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np


def get_diff_aggregated_irt(
    df_metric: pd.DataFrame,
    family: str,
    context_type: str,
    exp_name: str,
    config_dict: dict,
) -> pd.DataFrame:
    # obtain one large Dataframe for IRT
    irt_df_all = pd.DataFrame()
    if type(family) == str:
        family = [family]
    family_models = df_metric[
        (df_metric["family"].isin(family)) & (df_metric["context"] == context_type)
    ]["config_id"].values.tolist()

    for i, config_id in enumerate(family_models):
        irt_df, _ = get_irt_df(
            experiment=exp_name,
            config_id=config_id,
            run_id=1,
        )
        # append i to student_id in irt_df to make unique
        irt_df["student_id"] = irt_df["student_id"] + f"_model{i}"
        # concatenate
        irt_df_all = pd.concat([irt_df_all, irt_df], axis=0)

    # Compute IRT parameters
    _, difficulty_dict, _ = irt_estimation(interactions_df=irt_df_all)

    irt_scale = {
        "dbe_kt22": (-5, 5),
        "cupacfe": (30, 110),
    }
    old_range = (-5, 5)

    dataset_name = config_dict[family_models[0]]["LOADER"]["NAME"]

    logger.info(
        "Rescaling difficulty range",
        old_range=old_range,
        new_range=irt_scale[dataset_name],
    )
    new_min_diff, new_max_diff = irt_scale[dataset_name]
    for key, value in difficulty_dict.items():
        difficulty_dict[key] = (
            (value - old_range[0]) / (old_range[1] - old_range[0])
        ) * (new_max_diff - new_min_diff) + new_min_diff

    df_q_test = pd.read_csv(f"../data/gold/{dataset_name}_questions_test.csv")
    df_q_test = df_q_test[["question_id", "q_difficulty"]]
    df_q_test["q_difficulty_pred"] = df_q_test["question_id"].map(difficulty_dict)

    return df_q_test


def get_diff_aggregated_irt_context_nocontext(
    df_metric: pd.DataFrame, family: str, exp_name: str, config_dict: dict
):

    q_diff_context = get_diff_aggregated_irt(
        df_metric, family, "Context", exp_name, config_dict
    )
    q_diff_nocontext = get_diff_aggregated_irt(
        df_metric, family, "No context", exp_name, config_dict
    )

    return q_diff_context, q_diff_nocontext

In [None]:
q_diff_context, q_diff_nocontext = get_diff_aggregated_irt_context_nocontext(
    df_metric=df_metric,
    family="qwen3",
    exp_name=EXP_NAME,
    config_dict=config_dict,
)

In [None]:
from typing import Optional
from tools.utils import ensure_dir

def plot_aggregated_interactions(
    q_diff_context: pd.DataFrame, q_diff_nocontext: pd.DataFrame, savename: Optional[str] = None,
):
    linreg_context = stats.linregress(
        x=q_diff_context["q_difficulty"], y=q_diff_context["q_difficulty_pred"]
    )
    linreg_nocontext = stats.linregress(
        x=q_diff_nocontext["q_difficulty"], y=q_diff_nocontext["q_difficulty_pred"]
    )

    _, ax = plt.subplots(figsize=(16/3, 9/3))
    ax.set_xlabel("True Difficulty", fontsize=12)
    ax.set_ylabel("Predicted Difficulty", fontsize=12)

    # contextual
    ax.scatter(x=q_diff_context["q_difficulty"], y=q_diff_context["q_difficulty_pred"], marker='o')
    ax.plot(
        q_diff_context["q_difficulty"],
        linreg_context.intercept
        + linreg_context.slope * q_diff_context["q_difficulty"],
        label=f"Context",
    )
    # non-contextual
    ax.scatter(
        x=q_diff_nocontext["q_difficulty"], y=q_diff_nocontext["q_difficulty_pred"], marker="^"
    )
    ax.plot(
        q_diff_nocontext["q_difficulty"],
        linreg_nocontext.intercept
        + linreg_nocontext.slope * q_diff_nocontext["q_difficulty"],
        label="No context",
    )
    # diagonal line
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    x = np.linspace(xlim[0], xlim[1], 100)
    ax.plot(x, x, linestyle='--', color='gray')
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    ax.legend()
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    if savename is not None:
        plt.tight_layout()
        ensure_dir(os.path.dirname(savename))
        plt.savefig(savename)
    plt.show()

In [None]:
activate_latex(sans_serif=SANS_SERIF)
plot_aggregated_interactions(
    q_diff_context=q_diff_context,
    q_diff_nocontext=q_diff_nocontext,
)
deactivate_latex()

___

In [None]:
PRINT_PAPER = True

In [None]:
if PRINT_PAPER:
    activate_latex(sans_serif=SANS_SERIF)
    ########
    # plot for every config_id
    for llm_family in ["qwen3", ["llama3.1", "llama3.2"]]:

        q_diff_context, q_diff_nocontext = get_diff_aggregated_irt_context_nocontext(
            df_metric=df_metric,
            family=llm_family,
            exp_name=EXP_NAME,
            config_dict=config_dict,
        )
        if llm_family == ["llama3.1", "llama3.2"]:
            llm_family = "llama3"
        fname = os.path.join(
            "output", EXP_NAME, "figures", f"aggregated_irt_{llm_family}.pdf"
        )
        plot_aggregated_interactions(
            q_diff_context=q_diff_context,
            q_diff_nocontext=q_diff_nocontext,
            savename=fname,
        )

    ########
    deactivate_latex()

In [None]:
# student_dict

In [None]:
# student_skills = pd.DataFrame({"student": list(student_dict.keys()), "ability": list(student_dict.values())})
# # remove students "p_bad" and "p_good"
# student_skills = student_skills[~student_skills["student"].isin(["p_bad", "p_good"])]
# # get int before underscore in student column
# student_skills["student_level"] = student_skills["student"].str.extract("(\\d+)").astype(int)
# student_skills

In [None]:
# student_skills.groupby("student_level").agg({"ability": ["mean", "std", "count"]})

In [None]:
# list(student_dict.values())

## Scatter plot: predicted vs true difficulty

In [None]:
from tools.analyzer import read_pickle
from typing import Any
import os


def get_preds(experiment: str, config_id: str, run_id: int) -> dict:
    # paths
    output_dir = os.path.join("output", experiment)
    output_path = os.path.join(output_dir, config_id, f"run_{run_id}.pickle")

    # metric
    preds = {}

    logger.info("Loading checkpoint", output_path=output_path)
    log_dict = read_pickle(output_path)
    preds["y_pred"] = log_dict["preds_qdiff"]["test_y_pred"]
    preds["y_true"] = log_dict["preds_qdiff"]["test_y_true"]

    return preds


# preds = get_preds(
#     experiment=EXP_NAME,
#     config_id="llama3.2:1b~T_0.0~SO_student_bool_nocontext~L_5~SP_student_dbe_miscons_level_context~SS_proficiency_5_str~EFQ_quotes~EFI_quotes~ES_both_dbe_studentlevel_random1",
#     run_id=1,
# )

# preds = get_preds(
#     experiment=EXP_NAME,
#     config_id="qwen3:1.7b~T_0.0~SO_student_bool_nocontext~L_5~SP_student_cfe_miscons_level_nocontext~SS_proficiency_5_str~EFQ_mcq_reading_quotes~EFI_open_reading~ES_both_cfe_studentlevel_random0",
#     run_id=1,
# )

In [None]:
irt_df

In [None]:
logs = read_pickle("./output/roleplay_dbekt22_test_20250924-090854/qwen3:14b~T_1.0~SO_teacher_bool_nocontext~L_5~SP_teacher_dbe_miscons_level_context~SS_proficiency_5_str~EFQ_quotes~EFI_quotes~ES_both_dbe_studentlevel_kc_exact3/run_1.pickle")

In [None]:
logs

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(6, 6))
# ax.plot([-5, -5], [5, 5], linestyle='--', color='gray')
ax.axline((1, 1), slope=1)
ax.set_xlabel('True Difficulty', fontsize=14)
ax.set_ylabel('Predicted Difficulty', fontsize=14)
# ax.set_xlim(-5, 5)
# ax.set_title('Predicted vs True Difficulty', fontsize=16)
ax.scatter(x=preds["y_true"], y=preds["y_pred"])

In [None]:
plt.hist(preds["y_true"], bins=10, alpha=0.5, label='True Difficulty')

In [None]:
sns.scatterplot(data=df_results_nocontext, x="test RMSE", y="llm correctness", hue="model")

## Scatter plots

In [None]:
df_results_nocontext[["model", "test monotonicity", "llm correctness"]]

In [None]:
import seaborn as sns

sns.scatterplot(data=df_results_nocontext, x="test RMSE", y="llm correctness", hue="model")

## Additional: LLM question answering

In [None]:
# for config_id in config_ids:
#     logger.info(f"Plotting student level performance", config_id=config_id)
#     plot_student_level_performance(
#         exp_name=EXP_NAME,
#         config_id=config_id,
#         metric="val_accuracy",
#         **legend_kwargs,
#         save=False,
#     )

# Student levels

In [None]:
from typing import Optional
import os
import pandas as pd
import matplotlib.pyplot as plt
from tools.utils import ensure_dir


def _plot_level_correctness_roleplay(
    df_results: pd.DataFrame,
    config_id: str = None,
    save: bool = False,
    savefig_kwargs: Optional[dict] = None,
):
    """Plot LLM correctness per level.

    Parameters
    ----------
    df_results : pd.DataFrame
        DataFrame containing the results to plot.
    config_id : str, optional
        Configuration ID, by default None
    save : bool, optional
        Whether to save the plot, by default False
    savefig_kwargs : Optional[dict], optional
        Dictionary with save arguments, by default None
    """
    llm_group_correctness = (
        df_results[df_results["config_id"] == config_ids[0]]
        .filter(regex=(".*_llm_group_correctness"))
        .iloc[0, 0]
    )
    df_llm = pd.DataFrame({
        "student_level_group": range(1, len(llm_group_correctness) + 1),
        "llm_correct": llm_group_correctness
    }).set_index("student_level_group")
    print(df_llm)

    _, ax = plt.subplots()
    df_llm.plot(kind="line", ax=ax, label="LLM")  # FIXME: label is not shown
    ax.set(
        xlabel="Student levels",
        ylabel="MCQ correctness",
    )
    ax.set_ylim(-0.05, 1.05)
    ax.set_title((None if save else config_id), fontsize=9)
    ax.legend(loc="upper left", fontsize=9)
    ax.grid(True, linestyle="--", alpha=0.7)
    if save:
        plt.tight_layout()
        ensure_dir(os.path.dirname(savefig_kwargs["fname"]))
        plt.savefig(**savefig_kwargs)
    plt.show()

In [None]:
# # all configs
# for config_id in config_ids:
#     _plot_level_correctness_roleplay(
#         df_results=df_results,
#         config_id=config_id,
#     )