In [26]:
import pandas as pd
from analysis.diversity import gini_index, compute_normalized_entropy
import ast

In [27]:
df_olmo = pd.read_csv("../data/collab-rec-2026/analysis/ablation/olmo-7b_aggressive_5_rounds_ablated_success.csv")
olmo_ids = df_olmo["query_id"].unique().tolist()

In [32]:
def get_es_df(df, olmo_ids = olmo_ids, max_rounds=5):
    df = df.loc[df["query_id"].isin(olmo_ids)]
    moderator_df = df[(df["agent_name"] == "moderator") & (df["round_nr"].between(2, max_rounds))]
    best_rounds = moderator_df.loc[moderator_df.groupby("query_id")["success_score"].idxmax()]
    best_round_mapping = best_rounds.set_index("query_id")["round_nr"]
    # Optimize: Use merge instead of apply for better performance
    df_with_best = df.merge(best_round_mapping.rename("best_round"), left_on="query_id", right_index=True, how="inner")
    df_early_stopping = df_with_best[df_with_best["round_nr"] == df_with_best["best_round"]]
    return df_early_stopping.loc[df_early_stopping["agent_name"] == "moderator"]

In [33]:
def compute_sig_test(model, round, ablation_component):
    file_name = f"../data/collab-rec-2026/analysis/ablation/{model}_aggressive_{round}_rounds_ablated_{ablation_component}.csv"
    df = pd.read_csv(file_name)
    df_es = get_es_df(df, max_rounds=round)
    # print(f"\tNumber of queries in early stopping dataset: {len(df_es)}")

    file_name_og = f"../data/collab-rec-2026/analysis/{model}_mami_aggressive_scores.csv"
    df_og = pd.read_csv(file_name_og)
    df_es_og = get_es_df(df_og, max_rounds=round)

    col_names = ["success_score", "reliability_score"]
    for col in col_names:
        print(f"\tSignificance test for {col} column:")
        sig_test(df_es[col].tolist(), df_es_og[col].tolist())

In [34]:
import numpy as np
from scipy import stats

def sig_test(a1, a2, alpha=0.05):
    # Calculate the difference to check for normality
    diff = np.array(a1) - np.array(a2)

    # Check for Normality (Shapiro-Wilk Test)
    _, p_norm = stats.shapiro(diff)

    if p_norm > 0.05:
        # Data is normal -> Use Paired T-Test
        test_name = "Paired T-test"
        _, p_value = stats.ttest_rel(a1, a2)
    else:
        # Data is NOT normal -> Use Wilcoxon Signed-Rank Test
        test_name = "Wilcoxon Signed-Rank Test"
        _, p_value = stats.wilcoxon(a1, a2)

    # print(f"\tTest Used: {test_name}")
    # print(f"\tP-value: {p_value:.4f}")

    # Determine significance
    if p_value < alpha:
        print(f"\t\tResult: Statistically Significant (p < {alpha}) - Reject Null Hypothesis using {test_name}; p = {p_value:.4f}")
    else:
        print(f"\t\tResult: NOT Statistically Significant (p >= {alpha})")

In [37]:
for model in ["gemini"]:
    if model == "olmo-7b":
        round = 5
    else:
        round = 10
    for component in ["success", "reliability", "hallucination","rank" ]:
        print(f"{model} METRICS FOR ABLATION {component}:")
        compute_sig_test(model, round, component)

        print("\n")

	Significance test for success_score column:
		Result: NOT Statistically Significant (p >= 0.05)
	Significance test for reliability_score column:
		Result: NOT Statistically Significant (p >= 0.05)


