In [2]:
import pandas as pd
from analysis.diversity import gini_index, compute_normalized_entropy
import ast

In [25]:
df_olmo = pd.read_csv("../data/collab-rec-2026/analysis/ablation/olmo-7b_aggressive_5_rounds_ablated_success.csv")
olmo_ids = df_olmo["query_id"].unique().tolist()

In [26]:
def get_es_df(df, olmo_ids = olmo_ids):
    df = df.loc[df["query_id"].isin(olmo_ids)]
    moderator_df = df[(df["agent_name"] == "moderator") & (df["round_nr"].between(2, 5))]
    best_rounds = moderator_df.loc[moderator_df.groupby("query_id")["success_score"].idxmax()]
    best_round_mapping = best_rounds.set_index("query_id")["round_nr"].to_dict()
    df_early_stopping = df[df.apply(lambda row: row["round_nr"] == best_round_mapping.get(row["query_id"]), axis=1)]
    return df_early_stopping.loc[df_early_stopping["agent_name"] == "moderator"]

In [27]:
def compute_diversity_metrics(df):
    df["candidates"] = df["candidates"].apply(ast.literal_eval)
    recommendations = df["candidates"].tolist()
    gini = gini_index(recommendations)
    entropy = compute_normalized_entropy(recommendations)
    return gini, entropy

In [32]:
def compute_ablation_metrics(model, round, ablation_component):
    file_name = f"../data/collab-rec-2026/analysis/ablation/{model}_aggressive_{round}_rounds_ablated_{ablation_component}.csv"
    df = pd.read_csv(file_name)
    df_es = get_es_df(df)
    print(f"\tNumber of queries in early stopping dataset: {len(df_es)}")
    print(f"\tAvg success score in early stopping dataset: {df_es['success_score'].mean():.2f}")

    gini, entropy = compute_diversity_metrics(df_es)
    print(f"\tGini Index: {gini:.2f}")
    print(f"\tNormalized Entropy: {entropy['normalized_entropy']:.2f}")
    print(f"\tavg reliability score: {df_es['reliability_score'].mean():.2f}")
    # print(f"Normalized Entropy: {entropy}")

In [40]:
def compute_og_metrics(model, round):
    file_name_og = f"../data/collab-rec-2026/analysis/{model}_mami_aggressive_scores.csv"
    df_og = pd.read_csv(file_name_og)
    df_es = get_es_df(df_og)
    print("Original Metrics:")
    print(f"\tNumber of queries in early stopping dataset: {len(df_es)}")
    print(f"\tAvg success score in early stopping dataset: {df_es['success_score'].mean():.2f}")

    gini, entropy = compute_diversity_metrics(df_es)
    print(f"\tGini Index: {gini:.2f}")
    print(f"\tNormalized Entropy: {entropy['normalized_entropy']:.2f}")
    print(f"\tavg reliability score: {df_es['reliability_score'].mean():.2f}")

In [41]:
for model in ["gemini", "olmo-7b"]:
    if model == "olmo-7b":
        round = 5
    else:
        round = 10
    compute_og_metrics(model, round)
    for component in ["success", "reliability", "hallucination","rank" ]:
        print(f"{model} Metrics for ablation of {component}:")
        compute_ablation_metrics(model, round, component)

        print("\n")

	Number of queries in early stopping dataset: 150
	Avg success score in early stopping dataset: 0.62
	Gini Index: 0.54
	Normalized Entropy: 0.88
	avg reliability score: 0.48


olmo-7b Metrics for ablation of hallucination:
	Number of queries in early stopping dataset: 150
	Avg success score in early stopping dataset: 0.64
	Gini Index: 0.59
	Normalized Entropy: 0.86
	avg reliability score: 0.67


olmo-7b Metrics for ablation of rank:
	Number of queries in early stopping dataset: 150
	Avg success score in early stopping dataset: 0.63
	Gini Index: 0.60
	Normalized Entropy: 0.85
	avg reliability score: 0.67




In [48]:
import os
from analysis.get_scores import get_scores
def compute_temp_metrics(model, round, temp):
    json_file_path = f"../data/collab-rec-2026/llm-results/{model}/mami/ablated/{model}_aggressive_{round}_rounds_temp_{temp}.json"

    file_name = f"../data/collab-rec-2026/analysis/ablation/{model}_aggressive_{round}_rounds_temp_{temp}.csv"
    if not os.path.exists(file_name):
        print(f"File {file_name} does not exist.")
        df = get_scores(model_name=model, rejection_strategy= "aggressive", input_file=json_file_path, output_file=file_name)
    else:
        df = pd.read_csv(file_name)
    df_es = get_es_df(df)
    print(f"\tNumber of queries in early stopping dataset: {len(df_es)}")
    print(f"\tAvg success score in early stopping dataset: {df_es['success_score'].mean():.2f}")
    gini , entropy = compute_diversity_metrics(df_es)
    print(f"\tGini Index: {gini:.2f}")
    print(f"\tNormalized Entropy: {entropy['normalized_entropy']:.2f}")
    print(f"hallucination score: {df_es['hallucination_rate'].mean():.2f}")
    print(f"\tavg reliability score: {df_es['reliability_score'].mean():.2f}")

In [49]:
compute_temp_metrics("gemini", 10, 0.2)

	Number of queries in early stopping dataset: 150
	Avg success score in early stopping dataset: 0.64
	Gini Index: 0.57
	Normalized Entropy: 0.88
hallucination score: 0.00
	avg reliability score: 0.74


In [50]:
compute_temp_metrics("gemini", 10, 0.8)

Saved 36000 records to ../data/collab-rec-2026/analysis/ablation/gemini_aggressive_10_rounds_temp_0.8.csv

=== Summary Statistics ===
Total records: 36000
Unique queries: 900
Unique agents: 4
Agents: ['moderator', 'personalization', 'popularity', 'sustainability']
Round numbers: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10)]

=== Average Success Score by Agent ===
agent_name
moderator          0.614591
personalization    0.614100
popularity         0.613354
sustainability     0.613060
Name: success_score, dtype: float64

=== Average Success Score by Round ===
round_nr
1     0.615432
2     0.617495
3     0.615845
4     0.616412
5     0.614523
6     0.614542
7     0.611340
8     0.610836
9     0.611056
10    0.610279
Name: success_score, dtype: float64

=== Average Reliability Score by Agent (rounds 2+) ===
agent_name
popularity         0.874604
sustainability     0.874316
moderator          0.865111
per