In [4]:
import pandas as pd
import numpy as np

In [2]:
MODELS = ['claude', 'gemini', 'gpt', 'gemma-12b', 'olmo-7b', 'gemma-4b']
REJECTION_STRATEGIES = ["aggressive", "majority"]

In [48]:
def load_data(model: str, rejection_strategy: str) -> pd.DataFrame:
    path = f"../data/collab-rec-2026/analysis/{model}_mami_{rejection_strategy}_scores_with_relevance.csv"
    return pd.read_csv(path)

In [53]:
def hall_rel_score(model, rejection_strategy) -> float:
    print(f"== {model} {rejection_strategy} ===")
    df = load_data(model, rejection_strategy )
    mami_df = df.loc[(df["agent_name"].isin(["personalization", "sustainability", "popularity"])) & (df["round_nr"]==10)]
    masi_df = df.loc[(df["agent_name"].isin(["personalization", "sustainability", "popularity"])) & (df["round_nr"]==1)]
    print(f"\t Hall. Rate:\n MASI: {np.mean(masi_df['hallucination_rate'])} \n MAMI: {np.mean(mami_df['hallucination_rate'])}")
    print(f"\t Rel. Score: MAMI: {np.mean(mami_df['reliability_score']):.4f}")

In [54]:
for rs in REJECTION_STRATEGIES:
    for model in MODELS:
        print(model, rs)
        hall_rel_score(model, rs)

	 Hall. Rate:
 MASI: 0.0023703703703703686 
 MAMI: 0.0
	 Rel. Score: MAMI: 0.9613


In [71]:
from experiments.helpers import load_queries
from constants import CITIES

def get_hall_score(cities: list):
    if not cities:
        return None
    hall_count = sum(1 for city in cities if city not in CITIES)
    return hall_count / len(cities)

def hall_score_sasi(model):
    data = load_queries(f"../data/collab-rec-2026/llm-results/{model}/sasi/{model}_sasi.json")

    total_hall_score = 0
    count = 0

    for item in data:
        responses = item.get('response', [])
        if not responses:
            continue

        result = responses[0]
        cities = result.get('cities') or result.get('candidates')

        if cities:
            total_hall_score += get_hall_score(cities)
            count += 1

    avg_hall_score = total_hall_score / count if count > 0 else 0

    print(f"avg hall score sasi: {model}: {avg_hall_score}")

In [73]:
for model in MODELS:
    hall_score_sasi(model)

avg hall score sasi: olmo-7b: 0.04751930896292888
avg hall score sasi: gemma-4b: 0.00655555555555555


In [74]:
hall_score_sasi("gemma-12b")

avg hall score sasi: gemma-12b: 0.005555555555555553
