In [1]:
id_to_model = {
    "openai/gpt-oss-120b": "GPT-OSS-120b",
    "gpt-5-mini-2025-08-07": "GPT-5-mini",
    "gemini-2.5-flash": "gemini-2.5-flash",
    "openai/gpt-oss-20b": "GPT-OSS-20b",
    "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": "Qwen3-235B-A22B-Instruct-2507",
    "gpt-5-nano-2025-08-07": "GPT-5-Nano",
    "Qwen/Qwen3-30B-A3B-Instruct-2507": "Qwen3-30B-A3B-Instruct-2507",
    "mistralai/Mistral-Small-3.2-24B-Instruct-2506": "Mistral-Small-3.2-24B-Instruct-2506",
    "gemini-2.5-flash-lite": "gemini-2.5-flash-lite",
    "gpt-4.1-mini-2025-04-14": "GPT-4.1-mini",
    "google/gemma-3-27b-it": "gemma-3-27b-it",
    "mistralai/Mistral-Large-Instruct-2411": "Mistral-Large-Instruct-2411",
    "google/gemma-3-12b-it": "gemma-3-12b-it",
    "gpt-4.1-nano-2025-04-14": "GPT-4.1-Nano",
    "Qwen/Qwen3-4B-Instruct-2507": "Qwen3-4B-Instruct-2507",
    "RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8": "Llama-3.3-70B-Instruct",
    "google/gemma-3-4b-it": "gemma-3-4b-it",
    "mistralai/Ministral-8B-Instruct-2410": "Ministral-8B-Instruct-2410",
    "meta-llama/Llama-3.1-8B-Instruct": "Llama-3.1-8B-Instruct",
    "google/gemma-3-1b-it": "gemma-3-1b-it",
    "meta-llama/Llama-3.2-3B-Instruct": "Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.2-1B-Instruct": "Llama-3.2-1B-Instruct",
    "google/gemma-3-270m-it": "gemma-3-270m-it",
}

model_order = [
    "GPT-5-Nano",
    "GPT-5-mini",
    "GPT-4.1-Nano",
    "GPT-4.1-mini",
    "GPT-OSS-120b",
    "GPT-OSS-20b",
    "Llama-3.3-70B-Instruct",
    "Llama-3.1-8B-Instruct",
    "Llama-3.2-3B-Instruct",
    "Llama-3.2-1B-Instruct",
    "gemini-2.5-flash",
    "gemini-2.5-flash-lite",
    "gemma-3-27b-it",
    "gemma-3-12b-it",
    "gemma-3-4b-it",
    "gemma-3-1b-it",
    "gemma-3-270m-it",
    "Mistral-Large-Instruct-2411",
    "Mistral-Small-3.2-24B-Instruct-2506",
    "Ministral-8B-Instruct-2410",
    "Qwen3-235B-A22B-Instruct-2507",
    "Qwen3-30B-A3B-Instruct-2507",
    "Qwen3-4B-Instruct-2507",
]


In [2]:
def get_significance_level(p_value):
    if p_value < 0.001:
        return "***"
    elif p_value < 0.01:
        return "**"
    elif p_value < 0.05:
        return "*"
    else:
        return ""

# Lexical Perturbations
## MMLU

In [3]:
import pandas as pd
import json
import os

# Identify changed rows
mmlu_data = "../../data/mmlu"

with open(os.path.join(mmlu_data, "original.json")) as f:
    original_data = json.load(f)
original_data_df = pd.DataFrame(original_data["data"])

with open(os.path.join(mmlu_data, "lexical", "llm_synonym_perturbation.json")) as f:
    lexical_data = json.load(f)
lexical_data_df = pd.DataFrame(lexical_data["data"])

changed_rows = []
for index, row in lexical_data_df.iterrows():
    if row["question"] != original_data_df.iloc[index]["question"] or row["choices"] != original_data_df.iloc[index]["choices"]:
        changed_rows.append(index)

In [4]:
from statsmodels.stats.contingency_tables import mcnemar

# Execute McNemar's test for paired nominal data for each model
model_names = []
p_values = []
test_statistics = []
results_dir = "../../results"
for model in os.listdir(results_dir):
    if os.path.isdir(os.path.join(results_dir, model)):
        with open(os.path.join(results_dir, model, "mmlu", "original.json")) as f:
            original_results = json.load(f)
        model_names.append(original_results["model"])
        original_results_df = pd.DataFrame(original_results["predictions"]).iloc[changed_rows]
        original_results_df["correct_original"] = original_results_df["prediction"] == original_results_df["answer"]

        with open(os.path.join(results_dir, model, "mmlu", "lexical", "llm_synonym.json")) as f:
            lexical_results = json.load(f)
        lexical_results_df = pd.DataFrame(lexical_results["predictions"]).iloc[changed_rows]
        lexical_results_df["correct_lexical"] = lexical_results_df["prediction"] == lexical_results_df["answer"]

        # Contingency table
        contingency_table = pd.crosstab(original_results_df["correct_original"], lexical_results_df["correct_lexical"])

        # McNemar's test
        test_result = mcnemar(contingency_table, exact=False, correction=True)
        p_values.append(test_result.pvalue)
        test_statistics.append(test_result.statistic)

mmlu_lexical_significance_df = pd.DataFrame({
    "model": model_names,
    "p_value": p_values,
    "test_statistic": test_statistics
})
mmlu_lexical_significance_df["model"] = mmlu_lexical_significance_df["model"].map(id_to_model)
mmlu_lexical_significance_df = mmlu_lexical_significance_df.set_index("model").reindex(model_order).reset_index()
mmlu_lexical_significance_df["significance_level"] = mmlu_lexical_significance_df["p_value"].apply(get_significance_level)
mmlu_lexical_significance_df.to_csv("../../data/result_tables/mmlu_lexical_significance.csv")


## SQuAD

In [5]:
from datasets import load_dataset


# Identify changed rows
squad_data = "../../data/squad"
original_data_df = squad_dataset = load_dataset("rajpurkar/squad", split="validation").shuffle(seed=77).select(range(1000)).to_pandas()
with open(os.path.join(squad_data, "lexical", "llm_synonym_perturbation.json")) as f:
    lexical_data = json.load(f)
lexical_data_df = pd.DataFrame(lexical_data["data"])

changed_rows = []
for index, row in lexical_data_df.iterrows():
    if (row["question"] != original_data_df.iloc[index]["question"]) or (row["context"] != original_data_df.iloc[index]["context"]):
        changed_rows.append(index)

In [6]:
import evaluate
from tqdm.notebook import tqdm

# Initialize evaluators
squad_evaluator = evaluate.load("squad")

# Execute McNemar's test for paired nominal data for each model on exact match metric
model_names = []
p_values = []
test_statistics = []
results_dir = "../../results"
for model in tqdm(os.listdir(results_dir), desc="Processing models"):
    if os.path.isdir(os.path.join(results_dir, model)):
        with open(os.path.join(results_dir, model, "squad", "original.json")) as f:
            original_results = json.load(f)
        model_names.append(original_results["model"])
        original_results_df = pd.DataFrame(original_results["predictions"]).iloc[changed_rows]

        correct_original = []
        for idx, row in original_results_df.iterrows():
            squad_eval = squad_evaluator.compute(
                predictions=[{"id": str(idx), "prediction_text": row["prediction"]}],
                references=[{"id": str(idx), "answers": {"text": row["answers"], "answer_start": [0] * len(row["answers"])}}]
            )
            em = squad_eval["exact_match"]
            if int(em) == 100:
                correct_original.append(True)
            else:
                correct_original.append(False)
        original_results_df["correct_original"] = correct_original

        with open(os.path.join(results_dir, model, "squad", "lexical", "llm_synonym.json")) as f:
            lexical_results = json.load(f)
        lexical_results_df = pd.DataFrame(lexical_results["predictions"]).iloc[changed_rows]
        correct_lexical = []
        for idx, row in lexical_results_df.iterrows():
            squad_eval = squad_evaluator.compute(
                predictions=[{"id": str(idx), "prediction_text": row["prediction"]}],
                references=[{"id": str(idx), "answers": {"text": row["answers"], "answer_start": [0] * len(row["answers"])}}]
            )
            em = squad_eval["exact_match"]
            if int(em) == 100:
                correct_lexical.append(True)
            else:
                correct_lexical.append(False)
        lexical_results_df["correct_lexical"] = correct_lexical

        # Contingency table
        contingency_table = pd.crosstab(original_results_df["correct_original"], lexical_results_df["correct_lexical"])

        # McNemar's test
        test_result = mcnemar(contingency_table, exact=False, correction=True)
        p_values.append(test_result.pvalue)
        test_statistics.append(test_result.statistic)

squad_lexical_significance_df = pd.DataFrame({
    "model": model_names,
    "p_value": p_values,
    "test_statistic": test_statistics
})
squad_lexical_significance_df["model"] = squad_lexical_significance_df["model"].map(id_to_model)
squad_lexical_significance_df = squad_lexical_significance_df.set_index("model").reindex(model_order).reset_index()
squad_lexical_significance_df["significance_level"] = squad_lexical_significance_df["p_value"].apply(get_significance_level)
squad_lexical_significance_df.to_csv("../../data/result_tables/squad_lexical_significance.csv")


Processing models:   0%|          | 0/24 [00:00<?, ?it/s]

## AMEGA

In [7]:
from itertools import chain

# Execute McNemar's test for paired nominal data for each model on criterion satisfaction
model_names = []
p_values = []
test_statistics = []
results_dir = "../../results"
criteria_file = "../../AMEGA-benchmark/data/criteria.csv"
criteria_df = pd.read_csv(criteria_file, sep=";", decimal=",")
criteria_df["criteria_score_possible"] = criteria_df["criteria_score_possible"].astype(float)
for model in os.listdir(results_dir):
    if os.path.isdir(os.path.join(results_dir, model)):
        with open(os.path.join(results_dir, model, "amega", "original_v3.json")) as f:
            original_results = json.load(f)
        model_names.append(original_results["model"])
        original_results_df = pd.DataFrame(original_results["predictions"])

        correct_original = list(chain.from_iterable(original_results_df["majority_vote"].values))

        with open(os.path.join(results_dir, model, "amega", "lexical", "llm_synonym_v3.json")) as f:
            lexical_results = json.load(f)
        lexical_results_df = pd.DataFrame(lexical_results["predictions"])

        correct_lexical = list(chain.from_iterable(lexical_results_df["majority_vote"].values))

        # Contingency table
        contingency_table = [[0, 0], [0, 0]]
        for original, lexical, weight in zip(correct_original, correct_lexical, criteria_df["criteria_score_possible"].values):
            if original and lexical:
                contingency_table[0][0] += weight
            elif original and not lexical:
                contingency_table[0][1] += weight
            elif not original and lexical:
                contingency_table[1][0] += weight
            elif not original and not lexical:
                contingency_table[1][1] += weight

        # McNemar's test
        test_result = mcnemar(contingency_table, exact=False, correction=True)
        p_values.append(test_result.pvalue)
        test_statistics.append(test_result.statistic)

amega_lexical_significance_df = pd.DataFrame({
    "model": model_names,
    "p_value": p_values,
    "test_statistic": test_statistics
})
amega_lexical_significance_df["model"] = amega_lexical_significance_df["model"].map(id_to_model)
amega_lexical_significance_df = amega_lexical_significance_df.set_index("model").reindex(model_order).reset_index()
amega_lexical_significance_df["significance_level"] = amega_lexical_significance_df["p_value"].apply(get_significance_level)
amega_lexical_significance_df.to_csv("../../data/result_tables/amega_lexical_significance.csv")


# Syntactic Perturbations
## MMLU

In [8]:
import pandas as pd
import json
import os

# Identify changed rows
mmlu_data = "../../data/mmlu"

with open(os.path.join(mmlu_data, "original.json")) as f:
    original_data = json.load(f)
original_data_df = pd.DataFrame(original_data["data"])

with open(os.path.join(mmlu_data, "syntactic", "syntactic_perturbation.json")) as f:
    syntactic_data = json.load(f)
syntactic_data_df = pd.DataFrame(syntactic_data["data"])

changed_rows = []
for index, row in syntactic_data_df.iterrows():
    if row["question"] != original_data_df.iloc[index]["question"] or row["choices"] != original_data_df.iloc[index]["choices"]:
        changed_rows.append(index)

In [9]:
# Execute McNemar's test for paired nominal data for each model
model_names = []
p_values = []
test_statistics = []
results_dir = "../../results"
for model in os.listdir(results_dir):
    if os.path.isdir(os.path.join(results_dir, model)):
        with open(os.path.join(results_dir, model, "mmlu", "original.json")) as f:
            original_results = json.load(f)
        model_names.append(original_results["model"])
        original_results_df = pd.DataFrame(original_results["predictions"]).iloc[changed_rows]
        original_results_df["correct_original"] = original_results_df["prediction"] == original_results_df["answer"]

        with open(os.path.join(results_dir, model, "mmlu", "syntactic", "syntactic.json")) as f:
            syntactic_results = json.load(f)
        syntactic_results_df = pd.DataFrame(syntactic_results["predictions"]).iloc[changed_rows]
        syntactic_results_df["correct_syntactic"] = syntactic_results_df["prediction"] == syntactic_results_df["answer"]

        # Contingency table
        contingency_table = pd.crosstab(original_results_df["correct_original"], syntactic_results_df["correct_syntactic"])

        # McNemar's test
        test_result = mcnemar(contingency_table, exact=False, correction=True)
        p_values.append(test_result.pvalue)
        test_statistics.append(test_result.statistic)

mmlu_syntactic_significance_df = pd.DataFrame({
    "model": model_names,
    "p_value": p_values,
    "test_statistic": test_statistics
})
mmlu_syntactic_significance_df["model"] = mmlu_syntactic_significance_df["model"].map(id_to_model)
mmlu_syntactic_significance_df = mmlu_syntactic_significance_df.set_index("model").reindex(model_order).reset_index()
mmlu_syntactic_significance_df["significance_level"] = mmlu_syntactic_significance_df["p_value"].apply(get_significance_level)
mmlu_syntactic_significance_df.to_csv("../../data/result_tables/mmlu_syntactic_significance.csv")


## SQuAD

In [10]:
# Identify changed rows
squad_data = "../../data/squad"
original_data_df = squad_dataset = load_dataset("rajpurkar/squad", split="validation").shuffle(seed=77).select(range(1000)).to_pandas()
with open(os.path.join(squad_data, "syntactic", "syntactic_perturbation.json")) as f:
    syntactic_data = json.load(f)
syntactic_data_df = pd.DataFrame(syntactic_data["data"])

changed_rows = []
for index, row in syntactic_data_df.iterrows():
    if (row["question"] != original_data_df.iloc[index]["question"]) or (row["context"] != original_data_df.iloc[index]["context"]):
        changed_rows.append(index)

In [11]:
# Execute McNemar's test for paired nominal data for each model on exact match metric
model_names = []
p_values = []
test_statistics = []
results_dir = "../../results"
for model in tqdm(os.listdir(results_dir), desc="Processing models"):
    if os.path.isdir(os.path.join(results_dir, model)):
        with open(os.path.join(results_dir, model, "squad", "original.json")) as f:
            original_results = json.load(f)
        model_names.append(original_results["model"])
        original_results_df = pd.DataFrame(original_results["predictions"]).iloc[changed_rows]

        correct_original = []
        for idx, row in original_results_df.iterrows():
            squad_eval = squad_evaluator.compute(
                predictions=[{"id": str(idx), "prediction_text": row["prediction"]}],
                references=[{"id": str(idx), "answers": {"text": row["answers"], "answer_start": [0] * len(row["answers"])}}]
            )
            em = squad_eval["exact_match"]
            if int(em) == 100:
                correct_original.append(True)
            else:
                correct_original.append(False)
        original_results_df["correct_original"] = correct_original

        with open(os.path.join(results_dir, model, "squad", "syntactic", "syntactic.json")) as f:
            syntactic_results = json.load(f)
        syntactic_results_df = pd.DataFrame(syntactic_results["predictions"]).iloc[changed_rows]
        correct_syntactic = []
        for idx, row in syntactic_results_df.iterrows():
            squad_eval = squad_evaluator.compute(
                predictions=[{"id": str(idx), "prediction_text": row["prediction"]}],
                references=[{"id": str(idx), "answers": {"text": row["answers"], "answer_start": [0] * len(row["answers"])}}]
            )
            em = squad_eval["exact_match"]
            if int(em) == 100:
                correct_syntactic.append(True)
            else:
                correct_syntactic.append(False)
        syntactic_results_df["correct_syntactic"] = correct_syntactic

        # Contingency table
        contingency_table = pd.crosstab(original_results_df["correct_original"], syntactic_results_df["correct_syntactic"])

        # McNemar's test
        test_result = mcnemar(contingency_table, exact=False, correction=True)
        p_values.append(test_result.pvalue)
        test_statistics.append(test_result.statistic)

squad_syntactic_significance_df = pd.DataFrame({
    "model": model_names,
    "p_value": p_values,
    "test_statistic": test_statistics
})
squad_syntactic_significance_df["model"] = squad_syntactic_significance_df["model"].map(id_to_model)
squad_syntactic_significance_df = squad_syntactic_significance_df.set_index("model").reindex(model_order).reset_index()
squad_syntactic_significance_df["significance_level"] = squad_syntactic_significance_df["p_value"].apply(get_significance_level)
squad_syntactic_significance_df.to_csv("../../data/result_tables/squad_syntactic_significance.csv")


Processing models:   0%|          | 0/24 [00:00<?, ?it/s]

## AMEGA

In [12]:
# Execute McNemar's test for paired nominal data for each model on criterion satisfaction
model_names = []
p_values = []
test_statistics = []
results_dir = "../../results"
criteria_file = "../../AMEGA-benchmark/data/criteria.csv"
criteria_df = pd.read_csv(criteria_file, sep=";", decimal=",")
criteria_df["criteria_score_possible"] = criteria_df["criteria_score_possible"].astype(float)
for model in os.listdir(results_dir):
    if os.path.isdir(os.path.join(results_dir, model)):
        with open(os.path.join(results_dir, model, "amega", "original_v3.json")) as f:
            original_results = json.load(f)
        model_names.append(original_results["model"])
        original_results_df = pd.DataFrame(original_results["predictions"])
        correct_original = list(chain.from_iterable(original_results_df["majority_vote"].values))

        with open(os.path.join(results_dir, model, "amega", "syntactic", "syntactic_v3.json")) as f:
            syntactic_results = json.load(f)
        syntactic_results_df = pd.DataFrame(syntactic_results["predictions"])
        correct_syntactic = list(chain.from_iterable(syntactic_results_df["majority_vote"].values))

        # Contingency table
        contingency_table = [[0, 0], [0, 0]]
        for original, syntactic, weight in zip(correct_original, correct_syntactic, criteria_df["criteria_score_possible"].values):
            if original and syntactic:
                contingency_table[0][0] += weight
            elif original and not syntactic:
                contingency_table[0][1] += weight
            elif not original and syntactic:
                contingency_table[1][0] += weight
            elif not original and not syntactic:
                contingency_table[1][1] += weight

        # McNemar's test
        test_result = mcnemar(contingency_table, exact=False, correction=True)
        p_values.append(test_result.pvalue)
        test_statistics.append(test_result.statistic)

amega_syntactic_significance_df = pd.DataFrame({
    "model": model_names,
    "p_value": p_values,
    "test_statistic": test_statistics
})
amega_syntactic_significance_df["model"] = amega_syntactic_significance_df["model"].map(id_to_model)
amega_syntactic_significance_df = amega_syntactic_significance_df.set_index("model").reindex(model_order).reset_index()
amega_syntactic_significance_df["significance_level"] = amega_syntactic_significance_df["p_value"].apply(get_significance_level)
amega_syntactic_significance_df.to_csv("../../data/result_tables/amega_syntactic_significance.csv")
