In [1]:
id_to_model = {
    "openai/gpt-oss-120b": "GPT-OSS-120b",
    "gpt-5-mini-2025-08-07": "GPT-5-mini",
    "gemini-2.5-flash": "gemini-2.5-flash",
    "openai/gpt-oss-20b": "GPT-OSS-20b",
    "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": "Qwen3-235B-A22B-Instruct-2507",
    "gpt-5-nano-2025-08-07": "GPT-5-Nano",
    "Qwen/Qwen3-30B-A3B-Instruct-2507": "Qwen3-30B-A3B-Instruct-2507",
    "mistralai/Mistral-Small-3.2-24B-Instruct-2506": "Mistral-Small-3.2-24B-Instruct-2506",
    "gemini-2.5-flash-lite": "gemini-2.5-flash-lite",
    "gpt-4.1-mini-2025-04-14": "GPT-4.1-mini",
    "google/gemma-3-27b-it": "gemma-3-27b-it",
    "mistralai/Mistral-Large-Instruct-2411": "Mistral-Large-Instruct-2411",
    "google/gemma-3-12b-it": "gemma-3-12b-it",
    "gpt-4.1-nano-2025-04-14": "GPT-4.1-Nano",
    "Qwen/Qwen3-4B-Instruct-2507": "Qwen3-4B-Instruct-2507",
    "RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8": "Llama-3.3-70B-Instruct",
    "google/gemma-3-4b-it": "gemma-3-4b-it",
    "mistralai/Ministral-8B-Instruct-2410": "Ministral-8B-Instruct-2410",
    "meta-llama/Llama-3.1-8B-Instruct": "Llama-3.1-8B-Instruct",
    "google/gemma-3-1b-it": "gemma-3-1b-it",
    "meta-llama/Llama-3.2-3B-Instruct": "Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.2-1B-Instruct": "Llama-3.2-1B-Instruct",
    "google/gemma-3-270m-it": "gemma-3-270m-it",
}

model_order = [
    "GPT-5-Nano",
    "GPT-5-mini",
    "GPT-4.1-Nano",
    "GPT-4.1-mini",
    "GPT-OSS-120b",
    "GPT-OSS-20b",
    "Llama-3.3-70B-Instruct",
    "Llama-3.1-8B-Instruct",
    "Llama-3.2-3B-Instruct",
    "Llama-3.2-1B-Instruct",
    "gemini-2.5-flash",
    "gemini-2.5-flash-lite",
    "gemma-3-27b-it",
    "gemma-3-12b-it",
    "gemma-3-4b-it",
    "gemma-3-1b-it",
    "gemma-3-270m-it",
    "Mistral-Large-Instruct-2411",
    "Mistral-Small-3.2-24B-Instruct-2506",
    "Ministral-8B-Instruct-2410",
    "Qwen3-235B-A22B-Instruct-2507",
    "Qwen3-30B-A3B-Instruct-2507",
    "Qwen3-4B-Instruct-2507",
]


In [2]:
import json
from itertools import chain
from statistics import mean


def update_amega_metrics(amega_experiment_file, amega_criteria_df):
    """
    Compute per-case scores and mean score for AMEGA predictions and write them back to the JSON file.
    """
    # Read source JSON
    with open(amega_experiment_file, "r") as f:
        amega_data = json.load(f)
    majority_votes = amega_data["predictions"]["majority_vote"]

    # Compute case scores
    current_case_id = None
    current_case_score = 0.0
    case_scores = []

    for (_, criterion_row), criterion_met in zip(amega_criteria_df.iterrows(), chain.from_iterable(majority_votes)):
        row_case_id = criterion_row["case_id"]
        row_score_possible = criterion_row["criteria_score_possible"]

        if current_case_id is None:
            current_case_id = row_case_id

        # New case encountered: close out the previous one
        if row_case_id != current_case_id:
            case_scores.append(current_case_score)
            current_case_id = row_case_id
            current_case_score = 0.0

        if criterion_met:
            current_case_score += row_score_possible

    # Append the final case's score
    if current_case_id is not None:
        case_scores.append(current_case_score)

    mean_score = mean(case_scores)

    # Write back into JSON
    amega_data.setdefault("metrics", {})
    amega_data["metrics"]["case_scores"] = case_scores
    amega_data["metrics"]["mean_score"] = mean_score

    with open(amega_experiment_file, "w") as f:
        json.dump(amega_data, f)

# Lexical Perturbations
## MMLU

In [3]:
import os
import json

import pandas as pd

# Identify changed rows
mmlu_data_dir = "../../data/mmlu"

with open(os.path.join(mmlu_data_dir, "original.json")) as f:
    original_data = json.load(f)
original_data_df = pd.DataFrame(original_data["data"])

with open(os.path.join(mmlu_data_dir, "lexical", "llm_synonym_perturbation.json")) as f:
    lexical_data = json.load(f)
lexical_data_df = pd.DataFrame(lexical_data["data"])

changed_rows = []
for index, row in lexical_data_df.iterrows():
    if row["question"] != original_data_df.iloc[index]["question"] or row["choices"] != original_data_df.iloc[index]["choices"]:
        changed_rows.append(index)


# Extract results for each model
results_dir = "../../results"

model_names = []
original_scores = []
lexical_scores = []
score_differences = []
for model_dir in os.listdir(results_dir):
    if os.path.isdir(os.path.join(results_dir, model_dir)):
        with open(os.path.join(results_dir, model_dir, "mmlu", "original.json")) as f:
            original_results = json.load(f)
        model_names.append(original_results["model"])

        original_results_df = pd.DataFrame(original_results["predictions"]).iloc[changed_rows]
        original_results_df["correct"] = original_results_df["prediction"] == original_results_df["answer"]
        original_score = original_results_df["correct"].mean()
        original_scores.append(original_score)

        with open(os.path.join(results_dir, model_dir, "mmlu", "lexical", "llm_synonym.json")) as f:
            lexical_results = json.load(f)
        lexical_results_df = pd.DataFrame(lexical_results["predictions"]).iloc[changed_rows]
        lexical_results_df["correct"] = lexical_results_df["prediction"] == lexical_results_df["answer"]
        lexical_score = lexical_results_df["correct"].mean()
        lexical_scores.append(lexical_score)

        score_differences.append(original_score - lexical_score)

mmlu_df_lexical = pd.DataFrame({
    "model": model_names,
    "original": original_scores,
    "lexical": lexical_scores,
    "score_difference": score_differences
})
mmlu_df_lexical["model"] = mmlu_df_lexical["model"].map(id_to_model)
mmlu_df_lexical = mmlu_df_lexical.set_index("model").reindex(model_order).reset_index()
mmlu_df_lexical["rank_original"] = mmlu_df_lexical["original"].rank(method="min", ascending=False)
mmlu_df_lexical["rank_lexical"] = mmlu_df_lexical["lexical"].rank(method="min", ascending=False)
mmlu_df_lexical.to_csv("../../data/result_tables/mmlu_lexical.csv")

In [4]:
for idx, row in mmlu_df_lexical.iterrows():
    print(f"{row['model']} & {row['original']*100:.2f}\\% & {row['lexical']*100:.2f}\\% & {row['score_difference']*100:.2f}pp \\\\\\addlinespace[3pt]")

GPT-5-Nano & 69.62\% & 59.44\% & 10.18pp \\\addlinespace[3pt]
GPT-5-mini & 80.07\% & 70.78\% & 9.29pp \\\addlinespace[3pt]
GPT-4.1-Nano & 69.97\% & 62.16\% & 7.81pp \\\addlinespace[3pt]
GPT-4.1-mini & 80.76\% & 72.51\% & 8.25pp \\\addlinespace[3pt]
GPT-OSS-120b & 86.20\% & 76.48\% & 9.71pp \\\addlinespace[3pt]
GPT-OSS-20b & 81.46\% & 71.87\% & 9.59pp \\\addlinespace[3pt]
Llama-3.3-70B-Instruct & 80.49\% & 71.22\% & 9.27pp \\\addlinespace[3pt]
Llama-3.1-8B-Instruct & 62.86\% & 54.78\% & 8.08pp \\\addlinespace[3pt]
Llama-3.2-3B-Instruct & 58.05\% & 51.03\% & 7.02pp \\\addlinespace[3pt]
Llama-3.2-1B-Instruct & 25.71\% & 24.87\% & 0.85pp \\\addlinespace[3pt]
gemini-2.5-flash & 84.97\% & 76.19\% & 8.78pp \\\addlinespace[3pt]
gemini-2.5-flash-lite & 63.49\% & 54.91\% & 8.59pp \\\addlinespace[3pt]
gemma-3-27b-it & 76.63\% & 67.17\% & 9.46pp \\\addlinespace[3pt]
gemma-3-12b-it & 71.19\% & 62.52\% & 8.68pp \\\addlinespace[3pt]
gemma-3-4b-it & 57.25\% & 51.56\% & 5.69pp \\\addlinespace[3pt]
gemm

## SQuAD

In [5]:
import statistics

from haystack.components.evaluators import SASEvaluator
from haystack.utils import ComponentDevice
from datasets import load_dataset
import evaluate
from tqdm.notebook import tqdm


# Identify changed rows
squad_data = "../../data/squad"
original_data_df = load_dataset("rajpurkar/squad", split="validation").shuffle(seed=77).select(range(1000)).to_pandas()
with open(os.path.join(squad_data, "lexical", "llm_synonym_perturbation.json")) as f:
    lexical_data = json.load(f)
lexical_data_df = pd.DataFrame(lexical_data["data"])

changed_rows = []
for index, row in lexical_data_df.iterrows():
    if (row["question"] != original_data_df.iloc[index]["question"]) or (row["context"] != original_data_df.iloc[index]["context"]):
        changed_rows.append(index)

# Initialize evaluators
squad_evaluator = evaluate.load("squad")
sas_evaluator = SASEvaluator(device=ComponentDevice.from_str("mps"))
sas_evaluator.warm_up()


# Extract results for each model
model_names = []
original_em_scores = []
original_f1_scores = []
original_sas_scores = []
lexical_em_scores = []
lexical_f1_scores = []
lexical_sas_scores = []
difference_em = []
difference_f1 = []
difference_sas = []
for model in tqdm(os.listdir(results_dir), desc="Processing models"):
    if os.path.isdir(os.path.join(results_dir, model)):
        with open(os.path.join(results_dir, model, "squad", "original.json")) as f:
            original_results = json.load(f)
        original_results_df = pd.DataFrame(original_results["predictions"]).iloc[changed_rows]
        model_names.append(original_results["model"])
        
        em_scores = []
        f1_scores = []
        sas_scores = []
        for idx, row in tqdm(original_results_df.iterrows(), desc="Processing original rows", total=len(original_results_df)):
            squad_eval = squad_evaluator.compute(
                predictions=[{"id": str(idx), "prediction_text": row["prediction"]}],
                references=[{"id": str(idx), "answers": {"text": row["answers"], "answer_start": [0] * len(row["answers"])}}]
            )
            sas_eval = sas_evaluator.run(row["answers"], [row["prediction"]] * len(row["answers"]))
            em_scores.append(squad_eval["exact_match"])
            f1_scores.append(squad_eval["f1"])
            sas_scores.append(max(sas_eval["individual_scores"]))

        original_em_score = statistics.mean(em_scores)
        original_f1_score = statistics.mean(f1_scores)
        original_sas_score = statistics.mean(sas_scores)
        original_em_scores.append(original_em_score)
        original_f1_scores.append(original_f1_score)
        original_sas_scores.append(original_sas_score)

        with open(os.path.join(results_dir, model, "squad", "lexical", "llm_synonym.json")) as f:
            lexical_results = json.load(f)
        lexical_results_df = pd.DataFrame(lexical_results["predictions"]).iloc[changed_rows]
        
        em_scores = []
        f1_scores = []
        sas_scores = []
        for idx, row in tqdm(lexical_results_df.iterrows(), desc="Processing lexical rows", total=len(lexical_results_df)):
            squad_eval = squad_evaluator.compute(
                predictions=[{"id": str(idx), "prediction_text": row["prediction"]}],
                references=[{"id": str(idx), "answers": {"text": row["answers"], "answer_start": [0] * len(row["answers"])}}]
            )
            sas_eval = sas_evaluator.run(row["answers"], [row["prediction"]] * len(row["answers"]))
            em_scores.append(squad_eval["exact_match"])
            f1_scores.append(squad_eval["f1"])
            sas_scores.append(max(sas_eval["individual_scores"]))

        lexical_em_score = statistics.mean(em_scores)
        lexical_f1_score = statistics.mean(f1_scores)
        lexical_sas_score = statistics.mean(sas_scores)
        lexical_em_scores.append(lexical_em_score)
        lexical_f1_scores.append(lexical_f1_score)
        lexical_sas_scores.append(lexical_sas_score)

        difference_em.append(original_em_score - lexical_em_score)
        difference_f1.append(original_f1_score - lexical_f1_score)
        difference_sas.append(original_sas_score - lexical_sas_score)

squad_df_lexical = pd.DataFrame({
    "model": model_names,
    "original_em": original_em_scores,
    "original_f1": original_f1_scores,
    "original_sas": original_sas_scores,
    "lexical_em": lexical_em_scores,
    "lexical_f1": lexical_f1_scores,
    "lexical_sas": lexical_sas_scores,
    "difference_em": difference_em,
    "difference_f1": difference_f1,
    "difference_sas": difference_sas
})
squad_df_lexical["model"] = squad_df_lexical["model"].map(id_to_model)
squad_df_lexical = squad_df_lexical.set_index("model").reindex(model_order).reset_index()
squad_df_lexical["rank_original_em"] = squad_df_lexical["original_em"].rank(method="min", ascending=False)
squad_df_lexical["rank_original_f1"] = squad_df_lexical["original_f1"].rank(method="min", ascending=False)
squad_df_lexical["rank_original_sas"] = squad_df_lexical["original_sas"].rank(method="min", ascending=False)
squad_df_lexical["rank_lexical_em"] = squad_df_lexical["lexical_em"].rank(method="min", ascending=False)
squad_df_lexical["rank_lexical_f1"] = squad_df_lexical["lexical_f1"].rank(method="min", ascending=False)
squad_df_lexical["rank_lexical_sas"] = squad_df_lexical["lexical_sas"].rank(method="min", ascending=False)
squad_df_lexical.to_csv("../../data/result_tables/squad_lexical.csv")


Processing models:   0%|          | 0/24 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing lexical rows:   0%|          | 0/971 [00:00<?, ?it/s]

In [6]:
for idx, row in squad_df_lexical.iterrows():
    print(f"{row['model']} & {row['original_em']:.2f}\\% & {row['lexical_em']:.2f}\\% & {row['difference_em']:.2f}pp & {row['original_f1']:.2f}\\% & {row['lexical_f1']:.2f}\\% & {row['difference_f1']:.2f}pp & {row['original_sas']*100:.2f}\\% & {row['lexical_sas']*100:.2f}\\% & {row['difference_sas']*100:.2f}pp\\\\n\\addlinespace[3pt]")

GPT-5-Nano & 66.43\% & 62.92\% & 3.50pp & 83.67\% & 80.11\% & 3.55pp & 90.98\% & 88.81\% & 2.17pp\\n\addlinespace[3pt]
GPT-5-mini & 75.08\% & 71.06\% & 4.02pp & 89.44\% & 85.99\% & 3.45pp & 93.68\% & 92.07\% & 1.61pp\\n\addlinespace[3pt]
GPT-4.1-Nano & 76.73\% & 71.47\% & 5.25pp & 89.56\% & 85.60\% & 3.96pp & 93.98\% & 91.91\% & 2.07pp\\n\addlinespace[3pt]
GPT-4.1-mini & 77.03\% & 72.50\% & 4.53pp & 90.59\% & 86.85\% & 3.74pp & 94.36\% & 92.51\% & 1.85pp\\n\addlinespace[3pt]
GPT-OSS-120b & 71.78\% & 67.35\% & 4.43pp & 87.18\% & 83.73\% & 3.44pp & 93.37\% & 91.48\% & 1.89pp\\n\addlinespace[3pt]
GPT-OSS-20b & 70.75\% & 65.91\% & 4.84pp & 87.09\% & 83.05\% & 4.04pp & 92.36\% & 90.54\% & 1.83pp\\n\addlinespace[3pt]
Llama-3.3-70B-Instruct & 82.18\% & 77.14\% & 5.05pp & 92.50\% & 88.96\% & 3.53pp & 95.86\% & 94.00\% & 1.86pp\\n\addlinespace[3pt]
Llama-3.1-8B-Instruct & 72.19\% & 66.94\% & 5.25pp & 86.51\% & 83.16\% & 3.35pp & 91.95\% & 90.36\% & 1.60pp\\n\addlinespace[3pt]
Llama-3.2-3B-Instr

## AMEGA

In [8]:
# Calculate AMEGA score based on weighted criteria
amega_criteria_df = pd.read_csv("../../AMEGA-benchmark/data/criteria.csv", sep=";", decimal=",")
amega_criteria_df["criteria_score_possible"] = amega_criteria_df["criteria_score_possible"].astype(float)

for item in os.listdir(results_dir):
    current_dir = os.path.join(results_dir, item)
    if os.path.isdir(current_dir):
        original_amega_file = os.path.join(current_dir, "amega/original_v3.json")
        syntactic_variation_file = os.path.join(current_dir, "amega/syntactic/syntactic_v3.json")
        lexical_variation_file = os.path.join(current_dir, "amega/lexical/llm_synonym_v3.json")
        
        update_amega_metrics(original_amega_file, amega_criteria_df)
        update_amega_metrics(syntactic_variation_file, amega_criteria_df)
        update_amega_metrics(lexical_variation_file, amega_criteria_df)

In [9]:
# Extract results for each model
model_names = []
amega_scores_original = []
amega_scores_lexical = []
score_differences = []
for model in os.listdir(results_dir):
    if os.path.isdir(os.path.join(results_dir, model)):
        with open(os.path.join(results_dir, model, "amega", "original_v3.json")) as f:
            original_amega_data = json.load(f)
        with open(os.path.join(results_dir, model, "amega", "lexical", "llm_synonym_v3.json")) as f:
            lexical_amega_data = json.load(f)

        model_names.append(original_amega_data["model"])
        original_score = original_amega_data["metrics"]["mean_score"]
        lexical_score = lexical_amega_data["metrics"]["mean_score"]
        amega_scores_original.append(original_score)
        amega_scores_lexical.append(lexical_score)
        score_differences.append(original_score - lexical_score)

amega_df_lexical = pd.DataFrame({
    "model": model_names,
    "original": amega_scores_original,
    "lexical": amega_scores_lexical,
    "score_difference": score_differences
})
amega_df_lexical["model"] = amega_df_lexical["model"].map(id_to_model)
amega_df_lexical = amega_df_lexical.set_index("model").reindex(model_order).reset_index()
amega_df_lexical["rank_original"] = amega_df_lexical["original"].rank(method="min", ascending=False)
amega_df_lexical["rank_lexical"] = amega_df_lexical["lexical"].rank(method="min", ascending=False)
amega_df_lexical.to_csv("../../data/result_tables/amega_lexical.csv")



In [10]:
for idx, row in amega_df_lexical.iterrows():
    print(f"{row['model']} & {row['original']:.2f} & {row['lexical']:.2f} & {row['score_difference']:.2f} \\\\n\\addlinespace[3pt]")

GPT-5-Nano & 37.44 & 35.51 & 1.93 \\n\addlinespace[3pt]
GPT-5-mini & 39.64 & 37.49 & 2.14 \\n\addlinespace[3pt]
GPT-4.1-Nano & 34.12 & 33.41 & 0.72 \\n\addlinespace[3pt]
GPT-4.1-mini & 35.99 & 35.65 & 0.34 \\n\addlinespace[3pt]
GPT-OSS-120b & 39.83 & 39.36 & 0.47 \\n\addlinespace[3pt]
GPT-OSS-20b & 37.74 & 36.07 & 1.68 \\n\addlinespace[3pt]
Llama-3.3-70B-Instruct & 32.70 & 32.21 & 0.49 \\n\addlinespace[3pt]
Llama-3.1-8B-Instruct & 29.80 & 28.35 & 1.45 \\n\addlinespace[3pt]
Llama-3.2-3B-Instruct & 26.58 & 25.28 & 1.30 \\n\addlinespace[3pt]
Llama-3.2-1B-Instruct & 22.19 & 19.71 & 2.48 \\n\addlinespace[3pt]
gemini-2.5-flash & 38.05 & 37.94 & 0.11 \\n\addlinespace[3pt]
gemini-2.5-flash-lite & 36.04 & 34.96 & 1.07 \\n\addlinespace[3pt]
gemma-3-27b-it & 35.40 & 34.20 & 1.20 \\n\addlinespace[3pt]
gemma-3-12b-it & 35.14 & 34.41 & 0.73 \\n\addlinespace[3pt]
gemma-3-4b-it & 32.72 & 31.51 & 1.21 \\n\addlinespace[3pt]
gemma-3-1b-it & 26.89 & 26.17 & 0.72 \\n\addlinespace[3pt]
gemma-3-270m-it & 15.

# Syntactic Perturbations
## MMLU

In [11]:
# Identify changed rows
mmlu_data_dir = "../../data/mmlu"

with open(os.path.join(mmlu_data_dir, "original.json")) as f:
    original_data = json.load(f)
original_data_df = pd.DataFrame(original_data["data"])

with open(os.path.join(mmlu_data_dir, "syntactic", "syntactic_perturbation.json")) as f:
    syntactic_data = json.load(f)
syntactic_data_df = pd.DataFrame(syntactic_data["data"])

changed_rows = []
for index, row in syntactic_data_df.iterrows():
    if row["question"] != original_data_df.iloc[index]["question"] or row["choices"] != original_data_df.iloc[index]["choices"]:
        changed_rows.append(index)


# Extract results for each model
results_dir = "../../results"

model_names = []
original_scores = []
syntactic_scores = []
score_differences = []
for model_dir in os.listdir(results_dir):
    if os.path.isdir(os.path.join(results_dir, model_dir)):
        with open(os.path.join(results_dir, model_dir, "mmlu", "original.json")) as f:
            original_results = json.load(f)
        model_names.append(original_results["model"])

        original_results_df = pd.DataFrame(original_results["predictions"]).iloc[changed_rows]
        original_results_df["correct"] = original_results_df["prediction"] == original_results_df["answer"]
        original_score = original_results_df["correct"].mean()
        original_scores.append(original_score)

        with open(os.path.join(results_dir, model_dir, "mmlu", "syntactic", "syntactic.json")) as f:
            syntactic_results = json.load(f)
        syntactic_results_df = pd.DataFrame(syntactic_results["predictions"]).iloc[changed_rows]
        syntactic_results_df["correct"] = syntactic_results_df["prediction"] == syntactic_results_df["answer"]
        syntactic_score = syntactic_results_df["correct"].mean()
        syntactic_scores.append(syntactic_score)

        score_differences.append(original_score - syntactic_score)

mmlu_df_syntactic = pd.DataFrame({
    "model": model_names,
    "original": original_scores,
    "syntactic": syntactic_scores,
    "score_difference": score_differences
})
mmlu_df_syntactic["model"] = mmlu_df_syntactic["model"].map(id_to_model)
mmlu_df_syntactic = mmlu_df_syntactic.set_index("model").reindex(model_order).reset_index()
mmlu_df_syntactic["rank_original"] = mmlu_df_syntactic["original"].rank(method="min", ascending=False)
mmlu_df_syntactic["rank_syntactic"] = mmlu_df_syntactic["syntactic"].rank(method="min", ascending=False)
mmlu_df_syntactic.to_csv("../../data/result_tables/mmlu_syntactic.csv")

In [12]:
for idx, row in mmlu_df_syntactic.iterrows():
    print(f"{row['model']} & {row['original']*100:.2f}\\% & {row['syntactic']*100:.2f}\\% & {row['score_difference']*100:.2f}pp \\\\n\\addlinespace[3pt]")

GPT-5-Nano & 65.41\% & 63.20\% & 2.21pp \\n\addlinespace[3pt]
GPT-5-mini & 77.25\% & 75.41\% & 1.84pp \\n\addlinespace[3pt]
GPT-4.1-Nano & 65.59\% & 63.61\% & 1.98pp \\n\addlinespace[3pt]
GPT-4.1-mini & 78.04\% & 76.42\% & 1.61pp \\n\addlinespace[3pt]
GPT-OSS-120b & 82.90\% & 80.52\% & 2.39pp \\n\addlinespace[3pt]
GPT-OSS-20b & 77.75\% & 75.43\% & 2.32pp \\n\addlinespace[3pt]
Llama-3.3-70B-Instruct & 79.69\% & 78.08\% & 1.61pp \\n\addlinespace[3pt]
Llama-3.1-8B-Instruct & 60.68\% & 58.35\% & 2.33pp \\n\addlinespace[3pt]
Llama-3.2-3B-Instruct & 56.26\% & 54.28\% & 1.98pp \\n\addlinespace[3pt]
Llama-3.2-1B-Instruct & 25.60\% & 25.49\% & 0.11pp \\n\addlinespace[3pt]
gemini-2.5-flash & 82.97\% & 80.84\% & 2.13pp \\n\addlinespace[3pt]
gemini-2.5-flash-lite & 61.52\% & 61.24\% & 0.28pp \\n\addlinespace[3pt]
gemma-3-27b-it & 73.41\% & 71.87\% & 1.55pp \\n\addlinespace[3pt]
gemma-3-12b-it & 67.76\% & 65.39\% & 2.37pp \\n\addlinespace[3pt]
gemma-3-4b-it & 53.09\% & 51.84\% & 1.25pp \\n\addlines

## SQuAD

In [13]:
# Identify changed rows
squad_data = "../../data/squad"
original_data_df = load_dataset("rajpurkar/squad", split="validation").shuffle(seed=77).select(range(1000)).to_pandas()
with open(os.path.join(squad_data, "lexical", "llm_synonym_perturbation.json")) as f:
    lexical_data = json.load(f)
lexical_data_df = pd.DataFrame(lexical_data["data"])

changed_rows = []
for index, row in lexical_data_df.iterrows():
    if (row["question"] != original_data_df.iloc[index]["question"]) or (row["context"] != original_data_df.iloc[index]["context"]):
        changed_rows.append(index)

# Initialize evaluators
squad_evaluator = evaluate.load("squad")
sas_evaluator = SASEvaluator(device=ComponentDevice.from_str("mps"))
sas_evaluator.warm_up()


# Extract results for each model
model_names = []
original_em_scores = []
original_f1_scores = []
original_sas_scores = []
syntactic_em_scores = []
syntactic_f1_scores = []
syntactic_sas_scores = []
difference_em = []
difference_f1 = []
difference_sas = []
for model in tqdm(os.listdir(results_dir), desc="Processing models"):
    if os.path.isdir(os.path.join(results_dir, model)):
        with open(os.path.join(results_dir, model, "squad", "original.json")) as f:
            original_results = json.load(f)
        original_results_df = pd.DataFrame(original_results["predictions"]).iloc[changed_rows]
        model_names.append(original_results["model"])
        
        em_scores = []
        f1_scores = []
        sas_scores = []
        for idx, row in tqdm(original_results_df.iterrows(), desc="Processing original rows", total=len(original_results_df)):
            squad_eval = squad_evaluator.compute(
                predictions=[{"id": str(idx), "prediction_text": row["prediction"]}],
                references=[{"id": str(idx), "answers": {"text": row["answers"], "answer_start": [0] * len(row["answers"])}}]
            )
            sas_eval = sas_evaluator.run(row["answers"], [row["prediction"]] * len(row["answers"]))
            em_scores.append(squad_eval["exact_match"])
            f1_scores.append(squad_eval["f1"])
            sas_scores.append(max(sas_eval["individual_scores"]))

        original_em_score = statistics.mean(em_scores)
        original_f1_score = statistics.mean(f1_scores)
        original_sas_score = statistics.mean(sas_scores)
        original_em_scores.append(original_em_score)
        original_f1_scores.append(original_f1_score)
        original_sas_scores.append(original_sas_score)

        with open(os.path.join(results_dir, model, "squad", "syntactic", "syntactic.json")) as f:
            syntactic_results = json.load(f)
        syntactic_results_df = pd.DataFrame(syntactic_results["predictions"]).iloc[changed_rows]
        
        em_scores = []
        f1_scores = []
        sas_scores = []
        for idx, row in tqdm(syntactic_results_df.iterrows(), desc="Processing syntactic rows", total=len(syntactic_results_df)):
            squad_eval = squad_evaluator.compute(
                predictions=[{"id": str(idx), "prediction_text": row["prediction"]}],
                references=[{"id": str(idx), "answers": {"text": row["answers"], "answer_start": [0] * len(row["answers"])}}]
            )
            sas_eval = sas_evaluator.run(row["answers"], [row["prediction"]] * len(row["answers"]))
            em_scores.append(squad_eval["exact_match"])
            f1_scores.append(squad_eval["f1"])
            sas_scores.append(max(sas_eval["individual_scores"]))

        syntactic_em_score = statistics.mean(em_scores)
        syntactic_f1_score = statistics.mean(f1_scores)
        syntactic_sas_score = statistics.mean(sas_scores)
        syntactic_em_scores.append(syntactic_em_score)
        syntactic_f1_scores.append(syntactic_f1_score)
        syntactic_sas_scores.append(syntactic_sas_score)

        difference_em.append(original_em_score - syntactic_em_score)
        difference_f1.append(original_f1_score - syntactic_f1_score)
        difference_sas.append(original_sas_score - syntactic_sas_score)

squad_df_syntactic = pd.DataFrame({
    "model": model_names,
    "original_em": original_em_scores,
    "original_f1": original_f1_scores,
    "original_sas": original_sas_scores,
    "syntactic_em": syntactic_em_scores,
    "syntactic_f1": syntactic_f1_scores,
    "syntactic_sas": syntactic_sas_scores,
    "difference_em": difference_em,
    "difference_f1": difference_f1,
    "difference_sas": difference_sas
})
squad_df_syntactic["model"] = squad_df_syntactic["model"].map(id_to_model)
squad_df_syntactic = squad_df_syntactic.set_index("model").reindex(model_order).reset_index()
squad_df_syntactic["rank_original_em"] = squad_df_syntactic["original_em"].rank(method="min", ascending=False)
squad_df_syntactic["rank_original_f1"] = squad_df_syntactic["original_f1"].rank(method="min", ascending=False)
squad_df_syntactic["rank_original_sas"] = squad_df_syntactic["original_sas"].rank(method="min", ascending=False)
squad_df_syntactic["rank_syntactic_em"] = squad_df_syntactic["syntactic_em"].rank(method="min", ascending=False)
squad_df_syntactic["rank_syntactic_f1"] = squad_df_syntactic["syntactic_f1"].rank(method="min", ascending=False)
squad_df_syntactic["rank_syntactic_sas"] = squad_df_syntactic["syntactic_sas"].rank(method="min", ascending=False)
squad_df_syntactic.to_csv("../../data/result_tables/squad_syntactic.csv")


Processing models:   0%|          | 0/24 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing original rows:   0%|          | 0/971 [00:00<?, ?it/s]

Processing syntactic rows:   0%|          | 0/971 [00:00<?, ?it/s]

In [14]:
for idx, row in squad_df_syntactic.iterrows():
    print(f"{row['model']} & {row['original_em']:.2f}\\% & {row['syntactic_em']:.2f}\\% & {row['difference_em']:.2f}pp & {row['original_f1']:.2f}\\% & {row['syntactic_f1']:.2f}\\% & {row['difference_f1']:.2f}pp & {row['original_sas']*100:.2f}\\% & {row['syntactic_sas']*100:.2f}\\% & {row['difference_sas']*100:.2f}pp\\\\n\\addlinespace[3pt]")

GPT-5-Nano & 66.43\% & 63.23\% & 3.19pp & 83.67\% & 81.06\% & 2.61pp & 90.98\% & 89.27\% & 1.71pp\\n\addlinespace[3pt]
GPT-5-mini & 75.08\% & 72.09\% & 2.99pp & 89.44\% & 87.00\% & 2.44pp & 93.68\% & 92.30\% & 1.38pp\\n\addlinespace[3pt]
GPT-4.1-Nano & 76.73\% & 73.22\% & 3.50pp & 89.56\% & 86.53\% & 3.03pp & 93.98\% & 92.51\% & 1.47pp\\n\addlinespace[3pt]
GPT-4.1-mini & 77.03\% & 75.08\% & 1.96pp & 90.59\% & 88.32\% & 2.28pp & 94.36\% & 93.11\% & 1.25pp\\n\addlinespace[3pt]
GPT-OSS-120b & 71.78\% & 70.03\% & 1.75pp & 87.18\% & 85.20\% & 1.98pp & 93.37\% & 92.44\% & 0.93pp\\n\addlinespace[3pt]
GPT-OSS-20b & 70.75\% & 68.90\% & 1.85pp & 87.09\% & 85.11\% & 1.98pp & 92.36\% & 92.05\% & 0.31pp\\n\addlinespace[3pt]
Llama-3.3-70B-Instruct & 82.18\% & 79.20\% & 2.99pp & 92.50\% & 90.23\% & 2.26pp & 95.86\% & 94.43\% & 1.42pp\\n\addlinespace[3pt]
Llama-3.1-8B-Instruct & 72.19\% & 69.62\% & 2.57pp & 86.51\% & 84.45\% & 2.06pp & 91.95\% & 90.70\% & 1.25pp\\n\addlinespace[3pt]
Llama-3.2-3B-Instr

## AMEGA

In [15]:
# Extract results for each model
model_names = []
amega_scores_original = []
amega_scores_syntactic = []
score_differences = []
for model in os.listdir(results_dir):
    if os.path.isdir(os.path.join(results_dir, model)):
        with open(os.path.join(results_dir, model, "amega", "original_v3.json")) as f:
            original_amega_data = json.load(f)
        with open(os.path.join(results_dir, model, "amega", "syntactic", "syntactic_v3.json")) as f:
            syntactic_amega_data = json.load(f)

        model_names.append(original_amega_data["model"])
        original_score = original_amega_data["metrics"]["mean_score"]
        syntactic_score = syntactic_amega_data["metrics"]["mean_score"]
        amega_scores_original.append(original_score)
        amega_scores_syntactic.append(syntactic_score)
        score_differences.append(original_score - syntactic_score)

amega_df_syntactic = pd.DataFrame({
    "model": model_names,
    "original": amega_scores_original,
    "syntactic": amega_scores_syntactic,
    "score_difference": score_differences
})
amega_df_syntactic["model"] = amega_df_syntactic["model"].map(id_to_model)
amega_df_syntactic = amega_df_syntactic.set_index("model").reindex(model_order).reset_index()
amega_df_syntactic["rank_original"] = amega_df_syntactic["original"].rank(method="min", ascending=False)
amega_df_syntactic["rank_syntactic"] = amega_df_syntactic["syntactic"].rank(method="min", ascending=False)
amega_df_syntactic.to_csv("../../data/result_tables/amega_syntactic.csv")



In [16]:
for idx, row in amega_df_syntactic.iterrows():
    print(f"{row['model']} & {row['original']:.2f} & {row['syntactic']:.2f} & {row['score_difference']:.2f} \\\\n\\addlinespace[3pt]")

GPT-5-Nano & 37.44 & 36.83 & 0.61 \\n\addlinespace[3pt]
GPT-5-mini & 39.64 & 38.08 & 1.55 \\n\addlinespace[3pt]
GPT-4.1-Nano & 34.12 & 33.54 & 0.59 \\n\addlinespace[3pt]
GPT-4.1-mini & 35.99 & 35.07 & 0.93 \\n\addlinespace[3pt]
GPT-OSS-120b & 39.83 & 39.17 & 0.66 \\n\addlinespace[3pt]
GPT-OSS-20b & 37.74 & 37.72 & 0.03 \\n\addlinespace[3pt]
Llama-3.3-70B-Instruct & 32.70 & 32.31 & 0.39 \\n\addlinespace[3pt]
Llama-3.1-8B-Instruct & 29.80 & 29.80 & -0.00 \\n\addlinespace[3pt]
Llama-3.2-3B-Instruct & 26.58 & 27.35 & -0.77 \\n\addlinespace[3pt]
Llama-3.2-1B-Instruct & 22.19 & 21.83 & 0.36 \\n\addlinespace[3pt]
gemini-2.5-flash & 38.05 & 36.92 & 1.13 \\n\addlinespace[3pt]
gemini-2.5-flash-lite & 36.04 & 34.49 & 1.55 \\n\addlinespace[3pt]
gemma-3-27b-it & 35.40 & 35.46 & -0.06 \\n\addlinespace[3pt]
gemma-3-12b-it & 35.14 & 35.84 & -0.70 \\n\addlinespace[3pt]
gemma-3-4b-it & 32.72 & 32.15 & 0.58 \\n\addlinespace[3pt]
gemma-3-1b-it & 26.89 & 26.02 & 0.87 \\n\addlinespace[3pt]
gemma-3-270m-it &