In [1]:
import pandas as pd
import numpy as np
from scipy.stats import kendalltau

def test_rank_equivalence_kendall(original_scores, perturbed_scores, correlation_threshold, alpha=0.05, n_bootstraps=5000, random_state=77):
    """
    Tests for rank equivalence by checking if the Kendall's Tau correlation is
    significantly higher than a specified threshold.

    This is achieved by calculating a bootstrap confidence interval for Tau.

    :param original_scores: The first set of scores to be ranked (list or pd.Series)
    :param perturbed_scores: The second set of scores to be ranked (paired with the first) (list or pd.Series)
    :param correlation_threshold: The minimum correlation to be considered
                                  equivalent (e.g., 0.9).
    :param alpha: The significance level. Defaults to 0.05.
    :param n_bootstraps: The number of bootstrap samples. Defaults to 5000.
    :param random_state: Seed for reproducibility. Defaults to 42.
    """
    # Set seed for reproducible results
    rng = np.random.default_rng(random_state)
    
    # Combine ranks into a single DataFrame to bootstrap pairs
    data = pd.DataFrame({"original_scores": original_scores, "perturbed_scores": perturbed_scores})
    
    # Calculate the observed Kendall's Tau
    observed_tau, _ = kendalltau(
        data["original_scores"].rank(method="min", ascending=False),
        data["perturbed_scores"].rank(method="min", ascending=False)
        )

    # --- Bootstrap Procedure ---
    bootstrap_taus = []
    for _ in range(n_bootstraps):
        # Sample pairs with replacement
        sample = data.sample(n=len(data), replace=True, random_state=rng)
        
        # Calculate tau for the bootstrap sample
        tau, _ = kendalltau(
            sample["original_scores"].rank(method="min", ascending=False),
            sample["perturbed_scores"].rank(method="min", ascending=False)
        )
        bootstrap_taus.append(tau)

    # --- Calculate Confidence Interval ---
    # For a one-sided test (is Tau > threshold?), we check the lower bound
    # of the confidence interval.
    lower_bound = np.percentile(bootstrap_taus, 100 * alpha)
    upper_bound = np.percentile(bootstrap_taus, 100 * (1 - alpha))

    # --- Determine Equivalence ---
    is_equivalent = lower_bound > correlation_threshold

    return {
        "observed_tau": observed_tau,
        "correlation_threshold": correlation_threshold,
        "alpha": alpha,
        "ci_lower_bound": lower_bound,
        "ci_upper_bound": upper_bound,
        "is_equivalent": is_equivalent,
    }

In [2]:
from scipy.stats import wilcoxon

def test_score_difference_wilcoxon(original_scores, perturbed_scores, alpha=0.05, alternative='two-sided'):
    """
    Tests for a significant difference between two paired sets of scores using the
    standard Wilcoxon signed-rank test.

    The null hypothesis (H₀) is that the median of the differences between the
    paired scores is zero.

    :param original_scores: The first set of scores (list or pd.Series).
    :param perturbed_scores: The second set of scores (paired with the first).
    :param alpha: The significance level. Defaults to 0.05.
    :param alternative: Defines the alternative hypothesis.
                        'two-sided': the distribution of the differences is not symmetric about zero.
                        'less': the distribution of the differences is stochastically less than a distribution symmetric about zero.
                        'greater': the distribution of the differences is stochastically greater than a distribution symmetric about zero.
    :return: A dictionary containing the test statistic, p-value, and conclusion.
    """
    # --- Perform the Wilcoxon signed-rank test ---
    # The test works directly on the two sets of paired data. It calculates the
    # differences internally and then ranks them.
    statistic, p_value = wilcoxon(original_scores, perturbed_scores, alternative=alternative)

    # --- Determine if the difference is statistically significant ---
    is_different = p_value < alpha

    return {
        "statistic": statistic,
        "p_value": p_value,
        "alpha": alpha,
        "is_different": is_different,
    }

# Lexical Perturbations
## MMLU
### Strict Agreement

In [3]:
mmlu_lexical_results_df = pd.read_csv("../../data/result_tables/mmlu_lexical.csv")
results_kendall = test_rank_equivalence_kendall(
    mmlu_lexical_results_df["original"],
    mmlu_lexical_results_df["lexical"],
    correlation_threshold=0.9)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    mmlu_lexical_results_df["original"],
    mmlu_lexical_results_df["lexical"],
    alpha=0.01,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")




--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.9
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.9763
95% Confidence Interval's Lower Bound: 0.9339
95% Confidence Interval's Upper Bound: 1.0000
--------------------------------------------------
✅ Conclusion: The ranks ARE statistically equivalent.
   (The CI lower bound 0.9339 is greater than the threshold 0.9)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.01
--------------------------------------------------
Observed Wilcoxon Statistic: 275.0000
p-value: 0.0000
✅ Conclusion: The scores ARE statistically different.


### Moderate Agreements

In [4]:
results_kendall = test_rank_equivalence_kendall(
    mmlu_lexical_results_df["original"],
    mmlu_lexical_results_df["lexical"],
    correlation_threshold=0.8)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    mmlu_lexical_results_df["original"],
    mmlu_lexical_results_df["lexical"],
    alpha=0.05,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")

--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.8
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.9763
95% Confidence Interval's Lower Bound: 0.9339
95% Confidence Interval's Upper Bound: 1.0000
--------------------------------------------------
✅ Conclusion: The ranks ARE statistically equivalent.
   (The CI lower bound 0.9339 is greater than the threshold 0.8)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.05
--------------------------------------------------
Observed Wilcoxon Statistic: 275.0000
p-value: 0.0000
✅ Conclusion: The scores ARE statistically different.


## SQuAD
### Strict Agreement

In [5]:
squad_lexical_results_df = pd.read_csv("../../data/result_tables/squad_lexical.csv")
results_kendall = test_rank_equivalence_kendall(
    squad_lexical_results_df["original_sas"],
    squad_lexical_results_df["lexical_sas"],
    correlation_threshold=0.9)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    squad_lexical_results_df["original_sas"],
    squad_lexical_results_df["lexical_sas"],
    alpha=0.01,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")

--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.9
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.9289
95% Confidence Interval's Lower Bound: 0.8285
95% Confidence Interval's Upper Bound: 0.9917
--------------------------------------------------
❌ Conclusion: The ranks ARE NOT statistically equivalent.
   (The CI lower bound 0.8285 is not greater than the threshold 0.9)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.01
--------------------------------------------------
Observed Wilcoxon Statistic: 276.0000
p-value: 0.0000
✅ Conclusion: The scores ARE statistically different.


### Moderate Agreement

In [6]:
results_kendall = test_rank_equivalence_kendall(
    squad_lexical_results_df["original_sas"],
    squad_lexical_results_df["lexical_sas"],
    correlation_threshold=0.8)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    squad_lexical_results_df["original_sas"],
    squad_lexical_results_df["lexical_sas"],
    alpha=0.05,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")

--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.8
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.9289
95% Confidence Interval's Lower Bound: 0.8285
95% Confidence Interval's Upper Bound: 0.9917
--------------------------------------------------
✅ Conclusion: The ranks ARE statistically equivalent.
   (The CI lower bound 0.8285 is greater than the threshold 0.8)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.05
--------------------------------------------------
Observed Wilcoxon Statistic: 276.0000
p-value: 0.0000
✅ Conclusion: The scores ARE statistically different.


## AMEGA
### Strict Agreement

In [7]:
amega_lexical_results_df = pd.read_csv("../../data/result_tables/amega_lexical.csv")
results_kendall = test_rank_equivalence_kendall(
    amega_lexical_results_df["original"],
    amega_lexical_results_df["lexical"],
    correlation_threshold=0.9)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    amega_lexical_results_df["original"],
    amega_lexical_results_df["lexical"],
    alpha=0.01,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")


--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.9
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.8893
95% Confidence Interval's Lower Bound: 0.8033
95% Confidence Interval's Upper Bound: 0.9580
--------------------------------------------------
❌ Conclusion: The ranks ARE NOT statistically equivalent.
   (The CI lower bound 0.8033 is not greater than the threshold 0.9)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.01
--------------------------------------------------
Observed Wilcoxon Statistic: 276.0000
p-value: 0.0000
✅ Conclusion: The scores ARE statistically different.


### Moderate Agreement

In [8]:
results_kendall = test_rank_equivalence_kendall(
    amega_lexical_results_df["original"],
    amega_lexical_results_df["lexical"],
    correlation_threshold=0.8)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    amega_lexical_results_df["original"],
    amega_lexical_results_df["lexical"],
    alpha=0.05,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")


--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.8
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.8893
95% Confidence Interval's Lower Bound: 0.8033
95% Confidence Interval's Upper Bound: 0.9580
--------------------------------------------------
✅ Conclusion: The ranks ARE statistically equivalent.
   (The CI lower bound 0.8033 is greater than the threshold 0.8)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.05
--------------------------------------------------
Observed Wilcoxon Statistic: 276.0000
p-value: 0.0000
✅ Conclusion: The scores ARE statistically different.


# Syntactic Perturbations
## MMLU
### Strict Agreement


In [9]:
mmlu_syntactic_results_df = pd.read_csv("../../data/result_tables/mmlu_syntactic.csv")
results_kendall = test_rank_equivalence_kendall(
    mmlu_syntactic_results_df["original"],
    mmlu_syntactic_results_df["syntactic"],
    correlation_threshold=0.9)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    mmlu_syntactic_results_df["original"],
    mmlu_syntactic_results_df["syntactic"],
    alpha=0.01,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")


--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.9
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.9901
95% Confidence Interval's Lower Bound: 0.9654
95% Confidence Interval's Upper Bound: 1.0000
--------------------------------------------------
✅ Conclusion: The ranks ARE statistically equivalent.
   (The CI lower bound 0.9654 is greater than the threshold 0.9)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.01
--------------------------------------------------
Observed Wilcoxon Statistic: 273.0000
p-value: 0.0000
✅ Conclusion: The scores ARE statistically different.


### Moderate Agreements


In [10]:
results_kendall = test_rank_equivalence_kendall(
    mmlu_syntactic_results_df["original"],
    mmlu_syntactic_results_df["syntactic"],
    correlation_threshold=0.8)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    mmlu_syntactic_results_df["original"],
    mmlu_syntactic_results_df["syntactic"],
    alpha=0.05,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")


--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.8
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.9901
95% Confidence Interval's Lower Bound: 0.9654
95% Confidence Interval's Upper Bound: 1.0000
--------------------------------------------------
✅ Conclusion: The ranks ARE statistically equivalent.
   (The CI lower bound 0.9654 is greater than the threshold 0.8)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.05
--------------------------------------------------
Observed Wilcoxon Statistic: 273.0000
p-value: 0.0000
✅ Conclusion: The scores ARE statistically different.


## SQuAD
### Strict Agreement


In [11]:
squad_syntactic_results_df = pd.read_csv("../../data/result_tables/squad_syntactic.csv")
results_kendall = test_rank_equivalence_kendall(
    squad_syntactic_results_df["original_sas"],
    squad_syntactic_results_df["syntactic_sas"],
    correlation_threshold=0.9)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    squad_syntactic_results_df["original_sas"],
    squad_syntactic_results_df["syntactic_sas"],
    alpha=0.01,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")


--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.9
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.8656
95% Confidence Interval's Lower Bound: 0.7531
95% Confidence Interval's Upper Bound: 0.9504
--------------------------------------------------
❌ Conclusion: The ranks ARE NOT statistically equivalent.
   (The CI lower bound 0.7531 is not greater than the threshold 0.9)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.01
--------------------------------------------------
Observed Wilcoxon Statistic: 276.0000
p-value: 0.0000
✅ Conclusion: The scores ARE statistically different.


### Moderate Agreement


In [12]:
results_kendall = test_rank_equivalence_kendall(
    squad_syntactic_results_df["original_sas"],
    squad_syntactic_results_df["syntactic_sas"],
    correlation_threshold=0.8)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    squad_syntactic_results_df["original_sas"],
    squad_syntactic_results_df["syntactic_sas"],
    alpha=0.05,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")


--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.8
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.8656
95% Confidence Interval's Lower Bound: 0.7531
95% Confidence Interval's Upper Bound: 0.9504
--------------------------------------------------
❌ Conclusion: The ranks ARE NOT statistically equivalent.
   (The CI lower bound 0.7531 is not greater than the threshold 0.8)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.05
--------------------------------------------------
Observed Wilcoxon Statistic: 276.0000
p-value: 0.0000
✅ Conclusion: The scores ARE statistically different.


## AMEGA
### Strict Agreement


In [13]:
amega_syntactic_results_df = pd.read_csv("../../data/result_tables/amega_syntactic.csv")
results_kendall = test_rank_equivalence_kendall(
    amega_syntactic_results_df["original"],
    amega_syntactic_results_df["syntactic"],
    correlation_threshold=0.9)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    amega_syntactic_results_df["original"],
    amega_syntactic_results_df["syntactic"],
    alpha=0.01,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")


--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.9
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.8656
95% Confidence Interval's Lower Bound: 0.7540
95% Confidence Interval's Upper Bound: 0.9500
--------------------------------------------------
❌ Conclusion: The ranks ARE NOT statistically equivalent.
   (The CI lower bound 0.7540 is not greater than the threshold 0.9)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.01
--------------------------------------------------
Observed Wilcoxon Statistic: 233.0000
p-value: 0.0014
✅ Conclusion: The scores ARE statistically different.


### Moderate Agreement


In [14]:
results_kendall = test_rank_equivalence_kendall(
    amega_syntactic_results_df["original"],
    amega_syntactic_results_df["syntactic"],
    correlation_threshold=0.8)

print("--- Rank Equivalence Test using Kendall's Tau ---")
print(f"Equivalence Threshold (τ >): {results_kendall['correlation_threshold']}")
print(f"Significance Level (α): {results_kendall['alpha']}")
print("-" * 50)
print(f"Observed Kendall's Tau: {results_kendall['observed_tau']:.4f}")
print(f"95% Confidence Interval's Lower Bound: {results_kendall['ci_lower_bound']:.4f}")
print(f"95% Confidence Interval's Upper Bound: {results_kendall['ci_upper_bound']:.4f}")
print("-" * 50)

if results_kendall['is_equivalent']:
    print(f"✅ Conclusion: The ranks ARE statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is greater than the threshold {results_kendall['correlation_threshold']})")
else:
    print(f"❌ Conclusion: The ranks ARE NOT statistically equivalent.")
    print(f"   (The CI lower bound {results_kendall['ci_lower_bound']:.4f} is not greater than the threshold {results_kendall['correlation_threshold']})")

results_wilcoxon = test_score_difference_wilcoxon(
    amega_syntactic_results_df["original"],
    amega_syntactic_results_df["syntactic"],
    alpha=0.05,
    alternative='greater'
)
print("-" * 50)
print()
print("--- Score Difference Test using Wilcoxon Signed-Rank Test ---")
print(f"Significance Level (α): {results_wilcoxon['alpha']}")
print("-" * 50)
print(f"Observed Wilcoxon Statistic: {results_wilcoxon['statistic']:.4f}")
print(f"p-value: {results_wilcoxon['p_value']:.4f}")

if results_wilcoxon['is_different']:
    print(f"✅ Conclusion: The scores ARE statistically different.")
else:
    print(f"❌ Conclusion: The scores ARE NOT statistically different.")


--- Rank Equivalence Test using Kendall's Tau ---
Equivalence Threshold (τ >): 0.8
Significance Level (α): 0.05
--------------------------------------------------
Observed Kendall's Tau: 0.8656
95% Confidence Interval's Lower Bound: 0.7540
95% Confidence Interval's Upper Bound: 0.9500
--------------------------------------------------
❌ Conclusion: The ranks ARE NOT statistically equivalent.
   (The CI lower bound 0.7540 is not greater than the threshold 0.8)
--------------------------------------------------

--- Score Difference Test using Wilcoxon Signed-Rank Test ---
Significance Level (α): 0.05
--------------------------------------------------
Observed Wilcoxon Statistic: 233.0000
p-value: 0.0014
✅ Conclusion: The scores ARE statistically different.
