In [3]:
import pandas as pd
import numpy as np
from statsmodels.stats.contingency_tables import mcnemar
import os
import glob

In [None]:
print("="*70)
print("McNemar's Test: Fine-Tuned XLM-RoBERTa vs Baselines")
print("="*70)

path = os.getcwd()                    
FINETUNED_DIR = os.path.join(path,"xlmr_roberta_evaluation_results")  # where fine-tuned model CSVs are
BASELINE_BASE_DIR = os.path.join(path, "zeroshot_base_xlmr_roberta_results")  # base XLM-RoBERTa (no fine-tuning)
BASELINE_NLI_DIR = os.path.join(path, "zeroshot_baseline_xlm_roberta_nli_results")  # XLM-RoBERTa fine-tuned on NLI

print(FINETUNED_DIR)
print(BASELINE_BASE_DIR)
print(BASELINE_NLI_DIR)

In [5]:
def load_predictions_from_csv(filepath):
    """
    Load predictions from a CSV file.
    Returns dictionary with y_true, y_pred arrays (excluding parse failures)
    """
    try:
        df = pd.read_csv(filepath)
        
        #filter out parse failures if they exist (prediction == 3)
        if 'prediction' in df.columns:
            valid_mask = df['prediction'] != 3
            df_filtered = df[valid_mask]
        else:
            df_filtered = df
        
        return {
            'y_true': df_filtered['Trigger'].values,
            'y_pred': df_filtered['prediction'].values,
            'correct': df_filtered['correct'].values,
            'filepath': filepath,
            'n_samples': len(df_filtered)
        }
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None

def find_csv_file(directory, model_name, language):
    """Find the most recent CSV file matching model and language"""
    pattern = os.path.join(directory, f"{model_name}_{language}_*.csv")
    files = glob.glob(pattern)
    
    if not files:
        print(f"Warning: No files found matching {pattern}")
        return None
    
    # Extract timestamp from filename and use it to find most recent
    # Timestamp format: YYYYMMDD_HHMMSS
    def get_timestamp_from_filename(filepath):
        basename = os.path.basename(filepath)
        # Extract the timestamp part (last part before .csv)
        # Format: ModelName_Language_YYYYMMDD_HHMMSS.csv
        parts = basename.replace('.csv', '').split('_')
        if len(parts) >= 2:
            # Last two parts should be date and time
            try:
                timestamp_str = parts[-2] + parts[-1]  # Concatenate YYYYMMDD and HHMMSS
                return timestamp_str
            except:
                return '0'
        return '0'
    
    # Get most recent file based on timestamp in filename
    most_recent = max(files, key=get_timestamp_from_filename)
    return most_recent

def perform_mcnemar_test(model1_pred, model2_pred, y_true, model1_name, model2_name):
    """
    Perform McNemar's test comparing two models
    
    Returns:
        Dictionary with test results
    """
    #ensure same samples
    assert len(model1_pred) == len(model2_pred) == len(y_true), \
        "Prediction arrays must have same length"
    
    correct1 = (model1_pred == y_true)
    correct2 = (model2_pred == y_true)
    
    #contingency table
    both_correct = np.sum(correct1 & correct2)
    only_1_correct = np.sum(correct1 & ~correct2)
    only_2_correct = np.sum(~correct1 & correct2)
    both_wrong = np.sum(~correct1 & ~correct2)
    
    # McNemar's test contingency table format
    contingency_table = np.array([[both_correct, only_1_correct],
                                   [only_2_correct, both_wrong]])
    
    #performs test
    result = mcnemar(contingency_table, exact=False, correction=True)
    
    acc1 = np.mean(correct1)
    acc2 = np.mean(correct2)
    
    return {
        'model1': model1_name,
        'model2': model2_name,
        'statistic': result.statistic,
        'p_value': result.pvalue,
        'both_correct': both_correct,
        'only_model1_correct': only_1_correct,
        'only_model2_correct': only_2_correct,
        'both_wrong': both_wrong,
        'model1_accuracy': acc1,
        'model2_accuracy': acc2,
        'significant': result.pvalue < 0.05,
        'n_samples': len(y_true)
    }


def print_test_result(result, language):
    """Pretty print McNemar's test results"""
    print(f"\n{'='*70}")
    print(f"Language: {language}")
    print(f"Comparing: {result['model1']} vs {result['model2']}")
    print(f"{'='*70}")
    print(f"Sample size: {result['n_samples']}")
    print(f"\nAccuracies:")
    print(f"  {result['model1']}: {result['model1_accuracy']:.4f}")
    print(f"  {result['model2']}: {result['model2_accuracy']:.4f}")
    print(f"  Difference: {result['model1_accuracy'] - result['model2_accuracy']:.4f}")
    
    print(f"\nContingency Table:")
    print(f"  Both correct:              {result['both_correct']}")
    print(f"  Only {result['model1'][:20]:20} correct: {result['only_model1_correct']}")
    print(f"  Only {result['model2'][:20]:20} correct: {result['only_model2_correct']}")
    print(f"  Both wrong:                {result['both_wrong']}")
    
    print(f"\nMcNemar's Test:")
    print(f"  Chi-squared statistic: {result['statistic']:.4f}")
    print(f"  P-value: {result['p_value']:.4f}")
    print(f"  Significance level: α = 0.05")
    
    if result['significant']:
        if result['only_model1_correct'] > result['only_model2_correct']:
            print(f"\nRESULT: {result['model1']} is SIGNIFICANTLY BETTER (p < 0.05)")
        else:
            print(f"\nRESULT: {result['model2']} is SIGNIFICANTLY BETTER (p < 0.05)")
    else:
        print(f"\nRESULT: No significant difference (p ≥ 0.05)")

In [None]:
#models to compare to baseline
FINETUNED_MODELS = [
    "XLM-R-Multilingual",
    "XLM-R-Arabic", 
    "XLM-R-English"
]

# Dictionary of baselines: {display_name: (directory, file_prefix)}
BASELINES = {
    "XLM-R-Base (Zero-Shot)": (BASELINE_BASE_DIR, "ZeroShot_base_trigger_non-trigger"),
    "XLM-R-NLI (Zero-Shot)": (BASELINE_NLI_DIR, "ZeroShot_trigger_non-trigger")
}

LANGUAGES = ["Arabic", "English"]

all_results = []

for language in LANGUAGES:
    print(f"\n{'#'*70}")
    print(f"# TESTING: {language.upper()}")
    print(f"{'#'*70}")
    
    # Load all baseline models for this language
    baseline_data_dict = {}
    
    for baseline_display_name, (baseline_dir, baseline_prefix) in BASELINES.items():
        baseline_file = find_csv_file(baseline_dir, baseline_prefix, language)
        
        if baseline_file is None:
            print(f"Warning: Could not find {baseline_display_name} for {language}")
            continue
        
        baseline_data = load_predictions_from_csv(baseline_file)
        if baseline_data is None:
            print(f"Warning: Could not load {baseline_display_name} for {language}")
            continue
        
        baseline_data_dict[baseline_display_name] = baseline_data
        print(f"\nLoaded baseline: {baseline_display_name}")
        print(f"  File: {baseline_file}")
        print(f"  Samples: {baseline_data['n_samples']}")
    
    if not baseline_data_dict:
        print(f"Skipping {language}: No baseline files found")
        continue
    
    # Compare each fine-tuned model against each baseline
    for ft_model in FINETUNED_MODELS:
        # Load fine-tuned model predictions
        ft_file = find_csv_file(FINETUNED_DIR, ft_model, language)
        
        if ft_file is None:
            print(f"Skipping {ft_model}: File not found")
            continue
        
        ft_data = load_predictions_from_csv(ft_file)
        if ft_data is None:
            continue
        
        print(f"\n{'*'*70}")
        print(f"Fine-tuned model: {ft_model}")
        print(f"  File: {ft_file}")
        print(f"  Samples: {ft_data['n_samples']}")
        print(f"{'*'*70}")
        
        # Compare against each baseline
        for baseline_name, baseline_data in baseline_data_dict.items():
            
            if ft_data['n_samples'] != baseline_data['n_samples']:
                print(f"Warning: Sample size mismatch!")
                print(f"  Fine-tuned: {ft_data['n_samples']}, {baseline_name}: {baseline_data['n_samples']}")
                # Take minimum to be safe
                min_samples = min(ft_data['n_samples'], baseline_data['n_samples'])
                ft_pred = ft_data['y_pred'][:min_samples]
                baseline_pred = baseline_data['y_pred'][:min_samples]
                y_true = ft_data['y_true'][:min_samples]
            else:
                ft_pred = ft_data['y_pred']
                baseline_pred = baseline_data['y_pred']
                y_true = ft_data['y_true']
            
            # McNemar's test
            result = perform_mcnemar_test(
                ft_pred,
                baseline_pred,
                y_true,
                ft_model,
                baseline_name
            )
            
            result['language'] = language
            print_test_result(result, language)
            
            all_results.append({
                'Fine-tuned Model': ft_model,
                'Baseline': baseline_name,
                'Language': language,
                'FT Accuracy': result['model1_accuracy'],
                'Baseline Accuracy': result['model2_accuracy'],
                'Improvement': result['model1_accuracy'] - result['model2_accuracy'],
                'P-value': result['p_value'],
                'Significant (α=0.05)': 'Yes' if result['significant'] else '❌ No',
                'Chi-squared': result['statistic']
            })

# Summary Table

print("\n" + "="*70)
print("SUMMARY: Fine-tuned XLM-RoBERTa vs All Baselines")
print("="*70)

summary_df = pd.DataFrame(all_results)

if len(summary_df) > 0:
    print("\n" + summary_df.to_string(index=False))
    output_path = os.getcwd()
    
    csv_path = os.path.join(output_path, "mcnemar_test_results_all_baselines.csv")
    summary_df.to_csv(csv_path, index=False)
    print(f"\nResults saved to: {csv_path}")
    
    # Create a copy for LaTeX with formatted p-values
    latex_df = summary_df[['Fine-tuned Model', 'Baseline', 'Language', 'FT Accuracy', 
                           'Baseline Accuracy', 'Improvement', 'P-value', 'Significant (α=0.05)']].copy()
    
    # Format p-values: show "<0.01" if p < 0.01, otherwise show actual value
    latex_df['P-value'] = latex_df['P-value'].apply(
        lambda x: '$<0.01$' if x < 0.01 else f'{x:.4f}'
    )
    
    latex_str = latex_df.to_latex(
        index=False,
        caption="McNemar's Test: Fine-tuned XLM-RoBERTa vs All Baselines (α=0.05)",
        label="tab:mcnemar_all_baselines",
        escape=False
    )
    
    latex_path = os.path.join(output_path, "mcnemar_test_results_all_baselines.tex")
    with open(latex_path, 'w') as f:
        f.write(latex_str)
    print(f"LaTeX table saved to: {latex_path}")
    
    print("\n" + "="*70)
    print("KEY FINDINGS")
    print("="*70)
    
    # Significant improvements count
    sig_improvements = summary_df[summary_df['Significant (α=0.05)'] == 'Yes']
    print(f"\nSignificant improvements: {len(sig_improvements)} out of {len(summary_df)}")
    
    if len(sig_improvements) > 0:
        print("\nModels with significant improvement over baselines:")
        for _, row in sig_improvements.iterrows():
            print(f"  • {row['Fine-tuned Model']} ({row['Language']}) vs {row['Baseline']}: "
                  f"+{row['Improvement']:.4f} accuracy, p={row['P-value']:.4f}")
    
    # Best performing model
    best_model = summary_df.loc[summary_df['FT Accuracy'].idxmax()]
    print(f"\nBest performing fine-tuned model:")
    print(f"  {best_model['Fine-tuned Model']} ({best_model['Language']})")
    print(f"  Accuracy: {best_model['FT Accuracy']:.4f}")
    
    # Group by baseline to see performance against each
    print("\n" + "="*70)
    print("PERFORMANCE BY BASELINE")
    print("="*70)
    for baseline_name in summary_df['Baseline'].unique():
        baseline_subset = summary_df[summary_df['Baseline'] == baseline_name]
        avg_improvement = baseline_subset['Improvement'].mean()
        print(f"\n{baseline_name}:")
        print(f"  Average improvement: +{avg_improvement:.4f}")
        print(f"  Significant improvements: {len(baseline_subset[baseline_subset['Significant (α=0.05)'] == 'Yes'])} out of {len(baseline_subset)}")
    
else:
    print("\nNo results to display. Check file paths.")

print("\n" + "="*70)
print("TESTING COMPLETE")
print("="*70)