In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
import os
import warnings
import json
from datetime import datetime
warnings.filterwarnings('ignore')

In [3]:
MODEL_CONFIGS = {
    'Llama3-8B-Arabic': {
        'arabic': 'memo-llama3-8b-instruct-arabic_evaluation_results/test_predictions_20251125_051031.csv',
        'english': 'memo-llama3-8b-instruct-arabic_evaluation_results/test_predictions_20260110_091937.csv',
        'label_col': 'label',
        'pred_col': 'prediction'
    },
     'Meta-Llama-3-8B-Instruct': {
        'arabic': 'meta-llama-3-8b-instruct\llama_3b_base_test_predictions_arabic_20260202_112839.csv',
        'english': 'meta-llama-3-8b-instruct\llama_3b_base_test_predictions_eng_20260202_110950.csv',
        'label_col': 'label',
        'pred_col': 'prediction'
    },
    'Claude-Haiku-4.5': {
        'arabic': 'gpt and claude eval/Claude-Haiku-4.5_Arabic_20251210_104613.csv',
        'english': 'gpt and claude eval/Claude-Haiku-4.5_English_20251210_112951.csv',
        'label_col': 'Trigger',
        'pred_col': 'prediction'
    },
    'GPT-4o-mini': {
        'arabic': 'gpt and claude eval/GPT-4o-mini_Arabic_20251210_084029.csv',
        'english': 'gpt and claude eval/GPT-4o-mini_English_20251210_091410.csv',
        'label_col': 'Trigger',
        'pred_col': 'prediction'
    },
    'XLM-RoBERTa': {
        'arabic': 'zeroshot_base_xlmr_roberta_results/ZeroShot_base_trigger_non-trigger_Arabic_20251125_042246.csv',
        'english': 'zeroshot_base_xlmr_roberta_results/ZeroShot_base_trigger_non-trigger_English_20251125_042345.csv',
        'label_col': 'Trigger',
        'pred_col': 'prediction'
    },
    'xlm-roberta-large-xnli': {
        'arabic': 'zeroshot_baseline_xlm_roberta_nli_results/ZeroShot_trigger_non-trigger_Arabic_20251125_035721.csv',
        'english': 'zeroshot_baseline_xlm_roberta_nli_results/ZeroShot_trigger_non-trigger_English_20251125_040107.csv',
        'label_col': 'Trigger',
        'pred_col': 'prediction'
    },
    'XLM-R-Arabic-FT': {
        'arabic': 'xlmr_roberta_evaluation_results/XLM-R-Arabic_Arabic_20251125_022752.csv',
        'english': 'xlmr_roberta_evaluation_results/XLM-R-Arabic_English_20251125_022759.csv',
        'label_col': 'Trigger',
        'pred_col': 'prediction'
    },
    'XLM-R-English-FT': {
        'arabic': 'xlmr_roberta_evaluation_results/XLM-R-English_Arabic_20251125_022832.csv',
        'english': 'xlmr_roberta_evaluation_results/XLM-R-English_English_20251125_022839.csv',
        'label_col': 'Trigger',
        'pred_col': 'prediction'
    },
    'XLM-R-Multilingual-FT': {
        'arabic': 'xlmr_roberta_evaluation_results/XLM-R-Multilingual_Arabic_20251125_022713.csv',
        'english': 'xlmr_roberta_evaluation_results/XLM-R-Multilingual_English_20251125_022720.csv',
        'label_col': 'Trigger',
        'pred_col': 'prediction'
    }
}

#sampling parameters
TARGET_DISTRIBUTIONS = np.arange(0.05, 1.0, 0.05)
SAMPLE_SIZES = [300, 500, 700]
N_ITERATIONS = 200
CONFIDENCE_LEVEL = 0.95

In [4]:
def load_model_data(model_name, language, config):
    """Load predictions for a specific model and language."""
    base_dir = os.getcwd()
    filepath = os.path.join(base_dir, config[language])
    
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    
    df = pd.read_csv(filepath)
    
    true_labels = df[config['label_col']].values
    predictions = df[config['pred_col']].values
    
    return true_labels, predictions, df

def stratified_sample_by_distribution(true_labels, predictions, target_prop_class1, sample_size, df=None):
    #samples data to achieve some target distribution for class 1
    class1_idx = np.where(true_labels == 1)[0]
    class0_idx = np.where(true_labels == 0)[0]
    
    n_class1 = int(sample_size * target_prop_class1)
    n_class0 = sample_size - n_class1
    
    if n_class1 > len(class1_idx) or n_class0 > len(class0_idx):
        return None, None, None
    
    sampled_class1_idx = np.random.choice(class1_idx, size=n_class1, replace=False)
    sampled_class0_idx = np.random.choice(class0_idx, size=n_class0, replace=False)
    
    sampled_idx = np.concatenate([sampled_class1_idx, sampled_class0_idx])
    np.random.shuffle(sampled_idx)
    
    return true_labels[sampled_idx], predictions[sampled_idx], sampled_idx

def calculate_metrics(true_labels, predictions):
    #Calculates all performance metrics
    return {
        'accuracy': accuracy_score(true_labels, predictions),
        'precision': precision_score(true_labels, predictions, zero_division=0),
        'recall': recall_score(true_labels, predictions, zero_division=0),
        'f1': f1_score(true_labels, predictions, zero_division=0)
    }

def calculate_confidence_interval(values, confidence=0.95):
    #Calculates confidence interval for a list of values
    n = len(values)
    if n == 0:
        return 0, 0
    
    mean = np.mean(values)
    std_err = stats.sem(values)
    margin = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    
    return mean - margin, mean + margin

def analyze_prediction_distributions(model_name, language, config):
    #Analyzes the distribution of predictions made by a model
    print(f"Analyzing prediction distribution for {model_name} ({language})...")
    
    true_labels, predictions, df = load_model_data(model_name, language, config)
    
    total_samples = len(predictions)
    class1_predictions = np.sum(predictions == 1)
    class0_predictions = np.sum(predictions == 0)
    
    class1_true = np.sum(true_labels == 1)
    class0_true = np.sum(true_labels == 0)
    
    return {
        'model': model_name,
        'language': language,
        'total_samples': total_samples,
        'true_class1_count': class1_true,
        'true_class1_proportion': class1_true / total_samples,
        'true_class0_count': class0_true,
        'true_class0_proportion': class0_true / total_samples,
        'pred_class1_count': class1_predictions,
        'pred_class1_proportion': class1_predictions / total_samples,
        'pred_class0_count': class0_predictions,
        'pred_class0_proportion': class0_predictions / total_samples,
        'prediction_bias': (class1_predictions / total_samples) - (class1_true / total_samples)
    }

def evaluate_across_distributions(model_name, language, config, sample_size, save_samples=True):
    #Evaluates a model across different label distributions
    print(f"Evaluating {model_name} ({language}) with sample_size={sample_size}...")
    
    true_labels, predictions, df = load_model_data(model_name, language, config)
    
    results = []
    sampled_indices_tracker = []
    
    for target_dist in TARGET_DISTRIBUTIONS:
        dist_metrics = {metric: [] for metric in ['accuracy', 'precision', 'recall', 'f1']}
        iteration_samples = []
        
        for iteration in range(N_ITERATIONS):
            sampled_true, sampled_pred, sampled_idx = stratified_sample_by_distribution(
                true_labels, predictions, target_dist, sample_size, df
            )
            
            if sampled_true is None:
                continue
            
            if save_samples and iteration < 10:
                iteration_samples.append({
                    'iteration': iteration,
                    'indices': sampled_idx.tolist()
                })
            
            metrics = calculate_metrics(sampled_true, sampled_pred)
            
            for metric_name, value in metrics.items():
                dist_metrics[metric_name].append(value)
        
        if save_samples and iteration_samples:
            sampled_indices_tracker.append({
                'model': model_name,
                'language': language,
                'sample_size': sample_size,
                'target_distribution': target_dist,
                'iterations': iteration_samples
            })
        
        if dist_metrics['accuracy']:
            result = {
                'model': model_name,
                'language': language,
                'sample_size': sample_size,
                'distribution': target_dist,
            }
            
            for metric in ['accuracy', 'precision', 'recall', 'f1']:
                values = dist_metrics[metric]
                ci_lower, ci_upper = calculate_confidence_interval(values, CONFIDENCE_LEVEL)
                
                result[f'{metric}_mean'] = np.mean(values)
                result[f'{metric}_std'] = np.std(values)
                result[f'{metric}_ci_lower'] = ci_lower
                result[f'{metric}_ci_upper'] = ci_upper
            
            results.append(result)
    
    return pd.DataFrame(results), sampled_indices_tracker

def run_full_evaluation(save_samples=True):
    """Run evaluation for all models, languages, and sample sizes."""
    all_results = []
    all_sampled_indices = []
    all_prediction_distributions = []
    
    total_runs = len(MODEL_CONFIGS) * 2 * len(SAMPLE_SIZES)
    current_run = 0
    
    print("\n" + "="*80)
    print("ANALYZING PREDICTION DISTRIBUTIONS")
    print("="*80 + "\n")
    
    for model_name, config in MODEL_CONFIGS.items():
        for language in ['arabic', 'english']:
            try:
                dist_info = analyze_prediction_distributions(model_name, language, config)
                all_prediction_distributions.append(dist_info)
            except Exception as e:
                print(f"Error analyzing {model_name} ({language}): {e}")
    
    pred_dist_df = pd.DataFrame(all_prediction_distributions)
    base_dir = os.getcwd()
    pred_dist_path = os.path.join(base_dir, 'model_prediction_distributions.csv')
    pred_dist_df.to_csv(pred_dist_path, index=False)
    print(f"\nPrediction distributions saved to '{pred_dist_path}'")
    
    print("\n" + "="*80)
    print("RUNNING MAIN EVALUATION")
    print("="*80 + "\n")
    
    for model_name, config in MODEL_CONFIGS.items():
        for language in ['arabic', 'english']:
            for sample_size in SAMPLE_SIZES:
                current_run += 1
                print(f"\n[{current_run}/{total_runs}]", end=" ")
                try:
                    results_df, sampled_indices = evaluate_across_distributions(
                        model_name, language, config, sample_size, save_samples
                    )
                    all_results.append(results_df)
                    all_sampled_indices.extend(sampled_indices)
                except Exception as e:
                    print(f"Error evaluating {model_name} ({language}, size={sample_size}): {e}")
    
    if save_samples and all_sampled_indices:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        samples_path = os.path.join(base_dir, f'sampled_indices_{timestamp}.json')
        with open(samples_path, 'w') as f:
            json.dump(all_sampled_indices, f, indent=2)
        print(f"\nSampled indices saved to '{samples_path}'")
    
    final_results = pd.concat(all_results, ignore_index=True)
    return final_results, pred_dist_df

def plot_prediction_distributions(pred_dist_df, save_path='prediction_distributions.png'):
    #visualizes the prediction distributions across models
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    ax1 = axes[0, 0]
    arabic_data = pred_dist_df[pred_dist_df['language'] == 'arabic'].sort_values('prediction_bias')
    
    y_pos_ar = np.arange(len(arabic_data))
    colors_ar = ['red' if x < 0 else 'green' for x in arabic_data['prediction_bias']]
    
    ax1.barh(y_pos_ar, arabic_data['prediction_bias'], color=colors_ar, alpha=0.6)
    ax1.set_yticks(y_pos_ar)
    ax1.set_yticklabels(arabic_data['model'], fontsize=9)
    ax1.set_xlabel('Prediction Bias (Predicted - True Class 1 Proportion)', fontsize=11)
    ax1.set_title('Arabic: Model Prediction Bias', fontsize=12, fontweight='bold')
    ax1.axvline(x=0, color='black', linestyle='--', linewidth=1)
    ax1.grid(True, alpha=0.3, axis='x')
    
    ax2 = axes[0, 1]
    english_data = pred_dist_df[pred_dist_df['language'] == 'english'].sort_values('prediction_bias')
    
    y_pos_en = np.arange(len(english_data))
    colors_en = ['red' if x < 0 else 'green' for x in english_data['prediction_bias']]
    
    ax2.barh(y_pos_en, english_data['prediction_bias'], color=colors_en, alpha=0.6)
    ax2.set_yticks(y_pos_en)
    ax2.set_yticklabels(english_data['model'], fontsize=9)
    ax2.set_xlabel('Prediction Bias (Predicted - True Class 1 Proportion)', fontsize=11)
    ax2.set_title('English: Model Prediction Bias', fontsize=12, fontweight='bold')
    ax2.axvline(x=0, color='black', linestyle='--', linewidth=1)
    ax2.grid(True, alpha=0.3, axis='x')
    
    ax3 = axes[1, 0]
    models = arabic_data['model'].values
    x = np.arange(len(models))
    width = 0.35
    
    ax3.bar(x - width/2, arabic_data['true_class1_proportion'], width, 
            label='True Class 1', alpha=0.8, color='blue')
    ax3.bar(x + width/2, arabic_data['pred_class1_proportion'], width,
            label='Predicted Class 1', alpha=0.8, color='orange')
    ax3.set_ylabel('Proportion', fontsize=11)
    ax3.set_title('Arabic: True vs Predicted Class 1 Distribution', fontsize=12, fontweight='bold')
    ax3.set_xticks(x)
    ax3.set_xticklabels(models, rotation=45, ha='right', fontsize=8)
    ax3.legend(fontsize=9)
    ax3.grid(True, alpha=0.3, axis='y')
    ax3.set_ylim([0, 1])
    
    ax4 = axes[1, 1]
    models = english_data['model'].values
    x = np.arange(len(models))
    
    ax4.bar(x - width/2, english_data['true_class1_proportion'], width,
            label='True Class 1', alpha=0.8, color='blue')
    ax4.bar(x + width/2, english_data['pred_class1_proportion'], width,
            label='Predicted Class 1', alpha=0.8, color='orange')
    ax4.set_ylabel('Proportion', fontsize=11)
    ax4.set_title('English: True vs Predicted Class 1 Distribution', fontsize=12, fontweight='bold')
    ax4.set_xticks(x)
    ax4.set_xticklabels(models, rotation=45, ha='right', fontsize=8)
    ax4.legend(fontsize=9)
    ax4.grid(True, alpha=0.3, axis='y')
    ax4.set_ylim([0, 1])
    
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"\nPrediction distribution plot saved to {save_path}")
    plt.show()

def print_prediction_distribution_summary(pred_dist_df):
    """Print a summary table of prediction distributions."""
    print("\n" + "="*100)
    print("MODEL PREDICTION DISTRIBUTION SUMMARY")
    print("="*100)
    print(f"{'Model':<25} {'Lang':<7} {'True C1%':<10} {'Pred C1%':<10} {'Bias':<10} {'Total N'}")
    print("-"*100)
    
    for _, row in pred_dist_df.iterrows():
        print(f"{row['model']:<25} {row['language']:<7} "
              f"{row['true_class1_proportion']:>8.1%}   "
              f"{row['pred_class1_proportion']:>8.1%}   "
              f"{row['prediction_bias']:>8.3f}   "
              f"{row['total_samples']:>6}")

def plot_results_by_sample_size(results_df, metric='f1', save_path='distribution_analysis_by_sample_size.png'):
    #creates visualization comparing different sample sizes
    fig, axes = plt.subplots(len(SAMPLE_SIZES), 2, figsize=(16, 6 * len(SAMPLE_SIZES)))
    
    if len(SAMPLE_SIZES) == 1:
        axes = axes.reshape(1, -1)
    
    for idx, sample_size in enumerate(SAMPLE_SIZES):
        ax_ar = axes[idx, 0]
        arabic_data = results_df[(results_df['language'] == 'arabic') & 
                                 (results_df['sample_size'] == sample_size)]
        
        for model in arabic_data['model'].unique():
            model_data = arabic_data[arabic_data['model'] == model]
            ax_ar.plot(model_data['distribution'], model_data[f'{metric}_mean'], 
                      marker='o', label=model, linewidth=2)
            ax_ar.fill_between(model_data['distribution'], 
                              model_data[f'{metric}_ci_lower'],
                              model_data[f'{metric}_ci_upper'],
                              alpha=0.2)
        
        ax_ar.set_xlabel('Proportion of Class 1 (Triggering)', fontsize=12)
        ax_ar.set_ylabel(f'{metric.upper()} Score', fontsize=12)
        ax_ar.set_title(f'Arabic - Sample Size {sample_size} - {metric.upper()}', 
                       fontsize=14, fontweight='bold')
        ax_ar.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
        ax_ar.grid(True, alpha=0.3)
        ax_ar.set_ylim([0, 1.05])
        
        ax_en = axes[idx, 1]
        english_data = results_df[(results_df['language'] == 'english') & 
                                  (results_df['sample_size'] == sample_size)]
        
        for model in english_data['model'].unique():
            model_data = english_data[english_data['model'] == model]
            ax_en.plot(model_data['distribution'], model_data[f'{metric}_mean'], 
                      marker='o', label=model, linewidth=2)
            ax_en.fill_between(model_data['distribution'], 
                              model_data[f'{metric}_ci_lower'],
                              model_data[f'{metric}_ci_upper'],
                              alpha=0.2)
        
        ax_en.set_xlabel('Proportion of Class 1 (Triggering)', fontsize=12)
        ax_en.set_ylabel(f'{metric.upper()} Score', fontsize=12)
        ax_en.set_title(f'English - Sample Size {sample_size} - {metric.upper()}', 
                       fontsize=14, fontweight='bold')
        ax_en.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
        ax_en.grid(True, alpha=0.3)
        ax_en.set_ylim([0, 1.05])
    
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Plot saved to {save_path}")
    plt.show()

def plot_sample_size_comparison(results_df, metric='f1', distribution=0.5, save_path='sample_size_comparison.png'):
    #compares how sample size affects metrics at a specific distribution
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    data = results_df[np.isclose(results_df['distribution'], distribution, atol=0.01)]
    
    ax1 = axes[0]
    arabic_data = data[data['language'] == 'arabic']
    
    models = arabic_data['model'].unique()
    x = np.arange(len(SAMPLE_SIZES))
    width = 0.8 / len(models)
    
    for i, model in enumerate(models):
        model_data = arabic_data[arabic_data['model'] == model].sort_values('sample_size')
        positions = x + (i - len(models)/2) * width
        ax1.bar(positions, model_data[f'{metric}_mean'], width, 
               label=model, alpha=0.8)
        ax1.errorbar(positions, model_data[f'{metric}_mean'],
                    yerr=[model_data[f'{metric}_mean'] - model_data[f'{metric}_ci_lower'],
                          model_data[f'{metric}_ci_upper'] - model_data[f'{metric}_mean']],
                    fmt='none', ecolor='black', capsize=3, alpha=0.6)
    
    ax1.set_xlabel('Sample Size', fontsize=12)
    ax1.set_ylabel(f'{metric.upper()} Score', fontsize=12)
    ax1.set_title(f'Arabic - {metric.upper()} by Sample Size (at {distribution:.0%} Class 1)', 
                 fontsize=14, fontweight='bold')
    ax1.set_xticks(x)
    ax1.set_xticklabels(SAMPLE_SIZES)
    ax1.legend(fontsize=9)
    ax1.grid(True, alpha=0.3, axis='y')
    ax1.set_ylim([0, 1.05])
    
    ax2 = axes[1]
    english_data = data[data['language'] == 'english']
    
    for i, model in enumerate(models):
        model_data = english_data[english_data['model'] == model].sort_values('sample_size')
        positions = x + (i - len(models)/2) * width
        ax2.bar(positions, model_data[f'{metric}_mean'], width, 
               label=model, alpha=0.8)
        ax2.errorbar(positions, model_data[f'{metric}_mean'],
                    yerr=[model_data[f'{metric}_mean'] - model_data[f'{metric}_ci_lower'],
                          model_data[f'{metric}_ci_upper'] - model_data[f'{metric}_mean']],
                    fmt='none', ecolor='black', capsize=3, alpha=0.6)
    
    ax2.set_xlabel('Sample Size', fontsize=12)
    ax2.set_ylabel(f'{metric.upper()} Score', fontsize=12)
    ax2.set_title(f'English - {metric.upper()} by Sample Size (at {distribution:.0%} Class 1)', 
                 fontsize=14, fontweight='bold')
    ax2.set_xticks(x)
    ax2.set_xticklabels(SAMPLE_SIZES)
    ax2.legend(fontsize=9)
    ax2.grid(True, alpha=0.3, axis='y')
    ax2.set_ylim([0, 1.05])
    
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Plot saved to {save_path}")
    plt.show()

def create_summary_table_with_ci(results_df, distribution=0.5, sample_size=500):
    #summary table with confidence intervals
    summary = results_df[(np.isclose(results_df['distribution'], distribution, atol=0.01)) & 
                         (results_df['sample_size'] == sample_size)].copy()
    summary = summary.sort_values(['language', 'f1_mean'], ascending=[True, False])
    
    print(f"\n{'='*100}")
    print(f"Summary at {distribution:.0%} Class 1 Distribution (Sample Size: {sample_size})")
    print(f"{'='*100}")
    print(f"{'Model':<25} {'Lang':<7} {'Accuracy':<20} {'Precision':<20} {'Recall':<20} {'F1':<20}")
    print(f"{'-'*100}")
    
    for _, row in summary.iterrows():
        print(f"{row['model']:<25} {row['language']:<7} "
              f"{row['accuracy_mean']:.3f} [{row['accuracy_ci_lower']:.3f}-{row['accuracy_ci_upper']:.3f}]  "
              f"{row['precision_mean']:.3f} [{row['precision_ci_lower']:.3f}-{row['precision_ci_upper']:.3f}]  "
              f"{row['recall_mean']:.3f} [{row['recall_ci_lower']:.3f}-{row['recall_ci_upper']:.3f}]  "
              f"{row['f1_mean']:.3f} [{row['f1_ci_lower']:.3f}-{row['f1_ci_upper']:.3f}]")

def analyze_sample_size_stability(results_df):
    #checks how stable metrics are across different sample sizes
    print(f"\n{'='*80}")
    print("Sample Size Stability Analysis")
    print(f"{'='*80}")
    print("Shows variability in F1 score across sample sizes")
    print(f"{'-'*80}")
    
    for language in ['arabic', 'english']:
        print(f"\n{language.upper()}:")
        for model in results_df['model'].unique():
            model_data = results_df[(results_df['model'] == model) & 
                                   (results_df['language'] == language)]
            
            dist_50 = model_data[np.isclose(model_data['distribution'], 0.5, atol=0.01)]
            
            if len(dist_50) > 1:
                f1_range = dist_50['f1_mean'].max() - dist_50['f1_mean'].min()
                avg_ci_width = (dist_50['f1_ci_upper'] - dist_50['f1_ci_lower']).mean()
                
                print(f"  {model:<25} F1 Range: {f1_range:.4f}, Avg CI Width: {avg_ci_width:.4f}")

In [None]:
if __name__ == "__main__":
    print("="*80)
    print(f"Current working directory: {os.getcwd()}")
    print("="*80)
    
    print("\nStarting comprehensive evaluation...")
    print(f"Target distributions: {len(TARGET_DISTRIBUTIONS)} from {TARGET_DISTRIBUTIONS[0]:.0%} to {TARGET_DISTRIBUTIONS[-1]:.0%}")
    print(f"Sample sizes: {SAMPLE_SIZES}")
    print(f"Iterations per distribution: {N_ITERATIONS}")
    print(f"Confidence level: {CONFIDENCE_LEVEL*100:.0f}%\n")
    
    print("Verifying CSV files...")
    base_dir = os.getcwd()
    all_files_exist = True
    
    for model_name, config in MODEL_CONFIGS.items():
        for language in ['arabic', 'english']:
            filepath = os.path.join(base_dir, config[language])
            if not os.path.exists(filepath):
                print(f"  ✗ NOT FOUND: {filepath}")
                all_files_exist = False
            else:
                print(f"  ✓ Found: {config[language]}")
    
    if not all_files_exist:
        print("\n" + "="*80)
        print("ERROR: Some CSV files are missing.")
        print("="*80)
        exit(1)
    
    print("\n" + "="*80)
    print("All files found! Starting evaluation...")
    print("="*80 + "\n")
    
    results, pred_dist_df = run_full_evaluation()
    
    output_path = os.path.join(base_dir, 'distribution_analysis_results_full.csv')
    results.to_csv(output_path, index=False)
    print("\n" + "="*80)
    print(f"Results saved to '{output_path}'")
    print("="*80)
    
    print("\nGenerating visualizations...")
    
    plot_prediction_distributions(pred_dist_df, 
                                    save_path=os.path.join(base_dir, 'prediction_distributions.png'))

    print_prediction_distribution_summary(pred_dist_df)

    for metric in ['accuracy', 'precision', 'recall', 'f1']:
        save_path = os.path.join(base_dir, f'{metric}_by_distribution_all_sizes.png')
        plot_results_by_sample_size(results, metric=metric, save_path=save_path)

    for sample_size in SAMPLE_SIZES:
        save_path = os.path.join(base_dir, f'sample_size_comparison_{sample_size}.png')
        plot_sample_size_comparison(results, metric='f1', distribution=0.5, save_path=save_path)

    create_summary_table_with_ci(results, distribution=0.5, sample_size=500)

    analyze_sample_size_stability(results)

    print("\n" + "="*80)
    print("EVALUATION COMPLETE!")
    print("="*80)
    print(f"\nGenerated files:")
    print(f"  - distribution_analysis_results_full.csv")
    print(f"  - model_prediction_distributions.csv")
    print(f"  - prediction_distributions.png")
    print(f"  - accuracy_by_distribution_all_sizes.png")
    print(f"  - precision_by_distribution_all_sizes.png")
    print(f"  - recall_by_distribution_all_sizes.png")
    print(f"  - f1_by_distribution_all_sizes.png")
    for sample_size in SAMPLE_SIZES:
        print(f"  - sample_size_comparison_{sample_size}.png")
    print(f"  - sampled_indices_[timestamp].json")
    print("\n" + "="*80)