# Evaluating xlmr base model and fine tuned models


In [None]:
import torch
print(torch.cuda.is_available())  # Should print True
print(torch.cuda.get_device_name(0))  # Should show your gpu, mine is NVIDIA RTX 2000 Ada 

True
NVIDIA RTX 2000 Ada Generation Laptop GPU


In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from datetime import datetime
import json

In [None]:
# Configuration

MODELS = {
    "XLM-R-Multilingual": "anonym-author/xlmr-english-downsampled",
    "XLM-R-Arabic": "anonym-author/xlmr-arabic-downsampled",
    "XLM-R-English": "anonym-author/xlmr-english-downsampled"
}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Memory optimization settings
BATCH_SIZE = 32 if torch.cuda.is_available() else 4  # Larger batches for RTX 2000
print(f"Batch size: {BATCH_SIZE}")

Using device: cuda
Batch size: 32


In [None]:
path = os.getcwd()
parent = os.path.join(path, os.pardir)
memo_dataset_path = os.path.join(parent, "Data/Memo_Dataset.csv")
df = pd.read_csv(memo_dataset_path)

df = df[['Question', 'Question_eng', 'Trigger']]

print("Dataset unbalanced shape:", df.shape)
print(df['Trigger'].value_counts())

# Balance dataset
min_count = df['Trigger'].value_counts().min()
df_balanced = (
    df.groupby('Trigger', group_keys=False)
    .apply(lambda x: x.sample(min_count, random_state=42))
    .reset_index(drop=True)
)

print("\nBalanced dataset:", len(df_balanced))
print(df_balanced['Trigger'].value_counts())

# Split data
train_df, test_df = train_test_split(
    df_balanced, test_size=0.2, random_state=42, stratify=df_balanced['Trigger']
)

print(f'\nTraining samples: {len(train_df)}, Test samples: {len(test_df)}')

In [7]:
def predict_batch(model, tokenizer, texts, batch_size=None):
    """Make predictions on a batch of texts"""
    if batch_size is None:
        batch_size = BATCH_SIZE
    
    model.eval()
    all_predictions = []
    all_probs = []
    
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize
            inputs = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            ).to(DEVICE)
            
            # Get predictions
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=-1)
            predictions = torch.argmax(logits, dim=-1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            
            if (i + batch_size) % 80 == 0:
                print(f"  Processed {min(i + batch_size, len(texts))}/{len(texts)} samples...")
    
    return np.array(all_predictions), np.array(all_probs)

def calculate_metrics(y_true, y_pred, probs):
    """Calculate all evaluation metrics"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0)
    }
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Confidence (max probability)
    confidences = np.max(probs, axis=1)
    correct_mask = (y_true == y_pred)
    
    metrics['avg_confidence_correct'] = confidences[correct_mask].mean() if correct_mask.sum() > 0 else 0
    metrics['avg_confidence_incorrect'] = confidences[~correct_mask].mean() if (~correct_mask).sum() > 0 else 0
    metrics['confusion_matrix'] = cm.tolist()
    
    return metrics

def save_predictions(df, predictions, probs, model_name, language, output_dir):
    """Save detailed predictions to CSV"""
    results_df = df.copy()
    results_df['prediction'] = predictions
    results_df['prob_0'] = probs[:, 0]
    results_df['prob_1'] = probs[:, 1]
    results_df['confidence'] = np.max(probs, axis=1)
    results_df['correct'] = results_df['Trigger'] == results_df['prediction']
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{model_name.replace('/', '_')}_{language}_{timestamp}.csv"
    filepath = os.path.join(output_dir, filename)
    results_df.to_csv(filepath, index=False)
    print(f"  Saved predictions to: {filepath}")
    
    return results_df

def print_evaluation_results(model_name, language, metrics):
    """Print formatted evaluation results"""
    print("\n" + "="*70)
    print(f"Model: {model_name} | Language: {language}")
    print("="*70)
    print(f"Accuracy:  {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1 Score:  {metrics['f1']:.4f}")
    
    print("\nConfusion Matrix:")
    cm = metrics['confusion_matrix']
    print(f"                Predicted 0  Predicted 1")
    print(f"Actual 0        {cm[0][0]:>11}  {cm[0][1]:>11}")
    print(f"Actual 1        {cm[1][0]:>11}  {cm[1][1]:>11}")
    
    print(f"\nAvg Confidence (Correct):   {metrics['avg_confidence_correct']:.4f}")
    print(f"Avg Confidence (Incorrect): {metrics['avg_confidence_incorrect']:.4f}")


In [None]:
#main eval loop

#output directory
output_dir = os.path.join(path, "xlmr_roberta_evaluation_results")
os.makedirs(output_dir, exist_ok=True)

# Store all results
all_results = []

for model_name, model_path in MODELS.items():
    print("\n" + "="*70)
    print(f"Loading model: {model_name}")
    print("="*70)
    
    try:
        # Load model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        model.to(DEVICE)
        
        # Evaluate on Arabic text
        print(f"\nEvaluating on Arabic text...")
        arabic_texts = test_df['Question'].tolist()
        arabic_preds, arabic_probs = predict_batch(model, tokenizer, arabic_texts)
        arabic_metrics = calculate_metrics(test_df['Trigger'].values, arabic_preds, arabic_probs)
        
        # Save predictions
        arabic_results_df = save_predictions(
            test_df, arabic_preds, arabic_probs, model_name, "Arabic", output_dir
        )
        
        # Print results
        print_evaluation_results(model_name, "Arabic", arabic_metrics)
        
        # Store for summary table
        all_results.append({
            'Model': model_name,
            'Language': 'Arabic',
            'Accuracy': arabic_metrics['accuracy'],
            'Precision': arabic_metrics['precision'],
            'Recall': arabic_metrics['recall'],
            'F1': arabic_metrics['f1']
        })
        
        # Evaluate on English text
        print(f"\nEvaluating on English text...")
        english_texts = test_df['Question_eng'].tolist()
        english_preds, english_probs = predict_batch(model, tokenizer, english_texts)
        english_metrics = calculate_metrics(test_df['Trigger'].values, english_preds, english_probs)
        
        # Save predictions
        english_results_df = save_predictions(
            test_df, english_preds, english_probs, model_name, "English", output_dir
        )
        
        # Print results
        print_evaluation_results(model_name, "English", english_metrics)
        
        # Store for summary table
        all_results.append({
            'Model': model_name,
            'Language': 'English',
            'Accuracy': english_metrics['accuracy'],
            'Precision': english_metrics['precision'],
            'Recall': english_metrics['recall'],
            'F1': english_metrics['f1']
        })
        
        # Clean up memory
        del model
        del tokenizer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
    except Exception as e:
        print(f"Error evaluating {model_name}: {str(e)}")
        continue

print("\n" + "="*70)
print("CREATING SUMMARY TABLE")
print("="*70)

results_df = pd.DataFrame(all_results)

summary_csv_path = os.path.join(output_dir, "xlmr_roberta_evaluation_summary.csv")
results_df.to_csv(summary_csv_path, index=False)
print(f"\nSummary table saved to: {summary_csv_path}")

# Display table
print("\n" + "="*70)
print("EVALUATION SUMMARY")
print("="*70)
print(results_df.to_string(index=False))


# Generate Latex table

In [None]:
pivot_df = results_df.pivot(index='Model', columns='Language', values=['Accuracy', 'Precision', 'Recall', 'F1'])

# Create LaTeX string
latex_str = pivot_df.to_latex(
    float_format="%.4f",
    caption="Model Evaluation Results on Arabic Trigger Classification",
    label="tab:model_comparison"
)

# Save LaTeX
latex_path = os.path.join(output_dir, "evaluation_summary.tex")
with open(latex_path, 'w') as f:
    f.write(latex_str)
print(f"\nLaTeX table saved to: {latex_path}")
print("\n" + "="*70)
print("LATEX TABLE PREVIEW")
print("="*70)
print(latex_str)

print("\n" + "="*70)
print("EVALUATION COMPLETE!")
print("="*70)
print(f"All results saved to: {output_dir}")

# Zero Shot Evals for Baseline XLMR Roberta model fine tuned on NLI tasks


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from transformers import pipeline
import torch
from datetime import datetime


DEVICE = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'CUDA' if DEVICE == 0 else 'CPU'}")

# Multiple label formulations to test
LABEL_FORMULATIONS = [
    ["concerning", "normal"],
    ["urgent", "non-urgent"],
    ["trigger", "non-trigger"],
    ["distressed", "stable"],
    ["needs support", "doing well"]
]

print("\nTesting label formulations:")
for labels in LABEL_FORMULATIONS:
    print(f"  - {labels}")


path = os.getcwd()
parent = os.path.join(path, os.pardir)
memo_dataset_path = os.path.join(parent, "Data/Memo_Dataset.csv")
df = pd.read_csv(memo_dataset_path)

df = df[['Question', 'Question_eng', 'Trigger']]

# Balance dataset
min_count = df['Trigger'].value_counts().min()
df_balanced = (
    df.groupby('Trigger', group_keys=False)
    .apply(lambda x: x.sample(min_count, random_state=42))
    .reset_index(drop=True)
)

# Split data (SAME SPLIT AS FINE-TUNED MODELS)
train_df, test_df = train_test_split(
    df_balanced, test_size=0.2, random_state=42, stratify=df_balanced['Trigger']
)

print(f'\nTest samples: {len(test_df)}')


print("\n" + "="*70)
print("Loading XLM-RoBERTa NLI model for zero-shot classification")
print("="*70)

# Use a model fine-tuned on multilingual NLI for better zero-shot performance
# This model is specifically trained for zero-shot classification
classifier = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",  # Fine-tuned on XNLI (multilingual NLI)
    device=DEVICE
)

print("Using joeddav/xlm-roberta-large-xnli (fine-tuned on XNLI)")
print("This model is optimized for zero-shot classification tasks")


def predict_zero_shot(texts, candidate_labels):
    """Make zero-shot predictions on texts"""
    predictions = []
    probs_list = []
    
    # Determine which label represents "trigger" (class 1)
    # Assume first label is the positive/concerning/trigger label
    trigger_label = candidate_labels[0].lower()
    
    for idx, text in enumerate(texts):
        result = classifier(text, candidate_labels)
        
        # Get prediction (0 or 1)
        pred_label = result['labels'][0]  # Top prediction
        
        # Check if top prediction is the trigger label
        if pred_label.lower() == trigger_label:
            prediction = 1  # Trigger
        else:
            prediction = 0  # Non-trigger
        
        # Get probabilities in [prob_0, prob_1] format
        if result['labels'][0].lower() == trigger_label:
            prob_1 = result['scores'][0]
            prob_0 = result['scores'][1]
        else:
            prob_0 = result['scores'][0]
            prob_1 = result['scores'][1]
        
        predictions.append(prediction)
        probs_list.append([prob_0, prob_1])
        
        if (idx + 1) % 25 == 0:
            print(f"  Processed {idx + 1}/{len(texts)} samples...")
    
    return np.array(predictions), np.array(probs_list)

def calculate_metrics(y_true, y_pred, probs):
    """Calculate all evaluation metrics"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0)
    }
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Confidence (max probability)
    confidences = np.max(probs, axis=1)
    correct_mask = (y_true == y_pred)
    
    metrics['avg_confidence_correct'] = confidences[correct_mask].mean() if correct_mask.sum() > 0 else 0
    metrics['avg_confidence_incorrect'] = confidences[~correct_mask].mean() if (~correct_mask).sum() > 0 else 0
    metrics['confusion_matrix'] = cm.tolist()
    
    return metrics

def save_predictions(df, predictions, probs, label_formulation, language, output_dir):
    """Save detailed predictions to CSV"""
    results_df = df.copy()
    results_df['prediction'] = predictions
    results_df['prob_0'] = probs[:, 0]
    results_df['prob_1'] = probs[:, 1]
    results_df['confidence'] = np.max(probs, axis=1)
    results_df['correct'] = results_df['Trigger'] == results_df['prediction']
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    label_str = "_".join(label_formulation).replace(" ", "-")
    filename = f"ZeroShot_{label_str}_{language}_{timestamp}.csv"
    filepath = os.path.join(output_dir, filename)
    results_df.to_csv(filepath, index=False)
    print(f"  Saved predictions to: {filename}")
    
    return results_df

def print_evaluation_results(label_formulation, language, metrics):
    """Print formatted evaluation results"""
    print("\n" + "="*70)
    print(f"Labels: {label_formulation} | Language: {language}")
    print("="*70)
    print(f"Accuracy:  {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1 Score:  {metrics['f1']:.4f}")
    
    print("\nConfusion Matrix:")
    cm = metrics['confusion_matrix']
    print(f"                Predicted 0  Predicted 1")
    print(f"Actual 0        {cm[0][0]:>11}  {cm[0][1]:>11}")
    print(f"Actual 1        {cm[1][0]:>11}  {cm[1][1]:>11}")
    
    print(f"\nAvg Confidence (Correct):   {metrics['avg_confidence_correct']:.4f}")
    print(f"Avg Confidence (Incorrect): {metrics['avg_confidence_incorrect']:.4f}")


# Create output directory
output_dir = os.path.join(path, "zeroshot_baseline_results")
os.makedirs(output_dir, exist_ok=True)

# Store all results
all_results = []

for labels in LABEL_FORMULATIONS:
    print("\n" + "="*70)
    print(f"Testing label formulation: {labels}")
    print("="*70)
    
    label_str = str(labels)
    
    # ========================================
    # Evaluate on Arabic
    # ========================================
    print(f"\nEvaluating on Arabic text...")
    
    arabic_texts = test_df['Question'].tolist()
    arabic_preds, arabic_probs = predict_zero_shot(arabic_texts, labels)
    arabic_metrics = calculate_metrics(test_df['Trigger'].values, arabic_preds, arabic_probs)
    
    # Save predictions
    arabic_results_df = save_predictions(
        test_df, arabic_preds, arabic_probs, labels, "Arabic", output_dir
    )
    
    # Print results
    print_evaluation_results(labels, "Arabic", arabic_metrics)
    
    # Store for summary table
    all_results.append({
        'Label_Formulation': label_str,
        'Language': 'Arabic',
        'Accuracy': arabic_metrics['accuracy'],
        'Precision': arabic_metrics['precision'],
        'Recall': arabic_metrics['recall'],
        'F1': arabic_metrics['f1']
    })
    
    # ========================================
    # Evaluate on English
    # ========================================
    print(f"\nEvaluating on English text...")
    
    english_texts = test_df['Question_eng'].tolist()
    english_preds, english_probs = predict_zero_shot(english_texts, labels)
    english_metrics = calculate_metrics(test_df['Trigger'].values, english_preds, english_probs)
    
    # Save predictions
    english_results_df = save_predictions(
        test_df, english_preds, english_probs, labels, "English", output_dir
    )
    
    # Print results
    print_evaluation_results(labels, "English", english_metrics)
    
    # Store for summary table
    all_results.append({
        'Label_Formulation': label_str,
        'Language': 'English',
        'Accuracy': english_metrics['accuracy'],
        'Precision': english_metrics['precision'],
        'Recall': english_metrics['recall'],
        'F1': english_metrics['f1']
    })


print("\n" + "="*70)
print("CREATING ZERO-SHOT COMPARISON TABLE")
print("="*70)

results_df = pd.DataFrame(all_results)

# Save as CSV
summary_csv_path = os.path.join(output_dir, "zeroshot_comparison_summary.csv")
results_df.to_csv(summary_csv_path, index=False)
print(f"\nSummary saved to: {summary_csv_path}")

# Display table
print("\n" + "="*70)
print("ZERO-SHOT LABEL FORMULATION COMPARISON")
print("="*70)
print(results_df.to_string(index=False))


print("\n" + "="*70)
print("BEST PERFORMING FORMULATIONS")
print("="*70)

# Best for Arabic
best_arabic = results_df[results_df['Language'] == 'Arabic'].nlargest(1, 'F1')
print(f"\nBest for Arabic:")
print(f"  Labels: {best_arabic['Label_Formulation'].values[0]}")
print(f"  F1 Score: {best_arabic['F1'].values[0]:.4f}")

# Best for English
best_english = results_df[results_df['Language'] == 'English'].nlargest(1, 'F1')
print(f"\nBest for English:")
print(f"  Labels: {best_english['Label_Formulation'].values[0]}")
print(f"  F1 Score: {best_english['F1'].values[0]:.4f}")


# LaTeX format
pivot_df = results_df.pivot(
    index='Label_Formulation', 
    columns='Language', 
    values=['Accuracy', 'Precision', 'Recall', 'F1']
)

# Create LaTeX string
latex_str = pivot_df.to_latex(
    float_format="%.4f",
    caption="Zero-Shot Performance Across Different Label Formulations",
    label="tab:zeroshot_comparison"
)

# Save LaTeX
latex_path = os.path.join(output_dir, "zeroshot_comparison.tex")
with open(latex_path, 'w') as f:
    f.write(latex_str)

print(f"\nLaTeX table saved to: {latex_path}")
print("\n" + "="*70)
print("LATEX TABLE PREVIEW")
print("="*70)
print(latex_str)

print("\n" + "="*70)
print("ZERO-SHOT EVALUATION COMPLETE!")
print("="*70)
print(f"All results saved to: {output_dir}")

# Zero Shot Evals for Baseline XLMR-Roberta model

In [10]:
from transformers import pipeline

In [11]:
# Multiple label categories to test
LABEL_FORMULATIONS = [
    ["concerning", "normal"],
    ["urgent", "non-urgent"],
    ["trigger", "non-trigger"],
    ["distressed", "stable"],
    ["needs support", "doing well"]
]

print("\nTesting label formulations:")
for labels in LABEL_FORMULATIONS:
    print(f"  - {labels}")


Testing label formulations:
  - ['concerning', 'normal']
  - ['urgent', 'non-urgent']
  - ['trigger', 'non-trigger']
  - ['distressed', 'stable']
  - ['needs support', 'doing well']


In [None]:
path = os.getcwd()
parent = os.path.join(path, os.pardir)
memo_dataset_path = os.path.join(parent, "Data/Memo_Dataset.csv")
df = pd.read_csv(memo_dataset_path)

df = df[['Question', 'Question_eng', 'Trigger']]

# Balance dataset
min_count = df['Trigger'].value_counts().min()
df_balanced = (
    df.groupby('Trigger', group_keys=False)
    .apply(lambda x: x.sample(min_count, random_state=42))
    .reset_index(drop=True)
)

# Split data (SAME SPLIT AS FINE-TUNED MODELS)
train_df, test_df = train_test_split(
    df_balanced, test_size=0.2, random_state=42, stratify=df_balanced['Trigger']
)

print(f'\nTest samples: {len(test_df)}')

In [15]:
print("\n" + "="*70)
print("Loading XLM-RoBERTa base model for zero-shot baseline classification")
print("="*70)

classifier = pipeline(
    "zero-shot-classification",
    model="xlm-roberta-base",
    device=DEVICE
)


Loading XLM-RoBERTa base model for zero-shot baseline classification


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [16]:
def predict_zero_shot(texts, candidate_labels):
    """Make zero-shot predictions on texts"""
    predictions = []
    probs_list = []
    
    # Determine which label represents "trigger" (class 1)
    # Assume first label is the positive/concerning/trigger label
    trigger_label = candidate_labels[0].lower()
    
    for idx, text in enumerate(texts):
        result = classifier(text, candidate_labels)
        
        # Get prediction (0 or 1)
        pred_label = result['labels'][0]  # Top prediction
        
        # Check if top prediction is the trigger label
        if pred_label.lower() == trigger_label:
            prediction = 1  # Trigger
        else:
            prediction = 0  # Non-trigger
        
        # Get probabilities in [prob_0, prob_1] format
        if result['labels'][0].lower() == trigger_label:
            prob_1 = result['scores'][0]
            prob_0 = result['scores'][1]
        else:
            prob_0 = result['scores'][0]
            prob_1 = result['scores'][1]
        
        predictions.append(prediction)
        probs_list.append([prob_0, prob_1])
        
        if (idx + 1) % 25 == 0:
            print(f"  Processed {idx + 1}/{len(texts)} samples...")
    
    return np.array(predictions), np.array(probs_list)

def calculate_metrics(y_true, y_pred, probs):
    """Calculate all evaluation metrics"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0)
    }
    
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Confidence (max probability)
    confidences = np.max(probs, axis=1)
    correct_mask = (y_true == y_pred)
    
    metrics['avg_confidence_correct'] = confidences[correct_mask].mean() if correct_mask.sum() > 0 else 0
    metrics['avg_confidence_incorrect'] = confidences[~correct_mask].mean() if (~correct_mask).sum() > 0 else 0
    metrics['confusion_matrix'] = cm.tolist()
    
    return metrics

def save_predictions(df, predictions, probs, label_formulation, language, output_dir):
    """Save detailed predictions to CSV"""
    results_df = df.copy()
    results_df['prediction'] = predictions
    results_df['prob_0'] = probs[:, 0]
    results_df['prob_1'] = probs[:, 1]
    results_df['confidence'] = np.max(probs, axis=1)
    results_df['correct'] = results_df['Trigger'] == results_df['prediction']
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    label_str = "_".join(label_formulation).replace(" ", "-")
    filename = f"ZeroShot_base_{label_str}_{language}_{timestamp}.csv"
    filepath = os.path.join(output_dir, filename)
    results_df.to_csv(filepath, index=False)
    print(f"  Saved predictions to: {filename}")
    
    return results_df

def print_evaluation_results(label_formulation, language, metrics):
    """Print formatted evaluation results"""
    print("\n" + "="*70)
    print(f"Labels: {label_formulation} | Language: {language}")
    print("="*70)
    print(f"Accuracy:  {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1 Score:  {metrics['f1']:.4f}")
    
    print("\nConfusion Matrix:")
    cm = metrics['confusion_matrix']
    print(f"                Predicted 0  Predicted 1")
    print(f"Actual 0        {cm[0][0]:>11}  {cm[0][1]:>11}")
    print(f"Actual 1        {cm[1][0]:>11}  {cm[1][1]:>11}")
    
    print(f"\nAvg Confidence (Correct):   {metrics['avg_confidence_correct']:.4f}")
    print(f"Avg Confidence (Incorrect): {metrics['avg_confidence_incorrect']:.4f}")


In [None]:
# Create output directory
output_dir = os.path.join(path, "zeroshot_base_xlmr_roberta_results")
os.makedirs(output_dir, exist_ok=True)

# Store all results
all_results = []

for labels in LABEL_FORMULATIONS:
    print("\n" + "="*70)
    print(f"Testing label formulation: {labels}")
    print("="*70)
    
    label_str = str(labels)
    
    # ========================================
    # Evaluate on Arabic
    # ========================================
    print(f"\nEvaluating on Arabic text...")
    
    arabic_texts = test_df['Question'].tolist()
    arabic_preds, arabic_probs = predict_zero_shot(arabic_texts, labels)
    arabic_metrics = calculate_metrics(test_df['Trigger'].values, arabic_preds, arabic_probs)
    
    # Save predictions
    arabic_results_df = save_predictions(
        test_df, arabic_preds, arabic_probs, labels, "Arabic", output_dir
    )
    
    # Print results
    print_evaluation_results(labels, "Arabic", arabic_metrics)
    
    # Store for summary table
    all_results.append({
        'Label_Formulation': label_str,
        'Language': 'Arabic',
        'Accuracy': arabic_metrics['accuracy'],
        'Precision': arabic_metrics['precision'],
        'Recall': arabic_metrics['recall'],
        'F1': arabic_metrics['f1']
    })
    
    # ========================================
    # Evaluate on English
    # ========================================
    print(f"\nEvaluating on English text...")
    
    english_texts = test_df['Question_eng'].tolist()
    english_preds, english_probs = predict_zero_shot(english_texts, labels)
    english_metrics = calculate_metrics(test_df['Trigger'].values, english_preds, english_probs)
    
    # Save predictions
    english_results_df = save_predictions(
        test_df, english_preds, english_probs, labels, "English", output_dir
    )
    
    # Print results
    print_evaluation_results(labels, "English", english_metrics)
    
    # Store for summary table
    all_results.append({
        'Label_Formulation': label_str,
        'Language': 'English',
        'Accuracy': english_metrics['accuracy'],
        'Precision': english_metrics['precision'],
        'Recall': english_metrics['recall'],
        'F1': english_metrics['f1']
    })

print("\n" + "="*70)
print("CREATING ZERO-SHOT COMPARISON TABLE")
print("="*70)

results_df = pd.DataFrame(all_results)

# Save as CSV
summary_csv_path = os.path.join(output_dir, "zeroshot_base_comparison_summary.csv")
results_df.to_csv(summary_csv_path, index=False)
print(f"\nSummary saved to: {summary_csv_path}")

# Display table
print("\n" + "="*70)
print("ZERO-SHOT LABEL FORMULATION COMPARISON")
print("="*70)
print(results_df.to_string(index=False))

print("\n" + "="*70)
print("BEST PERFORMING FORMULATIONS")
print("="*70)

# Best for Arabic
best_arabic = results_df[results_df['Language'] == 'Arabic'].nlargest(1, 'F1')
print(f"\nBest for Arabic:")
print(f"  Labels: {best_arabic['Label_Formulation'].values[0]}")
print(f"  F1 Score: {best_arabic['F1'].values[0]:.4f}")

# Best for English
best_english = results_df[results_df['Language'] == 'English'].nlargest(1, 'F1')
print(f"\nBest for English:")
print(f"  Labels: {best_english['Label_Formulation'].values[0]}")
print(f"  F1 Score: {best_english['F1'].values[0]:.4f}")


In [None]:
# Pivot for better LaTeX format
pivot_df = results_df.pivot(
    index='Label_Formulation', 
    columns='Language', 
    values=['Accuracy', 'Precision', 'Recall', 'F1']
)

# Create LaTeX string
latex_str = pivot_df.to_latex(
    float_format="%.4f",
    caption="Zero-Shot Performance Across Different Label Formulations",
    label="tab:zeroshot_base_comparison"
)

# Save LaTeX
latex_path = os.path.join(output_dir, "zeroshot_base_comparison.tex")
with open(latex_path, 'w') as f:
    f.write(latex_str)

print(f"\nLaTeX table saved to: {latex_path}")
print("\n" + "="*70)
print("LATEX TABLE PREVIEW")
print("="*70)
print(latex_str)

print("\n" + "="*70)
print("ZERO-SHOT EVALUATION COMPLETE!")
print("="*70)
print(f"All results saved to: {output_dir}")