# Adversarial Bias Mitigation Experiments

This notebook implements adversarial debiasing, modeled after the implementation in:

[1] X. Han, T. Baldwin, and T. Cohn, “Towards Equal Opportunity Fairness through Adversarial Learning,” May 15, 2022, arXiv: arXiv:2203.06317. doi: 10.48550/arXiv.2203.06317.

[2] Xudong Han, Timothy Baldwin, and Trevor Cohn. 2021. Diverse Adversaries for Mitigating Bias in Training. In Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pages 2760–2765, Online. Association for Computational Linguistics.

https://github.com/INK-USC/Upstream-Bias-Mitigation and https://github.com/HanXudong/Diverse_Adversaries_for_Mitigating_Bias_in_Training

## NOTES
- Check what the models being saved in the src/models file are.
- Experiment with multiple discriminators, as in [2] (There is currently an error in training multiples)
- Add explicit equal opportunity criterion, as in [1]

In [3]:
from transformers import BertForSequenceClassification, BertTokenizer, RobertaForSequenceClassification, RobertaTokenizer

import os #,argparse,time
import numpy as np
import json
import pandas as pd
import torch

# Get directory of this script (works for any repository location)
dir_path = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
# Ensure we're in the src directory
if not dir_path.endswith('src'):
    # If we're in a notebook, find the src directory relative to current location
    current_dir = os.getcwd()
    if 'src' in current_dir:
        # Already in src or subdirectory
        dir_path = current_dir
    else:
        # Look for src directory
        potential_src = os.path.join(current_dir, 'src')
        if os.path.exists(potential_src):
            dir_path = potential_src
        else:
            print(f"Warning: Could not find 'src' directory. Using current directory: {current_dir}")
            dir_path = current_dir

os.chdir(dir_path)
print("Current Directory: ", os.getcwd())

from utils.dataloaders_BERT import BiosDataset, create_dataloaders
from utils.model_config import load_model_and_tokenizer, model_types
from utils.discriminator import Discriminator
from scripts_adv_debias import train_and_evaluate, eval_main, train_leakage_discriminator, adv_eval_epoch


from tqdm import tqdm, trange
from utils.customized_loss import DiffLoss

from torch.optim import Adam

from utils.eval_metrices import group_evaluation, leakage_evaluation

from pathlib import Path, PureWindowsPath

Current Directory:  /Users/pia4/Desktop/BFH/github_repos/bias_mitigation_BERT_multilingual/src


---
## Experiment Parameters

In [None]:
pretrained = True # Conduct experiments with pre-trained models, without fine-tuning (for baselines, adversarial = False) 

#### Experiment Settings ####
adversarial = False # Change to True to enable adversarial training
run_training = True # Train the model (if False, load existing model)
run_occupation_bias = True # Conduct occupation prediction bias evaluation
run_leakage_evaluation = True # Conduct gender prediction/leakage evaluation
run_intrinsic = True # Conduct WEAT-measures on pretrained or finetuned model.
balance_sensitive = True # Sample from dataset so that within each occupation category, sensitive attributes are balanced.
use_last = False # Use main model from end of training instead of best validation performance.
masked = True # Use data masking to remove sensitive information from text inputs.
language = "is"  # Changed to Icelandic
num_epochs = 15
num_discriminators = 1

#### DataLoader Parameters ####
params = {'batch_size': 128,    
        'shuffle': True,
        'num_workers': 0}

#### Model specs ####
model_type = "bert"
if language == "is":
    model_type = "roberta"  # Use RoBERTa for Icelandic
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")           
print(f"Using device: {device}")

data_path = "../data/bios/"+f"{language}/"
output_dir = f"../results/adv_debias/{language}/"


#### Adversarial Training Arguments ####
class Args:
    num_epochs = num_epochs
    use_fp16 = False
    cuda = "cuda"
    hidden_size = 768 # Change to match your model's hidden size
    emb_size = 2304 # Change to match your model's embedding size
    num_classes = 2 # Number of private labels for each discriminator
    adv = adversarial 
    # adv_level = -1
    lr = 0.00001
    LAMBDA = 0.8 # Balance coefficient for discriminator loss
    n_discriminator = num_discriminators
    adv_units = 256
    # ratio = 0.8
    DL = True # Change to True to enable DiffLoss
    diff_LAMBDA = 10**3.7 # Coefficient for differential loss
    experiment_type = "adv_debiasing" if adversarial else "standard" if not pretrained else "pretrained"
    # protected_attributes = protected_attributes
    diff_loss = DiffLoss()
    use_last = use_last
    masked = masked
    balanced = balance_sensitive
    # data_path = #Your data path

args = Args()

# Directory for saving the trained model
output_model_dir = os.path.join(output_dir, f"{model_type}_{args.experiment_type}_model")

# If adversarial training is used, add the number of discriminators to the output directory name
if not pretrained:
    if adversarial:
        output_model_dir += f"_{num_discriminators}discriminators"
    
    # If balanced dataset, add to output directory name
    if balance_sensitive:
        output_model_dir += f"_balanced"
    
    # If masking, add to output directory name
    if masked:
        output_model_dir += f"_masked"
    
Path(output_dir).mkdir(parents=True, exist_ok=True)

Using device: cpu


---
## Load and prepare training data

In [5]:
# Use the created train and test splits
language_code = language.upper()

if not masked:
    if balance_sensitive:
        train_split_path = os.path.join(os.path.dirname(data_path), 'train_split_balanced.csv')
        test_split_path = os.path.join(os.path.dirname(data_path), 'test_split_balanced.csv')
        train_df = pd.read_csv(train_split_path)
        test_df = pd.read_csv(test_split_path)
    else:
        train_split_path = os.path.join(os.path.dirname(data_path), 'train_split.csv')
        test_split_path = os.path.join(os.path.dirname(data_path), 'test_split.csv')
        train_df = pd.read_csv(train_split_path)
        test_df = pd.read_csv(test_split_path)

else:
    if balance_sensitive:
        train_split_path = os.path.join(os.path.dirname(data_path), 'train_split_balanced_masked.csv')
        test_split_path = os.path.join(os.path.dirname(data_path), 'test_split_balanced_masked.csv')
        train_df = pd.read_csv(train_split_path)
        test_df = pd.read_csv(test_split_path)
    else:
        train_split_path = os.path.join(os.path.dirname(data_path), 'train_split_masked.csv')
        test_split_path = os.path.join(os.path.dirname(data_path), 'test_split_masked.csv')
        train_df = pd.read_csv(train_split_path)
        test_df = pd.read_csv(test_split_path)
    
if language != "en":
    if masked:
        train_df['hard_text'] = train_df[f'hard_text_masked_{language_code}']
        test_df['hard_text'] = test_df[f'hard_text_masked_{language_code}']
    else:
        train_df['hard_text'] = train_df[f'hard_text_{language_code}']
        test_df['hard_text'] = test_df[f'hard_text_{language_code}']
        
    

In [8]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,hard_text,profession,gender,hard_text_masked,hard_text_masked_IS
0,0,Þau eru vottuð sem Advanced Jivamukti og hafa ...,yoga_teacher,Male,they are Advanced Jivamukti Certified and a me...,Þau eru vottuð sem Advanced Jivamukti og hafa ...
1,1,[NAME] er útskrifaður frá Háskólanum í Illinoi...,paralegal,Male,[NAME] is a graduate of the University of Illi...,[NAME] er útskrifaður frá Háskólanum í Illinoi...
2,2,Þau misstu föður sinn aðeins 11 ára gömul og f...,poet,Male,"Having lost their dad at just 11 years old, th...",Þau misstu föður sinn aðeins 11 ára gömul og f...
3,3,"Þeir koma með sinn sérstaka tegund af dökkum, ...",comedian,Male,"they bring their particular brand of dark, irr...","Þeir koma með sinn sérstaka tegund af dökkum, ..."
4,4,"Þeir eru sérfræðingar í hollri fæðu, mat og næ...",dietitian,Male,"they are an expert on healthy eating, food, an...","Þeir eru sérfræðingar í hollri fæðu, mat og næ..."


In [5]:
# Check gender and profession distribution in train and test sets
print("Train set distribution:")
print(train_df.groupby(['profession', 'gender']).size())
print("\nTest set distribution:")
print(test_df.groupby(['profession', 'gender']).size())

Train set distribution:
profession         gender
accountant         Female    80
                   Male      80
architect          Female    80
                   Male      80
attorney           Female    80
                   Male      80
chiropractor       Female    80
                   Male      80
comedian           Female    80
                   Male      80
composer           Female    80
                   Male      80
dentist            Female    80
                   Male      80
dietitian          Female    80
                   Male      80
dj                 Female    80
                   Male      80
filmmaker          Female    80
                   Male      80
interior_designer  Female    80
                   Male      80
journalist         Female    80
                   Male      80
model              Female    80
                   Male      80
nurse              Female    80
                   Male      80
painter            Female    80
                   Mal

In [10]:
# Get all of the professions in the dataset
occupations = train_df['profession'].unique().tolist()
print(f"Number of occupations: {len(occupations)}")

Number of occupations: 28


---
## Train model and measure bias

In [12]:
# Clear GPU memory
torch.cuda.empty_cache()

# Init model
model, tokenizer, model_name = load_model_and_tokenizer(language,occupations)
model.to(device)

# Init discriminators
n_discriminator = args.n_discriminator
discriminators = [Discriminator(args, args.hidden_size, 2) for _ in range(n_discriminator)]
discriminators = [dis.to(device) for dis in discriminators]

# Init optimizers
LEARNING_RATE = args.lr
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)
adv_optimizers = [Adam(filter(lambda p: p.requires_grad, dis.parameters()), lr=args.LAMBDA*LEARNING_RATE) for dis in discriminators]

Using device: cpu


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mideind/IceBERT and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_loader, test_loader = create_dataloaders(train_df, test_df, tokenizer, batch_size=params['batch_size'])

NameError: name 'tokenizer' is not defined

In [None]:
# Train and evaluate the model
if run_training and not pretrained:
    trained_model, _, _ = train_and_evaluate(model, train_loader, test_loader, args, discriminators, optimizer, adv_optimizers, output_dir)
else:
    trained_model = model
    if not run_training:
        print("Skipping model training (run_training=False)")

In [None]:
# Save full model using Hugging Face format (for bias metrics)
if run_training or pretrained:
    trained_model.save_pretrained(output_model_dir)
    tokenizer.save_pretrained(output_model_dir)

    print(f"Model saved to:")
    print(f"  Full model: {output_model_dir}")
else:
    print("Skipping model saving (run_training=False and not pretrained)")

Model saved to:
  Full model: ../results/adv_debias/nb/bert_pretrained_model


### Bias in occupation predicton

In [None]:
if run_occupation_bias and not pretrained:
    import gc
    
    gc.collect()
    torch.cuda.empty_cache()
    
    # Load the model WITHOUT specifying num_labels using conditional loading
    print(f"Loading model from {output_model_dir}")
    
    # Determine model type based on language
    current_model_type = model_types.get(language, "bert")  # default to BERT
    
    if current_model_type == "roberta":
        trained_model = RobertaForSequenceClassification.from_pretrained(
            output_model_dir, 
            output_hidden_states=True  # Don't specify num_labels!
        )
    else:
        trained_model = BertForSequenceClassification.from_pretrained(
            output_model_dir, 
            output_hidden_states=True  # Don't specify num_labels!
        )
    trained_model.to(device)
    
    # Print model info for debugging
    print(f"Loaded model config: {trained_model.config}")
    print(f"Model num_labels: {trained_model.config.num_labels}")
    print(f"Expected num_labels: {len(occupations)}")
    
    # Measure accuracy for each gender overall and for each occupation
    _, test_preds, test_labels, private_labels = eval_main(trained_model, test_loader, device, args)
    
    # Save results to a file
    results = group_evaluation(test_preds, test_labels, private_labels, silence=True)
    results_path = f"{model_type}_{args.experiment_type}"
    if adversarial:
        results_path += f"_{num_discriminators}discriminators"
    if balance_sensitive:
        results_path += f"_balanced"
    if masked:
        results_path += f"_masked"
    results_file = os.path.join(output_dir, f"{results_path}_results.json")
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    with open(results_file, 'w') as f:
        json.dump(results, f, indent=4)
elif not run_occupation_bias:
    print("Skipping occupation prediction bias evaluation (run_occupation_bias=False)")

In [None]:
if run_occupation_bias and not pretrained:
    # Create enhanced confusion matrix plots with profession names
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # Get profession names and class mapping
    class_labels = results.get("class_labels", list(range(len(occupations))))
    profession_names = [occupations[i] if i < len(occupations) else f"Class_{i}" for i in class_labels]
    
    # Get confusion matrices
    cm_0 = np.array(results["Group 0 confusion matrix"])
    cm_1 = np.array(results["Group 1 confusion matrix"])
    
    # Create the plot with larger sizing for better label readability
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot Group 0 (typically female) confusion matrix
    sns.heatmap(cm_0, 
               annot=True, 
               fmt='d', 
               cmap='Blues', 
               xticklabels=profession_names,
               yticklabels=profession_names,
               cbar=True,
               square=True,
               linewidths=0.5,
               ax=axes[0])
    axes[0].set_title('Group 0 (Female) Confusion Matrix', fontsize=16, fontweight='bold')
    axes[0].set_xlabel('Predicted Profession', fontsize=14)
    axes[0].set_ylabel('Actual Profession', fontsize=14)
    # Fix the rotation and font size for x-axis labels
    axes[0].tick_params(axis='x', rotation=90, labelsize=10)
    axes[0].tick_params(axis='y', rotation=0, labelsize=10)
    
    # Plot Group 1 (typically male) confusion matrix  
    sns.heatmap(cm_1, 
               annot=True, 
               fmt='d', 
               cmap='Reds', 
               xticklabels=profession_names,
               yticklabels=profession_names,
               cbar=True,
               square=True,
               linewidths=0.5,
               ax=axes[1])
    axes[1].set_title('Group 1 (Male) Confusion Matrix', fontsize=16, fontweight='bold')
    axes[1].set_xlabel('Predicted Profession', fontsize=14)
    axes[1].set_ylabel('Actual Profession', fontsize=14)
    # Fix the rotation and font size for x-axis labels
    axes[1].tick_params(axis='x', rotation=90, labelsize=10)
    axes[1].tick_params(axis='y', rotation=0, labelsize=10)
    
    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    # Save the plot with higher DPI for better quality
    cm_file = os.path.join(output_dir, f"{model_type}_{args.experiment_type}_confusion_matrices.png")
    plt.savefig(cm_file, dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()

In [None]:
# Profession-level TPR Gap Analysis
if run_occupation_bias and not pretrained:
    from utils.eval_metrices import analyze_profession_tpr_gaps
    from results_to_latex import create_profession_tpr_gap_table
    
    # Get profession names from the occupation list
    profession_names = occupations
    
    # Run profession-level analysis
    print("=== ANALYZING PROFESSION-LEVEL TPR GAPS ===")
    profession_tpr_analysis = analyze_profession_tpr_gaps(
        test_preds, test_labels, private_labels, test_labels, profession_names, silence=False
    )
    
    # Create LaTeX table
    profession_tpr_table = create_profession_tpr_gap_table(profession_tpr_analysis, language=language)
    
    # Save the table
    import os
    table_file = os.path.join(output_dir, f"{model_type}_{args.experiment_type}_profession_tpr_gaps.tex")
    with open(table_file, 'w') as f:
        f.write(profession_tpr_table)
    
    # Also save as txt for easy viewing
    table_txt_file = os.path.join(output_dir, f"{model_type}_{args.experiment_type}_profession_tpr_gaps.txt")
    with open(table_txt_file, 'w') as f:
        f.write(profession_tpr_table)
    
    print(f"\nProfession TPR gap table saved to:")
    print(f"  LaTeX: {table_file}")
    print(f"  Text: {table_txt_file}")

---
### Bias in gender prediction

In [None]:
if run_leakage_evaluation:
    # Clear GPU memory more aggressively
    import gc
    import torch

    # Delete any existing models
    if 'trained_model' in locals():
        del trained_model
    if 'model' in locals():
        del model
    if 'discriminators' in locals():
        del discriminators

    # Force garbage collection
    gc.collect()

    # Clear PyTorch cache
    torch.cuda.empty_cache()

    # Reset PyTorch memory allocator (this helps with fragmentation)
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.reset_accumulated_memory_stats()

    # Now load your model using conditional loading based on model type
    print(f"Loading model from {output_model_dir}")

    # Determine model type based on language
    current_model_type = model_types.get(language, "bert")  # default to BERT

    if current_model_type == "roberta":
        trained_model = RobertaForSequenceClassification.from_pretrained(output_model_dir, num_labels=len(occupations), output_hidden_states=True)
    else:
        trained_model = BertForSequenceClassification.from_pretrained(output_model_dir, num_labels=len(occupations), output_hidden_states=True)
    trained_model.to(device)

    leakage_results = leakage_evaluation(trained_model, train_loader, test_loader, device)
else:
    print("Skipping leakage evaluation (run_leakage_evaluation=False)")

Loading model from ../results/adv_debias/nb/bert_pretrained_model
Training hidden shape: (4480, 768), Private labels: 4480


In [None]:
# Save leakage results to a file
if run_leakage_evaluation:
    leakage_results_root = f"{model_type}_{args.experiment_type}"

    if balance_sensitive:
        leakage_results_root += f"_balanced"
    if masked:
        leakage_results_root += f"_masked"
    if adversarial:
        leakage_results_root += f"_{num_discriminators}discriminators"

    leakage_results_file = os.path.join(output_dir, f"{leakage_results_root}_leakage_results.json")

    with open(leakage_results_file, 'w') as f:
        json.dump(leakage_results, f, indent=4)

---
### Intrinsic Model Bias
Here we run particular WEAT/SEAT/LPBS tests on the model.

Tests:
- male vs. female terms/surgeon vs. nurse

In [14]:
import glob # For finding test files

def find_available_tests(language):
    """Find all available WEAT and LPBS tests for the given language."""
    
    # Find WEAT tests
    weat_path = f"../data/wordlists/{language}/WEAT/"
    weat_tests = []
    if os.path.exists(weat_path):
        weat_files = glob.glob(os.path.join(weat_path, "*.txt"))
        weat_tests = [os.path.splitext(os.path.basename(f))[0] for f in weat_files]
        print(f"Found {len(weat_tests)} WEAT tests: {weat_tests}")
    
    # Find LPBS tests  
    lpbs_path = f"../data/wordlists/{language}/LPBS/"
    lpbs_tests = []
    if os.path.exists(lpbs_path):
        lpbs_files = glob.glob(os.path.join(lpbs_path, "*.jsonl"))
        lpbs_tests = [os.path.splitext(os.path.basename(f))[0] for f in lpbs_files]
        print(f"Found {len(lpbs_tests)} LPBS tests: {lpbs_tests}")
    
    return weat_tests, lpbs_tests

In [None]:
from bias_metrics.utils import evaluate_combinations
from scripts_adv_debias import convert_classification_to_mlm
import os

### Run intrinsic bias tests ###
if run_intrinsic == True:
    
    ### Retrieve directory of current model ###
    output_model_dir = os.path.join(output_dir, f"{model_type}_{args.experiment_type}_model")

    if not pretrained:
        # If adversarial training was used, add the number of discriminators to the output directory name
        if adversarial:
            output_model_dir += f"_{num_discriminators}discriminators"
        
        # If balanced dataset, add to output directory name
        if balance_sensitive:
            output_model_dir += f"_balanced"
        
        # If masking, add to output directory name
        if masked:
            output_model_dir += f"_masked"
            
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        # Load the trained model using conditional loading based on model type
        print(f'Loading saved model from: {output_model_dir}...')
        
        # Determine model type based on language
        current_model_type = model_types.get(language, "bert")  # default to BERT
        
        if current_model_type == "roberta":
            trained_model = RobertaForSequenceClassification.from_pretrained(output_model_dir, num_labels=len(occupations), output_hidden_states=True)
        else:
            trained_model = BertForSequenceClassification.from_pretrained(output_model_dir, num_labels=len(occupations), output_hidden_states=True)
        trained_model.to(device)
    else:
        # For pretrained models, use the already loaded model
        print("Using pretrained model for intrinsic bias evaluation...")
        trained_model = model  # Use the model already loaded above
    
    # Experiment configurations
    pval = True
    iterations = 10000
    fullpermut = False
    
    # Convert your fine-tuned model to MLM format with model_type parameter
    current_model_type = model_types.get(language, "bert")  # default to BERT
    print(f"Converting {current_model_type.upper()}ForSequenceClassification to {current_model_type.upper()}ForMaskedLM...")
    mlm_model = convert_classification_to_mlm(trained_model, model_name, current_model_type)
    
    # Create MLM model directory with same name as classifier but with _mlm suffix
    if pretrained:
        # For pretrained models, create a temporary directory
        mlm_output_dir = os.path.join(output_dir, f"temp_{model_type}_{args.experiment_type}_model_mlm")
    else:
        mlm_output_dir = output_model_dir + "_mlm"
    
    Path(mlm_output_dir).mkdir(parents=True, exist_ok=True)
    
    mlm_model.save_pretrained(mlm_output_dir)
    tokenizer.save_pretrained(mlm_output_dir)
    
    print(f"MLM model saved to: {mlm_output_dir}")
    if not pretrained:
        print(f"Original classifier preserved at: {output_model_dir}")
    
    # Use the converted MLM model
    mlm_model_path = mlm_output_dir
    
    print(f"Using converted MLM model at: {mlm_model_path}")
    print(f"Model exists: {os.path.exists(mlm_model_path)}")
    
    # Find all available tests for the current language
    weat_tests, lpbs_tests = find_available_tests(language)
    
    # Create combinations for all available tests
    combinations = []
    
    # Add WEAT tests with different embedding types
    for test in weat_tests:
        combinations.append((test, 'WEAT', 'bert', language, mlm_model_path))
    
    # Add LPBS tests
    for test in lpbs_tests:
        combinations.append((test, 'LPBS', 'bert', language, mlm_model_path))
    
    print(f"\nTotal combinations to evaluate: {len(combinations)}")
    print("Test combinations:")
    for i, combo in enumerate(combinations, 1):
        print(f"  {i}. {combo[0]} - {combo[1]} - {combo[2]}")
    
    if combinations:
        print("\nEvaluating intrinsic bias on model (converted to MLM)...")
        
        # The evaluate_combinations function now handles individual test failures internally
        # and returns partial results instead of failing completely
        intrinsic_results = evaluate_combinations(combinations, pval, iterations, fullpermut)
        
        print("\nIntrinsic bias evaluation completed!")
        if intrinsic_results:
            print(f"Results for {len(intrinsic_results)} test combination(s):")
            
            # Print summary of results
            for metric, result in intrinsic_results.items():
                if isinstance(result, dict) and 'Effect Size' in result:
                    print(f"  {metric}: Effect Size = {result['Effect Size']:.4f}")
                else:
                    print(f"  {metric}: {result}")
        else:
            print("⚠️  WARNING: No successful test results obtained!")
        
        # Save intrinsic results as a text file (even if partial or empty)
        intrinsic_root = f"{model_type}_{args.experiment_type}_intrinsic_results"
        if not pretrained:
            if adversarial:
                intrinsic_root += f"_{num_discriminators}discriminators"
            
            # If balanced dataset, add to output directory name
            if balance_sensitive:
                intrinsic_root += f"_balanced"

            # If masking, add to output directory name
            if masked:
                intrinsic_root += f"_masked"
        
        intrinsic_results_file = os.path.join(output_dir, intrinsic_root+".txt")
        with open(intrinsic_results_file, 'w') as f:
            if intrinsic_results:
                for metric, result in intrinsic_results.items():
                    f.write(f"{metric}: {result}\n")
            else:
                f.write("No successful test results obtained. All tests failed.\n")
                f.write(f"Total tests attempted: {len(combinations)}\n")
        
        print(f"\nResults saved to: {intrinsic_results_file}")
        
        # Clean up temporary MLM model directory if we created one for pretrained models
        if pretrained and os.path.exists(mlm_output_dir):
            import shutil
            shutil.rmtree(mlm_output_dir)
            print(f"Cleaned up temporary MLM model directory: {mlm_output_dir}")
            
    else:
        print(f"No WEAT or LPBS tests found for language '{language}'")
        print(f"Please check that test files exist in:")
        print(f"  - ../data/wordlists/{language}/WEAT/")
        print(f"  - ../data/wordlists/{language}/LPBS/")

Using pretrained model for intrinsic bias evaluation...
Converting ROBERTAForSequenceClassification to ROBERTAForMaskedLM...
Transferred: roberta.embeddings.word_embeddings.weight
Transferred: roberta.embeddings.position_embeddings.weight
Transferred: roberta.embeddings.token_type_embeddings.weight
Transferred: roberta.embeddings.LayerNorm.weight
Transferred: roberta.embeddings.LayerNorm.bias
Transferred: roberta.encoder.layer.0.attention.self.query.weight
Transferred: roberta.encoder.layer.0.attention.self.query.bias
Transferred: roberta.encoder.layer.0.attention.self.key.weight
Transferred: roberta.encoder.layer.0.attention.self.key.bias
Transferred: roberta.encoder.layer.0.attention.self.value.weight
Transferred: roberta.encoder.layer.0.attention.self.value.bias
Transferred: roberta.encoder.layer.0.attention.output.dense.weight
Transferred: roberta.encoder.layer.0.attention.output.dense.bias
Transferred: roberta.encoder.layer.0.attention.output.LayerNorm.weight
Transferred: roberta.

KeyboardInterrupt: 