The code to process Adult Census data, edit/train models, and perform adversarial debiasing. 

Necessary libraries for the notebook.

In [9]:
import os
import tensorflow as tf
import tf2onnx
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras import layers, models
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.utils import to_categorical
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.onnx
from scipy.io import savemat
import csv

### Data Preprocessing

In [27]:
def load_adult_adf():
    # Define paths and column names
    train_path = '../data/adult/adult.data' 
    test_path = '../data/adult/adult.test'
    column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                    'marital-status', 'occupation', 'relationship', 'race', 'sex',
                    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

    # Load data
    train = pd.read_csv(train_path, names=column_names, na_values='?')
    test = pd.read_csv(test_path, names=column_names, na_values='?', skiprows=1)

    # Combine and preprocess
    df = pd.concat([train, test], ignore_index=True)
    df.drop(columns=['fnlwgt'], inplace=True) # 'education-num'
    df.dropna(inplace=True)

    # Encode categorical features
    categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 
                            'relationship', 'race', 'sex', 'native-country']
    for col in categorical_features:
        df[col] = LabelEncoder().fit_transform(df[col])
    
    # Ensure 'income' is correctly labeled
    df['income'] = df['income'].apply(lambda x: 1 if '>50K' in x.strip() else 0)

    # Split the data
    X = df.drop('income', axis=1)
    y = to_categorical(df['income'], num_classes=2)
    
    # Extract the protected attribute ('sex')
    protected_attribute = X['sex'].values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test, protected_train, protected_test = train_test_split(
        X, y, protected_attribute, test_size=0.2, random_state=42
    )
    
    return X_train, X_test, y_train, y_test, protected_train, protected_test

In [None]:
# Saves data for use in verification
def load_and_save_adult_data():
    X_train, X_test, y_train, y_test,_,_ = load_adult_adf() 

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Prepare data dictionary to save as .mat file
    data_dict = {
        'X': X_test,  
        'y': y_test   
    }
    
    # Save to .mat file for use in MATLAB
    savemat("adult_fairify_data.mat", data_dict)
    print("Data saved to adult_fairify_data.mat")

    return X_train, X_test, y_train, y_test

### Model Editing

Method to save the models as onnx files for verification. 

In [None]:
# Function to save the model as ONNX format
def save_model_onnx(model, input_shape, onnx_file_path):
    # Create a dummy input tensor with the correct input shape (batch_size, input_shape)
    dummy_input = tf.random.normal([1] + list(input_shape))

    # Convert the model to ONNX
    model_proto, external_tensor_storage = tf2onnx.convert.from_keras(model, 
                                                                      input_signature=(tf.TensorSpec(shape=[None] + list(input_shape), dtype=tf.float32),),
                                                                      opset=13)
    
    # Save the ONNX model to the specified path
    with open(onnx_file_path, "wb") as f:
        f.write(model_proto.SerializeToString())
    
    print(f"Model has been saved in ONNX format at {onnx_file_path}")

Change the models so they are able to be used in FairNNV. FairNNV cannot handle sigmoid so shift to softmax and adjust final layers. 

In [None]:
model_dir = './adult/adult_h5'
save_dir = './adult/adult_keras'
onnx_save_dir = './adult/adult_onnx'
num_classes = 2

# Ensure the save directories exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
if not os.path.exists(onnx_save_dir):
    os.makedirs(onnx_save_dir)

def modify_model_for_multiclass(model_path, num_classes):
    model = load_model(model_path)

    # Create a new input layer with the correct shape
    new_input = tf.keras.layers.Input(shape=(13,))
    x = new_input

    # Transfer the layers except the last one
    for layer in model.layers[:-1]:
        x = layer(x)

    # Create a new output layer
    output = tf.keras.layers.Dense(num_classes, activation='softmax', name='new_output')(x)
    
    # Create a new model
    new_model = tf.keras.models.Model(inputs=new_input, outputs=output)
    
    return new_model

# Modify each model in the directory to remove sigmoid
for model_file in os.listdir(model_dir):
    if model_file.endswith('.h5'):
        model_path = os.path.join(model_dir, model_file)
        new_model = modify_model_for_multiclass(model_path, num_classes)
        
        # Update the model's loss function
        new_model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
        
        # Save the modified model
        save_path = os.path.join(save_dir, model_file.replace('.h5', '.keras'))
        new_model.save(save_path)

Re-train models. 

In [None]:
X_train, X_test, y_train, y_test,_,_ = load_adult_adf()

for model_file in os.listdir(save_dir):
    if model_file.endswith('.keras'):
        model_path = os.path.join(save_dir, model_file)
        
        try:
            # Load the modified model
            print(f"Loading model {model_file}")
            model = load_model(model_path)

            # Compile the model
            model.compile(
                optimizer=Adam(),
                loss='categorical_crossentropy',  # Update loss function if needed
                metrics=['accuracy']
            )

            # Fit the model
            print(f"Training model {model_file}")
            history = model.fit(X_train, y_train, epochs=50, validation_split=0.2)

            # Evaluate the model
            y_pred = model.predict(X_test)
            y_pred_classes = np.argmax(y_pred, axis=1)
            accuracy = accuracy_score(np.argmax(y_test, axis=1), y_pred_classes)

            print(f"Model {model_file} - Accuracy: {accuracy}")
            # Save the retrained model
            model.save(model_path)
            print(f"Model {model_file} retrained and saved successfully.")

             # Save the model as ONNX
            onnx_save_path = os.path.join(onnx_save_dir, model_file.replace('.keras', '.onnx'))
            save_model_onnx(model, (13,), onnx_save_path)

        except Exception as e:
            print(f"Failed to process {model_file}. Error: {e}")

### Adversarial Debiasing

In [1]:
# To determine which adversarial debiasing framework to run to collect results
multiple_runs = True
singular_run = False

In [37]:
# function to save model metrics to csv
def save_metrics_to_csv(filename, model_file, model_name, classification_accuracy, balanced_accuracy, disparate_impact, equal_opportunity_difference, average_odds_difference,precision,recall,f1):
    # Check if the file exists to write the header only once
    file_exists = os.path.isfile(filename)

    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        if not file_exists:
            # Write the header if the file does not exist
            writer.writerow(['Model File', 'Model', 'Classification Accuracy', 'Balanced Accuracy', 'Disparate Impact', 'Equal Opportunity Difference', 'Average Odds Difference', 'Precision', 'Recall','F1'])
        
        # Write the metrics
        writer.writerow([model_file, model_name, classification_accuracy, balanced_accuracy, disparate_impact, equal_opportunity_difference, average_odds_difference, precision, recall, f1])

# function to save the mean/std dev of model metrics for multiple runs to csv
def save_metrics_to_csv_mr(filename, model_name, model_type, means, stds):
    headers = [
        'model_name', 'model_type', 'metric', 'mean', 'std_dev'
    ]
    with open(filename, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        if os.path.getsize(filename) == 0:
            writer.writerow(headers)
        for metric, mean_value in means.items():
            writer.writerow([model_name, model_type, metric, mean_value, stds[metric]])

Various metrics for evaluation including accuracy and fairness.

In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Metrics calculation functions
def precision(y_true, y_pred, average='binary'):
    return precision_score(y_true, y_pred, average=average)

def recall(y_true, y_pred, average='binary'):
    return recall_score(y_true, y_pred, average=average)

def f1(y_true, y_pred, average='binary'):
    return f1_score(y_true, y_pred, average=average)

def classification_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def balanced_accuracy(y_true, y_pred):
    classes = np.unique(y_true)
    recall_scores = []
    for cls in classes:
        true_positives = np.sum((y_true == cls) & (y_pred == cls))
        possible_positives = np.sum(y_true == cls)
        recall_scores.append(true_positives / possible_positives)
    return np.mean(recall_scores)

def disparate_impact(y_true, y_pred, protected_attribute):
    privileged = protected_attribute == 1
    unprivileged = protected_attribute == 0
    if np.sum(privileged) == 0 or np.sum(unprivileged) == 0:
        return np.nan
    privileged_outcome = np.mean(y_pred[privileged]) if np.sum(privileged) > 0 else np.nan
    unprivileged_outcome = np.mean(y_pred[unprivileged]) if np.sum(unprivileged) > 0 else np.nan
    if privileged_outcome == 0:
        return np.nan  
    return unprivileged_outcome / privileged_outcome

def equal_opportunity_difference(y_true, y_pred, protected_attribute):
    privileged = protected_attribute == 1
    unprivileged = protected_attribute == 0
    true_positive_rate_privileged = np.sum((y_true[privileged] == 1) & (y_pred[privileged] == 1)) / np.sum(y_true[privileged] == 1)
    true_positive_rate_unprivileged = np.sum((y_true[unprivileged] == 1) & (y_pred[unprivileged] == 1)) / np.sum(y_true[unprivileged] == 1)
    return true_positive_rate_unprivileged - true_positive_rate_privileged

def average_odds_difference(y_true, y_pred, protected_attribute):
    privileged = protected_attribute == 1
    unprivileged = protected_attribute == 0
    tpr_privileged = np.sum((y_true[privileged] == 1) & (y_pred[privileged] == 1)) / np.sum(y_true[privileged] == 1)
    tpr_unprivileged = np.sum((y_true[unprivileged] == 1) & (y_pred[unprivileged] == 1)) / np.sum(y_true[unprivileged] == 1)
    fpr_privileged = np.sum((y_true[privileged] == 0) & (y_pred[privileged] == 1)) / np.sum(y_true[privileged] == 0)
    fpr_unprivileged = np.sum((y_true[unprivileged] == 0) & (y_pred[unprivileged] == 1)) / np.sum(y_true[unprivileged] == 0)
    average_odds_privileged = (tpr_privileged + fpr_privileged) / 2
    average_odds_unprivileged = (tpr_unprivileged + fpr_unprivileged) / 2
    return average_odds_unprivileged - average_odds_privileged

Adversarial debiasing code for one run of each model. 

In [None]:
if singular_run:
    def build_adversary_model(input_shape):
        adversary_input = layers.Input(shape=input_shape)
        x = layers.Dense(64, activation='relu')(adversary_input)
        x = layers.Dense(32, activation='relu')(x)
        adversary_output = layers.Dense(1, activation='sigmoid')(x)
        adversary_model = models.Model(inputs=adversary_input, outputs=adversary_output)
        adversary_model.compile(optimizer='adam', loss='binary_crossentropy')
        return adversary_model

    # Directory paths
    input_directory = './adult/adult_keras'
    output_directory = './adult/adult_debiased_onnx'

    # Ensure the output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Load and preprocess the data
    X_train, X_test, y_train, y_test, protected_train, protected_test = load_adult_adf()

    # Standardize the features
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    metrics_filename = './model_metrics/adult_model_metrics.csv'

    # List of models to process
    model_list = ['AC-1', 'AC-4', 'AC-5', 'AC-10', 'AC-3']

    # Iterate over all .keras files in the input directory to convert to ONNX file
    for file in os.listdir(input_directory):
        if file.endswith('.keras') and any(model in file for model in model_list):
            # Full path to the current model file
            input_path = os.path.join(input_directory, file)
            output_path = os.path.join(output_directory, file.replace('.keras', '.onnx'))

            try:
                # Load the model
                print(f"Loading model from {input_path}")
                classifier_model = load_model(input_path)

                # Ensure the model is compiled with the correct optimizer and metrics
                classifier_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

                # Print metrics for plain model
                y_test_pred_plain = classifier_model.predict(X_test).argmax(axis=1)
                y_test_true = y_test.argmax(axis=1)

                plain_classification_accuracy = classification_accuracy(y_test_true, y_test_pred_plain)
                plain_balanced_accuracy = balanced_accuracy(y_test_true, y_test_pred_plain)
                plain_disparate_impact = disparate_impact(y_test_true, y_test_pred_plain, protected_test)
                plain_equal_opportunity_difference = equal_opportunity_difference(y_test_true, y_test_pred_plain, protected_test)
                plain_average_odds_difference = average_odds_difference(y_test_true, y_test_pred_plain, protected_test)
                plain_precision = precision(y_test_true, y_test_pred_plain, average='macro')  # Use 'macro' for multi-class
                plain_recall = recall(y_test_true, y_test_pred_plain, average='macro')        # Use 'macro' for multi-class
                plain_f1 = f1(y_test_true, y_test_pred_plain, average='macro')                # Use 'macro' for multi-class

                save_metrics_to_csv(metrics_filename, file, 'Plain Model', plain_classification_accuracy, plain_balanced_accuracy, plain_disparate_impact, plain_equal_opportunity_difference, plain_average_odds_difference, plain_precision, plain_recall, plain_f1)
                
                # Build and compile the adversary model
                adversary_model = build_adversary_model(classifier_model.output_shape[1:])

                # Training parameters
                num_epochs = 50
                batch_size = 128
                learning_rate = 0.001
                adversary_loss_weight = 0.7

                # Optimizers
                classifier_optimizer = tf.keras.optimizers.Adam(learning_rate)
                adversary_optimizer = tf.keras.optimizers.Adam(learning_rate)

                # Loss functions
                classification_loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
                adversary_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)

                # Training loop
                for epoch in range(num_epochs):
                    # Shuffle the training data
                    indices = np.arange(X_train.shape[0])
                    np.random.shuffle(indices)
                    
                    # Mini-batch training
                    for start in range(0, X_train.shape[0], batch_size):
                        end = min(start + batch_size, X_train.shape[0])
                        batch_indices = indices[start:end]
                        
                        X_batch = X_train[batch_indices]
                        y_batch = y_train[batch_indices]
                        protected_batch = protected_train[batch_indices].reshape(-1, 1)
                        
                        with tf.GradientTape() as classifier_tape, tf.GradientTape() as adversary_tape:
                            # Forward pass through the classifier
                            classifier_predictions = classifier_model(X_batch, training=True)
                            
                            # Forward pass through the adversary
                            adversary_predictions = adversary_model(classifier_predictions, training=True)
                            
                            # Compute losses
                            classification_loss = classification_loss_fn(y_batch, classifier_predictions)
                            adversary_loss = adversary_loss_fn(protected_batch, adversary_predictions)
                            total_loss = classification_loss - adversary_loss_weight * adversary_loss
                        
                        # Compute gradients and update classifier weights
                        classifier_gradients = classifier_tape.gradient(total_loss, classifier_model.trainable_variables)
                        classifier_optimizer.apply_gradients(zip(classifier_gradients, classifier_model.trainable_variables))
                        
                        with tf.GradientTape() as adversary_tape:
                            # Forward pass through the classifier
                            classifier_predictions = classifier_model(X_batch, training=True)
                            
                            # Forward pass through the adversary
                            adversary_predictions = adversary_model(classifier_predictions, training=True)
                            
                            # Compute adversary loss
                            adversary_loss = adversary_loss_fn(protected_batch, adversary_predictions)
                        
                        # Compute gradients and update adversary weights
                        adversary_gradients = adversary_tape.gradient(adversary_loss, adversary_model.trainable_variables)
                        adversary_optimizer.apply_gradients(zip(adversary_gradients, adversary_model.trainable_variables))
                    
                    print(f"Epoch {epoch + 1}/{num_epochs}, Classification Loss: {classification_loss.numpy()}, Adversary Loss: {adversary_loss.numpy()}")
                
                # Predictions for debiased model
                y_test_pred_debiased = classifier_model.predict(X_test).argmax(axis=1)

                debiased_classification_accuracy = classification_accuracy(y_test_true, y_test_pred_debiased)
                debiased_balanced_accuracy = balanced_accuracy(y_test_true, y_test_pred_debiased)
                debiased_disparate_impact = disparate_impact(y_test_true, y_test_pred_debiased, protected_test)
                debiased_equal_opportunity_difference = equal_opportunity_difference(y_test_true, y_test_pred_debiased, protected_test)
                debiased_average_odds_difference = average_odds_difference(y_test_true, y_test_pred_debiased, protected_test)
                debiased_precision = precision(y_test_true, y_test_pred_plain, average='macro')  # Use 'macro' for multi-class
                debiased_recall = recall(y_test_true, y_test_pred_plain, average='macro')        # Use 'macro' for multi-class
                debiased_f1 = f1(y_test_true, y_test_pred_plain, average='macro')                # Use 'macro' for multi-class
                
                save_metrics_to_csv(metrics_filename, file, 'Debiased Model', debiased_classification_accuracy, debiased_balanced_accuracy, debiased_disparate_impact, debiased_equal_opportunity_difference, debiased_average_odds_difference,debiased_precision,debiased_recall,debiased_f1)
                
                # Save the debiased model as ONNX
                input_shape = (13,)  # Adjust the input shape based on your model's expected input
                save_model_onnx(classifier_model, input_shape, output_path)

            except Exception as e:
                print(f"Failed to convert {file}. Error: {e}")

## Adversarial Debiasing Process For Multiple Runs

In [None]:
import matplotlib.pyplot as plt
from IPython.display import clear_output
from tensorflow.keras.callbacks import EarlyStopping

if multiple_runs:
    # Number of runs
    num_runs = 10

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=35, restore_best_weights=True)

    # Directory paths
    input_directory = './adult/adult_keras'
    output_directory = './adult/adult_debiased_onnx'

    # Ensure the output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Load and preprocess the data
    X_train, X_test, y_train, y_test, protected_train, protected_test = load_adult_adf()

    # Standardize the features
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    metrics_filename = './model_metrics/adult_model_metrics_multiple_runs.csv'

    # List of models to process
    model_list = ['AC-1', 'AC-4', 'AC-5', 'AC-10', 'AC-3']

    # Iterate over all .keras files in the input directory to convert to ONNX file
    for file in os.listdir(input_directory):
        if file.endswith('.keras') and any(model in file for model in model_list):
            input_path = os.path.join(input_directory, file)
            output_path = os.path.join(output_directory, file.replace('.keras', '.onnx'))

            try:
                plain_metrics = {
                    'classification_accuracy': [],
                    'balanced_accuracy': [],
                    'disparate_impact': [],
                    'equal_opportunity_difference': [],
                    'average_odds_difference': [],
                    'precision': [],
                    'recall': [],
                    'f1': []
                }

                debiased_metrics = {
                    'classification_accuracy': [],
                    'balanced_accuracy': [],
                    'disparate_impact': [],
                    'equal_opportunity_difference': [],
                    'average_odds_difference': [],
                    'precision': [],
                    'recall': [],
                    'f1': []
                }
            
                # Inside the loop for each .keras file
                for run in range(num_runs):
                    print(f"Run {run + 1}/{num_runs}")

                    # Random seed for variability
                    np.random.seed(run)
                    tf.random.set_seed(run)

                    # Shuffle the training data
                    indices = np.arange(X_train.shape[0])
                    np.random.shuffle(indices)
                    X_train_shuffled = X_train[indices]
                    y_train_shuffled = y_train[indices]
                    protected_train_shuffled = protected_train[indices]

                    # Load the model
                    classifier_model = load_model(input_path)

                    # Compile the model
                    classifier_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

                    # Train the model with shuffled data
                    classifier_model.fit(X_train_shuffled, y_train_shuffled, epochs=10, batch_size=128, verbose=0)

                    # Train and evaluate the plain model
                    y_test_pred_plain = classifier_model.predict(X_test).argmax(axis=1)
                    y_test_true = y_test.argmax(axis=1)

                    plain_metrics['classification_accuracy'].append(classification_accuracy(y_test_true, y_test_pred_plain))
                    plain_metrics['balanced_accuracy'].append(balanced_accuracy(y_test_true, y_test_pred_plain))
                    plain_metrics['disparate_impact'].append(disparate_impact(y_test_true, y_test_pred_plain, protected_test))
                    plain_metrics['equal_opportunity_difference'].append(equal_opportunity_difference(y_test_true, y_test_pred_plain, protected_test))
                    plain_metrics['average_odds_difference'].append(average_odds_difference(y_test_true, y_test_pred_plain, protected_test))
                    plain_metrics['precision'].append(precision_score(y_test_true, y_test_pred_plain, average='macro', zero_division=1))
                    plain_metrics['recall'].append(recall_score(y_test_true, y_test_pred_plain, average='macro'))
                    plain_metrics['f1'].append(f1_score(y_test_true, y_test_pred_plain, average='macro'))

                    # Build and compile the adversary model
                    adversary_model = build_adversary_model(classifier_model.output_shape[1:])

                    # Training parameters
                    num_epochs = 500
                    batch_size = 128
                    initial_learning_rate = 0.001
                    adversary_loss_weight_initial = 0.1
                    adversary_loss_weight_final = 0.7

                    # Optimizers
                    classifier_optimizer = tf.keras.optimizers.Adam(initial_learning_rate)
                    adversary_optimizer = tf.keras.optimizers.Adam(initial_learning_rate)

                    # Learning rate scheduler
                    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
                        initial_learning_rate,
                        decay_steps=10000,
                        decay_rate=0.96,
                        staircase=True
                    )

                    # Update the optimizer to use the learning rate scheduler
                    classifier_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
                    adversary_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

                    # Loss functions
                    classification_loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
                    adversary_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)

                    best_val_loss = float('inf')
                    epochs_since_best = 0

                    # Training loop
                    for epoch in range(num_epochs):
                        # Shuffle the training data
                        indices = np.arange(X_train.shape[0])
                        np.random.shuffle(indices)

                        # Mini-batch training
                        epoch_train_loss = 0
                        epoch_train_accuracy = 0
                        num_batches = 0
                        for start in range(0, X_train.shape[0], batch_size):
                            end = min(start + batch_size, X_train.shape[0])
                            batch_indices = indices[start:end]

                            X_batch = X_train[batch_indices]
                            y_batch = y_train[batch_indices]
                            protected_batch = protected_train[batch_indices].reshape(-1, 1)

                            with tf.GradientTape(persistent=True) as tape:
                                # Forward pass through the classifier
                                classifier_predictions = classifier_model(X_batch, training=True)

                                # Forward pass through the adversary with no gradient accumulation
                                adversary_predictions = adversary_model(classifier_predictions, training=False)

                                # Compute losses
                                classification_loss = classification_loss_fn(y_batch, classifier_predictions)
                                adversary_loss = adversary_loss_fn(protected_batch, adversary_predictions)

                                # Linearly increase the adversary loss weight over epochs
                                adversary_loss_weight = adversary_loss_weight_initial + \
                                                        (adversary_loss_weight_final - adversary_loss_weight_initial) * (epoch / num_epochs)

                                total_loss = classification_loss - adversary_loss_weight * adversary_loss

                            # Compute gradients and update classifier weights
                            classifier_gradients = tape.gradient(total_loss, classifier_model.trainable_variables)
                            classifier_optimizer.apply_gradients(zip(classifier_gradients, classifier_model.trainable_variables))

                            with tape:
                                # Forward pass through the classifier
                                classifier_predictions = classifier_model(X_batch, training=True)

                                # Forward pass through the adversary
                                adversary_predictions = adversary_model(classifier_predictions, training=True)

                                # Compute adversary loss
                                adversary_loss = adversary_loss_fn(protected_batch, adversary_predictions)

                            # Compute gradients and update adversary weights
                            adversary_vars = [var for var in adversary_model.trainable_variables]
                            adversary_gradients = tape.gradient(adversary_loss, adversary_vars)
                            adversary_optimizer.apply_gradients(zip(adversary_gradients, adversary_vars))

                            # Accumulate training loss and accuracy
                            epoch_train_loss += classification_loss.numpy()
                            epoch_train_accuracy += np.mean(np.argmax(classifier_predictions, axis=1) == np.argmax(y_batch, axis=1))
                            num_batches += 1

                        print(f"Epoch {epoch + 1}/{num_epochs}, Classification Loss: {classification_loss.numpy()}, Adversary Loss: {adversary_loss.numpy()}")

                        # Calculate average training loss and accuracy
                        epoch_train_loss /= num_batches
                        epoch_train_accuracy /= num_batches

                        # Evaluate on validation set
                        val_predictions = classifier_model.predict(X_test)
                        val_loss = classification_loss_fn(y_test, val_predictions).numpy()
                        val_accuracy = np.mean(np.argmax(val_predictions, axis=1) == np.argmax(y_test, axis=1))

                        # Early stopping check
                        if val_loss < best_val_loss:
                            best_val_loss = val_loss
                            epochs_since_best = 0
                        else:
                            epochs_since_best += 1
                            if epochs_since_best >= early_stopping.patience:
                                print(f"Early stopping at epoch {epoch + 1}")
                                break

                    y_test_pred_debiased = classifier_model.predict(X_test).argmax(axis=1)

                    debiased_metrics['classification_accuracy'].append(classification_accuracy(y_test_true, y_test_pred_debiased))
                    debiased_metrics['balanced_accuracy'].append(balanced_accuracy(y_test_true, y_test_pred_debiased))
                    debiased_metrics['disparate_impact'].append(disparate_impact(y_test_true, y_test_pred_debiased, protected_test))
                    debiased_metrics['equal_opportunity_difference'].append(equal_opportunity_difference(y_test_true, y_test_pred_debiased, protected_test))
                    debiased_metrics['average_odds_difference'].append(average_odds_difference(y_test_true, y_test_pred_debiased, protected_test))
                    debiased_metrics['precision'].append(precision_score(y_test_true, y_test_pred_debiased, average='macro', zero_division=1))
                    debiased_metrics['recall'].append(recall_score(y_test_true, y_test_pred_debiased, average='macro'))
                    debiased_metrics['f1'].append(f1_score(y_test_true, y_test_pred_debiased, average='macro'))

                # Calculate mean and std for plain metrics
                plain_means = {key: np.mean(values) for key, values in plain_metrics.items()}
                plain_stds = {key: np.std(values) for key, values in plain_metrics.items()}

                # Calculate mean and std for debiased metrics
                debiased_means = {key: np.mean(values) for key, values in debiased_metrics.items()}
                debiased_stds = {key: np.std(values) for key, values in debiased_metrics.items()}

                # Save metrics to CSV
                save_metrics_to_csv_mr(metrics_filename, file, 'Plain Model', plain_means, plain_stds)
                save_metrics_to_csv_mr(metrics_filename, file, 'Debiased Model', debiased_means, debiased_stds)

                # Save the debiased model as ONNX
                input_shape = (20,)  # Adjust the input shape based on your model's expected input
                save_model_onnx(classifier_model, input_shape, output_path)

            except Exception as e:
                print(f"Failed to convert {file}. Error: {e}")