The code to process German Credit data, edit/train models, and perform adversarial debiasing. 

Necessary libraries for the notebook.

In [1]:
import os
import tensorflow as tf
import tf2onnx
from tensorflow.keras.models import load_model, Model
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from scipy.io import savemat
import numpy as np
import pandas as pd
import warnings
import csv

2024-07-19 15:42:03.915178: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Data Preprocessing

In [28]:
# Custom preprocessing function for the German dataset
def german_custom_preprocessing(df):
    def group_credit_hist(x):
        if x in ['A30', 'A31', 'A32']:
            return 'None/Paid'
        elif x == 'A33':
            return 'Delay'
        elif x == 'A34':
            return 'Other'
        else:
            return 'NA'

    def group_employ(x):
        if x == 'A71':
            return 'Unemployed'
        elif x in ['A72', 'A73']:
            return '1-4 years'
        elif x in ['A74', 'A75']:
            return '4+ years'
        else:
            return 'NA'

    def group_savings(x):
        if x in ['A61', 'A62']:
            return '<500'
        elif x in ['A63', 'A64']:
            return '500+'
        elif x == 'A65':
            return 'Unknown/None'
        else:
            return 'NA'

    def group_status(x):
        if x in ['A11', 'A12']:
            return '<200'
        elif x in ['A13']:
            return '200+'
        elif x == 'A14':
            return 'None'
        else:
            return 'NA'

    status_map = {'A91': 1, 'A93': 1, 'A94': 1, 'A92': 0, 'A95': 0}  # 1: 'male'
    df['sex'] = df['personal_status'].replace(status_map)

    df['credit_history'] = df['credit_history'].apply(lambda x: group_credit_hist(x))
    df['savings'] = df['savings'].apply(lambda x: group_savings(x))
    df['employment'] = df['employment'].apply(lambda x: group_employ(x))
    df['status'] = df['status'].apply(lambda x: group_status(x))

    df.credit.replace([1, 2], [1, 0], inplace=True)

    return df

def load_german():
    filepath = '../data/german/german.data'
    column_names = ['status', 'month', 'credit_history', 'purpose', 'credit_amount', 'savings', 'employment',
                    'investment_as_income_percentage', 'personal_status', 'other_debtors', 'residence_since', 
                    'property', 'age', 'installment_plans', 'housing', 'number_of_credits', 'skill_level', 
                    'people_liable_for', 'telephone', 'foreign_worker', 'credit']
    na_values = []
    df = pd.read_csv(filepath, sep=' ', header=None, names=column_names, na_values=na_values)
    
    df = german_custom_preprocessing(df)
    feat_to_drop = ['personal_status']
    df = df.drop(feat_to_drop, axis=1)
    
    # Encode categorical features
    cat_feat = ['status', 'credit_history', 'purpose', 'savings', 'employment', 'other_debtors', 'property', 
                'installment_plans', 'housing', 'skill_level', 'telephone', 'foreign_worker']
    for col in cat_feat:
        df[col] = LabelEncoder().fit_transform(df[col])
    
    # Encode the target variable
    label_name = 'credit'
    
    X = df.drop(labels=[label_name], axis=1, inplace=False)
    y = df[label_name]
    
    # Extract the protected attribute ('sex')
    protected_attribute = X['sex'].values
    
    # Split the data into training and testing sets
    seed = 42
    X_train, X_test, y_train, y_test, protected_train, protected_test = train_test_split(
        X, y, protected_attribute, test_size=0.15, random_state=seed
    )
    
    # One-hot encode the labels
    y_train = to_categorical(y_train, num_classes=2)
    y_test = to_categorical(y_test, num_classes=2)
    
    return X_train, X_test, y_train, y_test, protected_train, protected_test

# Saves data for use in verification
def load_and_save_german_data():
    X_train, X_test, y_train, y_test, _, _ = load_german()
    
    # Scaling numerical features with MinMaxScaler
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Prepare data dictionary to save as .mat file
    data_dict = {
        'X': X_test, 
        'y': y_test   
    }
    
    # Save to .mat file for use in MATLAB
    savemat("./processed_data/german_data.mat", data_dict)
    print("Data saved to german_data.mat")

    return X_train, X_test, y_train, y_test

### Model Editing

Method to save the models as onnx files for verification. 

In [29]:
# Function to save the model as ONNX format
def save_model_onnx(model, input_shape, onnx_file_path):
    # Create a dummy input tensor with the correct input shape (batch_size, input_shape)
    dummy_input = tf.random.normal([1] + list(input_shape))

    # Convert the model to ONNX
    model_proto, external_tensor_storage = tf2onnx.convert.from_keras(model, 
                                                                      input_signature=(tf.TensorSpec(shape=[None] + list(input_shape), dtype=tf.float32),),
                                                                      opset=13)
    
    # Save the ONNX model to the specified path
    with open(onnx_file_path, "wb") as f:
        f.write(model_proto.SerializeToString())
    
    print(f"Model has been saved in ONNX format at {onnx_file_path}")

Change the models so they are able to be used in FairNNV. FairNNV cannot handle sigmoid so shift to softmax and adjust final layers. 

In [38]:
# Function to modify a model for multiclass classification
def modify_model_for_multiclass(model_path, num_classes):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        model = load_model(model_path)

    # Create a new input layer with the correct shape
    new_input = tf.keras.layers.Input(shape=(20,))
    x = new_input

    # Transfer the layers except the last one
    for layer in model.layers[:-1]:
        x = layer(x)

    # Create a new output layer
    output = tf.keras.layers.Dense(num_classes, activation='softmax', name='new_output')(x)
    
    # Create a new model
    new_model = tf.keras.models.Model(inputs=new_input, outputs=output)
    
    return new_model

# Ensure the save directories exist
model_dir = './german/german_h5'
save_dir = './german/german_keras'
onnx_save_dir = './german/german_onnx'
num_classes = 2

if not os.path.exists(save_dir):
    os.makedirs(save_dir)
if not os.path.exists(onnx_save_dir):
    os.makedirs(onnx_save_dir)

# Modify each model in the directory to remove sigmoid
for model_file in os.listdir(model_dir):
    if model_file.endswith('.h5'):
        model_path = os.path.join(model_dir, model_file)
        new_model = modify_model_for_multiclass(model_path, num_classes)
        
        # Update the model's loss function
        new_model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
        
        # Save the modified model
        save_path = os.path.join(save_dir, model_file.replace('.h5', '.keras'))
        new_model.save(save_path)




Re-train models. 

In [39]:
# Load and preprocess the German dataset
X_train, X_test, y_train, y_test,_,_ = load_german()

for model_file in os.listdir(save_dir):
    if model_file.endswith('.keras'):
        model_path = os.path.join(save_dir, model_file)
        
        try:
            # Load the modified model
            print(f"Loading model {model_file}")
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=UserWarning)
                model = load_model(model_path)

            # Reinitialize the optimizer
            model.compile(
                optimizer=Adam(learning_rate=0.001),  # Try a lower learning rate
                loss='categorical_crossentropy', 
                metrics=['accuracy']
            )

            # Fit the model
            print(f"Training model {model_file}")
            history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

            # Evaluate the model
            y_pred = model.predict(X_test)
            y_pred_classes = np.argmax(y_pred, axis=1)
            accuracy = accuracy_score(np.argmax(y_test, axis=1), y_pred_classes)

            print(f"Model {model_file} - Accuracy: {accuracy}")

            # Save the retrained model
            model.save(model_path)
            print(f"Model {model_file} retrained and saved successfully.")

            # Save the model as ONNX
            onnx_save_path = os.path.join(onnx_save_dir, model_file.replace('.keras', '.onnx'))
            save_model_onnx(model, (20,), onnx_save_path)

        except Exception as e:
            print(f"Failed to process {model_file}. Error: {e}")

Loading model GC-1.keras
Training model GC-1.keras
Epoch 1/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5183 - loss: 4.9311 - val_accuracy: 0.6765 - val_loss: 2.4733
Epoch 2/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6548 - loss: 1.8814 - val_accuracy: 0.6294 - val_loss: 1.1037
Epoch 3/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5923 - loss: 1.1068 - val_accuracy: 0.6353 - val_loss: 0.8916
Epoch 4/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6176 - loss: 1.1034 - val_accuracy: 0.6588 - val_loss: 0.8433
Epoch 5/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6317 - loss: 1.1994 - val_accuracy: 0.6118 - val_loss: 0.8993
Epoch 6/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5920 - loss: 1.1183 - val_accuracy: 0.6059 - val_lo

2024-07-30 11:37:59.989568: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-07-30 11:37:59.989673: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-07-30 11:38:00.012312: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-07-30 11:38:00.012483: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5807 - loss: 3.4194 - val_accuracy: 0.7000 - val_loss: 1.2051
Epoch 2/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6035 - loss: 1.0058 - val_accuracy: 0.6882 - val_loss: 0.9534
Epoch 3/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6312 - loss: 0.8330 - val_accuracy: 0.6941 - val_loss: 0.8367
Epoch 4/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6717 - loss: 0.7081 - val_accuracy: 0.6941 - val_loss: 0.7396
Epoch 5/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6793 - loss: 0.6954 - val_accuracy: 0.6941 - val_loss: 0.7061
Epoch 6/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6825 - loss: 0.6775 - val_accuracy: 0.6941 - val_loss: 0.7100
Epoch 7/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━

2024-07-30 11:38:04.630712: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-07-30 11:38:04.630835: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-07-30 11:38:04.652946: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-07-30 11:38:04.653084: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2925 - loss: 13.3541 - val_accuracy: 0.3000 - val_loss: 6.4780
Epoch 2/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3547 - loss: 5.3977 - val_accuracy: 0.3941 - val_loss: 3.4997
Epoch 3/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4058 - loss: 3.0216 - val_accuracy: 0.4353 - val_loss: 2.3594
Epoch 4/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4553 - loss: 2.0370 - val_accuracy: 0.4412 - val_loss: 1.7477
Epoch 5/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4847 - loss: 1.5126 - val_accuracy: 0.4471 - val_loss: 1.3551
Epoch 6/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5118 - loss: 1.1786 - val_accuracy: 0.4824 - val_loss: 1.0790
Epoch 7/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━

2024-07-30 11:38:09.077505: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-07-30 11:38:09.077602: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-07-30 11:38:09.097125: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-07-30 11:38:09.097208: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2889 - loss: 48.8629 - val_accuracy: 0.3059 - val_loss: 12.0622
Epoch 2/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4571 - loss: 6.3490 - val_accuracy: 0.6941 - val_loss: 0.6790
Epoch 3/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7111 - loss: 0.6769 - val_accuracy: 0.6941 - val_loss: 0.6758
Epoch 4/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7111 - loss: 0.6734 - val_accuracy: 0.6941 - val_loss: 0.6727
Epoch 5/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7111 - loss: 0.6700 - val_accuracy: 0.6941 - val_loss: 0.6695
Epoch 6/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7111 - loss: 0.6665 - val_accuracy: 0.6941 - val_loss: 0.6663
Epoch 7/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━

2024-07-30 11:38:13.902731: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-07-30 11:38:13.902824: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-07-30 11:38:13.924765: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-07-30 11:38:13.924847: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6400 - loss: 0.6912 - val_accuracy: 0.6941 - val_loss: 0.6857
Epoch 2/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7111 - loss: 0.6831 - val_accuracy: 0.6941 - val_loss: 0.6786
Epoch 3/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7111 - loss: 0.6755 - val_accuracy: 0.6941 - val_loss: 0.6720
Epoch 4/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7111 - loss: 0.6684 - val_accuracy: 0.6941 - val_loss: 0.6660
Epoch 5/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7111 - loss: 0.6620 - val_accuracy: 0.6941 - val_loss: 0.6606
Epoch 6/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7111 - loss: 0.6560 - val_accuracy: 0.6941 - val_loss: 0.6556
Epoch 7/50
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━

2024-07-30 11:38:19.480843: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-07-30 11:38:19.480930: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-07-30 11:38:19.518669: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-07-30 11:38:19.518763: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
