In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.optimizers import Adam

from pathlib import Path
import itertools

2024-07-14 18:32:32.588022: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
path = Path(Path.cwd())
file_path = path / 'dataset' / 'adult_income.data'

In [3]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
                'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
                'hours_per_week', 'native_country', 'income']

df = pd.read_csv(file_path, names=column_names, sep=',', skipinitialspace=True)
df.replace('?', pd.NA, inplace=True)
df.dropna(inplace=True)

df['income'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)
X = df.drop('income', axis=1)
y = df['income']

categorical_columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
continuous_columns = [col for col in X.columns if col not in categorical_columns]

X = pd.get_dummies(X, columns=categorical_columns)
one_hot_encoded_columns = [col for col in X.columns if col not in continuous_columns]

scaler = StandardScaler()
X_continuous = pd.DataFrame(scaler.fit_transform(X[continuous_columns]), index=X.index, columns=continuous_columns)
X = X_continuous.join(X[one_hot_encoded_columns])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [8]:
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0) 
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
print(f'Train: The loss is {loss} and the accuracy is {accuracy}')

model.fit(X_test, y_test, epochs=10, batch_size=32, validation_split=0.2, verbose=0) 
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
print(f'Test: The loss is {loss} and the accuracy is {accuracy}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Train: The loss is 0.2926315367221832 and the accuracy is 0.8659289479255676
Test: The loss is 0.312509149312973 and the accuracy is 0.8561896681785583


In [5]:
y_pred = model.predict(X_train)
y_pred = (y_pred > 0.5).astype(int)
y_pred = y_pred.flatten()
y_pred = pd.Series(y_pred, index=X_train.index)

[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


In [6]:
def fairness_calculator(protected_attribute_list, fairness_metric, X, y, y_pred):
    fairness_dictionary = {}

    for protected_attribute in protected_attribute_list:
        protected_attribute_index = X.columns.get_loc(protected_attribute)
        subset_y = []
        subset_y_pred = []
        indices = X.index[X.iloc[:, protected_attribute_index] == 1].to_list() 
        
        for i in indices:
            subset_y.append(y[i])
            subset_y_pred.append(y_pred[i])

        tn, fp, fn, tp = confusion_matrix(subset_y, subset_y_pred).ravel()
        
        if fairness_metric == 'Equal Opportunity':
            tpr = tp / (tp + fn)
            fairness = tpr
        elif fairness_metric == 'Equalized Odds':
            fpr = fp / (fp + tn)
            fairness = fpr
        elif fairness_metric == 'Disparate Impact':
            n = tn + fp + fn + tp
            ppp = (tp + fp) / n
            fairness = ppp
      
        fairness_dictionary[protected_attribute] = fairness

    fairness_difference = abs(fairness_dictionary[protected_attribute_list[0]] - fairness_dictionary[protected_attribute_list[1]])
    return fairness_difference

In [7]:
#testes para diferentes protected_attributes e diferentes metricas de fairness
fairness_metric_vector = ['Equal Opportunity', 'Equalized Odds', 'Disparate Impact']
protected_attribute_vector = ['Sex', 'Race']

complete_combinations = list(itertools.product(fairness_metric_vector, protected_attribute_vector))

start = 0 #state
for index in range(start, len(complete_combinations)):
    print (f'Starting combination: {index+1}')

    combination = complete_combinations[index]
    fairness_metric = combination[0]
    protected_attribute = combination[1]

    if protected_attribute == 'Sex':
        protected_attribute_list = ['sex_Male', 'sex_Female'] 
    elif protected_attribute == 'Race':
        protected_attribute_list = ['race_White', 'race_Black']
    else:
        print('ERROR')

    fairness_difference = fairness_calculator(protected_attribute_list, fairness_metric, X_train, y_train, y_pred)
    print(f'Regarding {protected_attribute}, the {fairness_metric} Difference between {protected_attribute_list[0]} and {protected_attribute_list[1]} is {fairness_difference}')
    print (f'Finishing combination: {index+1}')

Starting combination: 1
Regarding Sex, the Equal Opportunity Difference between sex_Male and sex_Female is 0.005795085152992763
Finishing combination: 1
Starting combination: 2
Regarding Race, the Equal Opportunity Difference between race_White and race_Black is 0.04189832396577042
Finishing combination: 2
Starting combination: 3
Regarding Sex, the Equalized Odds Difference between sex_Male and sex_Female is 0.07945495848947519
Finishing combination: 3
Starting combination: 4
Regarding Race, the Equalized Odds Difference between race_White and race_Black is 0.05492723800524168
Finishing combination: 4
Starting combination: 5
Regarding Sex, the Disparate Impact Difference between sex_Male and sex_Female is 0.18718262345138026
Finishing combination: 5
Starting combination: 6
Regarding Race, the Disparate Impact Difference between race_White and race_Black is 0.135218525012997
Finishing combination: 6
