## Pre-processing Mitigation Methods: AIF360



In [17]:
import numpy as np
from numpy.random import choice as np_choice
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from IPython.display import Markdown, display
# from sklearn.decomposition import PCA

import fairness_helpers as fh
import global_variables as gv
from xgboost import XGBClassifier
import utilities
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, QuantileTransformer, RobustScaler, StandardScaler,MinMaxScaler

from aif360.algorithms.preprocessing import DisparateImpactRemover
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.datasets import StandardDataset, BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric

#### get data

In [2]:
X, X1 = fh.get_aif360_data()

In [3]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,1319-0.0,1408-0.0,1329-0.0,1448-0.0,1538-0.0,6142-0.0,2050-0.0,1508-0.0,1339-0.0,30710-0.0,1349-0.0,30750-0.0,1468-0.0,20117-0.0,30740-0.0,1160-0.0,2090-0.0,31-0.0,1488-0.0,30850-0.0,4080-0.0,1369-0.0,21000-0.0,1200-0.0,1289-0.0,30790-0.0,845-0.0,48-0.0,30630-0.0,1299-0.0,1220-0.0,1548-0.0,1528-0.0,23099-0.0,49-0.0,30690-0.0,1389-0.0,2654-0.0,1249-0.0,1309-0.0,1379-0.0,1239-0.0,21003-0.0,30780-0.0,1438-0.0,30870-0.0,1359-0.0,30770-0.0,21001-0.0,1458-0.0,23100-0.0,6138-0.0,1418-0.0,1478-0.0,4079-0.0,30760-0.0,23101-0.0,2100-0.0,1428-0.0,30640-0.0,hypertension,CVD,sex-binary,race-binary,race-grouped,age-binary
sex-binary,race-binary,race-grouped,age-binary,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1
0,1,0,1,0.0,1.0,2.0,3.0,2.0,1.0,2.0,3.0,2.0,0.3400,1.0,34.937,3.0,2.0,5.62200,7.0,1.0,0.0,6.00,0.50800,110.0,1.0,1001.0,3.0,6.0,54.4035,20.90,74.0,1.5930,10.0,0.0,2.0,2.00,35.6,102.0,6.47700,1.0,6.0,1.0,2.0,1.0,0.0,54.0,3.88800,10.0,0.97700,2.0,26.33900,24.5790,3.86,25.0,1.0,3.0,1.0,77.0,1.70600,45.2,1.0,0.0,1.21100,0,1,0,1,0,1
1,1,0,1,0.0,3.0,2.0,1.0,0.0,1.0,1.0,2.0,2.0,3.9400,4.0,40.900,5.0,2.0,5.05200,9.0,0.0,1.0,2.00,13.08800,166.0,2.0,1001.0,2.0,2.0,15.4000,16.00,120.0,1.3900,2.0,0.0,2.0,2.47,36.5,113.0,5.51200,1.0,7.0,1.0,1.0,2.0,0.0,65.0,3.52000,12.0,2.35800,3.0,10.70100,35.0861,7.00,42.9,3.0,2.0,1.0,91.0,1.17300,74.6,0.0,1.0,1.01900,1,0,1,1,0,1
0,1,0,1,0.0,3.0,3.0,2.0,1.0,2.0,1.0,2.0,2.0,0.5500,1.0,40.000,1.0,0.0,5.31000,5.0,0.0,0.0,0.00,0.51500,132.0,1.0,1001.0,3.0,2.0,32.1000,16.00,66.0,2.0050,4.0,0.0,1.0,1.00,29.5,88.0,7.07900,1.0,7.0,3.0,4.0,2.0,0.0,69.0,4.22700,8.0,0.65500,2.0,10.69300,19.3835,7.00,15.2,3.0,2.0,1.0,67.0,2.49000,36.3,0.0,1.0,1.09700,0,0,0,1,0,1
1,1,0,1,3.0,3.0,3.0,3.0,0.0,2.0,1.0,2.0,2.0,0.4500,2.0,37.300,4.0,2.0,4.44900,7.0,0.0,1.0,5.00,4.67500,178.0,2.0,1001.0,1.0,3.0,43.5620,18.00,110.0,1.4740,2.0,0.0,1.0,2.00,28.5,117.0,5.02800,0.0,7.0,1.0,1.0,2.0,1.0,66.0,3.04100,10.0,3.10800,2.0,25.31700,35.1281,7.00,31.7,3.0,2.0,1.0,84.0,1.16900,79.6,0.0,3.0,0.92300,0,0,1,1,0,1
1,1,0,0,0.0,3.0,2.0,1.0,0.0,5.0,2.0,2.0,2.0,0.7500,2.0,32.200,1.0,2.0,4.61600,6.0,0.0,1.0,3.04,20.16200,178.0,1.0,1001.0,3.0,1.0,71.1100,22.38,94.0,2.1490,1.0,0.0,2.0,2.00,24.8,100.0,7.95800,1.0,7.0,2.0,1.0,1.0,0.0,48.0,4.98300,8.0,1.17300,1.0,26.52300,25.8866,1.00,20.1,1.0,2.0,1.0,88.0,2.05300,61.0,0.0,3.0,1.44300,0,0,1,1,0,0
1,1,0,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,1,0,1,0.0,3.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.4500,3.0,28.000,1.0,2.0,4.50900,7.0,0.0,1.0,0.00,17.13700,147.0,1.0,1001.0,2.0,2.0,5.3000,16.00,110.0,1.3450,0.0,0.0,2.0,0.00,27.0,113.0,5.75400,1.0,4.0,1.0,2.0,1.0,2.0,63.0,3.84300,4.0,3.82000,2.0,20.77700,30.6094,7.00,29.8,3.0,1.0,3.0,91.0,1.02600,80.7,0.0,3.0,1.13700,0,0,1,1,0,1
0,1,0,1,0.0,1.0,1.0,3.0,2.0,2.0,1.0,1.0,1.0,2.7500,1.0,33.200,1.0,1.0,5.67300,6.0,1.0,0.0,0.00,0.72500,148.0,2.0,1001.0,3.0,4.0,5.0900,16.00,102.0,1.3650,3.0,0.0,3.0,3.00,46.3,110.0,4.66400,2.0,7.0,1.0,6.0,2.0,0.0,64.0,2.86300,4.0,2.13400,3.0,23.64700,31.4652,7.00,38.7,3.0,3.0,2.0,73.0,1.15900,44.9,0.0,1.0,0.89800,1,0,0,1,0,1
1,1,0,0,2.0,1.0,1.0,2.0,2.0,1.0,4.0,2.0,2.0,0.8500,3.0,31.500,3.0,2.0,4.85100,7.0,1.0,1.0,20.00,12.04000,133.0,2.0,1001.0,2.0,2.0,37.5575,22.08,99.0,1.2410,2.0,1.0,3.0,1.00,25.7,98.0,5.18400,1.0,7.0,1.0,2.0,2.0,0.0,43.0,3.35800,20.0,2.54700,3.0,22.50700,28.6196,10.00,23.6,1.0,1.0,1.0,81.0,1.04300,68.1,1.0,1.0,0.98500,1,1,1,1,0,0
0,1,0,1,3.0,3.0,1.0,3.0,0.0,1.0,1.0,2.0,2.0,3.5700,2.0,45.700,1.0,2.0,5.50600,6.0,0.0,0.0,4.00,1.18648,138.0,1.0,1001.0,3.0,4.0,27.4800,19.64,86.0,1.6690,2.0,0.0,2.0,2.00,41.3,111.0,6.82700,1.0,7.0,4.0,1.0,1.0,0.0,56.0,4.20000,12.0,2.09000,3.0,30.49800,27.3702,7.00,32.7,1.0,2.0,1.0,83.0,1.58900,46.4,0.0,0.0,1.06300,0,0,0,1,0,1


#### Step 1. Convert Pandas DataFrame into [BinaryLabelDataset datatype](https://aif360.readthedocs.io/en/latest/modules/generated/aif360.datasets.BinaryLabelDataset.html#aif360.datasets.BinaryLabelDataset)

In [4]:
dataset1b = BinaryLabelDataset(df=X1, 
                          label_names=['CVD'], 
                          protected_attribute_names=['sex-binary'])

dataset2b = BinaryLabelDataset(df=X1, 
                          label_names=['CVD'], 
                          protected_attribute_names=['race-binary'])

dataset3b = BinaryLabelDataset(df=X1, 
                          label_names=['CVD'], 
                          protected_attribute_names=['age-binary'])

#### XGBoost Model

In [5]:
df = pd.read_csv('data/entire_imputed.csv')
pd.set_option('display.max_columns', None)
df.drop('Unnamed: 0', axis=1, inplace=True)

X_train, X_val, X_test, y_train, y_val, y_test = utilities.process_features(df, 'CVD', QuantileTransformer(output_distribution='uniform'), one_hot=True, val=True)
X_train, y_train= utilities.resample_data(X_train, y_train, 'under')

n_estimators = 1000

clf_xgb = XGBClassifier(max_depth=8,
    learning_rate=0.01,
    n_estimators=n_estimators,
    verbosity=0,
    silent=None,
    objective="binary:logistic",
    booster='gbtree',
    n_jobs=-1,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,)

clf_xgb.fit(X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=40,
            verbose=20)

`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.


[0]	validation_0-logloss:0.69122
[20]	validation_0-logloss:0.65926
[40]	validation_0-logloss:0.63727
[60]	validation_0-logloss:0.62187
[80]	validation_0-logloss:0.61095
[100]	validation_0-logloss:0.60308
[120]	validation_0-logloss:0.59757
[140]	validation_0-logloss:0.59357
[160]	validation_0-logloss:0.59081
[180]	validation_0-logloss:0.58878
[200]	validation_0-logloss:0.58708
[220]	validation_0-logloss:0.58596
[240]	validation_0-logloss:0.58508
[260]	validation_0-logloss:0.58449
[280]	validation_0-logloss:0.58400
[300]	validation_0-logloss:0.58349
[320]	validation_0-logloss:0.58319
[340]	validation_0-logloss:0.58296
[360]	validation_0-logloss:0.58269
[380]	validation_0-logloss:0.58247
[400]	validation_0-logloss:0.58235
[420]	validation_0-logloss:0.58222
[440]	validation_0-logloss:0.58221
[460]	validation_0-logloss:0.58217
[480]	validation_0-logloss:0.58191
[500]	validation_0-logloss:0.58176
[520]	validation_0-logloss:0.58180
[540]	validation_0-logloss:0.58169
[560]	validation_0-logloss

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=1000,
              n_jobs=-1, nthread=-1, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, ...)

In [7]:
y_pred_proba = clf_xgb.predict_proba(X_test)[:,1]

In [10]:
from sklearn.model_selection import train_test_split

y = X1['CVD']
X_train, X_test, y_train, y_test = train_test_split(X1.drop('CVD', axis=1), y, test_size=0.2, shuffle=False)
X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.15, random_state=1)

df_test = pd.concat([X_test, y_test], axis=1)

# split dataframe by sex
women_test = df_test[df_test['sex-binary']==0]
men_test = df_test[df_test['sex-binary']==1]

X_women = women_test.drop(fh.protected_attribute_names+['CVD'], axis=1)
X_men = men_test.drop(fh.protected_attribute_names+['CVD'], axis=1)

y_boost_prob_women = clf_xgb.predict_proba(X_women)[:,1]
y_boost_prob_men = clf_xgb.predict_proba(X_men)[:,1]

In [19]:
thresh_arr_best4, best_bal_acc4, DI4, av_odds4, stat_par4,  eq_opp4, theil_4, result4 = fh.validate_visualize(pd.concat([women_test, men_test], axis=0), np.concatenate([y_boost_prob_women, y_boost_prob_men], axis=0), 'sex-binary', metric='balanced_accuracy')

Threshold corresponding to Best balanced accuracy: 0.4822
Best balanced accuracy: 0.6711
Corresponding abs(1-disparate impact) value: 0.4956
Corresponding average odds difference value: -0.2482
Corresponding statistical parity difference value: -0.2335
Corresponding equal opportunity difference value: -0.2861
Corresponding Theil index value: 0.0897


### Method 1. Disparate Impact Remover

Edits feature values to increase group fairness while preserving rank-ordering. Must specify the <font color='blue'>repair_level</font> variable to indicate how much group distributions should overlap.

We want the individual rankings within their group to be preserved after repair ie if an individual has the highest ranking in group Q, it will still have the highest ranking in group Q after repair


#### Transform dataset for each protected attribute with Disparate Impact Remover

In [20]:
# transform original dataset

di = DisparateImpactRemover(repair_level = 1.0)
dataset_transf_train = di.fit_transform(dataset1b)
transformed = dataset_transf_train.convert_to_dataframe()[0] # sex

In [51]:
dataset_transf_train2 = di.fit_transform(dataset2b)
transformed2 = dataset_transf_train.convert_to_dataframe()[0] # race

In [None]:
dataset_transf_train3 = di.fit_transform(dataset3b)
transformed3 = dataset_transf_train.convert_to_dataframe()[0] # race

#### retrain model for each transformed dataset

In [23]:
X = transformed.drop(['CVD'] , axis=1)
y = transformed.loc[:, 'CVD']


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)
X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.15, random_state=1)    

X_train, y_train= utilities.resample_data(X_train, y_train, 'under')

In [25]:
clf_xgb.fit(X_train.drop(['sex-binary', 'race-binary', 'race-grouped', 'age-binary'], axis=1), y_train,
            eval_set=[(X_val.drop(['sex-binary', 'race-binary', 'race-grouped', 'age-binary'], axis=1), y_val)],
            early_stopping_rounds=40,
            verbose=20)

[0]	validation_0-logloss:0.69126
[20]	validation_0-logloss:0.65998
[40]	validation_0-logloss:0.63846
[60]	validation_0-logloss:0.62333
[80]	validation_0-logloss:0.61263
[100]	validation_0-logloss:0.60479
[120]	validation_0-logloss:0.59928
[140]	validation_0-logloss:0.59530
[160]	validation_0-logloss:0.59233
[180]	validation_0-logloss:0.59019
[200]	validation_0-logloss:0.58863
[220]	validation_0-logloss:0.58755
[240]	validation_0-logloss:0.58668
[260]	validation_0-logloss:0.58599
[280]	validation_0-logloss:0.58535
[300]	validation_0-logloss:0.58491
[320]	validation_0-logloss:0.58470
[340]	validation_0-logloss:0.58443
[360]	validation_0-logloss:0.58414
[380]	validation_0-logloss:0.58381
[400]	validation_0-logloss:0.58360
[420]	validation_0-logloss:0.58350
[440]	validation_0-logloss:0.58328
[460]	validation_0-logloss:0.58316
[480]	validation_0-logloss:0.58317
[500]	validation_0-logloss:0.58312
[520]	validation_0-logloss:0.58295
[540]	validation_0-logloss:0.58306
[556]	validation_0-logloss

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=1000,
              n_jobs=-1, nthread=-1, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, ...)

### make predictions

In [31]:
y_boost_prob_women = clf_xgb.predict_proba(X_women)[:,1]
y_boost_prob_men = clf_xgb.predict_proba(X_men)[:,1]

thresh_arr_best4, best_bal_acc4, DI4, av_odds4, stat_par4,  eq_opp4, theil_4, result = fh.validate_visualize(pd.concat([women_test, men_test], axis=0), np.concatenate([y_boost_prob_women, y_boost_prob_men], axis=0), 'sex-binary', metric='balanced_accuracy')

Threshold corresponding to Best balanced accuracy: 0.5912
Best balanced accuracy: 0.7086
Corresponding abs(1-disparate impact) value: 0.5945
Corresponding average odds difference value: -0.2647
Corresponding statistical parity difference value: -0.2793
Corresponding equal opportunity difference value: -0.2713
Corresponding Theil index value: 0.0836


In [None]:
# get accuracies

acc_og =
acc_int = 

#### compare Disparate Impact before and after intervention

In [50]:
result_og_0_6 = result4.iloc[33,:]
result_int_0_6  = result.iloc[33,:]

full_results = pd.DataFrame(columns = ['model', 'protected_attribute', 'intervention']+result.columns.to_list())
full_results.loc[len(full_results.index)] = result_og_0_6
full_results.loc[len(full_results.index)] = result_int_0_6

full_results.iloc[0, :3] = ['XGBoost', 'sex', 'original']
full_results.iloc[1, :3] = ['XGBoost', 'sex', 'DI Remover']

full_results

Unnamed: 0,model,protected_attribute,intervention,threshold,sensitivity,specificity,Balanced Accuracy,Avg Odds Difference,Disparate Impact,Statistical Parity Difference,Equal Opportunity Difference,Theil Index
0,XGBoost,sex,original,0.609388,0.389306,0.872059,0.630683,-0.263849,0.245853,-0.194606,-0.363522,0.094108
1,XGBoost,sex,DI Remover,0.609388,0.670389,0.742715,0.706552,-0.259937,0.393563,-0.267514,-0.274989,0.084304


In [40]:
result_og_0_6 = result4.iloc[33,:]
result_int_0_6  = result.iloc[33,:]

df = pd.concat([result_og_0_6, result_int_0_6], axis=0)

plt.figure(figsize=(4,3))
ax = sns.barplot(x='protected attribute', y='Disparate Impact', data=sex_boost_df, color='grey', zorder=10)
plt.ylim((0, 1.5))
plt.xlim((-0.7, 1.5))
plt.xlabel('')
plt.ylabel('')
ax.spines['left'].set_color('grey')
ax.spines['bottom'].set_color('grey')
ax.spines['bottom'].set_position(('data',1))
ax.text(1.6, 0.5, 'Bias', color='red', fontsize=14)
ax.text(1.6, 0.95, 'Fair', color='#515A5A', fontsize=14)
plt.yticks(fontsize=13)
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False)
plt.axhspan(0, 0.8, facecolor='red', alpha=0.04)
plt.axhspan(0.8, 1.25, facecolor='#C0C0C0', alpha=0.4, zorder=5)
plt.title('Disparate Impact', loc='left', y=1.05, fontsize=16, fontweight='bold')
for i in ax.containers:
    ax.bar_label(i, fontsize=14)

threshold                        0.609388
sensitivity                      0.389306
specificity                      0.872059
Balanced Accuracy                0.630683
Avg Odds Difference             -0.263849
Disparate Impact                 0.245853
Statistical Parity Difference   -0.194606
Equal Opportunity Difference    -0.363522
Theil Index                      0.094108
Name: 0, dtype: float64

In [36]:
og_results = pd.read_csv('results_tables/eval_manual_threshold_1.csv').iloc[:,1:]
og_results

Unnamed: 0,model,threshold,sensitivity,specificity,Balanced Accuracy,Avg Odds Difference,Disparate Impact,Statistical Parity Difference,Equal Opportunity Difference,Theil Index
0,MLP,0.500408,0.681125,0.670772,0.675949,-0.27701,0.449364,-0.28447,-0.288023,0.087927
1,MLP,0.500408,0.681125,0.670772,0.675949,-0.424157,0.132087,-0.3965,-0.4781,0.087927
2,MLP,0.500408,0.681125,0.670772,0.675949,-0.116429,0.670257,-0.121462,-0.128103,0.087927
3,XGBoost,0.609388,0.432253,0.876578,0.654415,-0.184388,0.372229,-0.144964,-0.251267,0.088873
4,XGBoost,0.609388,0.432253,0.876578,0.654415,-0.226962,0.116936,-0.170296,-0.312152,0.088873
5,XGBoost,0.609388,0.432253,0.876578,0.654415,-0.069423,0.566454,-0.067487,-0.086812,0.088873
6,TabNet,0.609388,0.30481,0.897437,0.601123,-0.160384,0.354731,-0.120626,-0.22094,0.097189
7,TabNet,0.609388,0.30481,0.897437,0.601123,-0.155448,0.144813,-0.130475,-0.197679,0.097189
8,TabNet,0.609388,0.30481,0.897437,0.601123,-0.117227,0.454571,-0.068157,-0.181938,0.097189


In [None]:
p, u =  fh.get_att_privilege_groups('sex-binary')

dataset_pred = BinaryLabelDataset(df=pd.concat([X_test, y_test], axis=1), 
                          label_names=['CVD'], 
                          protected_attribute_names=['sex-binary'])

dataset_pred.labels = y_pred

cm = BinaryLabelDatasetMetric(dataset_pred, privileged_groups=p, unprivileged_groups=u)
print(cm.disparate_impact())

In [None]:
# retrain model after processing

DIs = []
for level in np.linspace(0., 1., 11):
    di = DisparateImpactRemover(repair_level=level)
    
    # transform original dataset
    dataset_transf_train = di.fit_transform(dataset1b)
    transformed = dataset_transf_train.convert_to_dataframe()[0]
    tranformed_df = transformed.drop(['sex-binary', 'race-binary', 'race-grouped', 'age-binary'], axis=1)
    X_train, X_val, X_test, y_train, y_val, y_test = utilities.process_features(transformed_df, 'CVD', QuantileTransformer(output_distribution='uniform'), one_hot=True, val=True)
    X_train, y_train= utilities.resample_data(X_train, y_train, 'under')

    clf_xgb.fit(X_train, y_train,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=40,
                verbose=20)
    
    y_pred_proba = clf_xgb.predict_proba(X_test)[:,1]
    y_pred = np.where(y_pred_proba > threshold, 1,0)
    
    p, u =  get_att_privilege_groups('sex-binary')

    dataset_pred = transformed.copy()
    dataset_pred.labels = y_pred

    cm = BinaryLabelDatasetMetric(dataset_pred, privileged_groups=p, unprivileged_groups=u)
    DIs.append(cm.disparate_impact())

  0%|                                                                                           | 0/11 [00:00<?, ?it/s]

In [None]:
# bias evaluation
%matplotlib notebook

plt.plot(np.linspace(0, 1, 11), DIs, marker='o')
plt.plot([0, 1], [1, 1], 'g')
plt.plot([0, 1], [0.8, 0.8], 'r')
plt.ylim([0.4, 1.2])
plt.ylabel('Disparate Impact (DI)')
plt.xlabel('repair level')
plt.show()

### Method 2. Learning Fair Representations

Finds the latent representation which encodes the data well but obfuscated information about protected attributes

### Method 3. Optimized Preprocessing

Learns a probabilistic transformation that edits the features and labels in the data with group fairness, individual distortion, and dad fidelity constraints and objectives

### Method 4. Reweighing 

Weights the examples in each (group, label) combination differently to ensure fairness before classification

In [None]:
RW = Reweighing(unprivileged_groups=unprivileged_groups,
               privileged_groups=privileged_groups)
RW.fit(dataset_orig_train)
dataset_transf_train = RW.transform(dataset1b)