## Pre-processing Mitigation Methods: AIF360



In [5]:
import numpy as np
from numpy.random import choice as np_choice
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from IPython.display import Markdown, display
# from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

import fairness_helpers as fh
import global_variables as gv
from xgboost import XGBClassifier
import utilities
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, QuantileTransformer, RobustScaler, StandardScaler,MinMaxScaler

from aif360.algorithms.preprocessing import DisparateImpactRemover
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric

In [4]:
! pip install aif360
# ! pip install BlackBoxAuditing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


#### get data

In [7]:
X, X1 = fh.get_aif360_data()

In [8]:
X1

Unnamed: 0,1319-0.0,1408-0.0,1329-0.0,2050-0.0,1339-0.0,30710-0.0,1349-0.0,30750-0.0,30740-0.0,1160-0.0,1488-0.0,30850-0.0,4080-0.0,1369-0.0,1200-0.0,1289-0.0,30790-0.0,845-0.0,48-0.0,30630-0.0,1299-0.0,1220-0.0,1548-0.0,1528-0.0,23099-0.0,49-0.0,30690-0.0,1389-0.0,1249-0.0,1309-0.0,1379-0.0,21003-0.0,30780-0.0,1438-0.0,30870-0.0,1359-0.0,30770-0.0,21001-0.0,1458-0.0,23100-0.0,1418-0.0,1478-0.0,4079-0.0,30760-0.0,23101-0.0,30640-0.0,1428-0.0_1.0,1428-0.0_2.0,1428-0.0_3.0,20117-0.0_1.0,20117-0.0_2.0,2100-0.0_1.0,2654-0.0_4.0,2654-0.0_5.0,2654-0.0_6.0,2654-0.0_7.0,2654-0.0_8.0,2654-0.0_9.0,21000-0.0_2.0,21000-0.0_3.0,21000-0.0_4.0,21000-0.0_5.0,21000-0.0_6.0,21000-0.0_1001.0,21000-0.0_1002.0,21000-0.0_1003.0,21000-0.0_2001.0,21000-0.0_2002.0,21000-0.0_2003.0,21000-0.0_2004.0,21000-0.0_3001.0,21000-0.0_3002.0,21000-0.0_3003.0,21000-0.0_3004.0,21000-0.0_4001.0,21000-0.0_4002.0,21000-0.0_4003.0,1538-0.0_1.0,1538-0.0_2.0,31-0.0_1.0,6138-0.0_2.0,6138-0.0_3.0,6138-0.0_4.0,6138-0.0_5.0,6138-0.0_6.0,2090-0.0_1.0,1508-0.0_2.0,1508-0.0_3.0,1508-0.0_4.0,6142-0.0_2.0,6142-0.0_3.0,6142-0.0_4.0,6142-0.0_5.0,6142-0.0_6.0,6142-0.0_7.0,1468-0.0_2.0,1468-0.0_3.0,1468-0.0_4.0,1468-0.0_5.0,1239-0.0_1.0,1239-0.0_2.0,1448-0.0_2.0,1448-0.0_3.0,1448-0.0_4.0,hypertension_1,sex-binary,race-binary,race-grouped,age-binary,CVD
0,0.0,1.0,2.0,2.0,2.0,-0.494112,1.0,-0.053673,1.096931,0.0,1.000000,-0.121472,-1.12,1.0,3.0,3.86,0.428585,0.659091,-0.855615,0.130081,4.0,0.0,2.0,0.000000,0.380952,0.000000,0.505548,1.0,1.0,0.0,1.0,-0.307692,0.273567,0.000000,-0.478302,2.0,0.758646,-0.376206,-0.279412,0.159292,3.0,1.0,-0.357143,0.511852,-0.306878,0.603390,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1
1,0.0,3.0,2.0,1.0,2.0,1.215453,4.0,1.163265,0.176090,2.0,-0.333333,1.102978,1.12,2.0,2.0,-0.14,-0.458336,-0.454545,1.604278,-0.420054,0.0,0.0,2.0,0.156667,0.452381,1.000000,-0.163662,1.0,1.0,-0.5,2.0,0.538462,-0.066543,0.222222,0.824528,3.0,-1.542752,1.453725,0.490196,1.743363,2.0,1.0,0.642857,-0.525476,1.248677,-0.047458,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,1,0
2,0.0,3.0,3.0,1.0,2.0,-0.394387,1.0,0.979592,0.592892,-2.0,-1.000000,-0.120790,-0.24,1.0,3.0,-0.14,-0.078586,-0.454545,-1.283422,1.246612,1.0,0.0,1.0,-0.333333,-0.103175,-1.272727,0.923024,1.0,3.0,1.0,2.0,0.846154,0.586876,-0.222222,-0.782075,2.0,-1.543929,-1.281062,0.490196,-0.707965,2.0,1.0,-1.071429,2.037679,-0.777778,0.216949,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0
3,3.0,3.0,3.0,1.0,2.0,-0.441875,2.0,0.428571,-0.798061,0.0,0.666667,0.284115,1.60,2.0,1.0,0.86,0.182055,0.000000,1.069519,-0.192412,0.0,0.0,1.0,0.000000,-0.182540,1.363636,-0.499307,0.0,1.0,-0.5,2.0,0.615385,-0.509242,0.000000,1.532075,2.0,0.608241,1.461040,0.490196,0.752212,2.0,1.0,0.142857,-0.533261,1.513228,-0.372881,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,1,0,1,0
4,0.0,3.0,2.0,2.0,2.0,-0.299411,2.0,-0.612245,-0.528271,-1.0,0.013333,1.791513,1.60,1.0,3.0,-1.14,0.808483,0.995455,0.213904,1.636856,-0.5,0.0,2.0,0.000000,-0.476190,-0.181818,1.532594,1.0,2.0,-0.5,1.0,-0.769231,1.285582,-0.222222,-0.293396,1.0,0.785725,-0.148473,-0.980392,-0.274336,2.0,1.0,0.428571,1.187186,0.529101,1.389831,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502476,0.0,3.0,1.0,1.0,2.0,0.033004,3.0,-1.469388,-0.701131,0.0,-1.000000,1.497080,0.36,1.0,2.0,-0.14,-0.688005,-0.454545,1.069519,-0.542005,-1.0,0.0,2.0,-0.666667,-0.301587,1.000000,0.004161,1.0,1.0,0.0,1.0,0.384615,0.231978,-0.666667,2.203774,2.0,-0.059897,0.674057,0.490196,0.584071,1.0,3.0,0.642857,-0.811568,1.571429,0.352542,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0
502477,0.0,1.0,1.0,1.0,1.0,0.650347,1.0,-0.408163,1.179321,-1.0,-1.000000,-0.100350,0.40,2.0,3.0,1.86,-0.692780,-0.454545,0.641711,-0.487805,0.5,0.0,3.0,0.333333,1.230159,0.727273,-0.751734,2.0,1.0,2.0,2.0,0.461538,-0.673752,-0.666667,0.613208,3.0,0.362472,0.823104,0.490196,1.371681,3.0,2.0,-0.642857,-0.552723,-0.322751,-0.457627,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0
502478,2.0,1.0,1.0,4.0,2.0,-0.251923,3.0,-0.755102,-0.148627,0.0,5.666667,1.000973,-0.20,2.0,2.0,-0.14,0.045515,0.927273,0.481283,-0.823848,0.0,1.0,3.0,-0.333333,-0.404762,-0.363636,-0.391123,1.0,1.0,0.0,2.0,-1.153846,-0.216266,1.111111,1.002830,3.0,0.194702,0.327511,1.225490,0.035398,1.0,1.0,-0.071429,-0.778483,0.904762,-0.162712,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,1,0,0,1
502479,3.0,3.0,1.0,1.0,2.0,1.039747,2.0,2.142857,0.909532,-1.0,0.333333,-0.055433,0.00,1.0,3.0,1.86,-0.183642,0.372727,-0.213904,0.336043,0.0,0.0,2.0,0.000000,0.833333,0.818182,0.748266,1.0,4.0,-0.5,1.0,-0.153846,0.561922,0.222222,0.571698,3.0,1.370714,0.109913,0.490196,0.840708,2.0,1.0,0.071429,0.284146,-0.243386,0.101695,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0


#### Step 1. Convert Pandas DataFrame into [BinaryLabelDataset datatype](https://aif360.readthedocs.io/en/latest/modules/generated/aif360.datasets.BinaryLabelDataset.html#aif360.datasets.BinaryLabelDataset)

In [9]:
dataset1b = BinaryLabelDataset(df=X1, 
                          label_names=['CVD'], 
                          protected_attribute_names=['sex-binary'])

dataset2b = BinaryLabelDataset(df=X1, 
                          label_names=['CVD'], 
                          protected_attribute_names=['race-binary'])

dataset3b = BinaryLabelDataset(df=X1, 
                          label_names=['CVD'], 
                          protected_attribute_names=['age-binary'])

#### XGBoost Model

In [51]:
df = pd.read_csv('/content/gdrive/MyDrive/UB-Masters-Thesis/data/entire_imputed.csv')
pd.set_option('display.max_columns', None)
df.drop('Unnamed: 0', axis=1, inplace=True)
df

Unnamed: 0,1319-0.0,1408-0.0,1329-0.0,1448-0.0,1538-0.0,6142-0.0,2050-0.0,1508-0.0,1339-0.0,30710-0.0,1349-0.0,30750-0.0,1468-0.0,20117-0.0,30740-0.0,1160-0.0,2090-0.0,31-0.0,1488-0.0,30850-0.0,4080-0.0,1369-0.0,21000-0.0,1200-0.0,1289-0.0,30790-0.0,845-0.0,48-0.0,30630-0.0,1299-0.0,1220-0.0,1548-0.0,1528-0.0,23099-0.0,49-0.0,30690-0.0,1389-0.0,2654-0.0,1249-0.0,1309-0.0,1379-0.0,1239-0.0,21003-0.0,30780-0.0,1438-0.0,30870-0.0,1359-0.0,30770-0.0,21001-0.0,1458-0.0,23100-0.0,6138-0.0,1418-0.0,1478-0.0,4079-0.0,30760-0.0,23101-0.0,2100-0.0,1428-0.0,30640-0.0,outcome_myocardial_infarction,outcome_cardiomyopathies,outcome_ischemic_heart_disease,outcome_heart_failure,outcome_peripheral_vascular_disease,outcome_cardiac_arrest,outcome_cerebral_infarction,outcome_arrhythmia,CVD,hypertension
0,0.0,1.0,2.0,3.0,2.0,1.0,2.0,3.0,2.0,0.3400,1.0,34.937,3.0,2.0,5.62200,7.0,1.0,0.0,6.00,0.50800,110.0,1.0,1001.0,3.0,6.0,54.4035,20.90,74.0,1.5930,10.0,0.0,2.0,2.00,35.6,102.0,6.47700,1.0,6.0,1.0,2.0,1.0,0.0,54.0,3.88800,10.0,0.97700,2.0,26.33900,24.5790,3.86,25.0,1.0,3.0,1.0,77.0,1.70600,45.2,1.0,0.0,1.21100,0,0,0,0,0,0,0,1,1,0
1,0.0,3.0,2.0,1.0,0.0,1.0,1.0,2.0,2.0,3.9400,4.0,40.900,5.0,2.0,5.05200,9.0,0.0,1.0,2.00,13.08800,166.0,2.0,1001.0,2.0,2.0,15.4000,16.00,120.0,1.3900,2.0,0.0,2.0,2.47,36.5,113.0,5.51200,1.0,7.0,1.0,1.0,2.0,0.0,65.0,3.52000,12.0,2.35800,3.0,10.70100,35.0861,7.00,42.9,3.0,2.0,1.0,91.0,1.17300,74.6,0.0,1.0,1.01900,1,0,1,0,0,0,0,0,0,1
2,0.0,3.0,3.0,2.0,1.0,2.0,1.0,2.0,2.0,0.5500,1.0,40.000,1.0,0.0,5.31000,5.0,0.0,0.0,0.00,0.51500,132.0,1.0,1001.0,3.0,2.0,32.1000,16.00,66.0,2.0050,4.0,0.0,1.0,1.00,29.5,88.0,7.07900,1.0,7.0,3.0,4.0,2.0,0.0,69.0,4.22700,8.0,0.65500,2.0,10.69300,19.3835,7.00,15.2,3.0,2.0,1.0,67.0,2.49000,36.3,0.0,1.0,1.09700,0,0,0,0,0,0,0,0,0,0
3,3.0,3.0,3.0,3.0,0.0,2.0,1.0,2.0,2.0,0.4500,2.0,37.300,4.0,2.0,4.44900,7.0,0.0,1.0,5.00,4.67500,178.0,2.0,1001.0,1.0,3.0,43.5620,18.00,110.0,1.4740,2.0,0.0,1.0,2.00,28.5,117.0,5.02800,0.0,7.0,1.0,1.0,2.0,1.0,66.0,3.04100,10.0,3.10800,2.0,25.31700,35.1281,7.00,31.7,3.0,2.0,1.0,84.0,1.16900,79.6,0.0,3.0,0.92300,0,0,0,0,0,0,0,0,0,0
4,0.0,3.0,2.0,1.0,0.0,5.0,2.0,2.0,2.0,0.7500,2.0,32.200,1.0,2.0,4.61600,6.0,0.0,1.0,3.04,20.16200,178.0,1.0,1001.0,3.0,1.0,71.1100,22.38,94.0,2.1490,1.0,0.0,2.0,2.00,24.8,100.0,7.95800,1.0,7.0,2.0,1.0,1.0,0.0,48.0,4.98300,8.0,1.17300,1.0,26.52300,25.8866,1.00,20.1,1.0,2.0,1.0,88.0,2.05300,61.0,0.0,3.0,1.44300,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502476,0.0,3.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.4500,3.0,28.000,1.0,2.0,4.50900,7.0,0.0,1.0,0.00,17.13700,147.0,1.0,1001.0,2.0,2.0,5.3000,16.00,110.0,1.3450,0.0,0.0,2.0,0.00,27.0,113.0,5.75400,1.0,4.0,1.0,2.0,1.0,2.0,63.0,3.84300,4.0,3.82000,2.0,20.77700,30.6094,7.00,29.8,3.0,1.0,3.0,91.0,1.02600,80.7,0.0,3.0,1.13700,0,0,0,0,0,0,0,0,0,0
502477,0.0,1.0,1.0,3.0,2.0,2.0,1.0,1.0,1.0,2.7500,1.0,33.200,1.0,1.0,5.67300,6.0,1.0,0.0,0.00,0.72500,148.0,2.0,1001.0,3.0,4.0,5.0900,16.00,102.0,1.3650,3.0,0.0,3.0,3.00,46.3,110.0,4.66400,2.0,7.0,1.0,6.0,2.0,0.0,64.0,2.86300,4.0,2.13400,3.0,23.64700,31.4652,7.00,38.7,3.0,3.0,2.0,73.0,1.15900,44.9,0.0,1.0,0.89800,0,0,1,0,0,0,0,0,0,1
502478,2.0,1.0,1.0,2.0,2.0,1.0,4.0,2.0,2.0,0.8500,3.0,31.500,3.0,2.0,4.85100,7.0,1.0,1.0,20.00,12.04000,133.0,2.0,1001.0,2.0,2.0,37.5575,22.08,99.0,1.2410,2.0,1.0,3.0,1.00,25.7,98.0,5.18400,1.0,7.0,1.0,2.0,2.0,0.0,43.0,3.35800,20.0,2.54700,3.0,22.50700,28.6196,10.00,23.6,1.0,1.0,1.0,81.0,1.04300,68.1,1.0,1.0,0.98500,1,0,1,0,0,1,0,1,1,1
502479,3.0,3.0,1.0,3.0,0.0,1.0,1.0,2.0,2.0,3.5700,2.0,45.700,1.0,2.0,5.50600,6.0,0.0,0.0,4.00,1.18648,138.0,1.0,1001.0,3.0,4.0,27.4800,19.64,86.0,1.6690,2.0,0.0,2.0,2.00,41.3,111.0,6.82700,1.0,7.0,4.0,1.0,1.0,0.0,56.0,4.20000,12.0,2.09000,3.0,30.49800,27.3702,7.00,32.7,1.0,2.0,1.0,83.0,1.58900,46.4,0.0,0.0,1.06300,0,0,0,0,0,0,0,0,0,0


In [53]:
df = pd.read_csv('/content/gdrive/MyDrive/UB-Masters-Thesis/data/entire_imputed.csv')
pd.set_option('display.max_columns', None)
df.drop('Unnamed: 0', axis=1, inplace=True)

X_train, X_val, X_test, y_train, y_val, y_test = utilities.process_features(df, 'CVD', QuantileTransformer(output_distribution='uniform'), one_hot=True, val=True)
X_train, y_train= utilities.resample_data(X_train, y_train, 'under')

n_estimators = 1000

clf_xgb = XGBClassifier(max_depth=8,
    learning_rate=0.01,
    n_estimators=n_estimators,
    verbosity=0,
    silent=None,
    objective="binary:logistic",
    booster='gbtree',
    n_jobs=-1,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,)

clf_xgb.fit(X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=40,
            verbose=20)

[0]	validation_0-error:0.314007
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.303393
[40]	validation_0-error:0.304305
Stopping. Best iteration:
[4]	validation_0-error:0.302862



XGBClassifier(learning_rate=0.01, max_depth=8, n_estimators=1000, n_jobs=-1,
              subsample=0.7, verbosity=0)

### Method 1. Disparate Impact Remover

Edits feature values to increase group fairness while preserving rank-ordering. Must specify the <font color='blue'>repair_level</font> variable to indicate how much group distributions should overlap.

We want the individual rankings within their group to be preserved after repair ie if an individual has the highest ranking in group Q, it will still have the highest ranking in group Q after repair

In [None]:
# build DisparateRemoverObject

# binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
#     favorable_label=1,
#     unfavorable_label=0,
#     df=encoded_df,
#     label_names=['Loan_Status'],
#     protected_attribute_names=['Gender'])

In [11]:
# transform original dataset

di = DisparateImpactRemover(repair_level = 1.0)
dataset_transf_train = di.fit_transform(dataset1b)
transformed = dataset_transf_train.convert_to_dataframe()[0]

In [46]:
    X = transformed.drop(['CVD'] , axis=1)
    y = transformed.loc[:, 'CVD']


    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)
    X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.15, random_state=1)    

    X_train, y_train= utilities.resample_data(X_train, y_train, 'under')

In [42]:
X_test

Unnamed: 0,1319-0.0,1408-0.0,1329-0.0,2050-0.0,1339-0.0,30710-0.0,1349-0.0,30750-0.0,30740-0.0,1160-0.0,1488-0.0,30850-0.0,4080-0.0,1369-0.0,1200-0.0,1289-0.0,30790-0.0,845-0.0,48-0.0,30630-0.0,1299-0.0,1220-0.0,1548-0.0,1528-0.0,23099-0.0,49-0.0,30690-0.0,1389-0.0,1249-0.0,1309-0.0,1379-0.0,21003-0.0,30780-0.0,1438-0.0,30870-0.0,1359-0.0,30770-0.0,21001-0.0,1458-0.0,23100-0.0,1418-0.0,1478-0.0,4079-0.0,30760-0.0,23101-0.0,30640-0.0,1428-0.0_1.0,1428-0.0_2.0,1428-0.0_3.0,20117-0.0_1.0,20117-0.0_2.0,2100-0.0_1.0,2654-0.0_4.0,2654-0.0_5.0,2654-0.0_6.0,2654-0.0_7.0,2654-0.0_8.0,2654-0.0_9.0,21000-0.0_2.0,21000-0.0_3.0,21000-0.0_4.0,21000-0.0_5.0,21000-0.0_6.0,21000-0.0_1001.0,21000-0.0_1002.0,21000-0.0_1003.0,21000-0.0_2001.0,21000-0.0_2002.0,21000-0.0_2003.0,21000-0.0_2004.0,21000-0.0_3001.0,21000-0.0_3002.0,21000-0.0_3003.0,21000-0.0_3004.0,21000-0.0_4001.0,21000-0.0_4002.0,21000-0.0_4003.0,1538-0.0_1.0,1538-0.0_2.0,31-0.0_1.0,6138-0.0_2.0,6138-0.0_3.0,6138-0.0_4.0,6138-0.0_5.0,6138-0.0_6.0,2090-0.0_1.0,1508-0.0_2.0,1508-0.0_3.0,1508-0.0_4.0,6142-0.0_2.0,6142-0.0_3.0,6142-0.0_4.0,6142-0.0_5.0,6142-0.0_6.0,6142-0.0_7.0,1468-0.0_2.0,1468-0.0_3.0,1468-0.0_4.0,1468-0.0_5.0,1239-0.0_1.0,1239-0.0_2.0,1448-0.0_2.0,1448-0.0_3.0,1448-0.0_4.0,hypertension_1,sex-binary,race-binary,race-grouped,age-binary
338440,0.00,2.0,3.0,1.0,1.0,0.555371,2.0,-0.717347,0.346624,-1.05,-0.413333,-0.077024,-0.5232,1.0,3.0,-0.26,-0.589770,1.302273,0.481925,-0.501355,0.000,0.0,2.0,0.040000,0.150794,-0.518182,-1.038141,1.0,1.0,-0.500,1.0,0.384615,-0.893715,-0.767778,-0.266123,2.0,-0.452805,0.446480,-0.980392,0.345133,2.0,2.0,-0.401429,-0.755128,-0.541376,-0.298305,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
155245,0.16,3.0,1.0,2.0,2.0,-0.070709,3.0,-1.244898,-0.335557,2.00,0.233333,-0.077842,-0.7136,3.0,1.0,-0.26,-0.456062,-0.472727,0.122995,-1.189702,0.000,0.0,1.0,0.000000,0.087302,0.636364,1.490291,1.0,2.0,-0.500,1.0,-1.307692,1.813309,-0.160000,0.522642,3.0,0.383076,0.836671,-0.306373,0.646018,2.0,3.0,-0.760000,-1.117123,0.153439,1.708475,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
400029,0.65,3.0,1.0,1.0,2.0,-0.085716,2.0,0.836735,-1.817447,-0.02,0.246667,0.021316,-0.2804,3.0,3.0,-0.14,-0.565893,-0.454545,0.106952,0.478726,-0.075,0.0,2.0,-0.426667,-0.178968,-0.109091,0.539528,1.0,2.0,0.460,1.0,0.153846,0.322551,0.444444,0.553774,3.0,0.468558,-0.164530,-1.039216,0.433982,2.0,4.0,-0.500000,0.239675,-0.259259,0.274576,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
15236,0.58,3.0,3.0,1.0,2.0,0.094501,1.0,-0.259796,-0.865913,-1.05,0.350000,-0.068789,-0.5712,1.0,3.0,-0.26,-0.599321,-0.472727,-0.239037,0.457995,2.000,1.0,1.0,0.000000,-0.166667,0.181818,-0.467406,1.0,4.0,0.000,1.0,-0.307692,-0.655268,-0.521111,-0.116811,2.0,-0.165023,0.254537,0.394608,0.106195,4.0,1.0,-0.401429,0.180997,0.065344,-0.684746,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
379577,1.00,5.0,1.0,1.0,2.0,0.304397,2.0,0.040816,-0.130969,0.90,-0.333333,-0.073272,-0.1200,1.0,3.0,1.86,0.363000,-0.209091,-0.855615,0.642520,2.495,1.0,2.0,-0.426667,-0.197778,-1.181818,0.394792,1.0,4.0,0.995,1.0,-0.461538,0.112717,-0.888889,-0.134547,2.0,-0.190062,-0.891445,0.490196,-1.150885,2.0,3.0,0.214286,0.544899,-0.894180,0.033627,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213021,0.00,1.0,1.0,1.0,2.0,0.260946,1.0,0.632653,-0.137318,-1.00,1.666667,-0.081468,0.2400,1.0,3.0,-0.14,-0.538833,-0.454545,0.748663,-0.746206,-0.585,0.0,2.0,1.333333,0.414365,0.836364,-0.252427,1.0,1.0,1.500,1.0,0.769231,0.387246,0.444444,1.056604,3.0,-1.106549,1.491727,0.490196,0.814159,2.0,1.0,0.490714,-0.904422,-0.111111,0.589831,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
404411,0.00,3.0,3.0,2.0,2.0,-0.090987,4.0,0.508163,0.318643,0.00,0.556667,-0.079497,-0.3332,3.0,2.0,-0.26,-0.696873,0.506818,0.647059,0.728049,-0.500,0.0,2.0,0.333333,-0.309524,0.545455,1.981969,2.0,4.0,0.000,2.0,-1.000000,1.102588,-0.674444,4.481132,3.0,1.046799,0.362508,-0.737745,0.203540,3.0,2.0,-0.566429,0.464190,0.169947,1.252373,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
381024,1.00,1.0,0.0,2.0,1.0,-0.375392,1.0,0.530612,0.195477,-0.02,-1.000000,-0.098598,0.3600,2.0,1.0,-0.14,-0.193420,0.454545,-0.695187,0.270271,0.945,1.0,2.0,0.956667,-0.184683,-0.239273,1.302358,2.0,3.0,0.460,1.0,0.230769,1.427911,-0.888889,0.481538,3.0,0.285651,0.047790,0.000000,-0.041770,3.0,1.0,0.427857,0.448873,-0.529101,1.238644,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
283727,0.00,1.0,1.0,1.0,1.0,-0.589087,2.0,0.591837,0.722132,-1.00,-0.666667,-0.090325,-0.5200,1.0,2.0,-1.17,-0.616603,0.756818,-1.604278,0.274336,-0.065,1.0,2.0,0.736667,-1.554603,-1.545455,-0.644244,1.0,4.0,-0.610,1.0,-0.692308,-0.653420,2.000000,-0.389623,1.0,0.295659,-1.278798,-0.245098,-1.335575,1.0,2.0,0.071429,0.521155,-0.761905,-1.041559,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0


In [47]:
clf_xgb.fit(X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=40,
            verbose=20)

[0]	validation_0-error:0.334854
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.309281
[40]	validation_0-error:0.309148
Stopping. Best iteration:
[3]	validation_0-error:0.302531



XGBClassifier(learning_rate=0.01, max_depth=8, n_estimators=1000, n_jobs=-1,
              subsample=0.7, verbosity=0)

In [48]:
y_pred_proba = clf_xgb.predict_proba(X_test)[:,1]
y_pred = np.where(y_pred_proba > 0.6, 1,0)

In [27]:
p, u =  fh.get_att_privilege_groups('sex-binary')

dataset_pred = BinaryLabelDataset(df=pd.concat([X_test, y_test], axis=1), 
                          label_names=['CVD'], 
                          protected_attribute_names=['sex-binary'])

In [28]:
dataset_pred.labels = y_pred

cm = BinaryLabelDatasetMetric(dataset_pred, privileged_groups=p, unprivileged_groups=u)
print(cm.disparate_impact())

0.29401230929725636


In [18]:
# retrain model after processing

DIs = []
for level in np.linspace(0., 1., 11):
    di = DisparateImpactRemover(repair_level=level)
    
    # transform original dataset
    dataset_transf_train = di.fit_transform(dataset1b)
    transformed = dataset_transf_train.convert_to_dataframe()[0]
    X = transformed.drop(['CVD'] , axis=1)
    y = transformed.loc[:, 'CVD']


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.15, random_state=1)    

    X_train, y_train= utilities.resample_data(X_train, y_train, 'under')

    X_train = X_train.drop(['sex-binary', 'race-binary', 'race-grouped', 'age-binary'], axis=1)
    clf_xgb.fit(X_train, y_train,
                eval_set=[(X_val.drop(['sex-binary', 'race-binary', 'race-grouped', 'age-binary'], axis=1), y_val)],
                early_stopping_rounds=40,
                verbose=20)
    
    y_pred_proba = clf_xgb.predict_proba(X_test.drop(['sex-binary', 'race-binary', 'race-grouped', 'age-binary'], axis=1))[:,1]
    y_pred = np.where(y_pred_proba > 0.6, 1,0)
    
    p, u =  fh.get_att_privilege_groups('sex-binary')

    dataset_pred = BinaryLabelDataset(df=pd.concat([X_test, y_test], axis=1), 
                              label_names=['CVD'], 
                              protected_attribute_names=['sex-binary'])

    dataset_pred.labels = y_pred

    cm = BinaryLabelDatasetMetric(dataset_pred, privileged_groups=p, unprivileged_groups=u)
    print(cm.disparate_impact())
    DIs.append(cm.disparate_impact())

[0]	validation_0-error:0.313924
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.303327
[40]	validation_0-error:0.304272
Stopping. Best iteration:
[4]	validation_0-error:0.302796



invalid value encountered in double_scalars


nan
[0]	validation_0-error:0.304057
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.306411
[40]	validation_0-error:0.305831
Stopping. Best iteration:
[0]	validation_0-error:0.304057



invalid value encountered in double_scalars


nan
[0]	validation_0-error:0.311022
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.304222
[40]	validation_0-error:0.303675
[60]	validation_0-error:0.302912
[80]	validation_0-error:0.30278
[100]	validation_0-error:0.3021
Stopping. Best iteration:
[74]	validation_0-error:0.301702

0.27498483108224364
[0]	validation_0-error:0.321735
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.304554
[40]	validation_0-error:0.303509
Stopping. Best iteration:
[9]	validation_0-error:0.302481



invalid value encountered in double_scalars


nan
[0]	validation_0-error:0.311801
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.303841
[40]	validation_0-error:0.302166
[60]	validation_0-error:0.302597
[80]	validation_0-error:0.302481
Stopping. Best iteration:
[42]	validation_0-error:0.301055

0.10724987529544418
[0]	validation_0-error:0.304521
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.304073
[40]	validation_0-error:0.303012
[60]	validation_0-error:0.301718
[80]	validation_0-error:0.302531
[100]	validation_0-error:0.301818
Stopping. Best iteration:
[72]	validation_0-error:0.301154

0.28282842148664067
[0]	validation_0-error:0.29404
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.305035
[40]	validation_0-error:0.304604
Stopping. Best iteration:
[0]	validation_0-error:0.29404



invalid value encountered in double_scalars


nan
[0]	validation_0-error:0.348088
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.305533
[40]	validation_0-error:0.305002
Stopping. Best iteration:
[3]	validation_0-error:0.302216



invalid value encountered in double_scalars


nan
[0]	validation_0-error:0.288136
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.305068
[40]	validation_0-error:0.304786
Stopping. Best iteration:
[0]	validation_0-error:0.288136



invalid value encountered in double_scalars


nan
[0]	validation_0-error:0.300093
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.306843
[40]	validation_0-error:0.307672
Stopping. Best iteration:
[0]	validation_0-error:0.300093



invalid value encountered in double_scalars


nan
[0]	validation_0-error:0.315168
Will train until validation_0-error hasn't improved in 40 rounds.
[20]	validation_0-error:0.306047
[40]	validation_0-error:0.30598
[60]	validation_0-error:0.303808
[80]	validation_0-error:0.303542
[100]	validation_0-error:0.303061
[120]	validation_0-error:0.302862
[140]	validation_0-error:0.301818
[160]	validation_0-error:0.302332
[180]	validation_0-error:0.302282
Stopping. Best iteration:
[140]	validation_0-error:0.301818

0.47841980473091367


In [20]:
# bias evaluation

plt.plot(np.linspace(0, 1, 11), DIs, marker='o')
plt.plot([0, 1], [1, 1], 'g')
plt.plot([0, 1], [0.8, 0.8], 'r')
plt.ylim([0.4, 1.2])
plt.title('XGBoost: Disparate Impact Remover')
plt.ylabel('Disparate Impact (DI)')
plt.xlabel('repair level')
# plt.show()

Text(0.5, 15.0, 'repair level')

In [21]:
DIs

[nan,
 nan,
 0.27498483108224364,
 nan,
 0.10724987529544418,
 0.28282842148664067,
 nan,
 nan,
 nan,
 nan,
 0.47841980473091367]

### Method 2. Learning Fair Representations

Finds the latent representation which encodes the data well but obfuscated information about protected attributes

### Method 3. Optimized Preprocessing

Learns a probabilistic transformation that edits the features and labels in the data with group fairness, individual distortion, and dad fidelity constraints and objectives

### Method 4. Reweighing 

Weights the examples in each (group, label) combination differently to ensure fairness before classification

In [None]:
RW = Reweighing(unprivileged_groups=unprivileged_groups,
               privileged_groups=privileged_groups)
RW.fit(dataset_orig_train)
dataset_transf_train = RW.transform(dataset1b)