Working script for cfST

Use to greate the functions from_groups(), test_disc()... part of run_cfST()

From Salvatore's paper: "it boils down to the Manhattan distance of z-scores"

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import math
import sys
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors

In [2]:
# set working directory - note: all code runs from the src folder
wrk_dir = os.getcwd()
# data path
data_path = wrk_dir + '\\' + 'data' + '\\'
# results path
resu_path = wrk_dir + '\\' + 'results\\counterfactuals' + '\\'

In [3]:
df    = pd.read_csv(data_path + 'Karimi2020_v2.csv', sep='|', )
print(df.shape)
cf_df = pd.read_csv(resu_path + 'cf_Karimi2020_v2.csv', sep='|', )
print(cf_df.shape)

(4993, 6)
(4993, 6)


In [4]:
round(df[df['Gender']==1].shape[0]/df.shape[0]*100, 3) # perc. of women in df
#df[df['Gender']==0].shape[0]/df.shape[0]*100

34.288

In [5]:
# plt.hist(df['LoanApproval'])
# plt.hist(cf_df['LoanApproval']) # number of denied loans drops!

In [6]:
print(df[(df['Gender']==1) & (df['LoanApproval']==1)].shape[0]/df.shape[0]*100)
print('--- vs ---')
print(cf_df[(cf_df['Gender']==1) & (cf_df['LoanApproval']==1)].shape[0]/cf_df.shape[0]*100)

13.498898457840976
--- vs ---
21.029441217704786


In [7]:
feat_trgt = ['LoanApproval']
# feat_rlvt = ['AnnualSalary']
feat_rlvt = ['AnnualSalary', 'AccountBalance']
feat_prot = 'Gender' # extend later for more than one (intersectionality???)
feat_prot_vals = {'non_prot': 0, 'prot': 1}

# future params!
# protected_group = {'Gender': 1} 
n = 10 # determine by power analysis? (future extension)
d = 'manhattan' # ‘manhattan’ - see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html

feat_list = feat_trgt + feat_rlvt
feat_list.append(feat_prot)
feat_list

['LoanApproval', 'AnnualSalary', 'AccountBalance', 'Gender']

In [8]:
# TODO: we'll need a prepocessing step
# normalize option; weighted option

weights = None
# else weights = {} where len(weights) != len(rlvt_feat): exit
standardize = True

Under counterfactual situation testing, for the protected group we need to create two groups: a *control group* (CG) centered on the factuals and a *test group* (TG) centered on the counterfactuals. In practice, this means that we take individual women and compare them to other similar women using some distance $d_1$ to construct CG, while we take their corresponding counterfactuals and compare them to similar men using some distance $d_2$. We start off with $d_1=d_2$. We create the groups using a KNN algorithm.

Under this approach, rather than centering both CG and TG on the same (factual) instance, we construct the hypothetical group (*what would have happened had the female individual been male?*) by allowing for all variables to adjust due to the change in $A$. This is our implementation of what Kohler defined as *fairness given the difference* and what she arguess through her Eddie Murphy paper.

In [9]:
# 1) Setup the respectice search spaces for control (ctr) and test (tst) groups

df_ctr = df[df[feat_prot]==feat_prot_vals['prot']][feat_list].copy()
df_tst = cf_df[cf_df[feat_prot]==feat_prot_vals['non_prot']][feat_list].copy()
# del df, cf_df : I CANT!!! need to keep them for getting the centroids! think later of a more efficient way

search_ctr_group = df_ctr[feat_rlvt].copy()  #the search space is just the group-specific df with only rlvt features
print(search_ctr_group.shape)

search_tst_group = df_tst[feat_rlvt].copy()
print(search_tst_group.shape)

# search_ctr_group = df[df[feat_prot]==feat_prot_vals['prot']][feat_rlvt].copy()  #[feat_list]
# print(search_ctr_group.shape)

# search_tst_group = cf_df[cf_df[feat_prot]==feat_prot_vals['non_prot']][feat_rlvt].copy()
# print(search_tst_group.shape)

df_ctr.reset_index(inplace=True, )
df_ctr.rename(columns={'index': 'org_index'}, inplace=True)

df_tst.reset_index(inplace=True, )
df_tst.rename(columns={'index': 'org_index'}, inplace=True)

(1712, 2)
(3281, 2)


In [10]:
df_ctr.head(5)

Unnamed: 0,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,0,-1.0,35000,7947.67809,1
1,4,1.0,201000,59008.567839,1
2,5,1.0,102000,26000.811689,1
3,6,-1.0,68000,15064.447848,1
4,9,-1.0,84500,21275.931413,1


In [11]:
df_tst.head(5)

Unnamed: 0,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,1,1.0,120000.0,36940.1,0
1,2,-1.0,90000.0,23564.13,0
2,3,-1.0,80000.0,27596.57,0
3,7,1.0,140000.0,45716.98,0
4,8,-1.0,70000.0,22540.65,0


In [12]:
weights = {'AnnualSalary': 5, 'AccountBalance': 1}

In [None]:
# for feat_weight in weights:
#     print(feat_weight)
#     print(search_ctr_group[feat_weight])
    
#     print(weights[feat_weight] * search_ctr_group[feat_weight])

In [13]:
if standardize:
    
    print('standardizing')
    
    scaler = preprocessing.StandardScaler()
    
    search_ctr_group_scaled = scaler.fit_transform(search_ctr_group)
    search_ctr_group_scaled = pd.DataFrame(search_ctr_group_scaled, 
                                           index=search_ctr_group.index, 
                                           columns=search_ctr_group.columns)

    search_ctr_group = search_ctr_group_scaled
    del search_ctr_group_scaled
    
    search_tst_group_scaled = scaler.fit_transform(search_tst_group)
    search_tst_group_scaled = pd.DataFrame(search_tst_group_scaled, 
                                           index=search_tst_group.index, 
                                           columns=search_tst_group.columns)

    search_tst_group = search_tst_group_scaled
    del search_tst_group_scaled

if weights:
    print('weighting')
    
    if len(weights) != len(feat_rlvt):
        sys.exit('provide a weight for each relevant feature')
    
    for feat_weight in weights:
        print(feat_weight)
        search_ctr_group[feat_weight] = weights[feat_weight] * search_ctr_group[feat_weight]
        search_tst_group[feat_weight] = weights[feat_weight] * search_tst_group[feat_weight]


standardizing
weighting
AnnualSalary
AccountBalance


In [None]:
# # keep track of old / new indices: KNN seems to retunr indices that are absolute (?)
# search_ctr_group.reset_index(inplace=True, )
# search_ctr_group.rename(columns={'index': 'org_index'}, inplace=True)

# search_tst_group.reset_index(inplace=True, )
# search_tst_group.rename(columns={'index': 'org_index'}, inplace=True)

In [14]:
search_ctr_group.head(5)

Unnamed: 0,AnnualSalary,AccountBalance
0,-7.882784,-1.671135
4,18.362103,3.546198
5,2.710032,0.17351
6,-2.665427,-0.943953
9,-0.056749,-0.309272


In [15]:
search_tst_group.head(5)

Unnamed: 0,AnnualSalary,AccountBalance
1,2.975708,0.670098
2,-1.639054,-0.661134
3,-3.177308,-0.259809
7,6.052216,1.543611
8,-4.715562,-0.762996


In [16]:
# start the main function here

# store neighboors here
dict_df_neighbors = {}

In [17]:
# 2) For eahc ind(ividual) set the centers

ind = 0 # start loop or map get_neighboors()

# for storing the neighboors 
temp_dict_df_neighbors = {}

# get ctr center from factual df
center_ctr = df.loc[ind, feat_rlvt]
print(center_ctr)

# get tst center from counterfactual df 
center_tst = cf_df.loc[ind, feat_rlvt]
print(center_tst)

# prepare for knn
if len(feat_rlvt) > 1:
    center_ctr = center_ctr.values.reshape(1, -1)
    center_tst = center_tst.values.reshape(1, -1)
else:
    center_ctr = center_ctr.values.reshape(-1, 1)
    center_tst = center_tst.values.reshape(-1, 1)

print(center_ctr)
print(center_tst)

AnnualSalary      35000.00000
AccountBalance     7947.67809
Name: 0, dtype: float64
AnnualSalary      50796.35
AccountBalance    13852.05
Name: 0, dtype: float64
[[35000.          7947.67808994]]
[[50796.35 13852.05]]


In [18]:
# 3) Control Group for ind

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_1 = NearestNeighbors(n_neighbors = n + 1, algorithm='ball_tree', metric = d).fit(search_ctr_group[feat_rlvt])
knn_1

NearestNeighbors(algorithm='ball_tree', metric='manhattan', n_neighbors=11)

In [19]:
distances_1, indices_1 = knn_1.kneighbors(center_ctr)

In [20]:
temp_ctr_df = pd.DataFrame()
temp_ctr_df['knn_indices'] = pd.Series(indices_1[0])
temp_ctr_df['knn_distances'] = pd.Series(distances_1[0])
temp_ctr_df.sort_values(by='knn_distances', ascending=True, inplace=True)
# HERE we can drop neighbors based on the distance!
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances
0,1,42925.769789
1,1204,42925.980382
2,473,42927.804478
3,1665,42928.280407
4,53,42929.199643
5,260,42930.76798
6,950,42931.292973
7,1690,42931.352006
8,910,42931.47791
9,405,42931.595695


In [21]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_ctr_df = temp_ctr_df.merge(df_ctr, how='inner', left_on='knn_indices', right_index=True)
# temp_ctr_df = temp_ctr_df.merge(search_ctr_group, how='inner', left_on='knn_indices', right_index=True)
temp_ctr_df

# we can. e.g., test on the features of interest as well for ST!

# note, maybe join here with the df dataset? you'll get org X and Y too

Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,1,42925.769789,4,1.0,201000,59008.567839,1
1,1204,42925.980382,3465,1.0,199500,59268.504239,1
2,473,42927.804478,1356,1.0,190500,55342.236169,1
3,1665,42928.280407,4860,1.0,190500,50684.422553,1
4,53,42929.199643,174,1.0,183500,52519.208993,1
5,260,42930.76798,749,1.0,175000,50322.351188,1
6,950,42931.292973,2750,1.0,174000,46731.664732,1
7,1690,42931.352006,4914,1.0,169500,53116.803967,1
8,910,42931.47791,2644,1.0,169500,51884.60888,1
9,405,42931.595695,1176,1.0,169500,50731.873589,1


In [22]:
# drop ind for ctr! org index is the one we are using to loop over the search space!!!
temp_ctr_df = temp_ctr_df[temp_ctr_df['org_index'] != ind].reset_index(drop=True)
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,1,42925.769789,4,1.0,201000,59008.567839,1
1,1204,42925.980382,3465,1.0,199500,59268.504239,1
2,473,42927.804478,1356,1.0,190500,55342.236169,1
3,1665,42928.280407,4860,1.0,190500,50684.422553,1
4,53,42929.199643,174,1.0,183500,52519.208993,1
5,260,42930.76798,749,1.0,175000,50322.351188,1
6,950,42931.292973,2750,1.0,174000,46731.664732,1
7,1690,42931.352006,4914,1.0,169500,53116.803967,1
8,910,42931.47791,2644,1.0,169500,51884.60888,1
9,405,42931.595695,1176,1.0,169500,50731.873589,1


In [23]:
if temp_ctr_df.shape[0] > n:
    print(temp_ctr_df.shape)
    temp_ctr_df.drop(temp_ctr_df.tail(1).index,inplace=True)
    print(temp_ctr_df.shape)

(11, 7)
(10, 7)


In [24]:
# store it
temp_dict_df_neighbors['control'] = temp_ctr_df

In [25]:
# clean up
del center_ctr, knn_1, temp_ctr_df, indices_1, distances_1, 

In [26]:
# 3) Test Group for ind (use the corresponding search space: men!)

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_2 = NearestNeighbors(n_neighbors = n, algorithm='ball_tree', metric = d).fit(search_tst_group[feat_rlvt])
knn_2

NearestNeighbors(algorithm='ball_tree', metric='manhattan', n_neighbors=10)

In [27]:
distances_2, indices_2 = knn_2.kneighbors(center_tst)

In [28]:
temp_tst_df = pd.DataFrame()
temp_tst_df['knn_indices'] = pd.Series(indices_2[0])
temp_tst_df['knn_distances'] = pd.Series(distances_2[0])
temp_tst_df.sort_values(by='knn_distances', ascending=True, inplace=True)
# HERE we can drop neighbors based on the distance!
temp_tst_df

Unnamed: 0,knn_indices,knn_distances
0,592,64617.704365
1,164,64624.205159
2,610,64624.892517
3,5,64628.196312
4,149,64628.377803
5,1029,64628.535194
6,2659,64629.847007
7,909,64630.099292
8,577,64630.226964
9,2688,64630.474463


In [29]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_tst_df = temp_tst_df.merge(df_tst, how='inner', left_on='knn_indices', right_index=True)
# temp_tst_df = temp_tst_df.merge(search_tst_group, how='inner', left_on='knn_indices', right_index=True)
temp_tst_df


Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,592,64617.704365,903,1.0,270000.0,76890.35,0
1,164,64624.205159,249,1.0,230000.0,73395.95,0
2,610,64624.892517,937,1.0,230000.0,66489.51,0
3,5,64628.196312,10,1.0,210000.0,64205.77,0
4,149,64628.377803,220,1.0,210000.0,62382.19,0
5,1029,64628.535194,1569,1.0,210000.0,60800.75,0
6,2659,64629.847007,4081,1.0,200000.0,63075.99,0
7,909,64630.099292,1404,1.0,200000.0,60541.08,0
8,577,64630.226964,877,1.0,200000.0,59258.25,0
9,2688,64630.474463,4118,1.0,200000.0,56771.43,0


In [30]:
# store it
temp_dict_df_neighbors['test'] = temp_tst_df

In [31]:
# clean up
del center_tst, knn_2, temp_tst_df, indices_2, distances_2, 

In [32]:
temp_dict_df_neighbors

{'control':    knn_indices  knn_distances  org_index  LoanApproval  AnnualSalary  \
 0            1   42925.769789          4           1.0        201000   
 1         1204   42925.980382       3465           1.0        199500   
 2          473   42927.804478       1356           1.0        190500   
 3         1665   42928.280407       4860           1.0        190500   
 4           53   42929.199643        174           1.0        183500   
 5          260   42930.767980        749           1.0        175000   
 6          950   42931.292973       2750           1.0        174000   
 7         1690   42931.352006       4914           1.0        169500   
 8          910   42931.477910       2644           1.0        169500   
 9          405   42931.595695       1176           1.0        169500   
 
    AccountBalance  Gender  
 0    59008.567839       1  
 1    59268.504239       1  
 2    55342.236169       1  
 3    50684.422553       1  
 4    52519.208993       1  
 5    5032

In [33]:
#... later on
dict_df_neighbors[ind] = temp_dict_df_neighbors

In [34]:
#recall that ind is the same across df and cf_df
dict_df_neighbors

{0: {'control':    knn_indices  knn_distances  org_index  LoanApproval  AnnualSalary  \
  0            1   42925.769789          4           1.0        201000   
  1         1204   42925.980382       3465           1.0        199500   
  2          473   42927.804478       1356           1.0        190500   
  3         1665   42928.280407       4860           1.0        190500   
  4           53   42929.199643        174           1.0        183500   
  5          260   42930.767980        749           1.0        175000   
  6          950   42931.292973       2750           1.0        174000   
  7         1690   42931.352006       4914           1.0        169500   
  8          910   42931.477910       2644           1.0        169500   
  9          405   42931.595695       1176           1.0        169500   
  
     AccountBalance  Gender  
  0    59008.567839       1  
  1    59268.504239       1  
  2    55342.236169       1  
  3    50684.422553       1  
  4    52519.208993

In [None]:
# 4) We need to start testing each group: do now for ind

In [35]:
target_labels = {'pos': 1, 'neg': -1}
alpha = 0.05


In [37]:
# for ind:

ctr_group = dict_df_neighbors[ind]['control']
tst_group = dict_df_neighbors[ind]['test']

p1 = ctr_group[ctr_group[feat_prot]==target_labels['neg']].shape[0] / ctr_group.shape[0]
p2 = tst_group[tst_group[feat_prot]==target_labels['neg']].shape[0] / tst_group.shape[0]
diff = p1 - p2
diff

0.0

In [38]:
def get_wald_ci(alpha: float, p1: float, p2: float, k1: int, k2: int):
    
    wald_ci_summary = {}
    
    z_score = round(st.norm.ppf(1 - (alpha/2)), 2)
    d_alpha = z_score * math.sqrt( (p1*(1 - p1)/k1) + (p2*(1 - p2)/k2) )
    
    wald_ci_summary['d_alpha'] = d_alpha
    wald_ci_summary['CIs'] = [(p1 - p2) - d_alpha, (p1 - p2) + d_alpha]
    if (p1 - p2) >= 0:
        wald_ci_summary['diff'] = max(0, p1 - p2 - d_alpha)
    else:
        wald_ci_summary['diff'] = min(0, p1 - p2 + d_alpha)
    
    print(wald_ci_summary)
    
    return wald_ci_summary

In [39]:
ind_wald_ci = get_wald_ci(alpha=alpha, p1=p1, p2=p2, k1=ctr_group.shape[0], k2=tst_group.shape[0])

{'d_alpha': 0.0, 'CIs': [0.0, 0.0], 'diff': 0}


In [40]:
print(ind)
print(ind_wald_ci)

0
{'d_alpha': 0.0, 'CIs': [0.0, 0.0], 'diff': 0}


In [41]:
# i guess i need some sort of threshold here???
diff_epsilon = 0.05 # tau in the second paper

In [42]:
if ind_wald_ci['diff'] > diff_epsilon:
    ind_wald_ci['cfST'] = 'Yes'
else:
    ind_wald_ci['cfST'] = 'No'
    
#how to return the results? get df with discrimination columns!

In [None]:
df

In [None]:
df.loc[0,]

In [None]:
search_ctr_group.loc[857,]

In [None]:
df.loc[857,] # different ppl!!!

In [None]:
from sklearn.neighbors import BallTree
rng = np.random.RandomState(0)
X = rng.random_sample((10, 3))
print(X)
print('---')
print(X[:1])
tree = BallTree(X, leaf_size=2)              
dist, ind = tree.query(X[:1], k=3)                
print(ind)  # indices of 3 closest neighbors
#[0 3 1]
print(dist)  # distances to 3 closest neighbors
#[ 0.          0.19662693  0.29473397]