Working script for cfST

Use to greate the functions from_groups(), test_disc()... part of run_cfST()

From Salvatore's paper: "it boils down to the Manhattan distance of z-scores"

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing

In [2]:
# set working directory - note: all code runs from the src folder
wrk_dir = os.getcwd()
# data path
data_path = wrk_dir + '\\' + 'data' + '\\'
# results path
resu_path = wrk_dir + '\\' + 'results\\counterfactuals' + '\\'

In [22]:
df    = pd.read_csv(data_path + 'Karimi2020_v2.csv', sep='|', )
print(df.shape)
cf_df = pd.read_csv(resu_path + 'cf_Karimi2020_v2.csv', sep='|', )
print(cf_df.shape)

(4993, 6)
(4993, 6)


In [4]:
round(df[df['Gender']==1].shape[0]/df.shape[0]*100, 3) # perc. of women in df
#df[df['Gender']==0].shape[0]/df.shape[0]*100

34.288

In [None]:
# plt.hist(df['LoanApproval'])
# plt.hist(cf_df['LoanApproval']) # number of denied loans drops!

In [5]:
print(df[(df['Gender']==1) & (df['LoanApproval']==1)].shape[0]/df.shape[0]*100)
print('--- vs ---')
print(cf_df[(cf_df['Gender']==1) & (cf_df['LoanApproval']==1)].shape[0]/cf_df.shape[0]*100)

13.498898457840976
--- vs ---
21.029441217704786


In [6]:
feat_trgt = ['LoanApproval']
# feat_rlvt = ['AnnualSalary']
feat_rlvt = ['AnnualSalary', 'AccountBalance']
feat_prot = 'Gender' # extend later for more than one (intersectionality???)
feat_prot_vals = {'non_prot': 0, 'prot': 1}

# future params!
# protected_group = {'Gender': 1} 
n = 10 # determine by power analysis? (future extension)
d = 'manhattan' # ‘manhattan’ - see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html

feat_list = feat_trgt + feat_rlvt
feat_list.append(feat_prot)
feat_list

['LoanApproval', 'AnnualSalary', 'AccountBalance', 'Gender']

In [39]:
# TODO: we'll need a prepocessing step
# normalize option; weighted option

weights = None
# else weights = {} where len(weights) != len(rlvt_feat): exit
standardize = True

Under counterfactual situation testing, for the protected group we need to create two groups: a *control group* (CG) centered on the factuals and a *test group* (TG) centered on the counterfactuals. In practice, this means that we take individual women and compare them to other similar women using some distance $d_1$ to construct CG, while we take their corresponding counterfactuals and compare them to similar men using some distance $d_2$. We start off with $d_1=d_2$. We create the groups using a KNN algorithm.

Under this approach, rather than centering both CG and TG on the same (factual) instance, we construct the hypothetical group (*what would have happened had the female individual been male?*) by allowing for all variables to adjust due to the change in $A$. This is our implementation of what Kohler defined as *fairness given the difference* and what she arguess through her Eddie Murphy paper.

In [40]:
# 1) Setup the respectice search spaces for control (ctr) and test (tst) groups

# note: i just want the relevent features here... not all X!

search_ctr_group = df[df[feat_prot]==feat_prot_vals['prot']][feat_rlvt].copy()  #[feat_list]
print(search_ctr_group.shape)

search_tst_group = cf_df[cf_df[feat_prot]==feat_prot_vals['non_prot']][feat_rlvt].copy()
print(search_tst_group.shape)


(1712, 2)
(3281, 2)


In [41]:
search_ctr_group

Unnamed: 0,AnnualSalary,AccountBalance
0,35000,7947.678090
4,201000,59008.567839
5,102000,26000.811689
6,68000,15064.447848
9,84500,21275.931413
...,...,...
4982,69500,19934.430626
4985,46500,9090.839963
4987,99000,31274.211511
4989,125000,35574.858747


In [60]:
if standardize:

    scaler = preprocessing.StandardScaler()
    
    search_ctr_group_scaled = scaler.fit_transform(search_ctr_group)
    search_ctr_group_scaled = pd.DataFrame(search_ctr_group_scaled, 
                                           index=search_ctr_group.index, 
                                           columns=search_ctr_group.columns)

    search_ctr_group = search_ctr_group_scaled
    del search_ctr_group_scaled
    
    search_tst_group_scaled = scaler.fit_transform(search_tst_group)
    search_tst_group_scaled = pd.DataFrame(search_tst_group_scaled, 
                                           index=search_tst_group.index, 
                                           columns=search_tst_group.columns)

    search_tst_group = search_tst_group_scaled
    del search_tst_group_scaled

if weights:
    print('todo')


In [43]:
# keep track of old / new indices: KNN seems to retunr indices that are absolute (?)
search_ctr_group.reset_index(inplace=True, )
search_ctr_group.rename(columns={'index': 'org_index'}, inplace=True)

search_tst_group.reset_index(inplace=True, )
search_tst_group.rename(columns={'index': 'org_index'}, inplace=True)

In [44]:
search_ctr_group.head(5)

Unnamed: 0,org_index,AnnualSalary,AccountBalance
0,0,-1.576557,-1.671135
1,4,3.672421,3.546198
2,5,0.542006,0.17351
3,6,-0.533085,-0.943953
4,9,-0.01135,-0.309272


In [45]:
search_tst_group.head(5)

Unnamed: 0,org_index,AnnualSalary,AccountBalance
0,1,0.595142,0.670098
1,2,-0.327811,-0.661134
2,3,-0.635462,-0.259809
3,7,1.210443,1.543611
4,8,-0.943112,-0.762996


In [46]:
# start the main function here

# store neighboors here
dict_df_neighbors = {}

In [47]:
# 2) For eahc ind(ividual) set the centers

ind = 0 # start loop or map get_neighboors()

# for storing the neighboors 
temp_dict_df_neighbors = {}

# get ctr center from factual df
center_ctr = df.loc[ind, feat_rlvt]
print(center_ctr)

# get tst center from counterfactual df 
center_tst = cf_df.loc[ind, feat_rlvt]
print(center_tst)

# prepare for knn
if len(feat_rlvt) > 1:
    center_ctr = center_ctr.values.reshape(1, -1)
    center_tst = center_tst.values.reshape(1, -1)
else:
    center_ctr = center_ctr.values.reshape(-1, 1)
    center_tst = center_tst.values.reshape(-1, 1)

print(center_ctr)
print(center_tst)

AnnualSalary      35000.00000
AccountBalance     7947.67809
Name: 0, dtype: float64
AnnualSalary      50796.35
AccountBalance    13852.05
Name: 0, dtype: float64
[[35000.          7947.67808994]]
[[50796.35 13852.05]]


In [48]:
# 3) Control Group for ind

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_1 = NearestNeighbors(n_neighbors = n + 1, algorithm='ball_tree', metric = d).fit(search_ctr_group[feat_rlvt])
knn_1

NearestNeighbors(algorithm='ball_tree', metric='manhattan', n_neighbors=11)

In [49]:
distances_1, indices_1 = knn_1.kneighbors(center_ctr)

In [50]:
distances_1

array([[42940.45947166, 42940.48034223, 42941.16610609, 42941.64203516,
        42941.6759016 , 42942.057525  , 42942.16914649, 42942.18342902,
        42942.30121395, 42942.33593805, 42942.55497174]])

In [51]:
indices_1

array([[   1, 1204,  473, 1665,   53, 1690,  260,  910,  405, 1689,  606]],
      dtype=int64)

In [52]:
temp_ctr_df = pd.DataFrame()
temp_ctr_df['knn_indices'] = pd.Series(indices_1[0])
temp_ctr_df['knn_distances'] = pd.Series(distances_1[0])
temp_ctr_df.sort_values(by='knn_distances', ascending=True, inplace=True)
# HERE we can drop neighbors based on the distance!
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances
0,1,42940.459472
1,1204,42940.480342
2,473,42941.166106
3,1665,42941.642035
4,53,42941.675902
5,1690,42942.057525
6,260,42942.169146
7,910,42942.183429
8,405,42942.301214
9,1689,42942.335938


In [53]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_ctr_df = temp_ctr_df.merge(search_ctr_group, how='inner', left_on='knn_indices', right_index=True)
temp_ctr_df

# we can. e.g., test on the features of interest as well for ST!

# note, maybe join here with the df dataset? you'll get org X and Y too

Unnamed: 0,knn_indices,knn_distances,org_index,AnnualSalary,AccountBalance
0,1,42940.459472,4,3.672421,3.546198
1,1204,42940.480342,3465,3.62499,3.572758
2,473,42941.166106,1356,3.340407,3.171577
3,1665,42941.642035,4860,3.340407,2.695648
4,53,42941.675902,174,3.119065,2.883124
5,1690,42942.057525,4914,2.67638,2.944185
6,260,42942.169146,749,2.850292,2.658652
7,910,42942.183429,2644,2.67638,2.818281
8,405,42942.301214,1176,2.67638,2.700496
9,1689,42942.335938,4913,2.407607,2.934545


In [54]:
# drop ind for ctr! org index is the one we are using to loop over the search space!!!
temp_ctr_df = temp_ctr_df[temp_ctr_df['org_index'] != ind].reset_index(drop=True)
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances,org_index,AnnualSalary,AccountBalance
0,1,42940.459472,4,3.672421,3.546198
1,1204,42940.480342,3465,3.62499,3.572758
2,473,42941.166106,1356,3.340407,3.171577
3,1665,42941.642035,4860,3.340407,2.695648
4,53,42941.675902,174,3.119065,2.883124
5,1690,42942.057525,4914,2.67638,2.944185
6,260,42942.169146,749,2.850292,2.658652
7,910,42942.183429,2644,2.67638,2.818281
8,405,42942.301214,1176,2.67638,2.700496
9,1689,42942.335938,4913,2.407607,2.934545


In [57]:
if temp_ctr_df.shape[0] > n:
    print(temp_ctr_df.shape)
    temp_ctr_df.drop(temp_ctr_df.tail(1).index,inplace=True)
    print(temp_ctr_df.shape)

(11, 5)


In [38]:
# store it
temp_dict_df_neighbors['control'] = temp_ctr_df

In [None]:
# clean up
del center_ctr, knn_1, temp_ctr_df, indices_1, distances_1, 

In [None]:
# 3) Test Group for ind (use the corresponding search space: men!)

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_2 = NearestNeighbors(n_neighbors = n, algorithm='ball_tree', metric = d).fit(search_tst_group[feat_rlvt])
knn_2

In [None]:
distances_2, indices_2 = knn_2.kneighbors(center_tst)

In [None]:
distances_2

In [None]:
indices_2

In [None]:
temp_tst_df = pd.DataFrame()
temp_tst_df['knn_indices'] = pd.Series(indices_2[0])
temp_tst_df['knn_distances'] = pd.Series(distances_2[0])
temp_tst_df.sort_values(by='knn_distances', ascending=True, inplace=True)
# HERE we can drop neighbors based on the distance!
temp_tst_df

In [None]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_tst_df = temp_tst_df.merge(search_tst_group, how='inner', left_on='knn_indices', right_index=True)
temp_tst_df


In [None]:
# store it
temp_dict_df_neighbors['test'] = temp_tst_df

In [None]:
# clean up
del center_tst, knn_2, temp_tst_df, indices_2, distances_2, 

In [None]:
temp_dict_df_neighbors

In [None]:
#... later on
dict_df_neighbors[ind] = temp_dict_df_neighbors

In [None]:
#recall that ind is the same across df and cf_df
dict_df_neighbors

In [None]:


len(dict_df_neighbors)

In [None]:
# 4) We need to start testing each group: do now for ind

In [None]:
df.loc[0,]

In [None]:
search_ctr_group.loc[857,]

In [None]:
df.loc[857,] # different ppl!!!

In [None]:
from sklearn.neighbors import BallTree
rng = np.random.RandomState(0)
X = rng.random_sample((10, 3))
print(X)
print('---')
print(X[:1])
tree = BallTree(X, leaf_size=2)              
dist, ind = tree.query(X[:1], k=3)                
print(ind)  # indices of 3 closest neighbors
#[0 3 1]
print(dist)  # distances to 3 closest neighbors
#[ 0.          0.19662693  0.29473397]