Working script for cfST

Use to greate the functions from_groups(), test_disc()... part of run_cfST()

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors

In [2]:
# set working directory - note: all code runs from the src folder
wrk_dir = os.getcwd()
# data path
data_path = wrk_dir + '\\' + 'data' + '\\'
# results path
resu_path = wrk_dir + '\\' + 'results\\counterfactuals' + '\\'

In [3]:
df    = pd.read_csv(data_path + 'Karimi2020_v2.csv', sep='|', )
print(df.shape)
cf_df = pd.read_csv(resu_path + 'cf_Karimi2020_v2.csv', sep='|', )
print(cf_df.shape)

(4993, 6)
(4993, 6)


In [4]:
round(df[df['Gender']==1].shape[0]/df.shape[0]*100, 3) # perc. of women in df
#df[df['Gender']==0].shape[0]/df.shape[0]*100

34.288

In [None]:
# plt.hist(df['LoanApproval'])
# plt.hist(cf_df['LoanApproval']) # number of denied loans drops!

In [5]:
print(df[(df['Gender']==1) & (df['LoanApproval']==1)].shape[0]/df.shape[0]*100)
print('--- vs ---')
print(cf_df[(cf_df['Gender']==1) & (cf_df['LoanApproval']==1)].shape[0]/cf_df.shape[0]*100)

13.498898457840976
--- vs ---
21.029441217704786


In [34]:
feat_trgt = ['LoanApproval']
# feat_rlvt = ['AnnualSalary']
feat_rlvt = ['AnnualSalary', 'AccountBalance']
feat_prot = 'Gender' # extend later for more than one (intersectionality???)
feat_prot_vals = {'non_prot': 0, 'prot': 1}

# future params!
# protected_group = {'Gender': 1} 
n = 10 # determine by power analysis? (future extension)
d = 'euclidean' # ‘manhattan’ - see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html

feat_list = feat_trgt + feat_rlvt
feat_list.append(feat_prot)
feat_list

['LoanApproval', 'AnnualSalary', 'AccountBalance', 'Gender']

Under counterfactual situation testing, for the protected group we need to create two groups: a *control group* (CG) centered on the factuals and a *test group* (TG) centered on the counterfactuals. In practice, this means that we take individual women and compare them to other similar women using some distance $d_1$ to construct CG, while we take their corresponding counterfactuals and compare them to similar men using some distance $d_2$. We start off with $d_1=d_2$. We create the groups using a KNN algorithm.

Under this approach, rather than centering both CG and TG on the same (factual) instance, we construct the hypothetical group (*what would have happened had the female individual been male?*) by allowing for all variables to adjust due to the change in $A$. This is our implementation of what Kohler defined as *fairness given the difference* and what she arguess through her Eddie Murphy paper.

In [35]:
# 1) Setup the respectice search spaces for control (ctr) and test (tst) groups

search_ctr_group = df[df[feat_prot]==feat_prot_vals['prot']][feat_list].copy()
print(search_ctr_group.shape)

search_tst_group = cf_df[cf_df[feat_prot]==feat_prot_vals['non_prot']][feat_list].copy()
print(search_tst_group.shape)

# keep track of old / new indices: KNN seems to retunr indices that are absolute (?)
search_ctr_group.reset_index(inplace=True, )
search_ctr_group.rename(columns={'index': 'org_index'}, inplace=True)
search_tst_group.reset_index(inplace=True, )
search_tst_group.rename(columns={'index': 'org_index'}, inplace=True)

(1712, 4)
(3281, 4)


In [36]:
search_ctr_group.head(5)

Unnamed: 0,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,0,-1.0,35000,7947.67809,1
1,4,1.0,201000,59008.567839,1
2,5,1.0,102000,26000.811689,1
3,6,-1.0,68000,15064.447848,1
4,9,-1.0,84500,21275.931413,1


In [37]:
# start the main function here

# store neighboors here
dict_df_neighbors = {}

In [38]:
# 2) For eahc ind(ividual) set the centers

ind = 0 # start loop or map get_neighboors()

# for storing the neighboors 
temp_dict_df_neighbors = {}

# get ctr center from factual df
center_ctr = df.loc[ind, feat_rlvt]
print(center_ctr)

# get tst center from counterfactual df 
center_tst = cf_df.loc[ind, feat_rlvt]
print(center_tst)

# prepare for knn
if len(feat_rlvt) > 1:
    center_ctr = center_ctr.values.reshape(1, -1)
    center_tst = center_tst.values.reshape(1, -1)
else:
    center_ctr = center_ctr.values.reshape(-1, 1)
    center_tst = center_tst.values.reshape(-1, 1)

print(center_ctr)
print(center_tst)

AnnualSalary      35000.00000
AccountBalance     7947.67809
Name: 0, dtype: float64
AnnualSalary      50796.35
AccountBalance    13852.05
Name: 0, dtype: float64
[[35000.          7947.67808994]]
[[50796.35 13852.05]]


In [39]:
# 3) Control Group for ind

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_1 = NearestNeighbors(n_neighbors = n + 1, algorithm='ball_tree', metric = d).fit(search_ctr_group[feat_rlvt])
knn_1

NearestNeighbors(algorithm='ball_tree', metric='euclidean', n_neighbors=11)

In [40]:
distances_1, indices_1 = knn_1.kneighbors(center_ctr)

In [41]:
distances_1

array([[   0.        ,  123.29308024,  242.26211041,  619.62941896,
         672.13945902, 1503.55856885, 1684.29702006, 1687.30608972,
        1720.20882082, 1813.3319503 , 1816.84824881]])

In [42]:
indices_1

array([[   0, 1247, 1648,  887,  811, 1020,  100,  714, 1337,   64,  418]],
      dtype=int64)

In [43]:
temp_ctr_df = pd.DataFrame()
temp_ctr_df['knn_indices'] = pd.Series(indices_1[0])
temp_ctr_df['knn_distances'] = pd.Series(distances_1[0])
temp_ctr_df.sort_values(by='knn_distances', ascending=True, inplace=True)
# HERE we can drop neighbors based on the distance!
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances
0,0,0.0
1,1247,123.29308
2,1648,242.26211
3,887,619.629419
4,811,672.139459
5,1020,1503.558569
6,100,1684.29702
7,714,1687.30609
8,1337,1720.208821
9,64,1813.33195


In [44]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_ctr_df = temp_ctr_df.merge(search_ctr_group, how='inner', left_on='knn_indices', right_index=True)
temp_ctr_df

# we can. e.g., test on the features of interest as well for ST!


Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,0,0.0,0,-1.0,35000,7947.67809,1
1,1247,123.29308,3591,-1.0,35000,7824.38501,1
2,1648,242.26211,4789,-1.0,35000,7705.41598,1
3,887,619.629419,2568,-1.0,34500,7581.699106,1
4,811,672.139459,2355,-1.0,35000,8619.817549,1
5,1020,1503.558569,2956,-1.0,36500,7844.293517,1
6,100,1684.29702,279,-1.0,36500,7181.612524,1
7,714,1687.30609,2082,-1.0,33500,8720.337032,1
8,1337,1720.208821,3838,-1.0,33500,7105.586416,1
9,64,1813.33195,204,-1.0,33500,6928.770458,1


In [45]:
# drop ind for ctr! org index is the one we are using to loop over the search space!!!
temp_ctr_df = temp_ctr_df[temp_ctr_df['org_index'] != ind].reset_index(drop=True)
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,1247,123.29308,3591,-1.0,35000,7824.38501,1
1,1648,242.26211,4789,-1.0,35000,7705.41598,1
2,887,619.629419,2568,-1.0,34500,7581.699106,1
3,811,672.139459,2355,-1.0,35000,8619.817549,1
4,1020,1503.558569,2956,-1.0,36500,7844.293517,1
5,100,1684.29702,279,-1.0,36500,7181.612524,1
6,714,1687.30609,2082,-1.0,33500,8720.337032,1
7,1337,1720.208821,3838,-1.0,33500,7105.586416,1
8,64,1813.33195,204,-1.0,33500,6928.770458,1
9,418,1816.848249,1213,-1.0,36500,6922.525633,1


In [46]:
# store it
temp_dict_df_neighbors['control'] = temp_ctr_df

In [47]:
# clean up
del center_ctr, knn_1, temp_ctr_df, indices_1, distances_1, 

In [48]:
# 3) Test Group for ind (use the corresponding search space: men!)

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_2 = NearestNeighbors(n_neighbors = n, algorithm='ball_tree', metric = d).fit(search_tst_group[feat_rlvt])
knn_2

NearestNeighbors(algorithm='ball_tree', metric='euclidean', n_neighbors=10)

In [49]:
distances_2, indices_2 = knn_2.kneighbors(center_tst)

In [50]:
distances_2

array([[796.58069987, 796.66779306, 796.92167426, 797.36501735,
        798.67828698, 802.55931376, 812.99318453, 816.78921326,
        821.83624214, 830.87982531]])

In [51]:
indices_2

array([[2218, 2221,  612,  453, 2958,   65, 2153, 1438, 2507,  300]],
      dtype=int64)

In [52]:
temp_tst_df = pd.DataFrame()
temp_tst_df['knn_indices'] = pd.Series(indices_2[0])
temp_tst_df['knn_distances'] = pd.Series(distances_2[0])
temp_tst_df.sort_values(by='knn_distances', ascending=True, inplace=True)
# HERE we can drop neighbors based on the distance!
temp_tst_df

Unnamed: 0,knn_indices,knn_distances
0,2218,796.5807
1,2221,796.667793
2,612,796.921674
3,453,797.365017
4,2958,798.678287
5,65,802.559314
6,2153,812.993185
7,1438,816.789213
8,2507,821.836242
9,300,830.879825


In [53]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_tst_df = temp_tst_df.merge(search_tst_group, how='inner', left_on='knn_indices', right_index=True)
temp_tst_df


Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,2218,796.5807,3404,-1.0,50000.0,13832.88,0
1,2221,796.667793,3409,-1.0,50000.0,13829.55,0
2,612,796.921674,939,-1.0,50000.0,13821.87,0
3,453,797.365017,700,-1.0,50000.0,13811.83,0
4,2958,798.678287,4516,-1.0,50000.0,13791.11,0
5,65,802.559314,100,-1.0,50000.0,13752.41,0
6,2153,812.993185,3283,-1.0,50000.0,13688.39,0
7,1438,816.789213,2190,-1.0,50000.0,13670.47,0
8,2507,821.836242,3852,-1.0,50000.0,14055.13,0
9,300,830.879825,468,-1.0,50000.0,13615.01,0


In [54]:
# store it
temp_dict_df_neighbors['test'] = temp_tst_df

In [55]:
# clean up
del center_tst, knn_2, temp_tst_df, indices_2, distances_2, 

In [56]:
temp_dict_df_neighbors

{'control':    knn_indices  knn_distances  org_index  LoanApproval  AnnualSalary  \
 0         1247     123.293080       3591          -1.0         35000   
 1         1648     242.262110       4789          -1.0         35000   
 2          887     619.629419       2568          -1.0         34500   
 3          811     672.139459       2355          -1.0         35000   
 4         1020    1503.558569       2956          -1.0         36500   
 5          100    1684.297020        279          -1.0         36500   
 6          714    1687.306090       2082          -1.0         33500   
 7         1337    1720.208821       3838          -1.0         33500   
 8           64    1813.331950        204          -1.0         33500   
 9          418    1816.848249       1213          -1.0         36500   
 
    AccountBalance  Gender  
 0     7824.385010       1  
 1     7705.415980       1  
 2     7581.699106       1  
 3     8619.817549       1  
 4     7844.293517       1  
 5     718

In [57]:
#... later on
dict_df_neighbors[ind] = temp_dict_df_neighbors

In [58]:
#recall that ind is the same across df and cf_df
dict_df_neighbors

{0: {'control':    knn_indices  knn_distances  org_index  LoanApproval  AnnualSalary  \
  0         1247     123.293080       3591          -1.0         35000   
  1         1648     242.262110       4789          -1.0         35000   
  2          887     619.629419       2568          -1.0         34500   
  3          811     672.139459       2355          -1.0         35000   
  4         1020    1503.558569       2956          -1.0         36500   
  5          100    1684.297020        279          -1.0         36500   
  6          714    1687.306090       2082          -1.0         33500   
  7         1337    1720.208821       3838          -1.0         33500   
  8           64    1813.331950        204          -1.0         33500   
  9          418    1816.848249       1213          -1.0         36500   
  
     AccountBalance  Gender  
  0     7824.385010       1  
  1     7705.415980       1  
  2     7581.699106       1  
  3     8619.817549       1  
  4     7844.293517

In [None]:
df.loc[0,]

In [None]:
search_ctr_group.loc[857,]

In [None]:
df.loc[857,] # different ppl!!!

In [None]:
from sklearn.neighbors import BallTree
rng = np.random.RandomState(0)
X = rng.random_sample((10, 3))
print(X)
print('---')
print(X[:1])
tree = BallTree(X, leaf_size=2)              
dist, ind = tree.query(X[:1], k=3)                
print(ind)  # indices of 3 closest neighbors
#[0 3 1]
print(dist)  # distances to 3 closest neighbors
#[ 0.          0.19662693  0.29473397]