Working script for cfST

Use to greate the functions from_groups(), test_disc()... part of run_cfST()

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors

In [2]:
# set working directory - note: all code runs from the src folder
wrk_dir = os.getcwd()
# data path
data_path = wrk_dir + '\\' + 'data' + '\\'
# results path
resu_path = wrk_dir + '\\' + 'results\\counterfactuals' + '\\'

In [3]:
df    = pd.read_csv(data_path + 'Karimi2020_v2.csv', sep='|', )
print(df.shape)
cf_df = pd.read_csv(resu_path + 'cf_Karimi2020_v2.csv', sep='|', )
print(cf_df.shape)

(4993, 6)
(4993, 6)


In [4]:
round(df[df['Gender']==1].shape[0]/df.shape[0]*100, 3) # perc. of women in df
#df[df['Gender']==0].shape[0]/df.shape[0]*100

34.288

In [None]:
# plt.hist(df['LoanApproval'])
# plt.hist(cf_df['LoanApproval']) # number of denied loans drops!

In [5]:
print(df[(df['Gender']==1) & (df['LoanApproval']==1)].shape[0]/df.shape[0]*100)
print('--- vs ---')
print(cf_df[(cf_df['Gender']==1) & (cf_df['LoanApproval']==1)].shape[0]/cf_df.shape[0]*100)

13.498898457840976
--- vs ---
21.029441217704786


In [6]:
feat_trgt = ['LoanApproval']
feat_rlvt = ['AnnualSalary']
# feat_rlvt = ['AnnualSalary', 'AccountBalance']
feat_prot = 'Gender' # extend later for more than one (intersectionality???)
feat_prot_vals = {'non_prot': 0, 'prot': 1}

# future params!
# protected_group = {'Gender': 1} 
n = 10 # determine by power analysis? (future extension)
d = 'euclidean' # ‘manhattan’ - see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html

feat_list = feat_trgt + feat_rlvt
feat_list.append(feat_prot)
feat_list

['LoanApproval', 'AnnualSalary', 'Gender']

Under counterfactual situation testing, for the protected group we need to create two groups: a *control group* (CG) centered on the factuals and a *test group* (TG) centered on the counterfactuals. In practice, this means that we take individual women and compare them to other similar women using some distance $d_1$ to construct CG, while we take their corresponding counterfactuals and compare them to similar men using some distance $d_2$. We start off with $d_1=d_2$. We create the groups using a KNN algorithm.

Under this approach, rather than centering both CG and TG on the same (factual) instance, we construct the hypothetical group (*what would have happened had the female individual been male?*) by allowing for all variables to adjust due to the change in $A$. This is our implementation of what Kohler defined as *fairness given the difference* and what she arguess through her Eddie Murphy paper.

In [7]:
# 1) Setup the respectice search spaces for control (ctr) and test (tst) groups

search_ctr_group = df[df[feat_prot]==feat_prot_vals['prot']][feat_list].copy()
print(search_ctr_group.shape)

search_tst_group = cf_df[cf_df[feat_prot]==feat_prot_vals['non_prot']][feat_list].copy()
print(search_tst_group.shape)

# keep track of old / new indices: KNN seems to retunr indices that are absolute (?)
search_ctr_group.reset_index(inplace=True, )
search_ctr_group.rename(columns={'index': 'org_index'}, inplace=True)
search_tst_group.reset_index(inplace=True, )
search_tst_group.rename(columns={'index': 'org_index'}, inplace=True)

(1712, 3)
(3281, 3)


In [8]:
search_ctr_group.head(5)

Unnamed: 0,org_index,LoanApproval,AnnualSalary,Gender
0,0,-1.0,35000,1
1,4,1.0,201000,1
2,5,1.0,102000,1
3,6,-1.0,68000,1
4,9,-1.0,84500,1


In [10]:
# start the main function here

# store neighboors here
dict_df_neighbors = {}

In [11]:
# 2) For eahc ind(ividual) set the centers

ind = 0 # start loop or map get_neighboors()

# for storing the neighboors 
temp_dict_df_neighbors = {}

# get ctr center from factual df
center_ctr = df.loc[ind, feat_rlvt]
print(center_ctr)

# get tst center from counterfactual df 
center_tst = cf_df.loc[ind, feat_rlvt]
print(center_tst)

# prepare for knn
if len(feat_rlvt) > 1:
    center_ctr = center_ctr.values.reshape(1, -1)
    center_tst = center_tst.values.reshape(1, -1)
else:
    center_ctr = center_ctr.values.reshape(-1, 1)
    center_tst = center_tst.values.reshape(-1, 1)

print(center_ctr)
print(center_tst)

AnnualSalary    35000.0
Name: 0, dtype: float64
AnnualSalary    50796.35
Name: 0, dtype: float64
[[35000.]]
[[50796.35]]


In [12]:
# 3) Control Group for ind

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_1 = NearestNeighbors(n_neighbors = n + 1, algorithm='ball_tree', metric = d).fit(search_ctr_group[feat_rlvt])
knn_1

NearestNeighbors(algorithm='ball_tree', metric='euclidean', n_neighbors=11)

In [13]:
distances_1, indices_1 = knn_1.kneighbors(center_ctr)

In [14]:
distances_1

array([[   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
         500., 1000.]])

In [15]:
indices_1

array([[   0,  811,  857, 1648,  131,  786,   22,  359, 1247,  887,  932]],
      dtype=int64)

In [16]:
temp_ctr_df = pd.DataFrame()
temp_ctr_df['knn_indices'] = pd.Series(indices_1[0])
temp_ctr_df['knn_distances'] = pd.Series(distances_1[0])
temp_ctr_df.sort_values(by='knn_distances', ascending=True, inplace=True)
# HERE we can drop neighbors based on the distance!
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances
0,0,0.0
1,811,0.0
2,857,0.0
3,1648,0.0
4,131,0.0
5,786,0.0
6,22,0.0
7,359,0.0
8,1247,0.0
9,887,500.0


In [17]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_ctr_df = temp_ctr_df.merge(search_ctr_group, how='inner', left_on='knn_indices', right_index=True)
temp_ctr_df

# we can. e.g., test on the features of interest as well for ST!


Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,Gender
0,0,0.0,0,-1.0,35000,1
1,811,0.0,2355,-1.0,35000,1
2,857,0.0,2489,-1.0,35000,1
3,1648,0.0,4789,-1.0,35000,1
4,131,0.0,369,-1.0,35000,1
5,786,0.0,2276,-1.0,35000,1
6,22,0.0,64,-1.0,35000,1
7,359,0.0,1050,-1.0,35000,1
8,1247,0.0,3591,-1.0,35000,1
9,887,500.0,2568,-1.0,34500,1


In [18]:
# drop ind for ctr! org index is the one we are using to loop over the search space!!!
temp_ctr_df = temp_ctr_df[temp_ctr_df['org_index'] != ind].reset_index(drop=True)
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,Gender
0,811,0.0,2355,-1.0,35000,1
1,857,0.0,2489,-1.0,35000,1
2,1648,0.0,4789,-1.0,35000,1
3,131,0.0,369,-1.0,35000,1
4,786,0.0,2276,-1.0,35000,1
5,22,0.0,64,-1.0,35000,1
6,359,0.0,1050,-1.0,35000,1
7,1247,0.0,3591,-1.0,35000,1
8,887,500.0,2568,-1.0,34500,1
9,932,1000.0,2697,-1.0,36000,1


In [19]:
# store it
temp_dict_df_neighbors['control'] = temp_ctr_df

In [20]:
# clean up
del center_ctr, knn_1, temp_ctr_df, indices_1, distances_1, 

In [21]:
# 3) Test Group for ind

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_2 = NearestNeighbors(n_neighbors = n, algorithm='ball_tree', metric = d).fit(search_tst_group[feat_rlvt])
knn_2

NearestNeighbors(algorithm='ball_tree', metric='euclidean', n_neighbors=10)

In [22]:
distances_2, indices_2 = knn_2.kneighbors(center_tst)

In [23]:
distances_2

array([[796.35, 796.35, 796.35, 796.35, 796.35, 796.35, 796.35, 796.35,
        796.35, 796.35]])

In [24]:
indices_2

array([[1438, 1488, 1475,  739, 1420,  730,  408, 3274, 2644, 1481]],
      dtype=int64)

In [25]:
temp_tst_df = pd.DataFrame()
temp_tst_df['knn_indices'] = pd.Series(indices_2[0])
temp_tst_df['knn_distances'] = pd.Series(distances_2[0])
temp_tst_df.sort_values(by='knn_distances', ascending=True, inplace=True)
# HERE we can drop neighbors based on the distance!
temp_tst_df

Unnamed: 0,knn_indices,knn_distances
0,1438,796.35
1,1488,796.35
2,1475,796.35
3,739,796.35
4,1420,796.35
5,730,796.35
6,408,796.35
7,3274,796.35
8,2644,796.35
9,1481,796.35


In [26]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_tst_df = temp_tst_df.merge(search_tst_group, how='inner', left_on='knn_indices', right_index=True)
temp_tst_df


Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,Gender
0,1438,796.35,2190,-1.0,50000.0,0
1,1488,796.35,2273,-1.0,50000.0,0
2,1475,796.35,2248,-1.0,50000.0,0
3,739,796.35,1135,-1.0,50000.0,0
4,1420,796.35,2161,-1.0,50000.0,0
5,730,796.35,1120,-1.0,50000.0,0
6,408,796.35,626,-1.0,50000.0,0
7,3274,796.35,4980,-1.0,50000.0,0
8,2644,796.35,4050,-1.0,50000.0,0
9,1481,796.35,2262,-1.0,50000.0,0


In [27]:
# store it
temp_dict_df_neighbors['test'] = temp_tst_df

In [28]:
# clean up
del center_tst, knn_2, temp_tst_df, indices_2, distances_2, 

In [29]:
temp_dict_df_neighbors

{'control':    knn_indices  knn_distances  org_index  LoanApproval  AnnualSalary  Gender
 0          811            0.0       2355          -1.0         35000       1
 1          857            0.0       2489          -1.0         35000       1
 2         1648            0.0       4789          -1.0         35000       1
 3          131            0.0        369          -1.0         35000       1
 4          786            0.0       2276          -1.0         35000       1
 5           22            0.0         64          -1.0         35000       1
 6          359            0.0       1050          -1.0         35000       1
 7         1247            0.0       3591          -1.0         35000       1
 8          887          500.0       2568          -1.0         34500       1
 9          932         1000.0       2697          -1.0         36000       1,
 'test':    knn_indices  knn_distances  org_index  LoanApproval  AnnualSalary  Gender
 0         1438         796.35       2190   

In [30]:
#... later on
dict_df_neighbors[ind] = temp_dict_df_neighbors

In [31]:
dict_df_neighbors

{0: {'control':    knn_indices  knn_distances  org_index  LoanApproval  AnnualSalary  Gender
  0          811            0.0       2355          -1.0         35000       1
  1          857            0.0       2489          -1.0         35000       1
  2         1648            0.0       4789          -1.0         35000       1
  3          131            0.0        369          -1.0         35000       1
  4          786            0.0       2276          -1.0         35000       1
  5           22            0.0         64          -1.0         35000       1
  6          359            0.0       1050          -1.0         35000       1
  7         1247            0.0       3591          -1.0         35000       1
  8          887          500.0       2568          -1.0         34500       1
  9          932         1000.0       2697          -1.0         36000       1,
  'test':    knn_indices  knn_distances  org_index  LoanApproval  AnnualSalary  Gender
  0         1438         796.

In [None]:
df.loc[0,]

In [None]:
search_ctr_group.loc[857,]

In [None]:
df.loc[857,] # different ppl!!!

In [None]:
from sklearn.neighbors import BallTree
rng = np.random.RandomState(0)
X = rng.random_sample((10, 3))
print(X)
print('---')
print(X[:1])
tree = BallTree(X, leaf_size=2)              
dist, ind = tree.query(X[:1], k=3)                
print(ind)  # indices of 3 closest neighbors
#[0 3 1]
print(dist)  # distances to 3 closest neighbors
#[ 0.          0.19662693  0.29473397]