Working script for cfST

Use to greate the functions from_groups(), test_disc()... part of run_cfST()

From Salvatore's paper: "it boils down to the Manhattan distance of z-scores"

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing

In [2]:
# set working directory - note: all code runs from the src folder
wrk_dir = os.getcwd()
# data path
data_path = wrk_dir + '\\' + 'data' + '\\'
# results path
resu_path = wrk_dir + '\\' + 'results\\counterfactuals' + '\\'

In [3]:
df    = pd.read_csv(data_path + 'Karimi2020_v2.csv', sep='|', )
print(df.shape)
cf_df = pd.read_csv(resu_path + 'cf_Karimi2020_v2.csv', sep='|', )
print(cf_df.shape)

(4993, 6)
(4993, 6)


In [4]:
round(df[df['Gender']==1].shape[0]/df.shape[0]*100, 3) # perc. of women in df
#df[df['Gender']==0].shape[0]/df.shape[0]*100

34.288

In [5]:
# plt.hist(df['LoanApproval'])
# plt.hist(cf_df['LoanApproval']) # number of denied loans drops!

In [6]:
print(df[(df['Gender']==1) & (df['LoanApproval']==1)].shape[0]/df.shape[0]*100)
print('--- vs ---')
print(cf_df[(cf_df['Gender']==1) & (cf_df['LoanApproval']==1)].shape[0]/cf_df.shape[0]*100)

13.498898457840976
--- vs ---
21.029441217704786


In [7]:
feat_trgt = ['LoanApproval']
# feat_rlvt = ['AnnualSalary']
feat_rlvt = ['AnnualSalary', 'AccountBalance']
feat_prot = 'Gender' # extend later for more than one (intersectionality???)
feat_prot_vals = {'non_prot': 0, 'prot': 1}

# future params!
# protected_group = {'Gender': 1} 
n = 10 # determine by power analysis? (future extension)
d = 'manhattan' # ‘manhattan’ - see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html

feat_list = feat_trgt + feat_rlvt
feat_list.append(feat_prot)
feat_list

['LoanApproval', 'AnnualSalary', 'AccountBalance', 'Gender']

In [52]:
# TODO: we'll need a prepocessing step
# normalize option; weighted option

weights = None
# else weights = {} where len(weights) != len(rlvt_feat): exit
standardize = False

Under counterfactual situation testing, for the protected group we need to create two groups: a *control group* (CG) centered on the factuals and a *test group* (TG) centered on the counterfactuals. In practice, this means that we take individual women and compare them to other similar women using some distance $d_1$ to construct CG, while we take their corresponding counterfactuals and compare them to similar men using some distance $d_2$. We start off with $d_1=d_2$. We create the groups using a KNN algorithm.

Under this approach, rather than centering both CG and TG on the same (factual) instance, we construct the hypothetical group (*what would have happened had the female individual been male?*) by allowing for all variables to adjust due to the change in $A$. This is our implementation of what Kohler defined as *fairness given the difference* and what she arguess through her Eddie Murphy paper.

In [53]:
# 1) Setup the respectice search spaces for control (ctr) and test (tst) groups

df_ctr = df[df[feat_prot]==feat_prot_vals['prot']][feat_list].copy()
df_tst = cf_df[cf_df[feat_prot]==feat_prot_vals['non_prot']][feat_list].copy()
# del df, cf_df : I CANT!!! need to keep them for getting the centroids! think later of a more efficient way

search_ctr_group = df_ctr[feat_rlvt].copy()  #the search space is just the group-specific df with only rlvt features
print(search_ctr_group.shape)

search_tst_group = df_tst[feat_rlvt].copy()
print(search_tst_group.shape)

# search_ctr_group = df[df[feat_prot]==feat_prot_vals['prot']][feat_rlvt].copy()  #[feat_list]
# print(search_ctr_group.shape)

# search_tst_group = cf_df[cf_df[feat_prot]==feat_prot_vals['non_prot']][feat_rlvt].copy()
# print(search_tst_group.shape)

df_ctr.reset_index(inplace=True, )
df_ctr.rename(columns={'index': 'org_index'}, inplace=True)

df_tst.reset_index(inplace=True, )
df_tst.rename(columns={'index': 'org_index'}, inplace=True)

(1712, 2)
(3281, 2)


In [54]:
df_ctr.head(5)

Unnamed: 0,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,0,-1.0,35000,7947.67809,1
1,4,1.0,201000,59008.567839,1
2,5,1.0,102000,26000.811689,1
3,6,-1.0,68000,15064.447848,1
4,9,-1.0,84500,21275.931413,1


In [55]:
df_tst.head(5)

Unnamed: 0,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,1,1.0,120000.0,36940.1,0
1,2,-1.0,90000.0,23564.13,0
2,3,-1.0,80000.0,27596.57,0
3,7,1.0,140000.0,45716.98,0
4,8,-1.0,70000.0,22540.65,0


In [56]:
if standardize:
    
    print('standardizing')
    
    scaler = preprocessing.StandardScaler()
    
    search_ctr_group_scaled = scaler.fit_transform(search_ctr_group)
    search_ctr_group_scaled = pd.DataFrame(search_ctr_group_scaled, 
                                           index=search_ctr_group.index, 
                                           columns=search_ctr_group.columns)

    search_ctr_group = search_ctr_group_scaled
    del search_ctr_group_scaled
    
    search_tst_group_scaled = scaler.fit_transform(search_tst_group)
    search_tst_group_scaled = pd.DataFrame(search_tst_group_scaled, 
                                           index=search_tst_group.index, 
                                           columns=search_tst_group.columns)

    search_tst_group = search_tst_group_scaled
    del search_tst_group_scaled

if weights:
    print('weighting')


In [30]:
# # keep track of old / new indices: KNN seems to retunr indices that are absolute (?)
# search_ctr_group.reset_index(inplace=True, )
# search_ctr_group.rename(columns={'index': 'org_index'}, inplace=True)

# search_tst_group.reset_index(inplace=True, )
# search_tst_group.rename(columns={'index': 'org_index'}, inplace=True)

In [57]:
search_ctr_group.head(5)

Unnamed: 0,AnnualSalary,AccountBalance
0,35000,7947.67809
4,201000,59008.567839
5,102000,26000.811689
6,68000,15064.447848
9,84500,21275.931413


In [58]:
search_tst_group.head(5)

Unnamed: 0,AnnualSalary,AccountBalance
1,120000.0,36940.1
2,90000.0,23564.13
3,80000.0,27596.57
7,140000.0,45716.98
8,70000.0,22540.65


In [59]:
# start the main function here

# store neighboors here
dict_df_neighbors = {}

In [60]:
# 2) For eahc ind(ividual) set the centers

ind = 0 # start loop or map get_neighboors()

# for storing the neighboors 
temp_dict_df_neighbors = {}

# get ctr center from factual df
center_ctr = df.loc[ind, feat_rlvt]
print(center_ctr)

# get tst center from counterfactual df 
center_tst = cf_df.loc[ind, feat_rlvt]
print(center_tst)

# prepare for knn
if len(feat_rlvt) > 1:
    center_ctr = center_ctr.values.reshape(1, -1)
    center_tst = center_tst.values.reshape(1, -1)
else:
    center_ctr = center_ctr.values.reshape(-1, 1)
    center_tst = center_tst.values.reshape(-1, 1)

print(center_ctr)
print(center_tst)

AnnualSalary      35000.00000
AccountBalance     7947.67809
Name: 0, dtype: float64
AnnualSalary      50796.35
AccountBalance    13852.05
Name: 0, dtype: float64
[[35000.          7947.67808994]]
[[50796.35 13852.05]]


In [61]:
# 3) Control Group for ind

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_1 = NearestNeighbors(n_neighbors = n + 1, algorithm='ball_tree', metric = d).fit(search_ctr_group[feat_rlvt])
knn_1

NearestNeighbors(algorithm='ball_tree', metric='manhattan', n_neighbors=11)

In [62]:
distances_1, indices_1 = knn_1.kneighbors(center_ctr)

In [63]:
temp_ctr_df = pd.DataFrame()
temp_ctr_df['knn_indices'] = pd.Series(indices_1[0])
temp_ctr_df['knn_distances'] = pd.Series(distances_1[0])
temp_ctr_df.sort_values(by='knn_distances', ascending=True, inplace=True)
# HERE we can drop neighbors based on the distance!
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances
0,0,0.0
1,1247,123.29308
2,1648,242.26211
3,811,672.139459
4,887,865.978984
5,1020,1603.384573
6,22,1953.770991
7,100,2266.065566
8,714,2272.658942
9,1337,2342.091674


In [64]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_ctr_df = temp_ctr_df.merge(df_ctr, how='inner', left_on='knn_indices', right_index=True)
# temp_ctr_df = temp_ctr_df.merge(search_ctr_group, how='inner', left_on='knn_indices', right_index=True)
temp_ctr_df

# we can. e.g., test on the features of interest as well for ST!

# note, maybe join here with the df dataset? you'll get org X and Y too

Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,0,0.0,0,-1.0,35000,7947.67809,1
1,1247,123.29308,3591,-1.0,35000,7824.38501,1
2,1648,242.26211,4789,-1.0,35000,7705.41598,1
3,811,672.139459,2355,-1.0,35000,8619.817549,1
4,887,865.978984,2568,-1.0,34500,7581.699106,1
5,1020,1603.384573,2956,-1.0,36500,7844.293517,1
6,22,1953.770991,64,-1.0,35000,9901.449081,1
7,100,2266.065566,279,-1.0,36500,7181.612524,1
8,714,2272.658942,2082,-1.0,33500,8720.337032,1
9,1337,2342.091674,3838,-1.0,33500,7105.586416,1


In [65]:
# drop ind for ctr! org index is the one we are using to loop over the search space!!!
temp_ctr_df = temp_ctr_df[temp_ctr_df['org_index'] != ind].reset_index(drop=True)
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,1247,123.29308,3591,-1.0,35000,7824.38501,1
1,1648,242.26211,4789,-1.0,35000,7705.41598,1
2,811,672.139459,2355,-1.0,35000,8619.817549,1
3,887,865.978984,2568,-1.0,34500,7581.699106,1
4,1020,1603.384573,2956,-1.0,36500,7844.293517,1
5,22,1953.770991,64,-1.0,35000,9901.449081,1
6,100,2266.065566,279,-1.0,36500,7181.612524,1
7,714,2272.658942,2082,-1.0,33500,8720.337032,1
8,1337,2342.091674,3838,-1.0,33500,7105.586416,1
9,64,2518.907632,204,-1.0,33500,6928.770458,1


In [66]:
if temp_ctr_df.shape[0] > n:
    print(temp_ctr_df.shape)
    temp_ctr_df.drop(temp_ctr_df.tail(1).index,inplace=True)
    print(temp_ctr_df.shape)

In [67]:
# store it
temp_dict_df_neighbors['control'] = temp_ctr_df

In [68]:
# clean up
del center_ctr, knn_1, temp_ctr_df, indices_1, distances_1, 

In [69]:
# 3) Test Group for ind (use the corresponding search space: men!)

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_2 = NearestNeighbors(n_neighbors = n, algorithm='ball_tree', metric = d).fit(search_tst_group[feat_rlvt])
knn_2

NearestNeighbors(algorithm='ball_tree', metric='manhattan', n_neighbors=10)

In [70]:
distances_2, indices_2 = knn_2.kneighbors(center_tst)

In [71]:
temp_tst_df = pd.DataFrame()
temp_tst_df['knn_indices'] = pd.Series(indices_2[0])
temp_tst_df['knn_distances'] = pd.Series(distances_2[0])
temp_tst_df.sort_values(by='knn_distances', ascending=True, inplace=True)
# HERE we can drop neighbors based on the distance!
temp_tst_df

Unnamed: 0,knn_indices,knn_distances
0,2218,815.52
1,2221,818.85
2,612,826.53
3,453,836.57
4,2958,857.29
5,65,895.99
6,2153,960.01
7,1438,977.93
8,2507,999.43
9,300,1033.39


In [72]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_tst_df = temp_tst_df.merge(df_tst, how='inner', left_on='knn_indices', right_index=True)
# temp_tst_df = temp_tst_df.merge(search_tst_group, how='inner', left_on='knn_indices', right_index=True)
temp_tst_df


Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,2218,815.52,3404,-1.0,50000.0,13832.88,0
1,2221,818.85,3409,-1.0,50000.0,13829.55,0
2,612,826.53,939,-1.0,50000.0,13821.87,0
3,453,836.57,700,-1.0,50000.0,13811.83,0
4,2958,857.29,4516,-1.0,50000.0,13791.11,0
5,65,895.99,100,-1.0,50000.0,13752.41,0
6,2153,960.01,3283,-1.0,50000.0,13688.39,0
7,1438,977.93,2190,-1.0,50000.0,13670.47,0
8,2507,999.43,3852,-1.0,50000.0,14055.13,0
9,300,1033.39,468,-1.0,50000.0,13615.01,0


In [73]:
# store it
temp_dict_df_neighbors['test'] = temp_tst_df

In [74]:
# clean up
del center_tst, knn_2, temp_tst_df, indices_2, distances_2, 

In [75]:
temp_dict_df_neighbors

{'control':    knn_indices  knn_distances  org_index  LoanApproval  AnnualSalary  \
 0         1247     123.293080       3591          -1.0         35000   
 1         1648     242.262110       4789          -1.0         35000   
 2          811     672.139459       2355          -1.0         35000   
 3          887     865.978984       2568          -1.0         34500   
 4         1020    1603.384573       2956          -1.0         36500   
 5           22    1953.770991         64          -1.0         35000   
 6          100    2266.065566        279          -1.0         36500   
 7          714    2272.658942       2082          -1.0         33500   
 8         1337    2342.091674       3838          -1.0         33500   
 9           64    2518.907632        204          -1.0         33500   
 
    AccountBalance  Gender  
 0     7824.385010       1  
 1     7705.415980       1  
 2     8619.817549       1  
 3     7581.699106       1  
 4     7844.293517       1  
 5     990

In [76]:
#... later on
dict_df_neighbors[ind] = temp_dict_df_neighbors

In [77]:
#recall that ind is the same across df and cf_df
dict_df_neighbors

{0: {'control':    knn_indices  knn_distances  org_index  LoanApproval  AnnualSalary  \
  0         1247     123.293080       3591          -1.0         35000   
  1         1648     242.262110       4789          -1.0         35000   
  2          811     672.139459       2355          -1.0         35000   
  3          887     865.978984       2568          -1.0         34500   
  4         1020    1603.384573       2956          -1.0         36500   
  5           22    1953.770991         64          -1.0         35000   
  6          100    2266.065566        279          -1.0         36500   
  7          714    2272.658942       2082          -1.0         33500   
  8         1337    2342.091674       3838          -1.0         33500   
  9           64    2518.907632        204          -1.0         33500   
  
     AccountBalance  Gender  
  0     7824.385010       1  
  1     7705.415980       1  
  2     8619.817549       1  
  3     7581.699106       1  
  4     7844.293517

In [None]:
# 4) We need to start testing each group: do now for ind

In [None]:
df.loc[0,]

In [None]:
search_ctr_group.loc[857,]

In [None]:
df.loc[857,] # different ppl!!!

In [None]:
from sklearn.neighbors import BallTree
rng = np.random.RandomState(0)
X = rng.random_sample((10, 3))
print(X)
print('---')
print(X[:1])
tree = BallTree(X, leaf_size=2)              
dist, ind = tree.query(X[:1], k=3)                
print(ind)  # indices of 3 closest neighbors
#[0 3 1]
print(dist)  # distances to 3 closest neighbors
#[ 0.          0.19662693  0.29473397]