Working script for cfST

Use to greate the functions from_groups(), test_disc()... part of run_cfST()

From Salvatore's paper: "it boils down to the Manhattan distance of z-scores"

In [1]:
import os
import pandas as pd
import numpy as np
import math
import sys
import matplotlib.pyplot as plt
import scipy.stats as st

from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors

In [2]:
# set working directory - note: all code runs from the src folder
wrk_dir = os.getcwd()
print(wrk_dir)
# data path
data_path = wrk_dir + '\\' + 'data' + '\\'
# results path
resu_path = wrk_dir + '\\' + 'results\\counterfactuals' + '\\'

C:\Users\Jose Alvarez\Documents\Projects\CounterfactualSituationTesting


In [3]:
df    = pd.read_csv(data_path + 'Karimi2020_v2.csv', sep='|', )
print(df.shape)
cf_df = pd.read_csv(resu_path + 'cf_Karimi2020_v2.csv', sep='|', )
print(cf_df.shape)

(4993, 6)
(4993, 6)


In [None]:
round(df[df['Gender']==1].shape[0]/df.shape[0]*100, 3) # perc. of women in df
#df[df['Gender']==0].shape[0]/df.shape[0]*100

In [None]:
# plt.hist(df['LoanApproval'])
# plt.hist(cf_df['LoanApproval']) # number of denied loans drops!

In [None]:
print(df[(df['Gender']==1) & (df['LoanApproval']==1)].shape[0]/df.shape[0]*100)
print('--- vs ---')
print(cf_df[(cf_df['Gender']==1) & (cf_df['LoanApproval']==1)].shape[0]/cf_df.shape[0]*100)

In [4]:
feat_trgt = ['LoanApproval']
feat_trgt_vals = {'pos': 1, 'neg': -1} # 'pos' for the desired label, 'neg' for the undesired label
# feat_rlvt = ['AnnualSalary']
feat_rlvt = ['AnnualSalary', 'AccountBalance']
feat_prot = 'Gender'
feat_prot_vals = {'non_protected': 0, 'protected': 1} # indicate labels for non-protected and protected groups

# future params!
# protected_group = {'Gender': 1} 
n = 10 # determine by power analysis? (future extension)
d = 'manhattan' # ‘manhattan’ - see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html

weights = None
# weights = {'AnnualSalary': 5, 'AccountBalance': 1}
standardize = True

feat_list = feat_trgt + feat_rlvt
feat_list.append(feat_prot)
feat_list

['LoanApproval', 'AnnualSalary', 'AccountBalance', 'Gender']

Under counterfactual situation testing, for the protected group we need to create two groups: a *control group* (CG) centered on the factuals and a *test group* (TG) centered on the counterfactuals. In practice, this means that we take individual women and compare them to other similar women using some distance $d_1$ to construct CG, while we take their corresponding counterfactuals and compare them to similar men using some distance $d_2$. We start off with $d_1=d_2$. We create the groups using a KNN algorithm.

Under this approach, rather than centering both CG and TG on the same (factual) instance, we construct the hypothetical group (*what would have happened had the female individual been male?*) by allowing for all variables to adjust due to the change in $A$. This is our implementation of what Kohler defined as *fairness given the difference* and what she arguess through her Eddie Murphy paper.

In [63]:
from src.cf_situation_testing import get_k_neighbors

test_knn = get_k_neighbors(df=df, cf_df=cf_df, k=n, 
                           feat_trgt=feat_trgt, feat_trgt_vals=feat_trgt_vals, feat_rlvt=feat_rlvt, 
                           feat_prot=feat_prot, feat_prot_vals=feat_prot_vals, standardize=True, )

test_knn

  1%|▍                                                                                | 9/1712 [00:00<00:20, 85.13it/s]

standardizing


100%|██████████████████████████████████████████████████████████████████████████████| 1712/1712 [00:18<00:00, 93.76it/s]


{0: {'control':     knn_indices  knn_distances  org_index
  0             0       0.000000          0
  1          1247       0.011917       3591
  2          1648       0.023416       4789
  3           887       0.050495       2568
  4          1020       0.055355       2956
  5           811       0.064967       2355
  6          1407       0.117550       4055
  7          1042       0.118574       3019
  8           100       0.119408        279
  9           714       0.120045       2082
  10         1337       0.126756       3838,
  'test':     knn_indices  knn_distances  org_index
  0             0       0.000000          0
  1          2219       0.026652       3404
  2          2222       0.026986       3409
  3           613       0.027757        939
  4           454       0.028766        700
  5          2959       0.030846       4516
  6            66       0.034732        100
  7          2154       0.041160       3283
  8          1439       0.042960       2190
  9      

In [66]:
test_knn[6]['control']

Unnamed: 0,knn_indices,knn_distances,org_index
0,3,0.0,6
1,464,0.015986,1334
2,1137,0.037291,3294
3,1676,0.039675,4886
4,51,0.059112,161
5,1361,0.067358,3900
6,38,0.081883,118
7,1636,0.084567,4754
8,762,0.102907,2221
9,1219,0.105018,3502


In [22]:
# key: individuals have the same index across df and cf_df
protected_indices     = df[df[feat_prot]==feat_prot_vals['protected']].index.to_list()
non_protected_indices = df[df[feat_prot]==feat_prot_vals['non_protected']].index.to_list() 

In [23]:
# 1) Setup the respectice search spaces for control (ctr) and test (tst) groups
# use factual df for ctr
search_ctr_group = df[feat_rlvt].copy()
print(search_ctr_group.shape)
# use counterfactual df for tst
search_tst_group = cf_df[feat_rlvt].copy()
print(search_tst_group.shape)

(4993, 2)
(4993, 2)


In [24]:
# if we pre-process, the search_spaces must include the centers | TODO: do we normalize after ot before partitioning by A? decide in previous cell
if standardize:
    print('standardizing')
    
    scaler = preprocessing.StandardScaler()
    
    search_ctr_group_scaled = scaler.fit_transform(search_ctr_group)
    search_ctr_group_scaled = pd.DataFrame(search_ctr_group_scaled, index=search_ctr_group.index, columns=search_ctr_group.columns)
    search_ctr_group = search_ctr_group_scaled
    del search_ctr_group_scaled
    
    search_tst_group_scaled = scaler.fit_transform(search_tst_group)
    search_tst_group_scaled = pd.DataFrame(search_tst_group_scaled, index=search_tst_group.index, columns=search_tst_group.columns)
    search_tst_group = search_tst_group_scaled
    del search_tst_group_scaled

if weights:
    print('weighting')
    
    if len(weights) != len(feat_rlvt):
        sys.exit('provide a weight for each relevant feature')
    
    for feat_weight in weights:
        print(feat_weight)
        search_ctr_group[feat_weight] = weights[feat_weight] * search_ctr_group[feat_weight]
        search_tst_group[feat_weight] = weights[feat_weight] * search_tst_group[feat_weight]

standardizing


In [25]:
centers_ctr = search_ctr_group.iloc[protected_indices].copy() # search_ctr_group will always include the ctr center

search_ctr_group = search_ctr_group.iloc[protected_indices].copy()
search_ctr_group.reset_index(inplace=True, )
search_ctr_group.rename(columns={'index': 'org_index'}, inplace=True)

print(search_ctr_group.shape)
search_ctr_group.head(5)

(1712, 3)


Unnamed: 0,org_index,AnnualSalary,AccountBalance
0,0,-1.821729,-1.955836
1,4,3.198387,2.979531
2,5,0.204463,-0.210883
3,6,-0.823754,-1.267954
4,9,-0.324766,-0.667573


In [26]:
centers_ctr.head(5)

Unnamed: 0,AnnualSalary,AccountBalance
0,-1.821729,-1.955836
4,3.198387,2.979531
5,0.204463,-0.210883
6,-0.823754,-1.267954
9,-0.324766,-0.667573


In [27]:
centers_tst = search_tst_group.iloc[protected_indices].copy()

# define the search space within the loop (unfortunately...)
# search_tst_group = search_tst_group.iloc[non_protected_indices].copy()
# search_tst_group.reset_index(inplace=True, )
# search_tst_group.rename(columns={'index': 'org_index'}, inplace=True)

# print(search_tst_group.shape)
# search_tst_group.head(5)

In [28]:
centers_tst.head(5)

Unnamed: 0,AnnualSalary,AccountBalance
0,-1.548143,-1.642224
4,3.606234,3.484847
5,0.532238,0.170509
6,-0.523478,-0.927622
9,-0.011145,-0.303922


In [12]:
# start the main function here...

# store neighboors here
dict_df_neighbors = {}

In [13]:
# 2) For eahc ind(ividual) set the centers

ind = 0 # start loop: for ind in protected_indices:

# for storing the neighboors 
temp_dict_df_neighbors = {}

# get ctr center from df of factual centers
ind_center_ctr = centers_ctr.loc[ind, ] #[ind, feat_rlvt]

print(ind_center_ctr)

# get tst center from df of counterfactual centers
ind_center_tst = centers_tst.loc[ind, ] #[ind, feat_rlvt]

print(ind_center_tst)

# prepare for knn
if len(feat_rlvt) > 1:
    ind_center_ctr = ind_center_ctr.values.reshape(1, -1)
    ind_center_tst = ind_center_tst.values.reshape(1, -1)
else:
    ind_center_ctr = ind_center_ctr.values.reshape(-1, 1)
    ind_center_tst = ind_center_tst.values.reshape(-1, 1)

print(ind_center_ctr)
print(ind_center_tst)

AnnualSalary     -1.821729
AccountBalance   -1.955836
Name: 0, dtype: float64
AnnualSalary     -1.548143
AccountBalance   -1.642224
Name: 0, dtype: float64
[[-1.82172855 -1.95583556]]
[[-1.54814327 -1.64222367]]


In [14]:
# 3) Control Group for ind

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_1 = NearestNeighbors(n_neighbors = n + 1, algorithm='ball_tree', metric = d).fit(search_ctr_group[feat_rlvt])
knn_1

NearestNeighbors(algorithm='ball_tree', metric='manhattan', n_neighbors=11)

In [15]:
distances_1, indices_1 = knn_1.kneighbors(ind_center_ctr)

In [16]:
temp_ctr_df = pd.DataFrame()
temp_ctr_df['knn_indices'] = pd.Series(indices_1[0])
temp_ctr_df['knn_distances'] = pd.Series(distances_1[0])
temp_ctr_df.sort_values(by='knn_distances', ascending=True, inplace=True)

# HERE we can drop neighbors based on the distance!
temp_ctr_df

Unnamed: 0,knn_indices,knn_distances
0,0,0.0
1,1247,0.011917
2,1648,0.023416
3,887,0.050495
4,1020,0.055355
5,811,0.064967
6,1407,0.11755
7,1042,0.118574
8,100,0.119408
9,714,0.120045


In [17]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
temp_ctr_df = temp_ctr_df.merge(search_ctr_group[['org_index']], how='inner', left_on='knn_indices', right_index=True)

temp_ctr_df

# todo: we can. e.g., test on the features of interest as well for ST!


Unnamed: 0,knn_indices,knn_distances,org_index
0,0,0.0,0
1,1247,0.011917,3591
2,1648,0.023416,4789
3,887,0.050495,2568
4,1020,0.055355,2956
5,811,0.064967,2355
6,1407,0.11755,4055
7,1042,0.118574,3019
8,100,0.119408,279
9,714,0.120045,2082


In [None]:
# let's not drop it for now...
# drop ind for ctr! org index is the one we are using to loop over the search space!!! | TODO should I keep the center?
# temp_ctr_df = temp_ctr_df[temp_ctr_df['org_index'] != ind].reset_index(drop=True)

# temp_ctr_df

In [None]:
# if temp_ctr_df.shape[0] > n:
#     print(temp_ctr_df.shape)
#     temp_ctr_df.drop(temp_ctr_df.tail(1).index,inplace=True)
#     print(temp_ctr_df.shape)

In [18]:
# store it
temp_dict_df_neighbors['control'] = temp_ctr_df

In [19]:
# clean up
del ind_center_ctr, knn_1, temp_ctr_df, indices_1, distances_1, 

In [33]:
# 3) Test Group for ind

temp_search_tst_group = search_tst_group.iloc[[ind] + non_protected_indices].copy()
temp_search_tst_group.reset_index(inplace=True, )
temp_search_tst_group.rename(columns={'index': 'org_index'}, inplace=True)

temp_search_tst_group.head(5)

Unnamed: 0,org_index,AnnualSalary,AccountBalance
0,0,-1.548143,-1.642224
1,1,0.600662,0.676069
2,2,-0.330852,-0.667025
3,3,-0.641356,-0.262124
4,7,1.221672,1.557363


In [34]:
knn_2 = NearestNeighbors(n_neighbors = n + 1, algorithm='ball_tree', metric = d).fit(temp_search_tst_group[feat_rlvt])
knn_2

NearestNeighbors(algorithm='ball_tree', metric='manhattan', n_neighbors=11)

In [37]:
distances_2, indices_2 = knn_2.kneighbors(ind_center_tst)

In [38]:
temp_tst_df = pd.DataFrame()
temp_tst_df['knn_indices'] = pd.Series(indices_2[0])
temp_tst_df['knn_distances'] = pd.Series(distances_2[0])
temp_tst_df.sort_values(by='knn_distances', ascending=True, inplace=True)

# HERE we can drop neighbors based on the distance!
temp_tst_df

Unnamed: 0,knn_indices,knn_distances
0,0,0.0
1,2219,0.026652
2,2222,0.026986
3,613,0.027757
4,454,0.028766
5,2959,0.030846
6,66,0.034732
7,2154,0.04116
8,1439,0.04296
9,2508,0.045118


In [40]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_tst_df = temp_tst_df.merge(temp_search_tst_group[['org_index']], how='inner', left_on='knn_indices', right_index=True)

temp_tst_df

Unnamed: 0,knn_indices,knn_distances,org_index
0,0,0.0,0
1,2219,0.026652,3404
2,2222,0.026986,3409
3,613,0.027757,939
4,454,0.028766,700
5,2959,0.030846,4516
6,66,0.034732,100
7,2154,0.04116,3283
8,1439,0.04296,2190
9,2508,0.045118,3852


In [41]:
# store it
temp_dict_df_neighbors['test'] = temp_tst_df

In [43]:
# clean up
del ind_center_tst, knn_2, temp_tst_df, indices_2, distances_2, temp_search_tst_group

In [44]:
temp_dict_df_neighbors

{'control':     knn_indices  knn_distances  org_index
 0             0       0.000000          0
 1          1247       0.011917       3591
 2          1648       0.023416       4789
 3           887       0.050495       2568
 4          1020       0.055355       2956
 5           811       0.064967       2355
 6          1407       0.117550       4055
 7          1042       0.118574       3019
 8           100       0.119408        279
 9           714       0.120045       2082
 10         1337       0.126756       3838,
 'test':     knn_indices  knn_distances  org_index
 0             0       0.000000          0
 1          2219       0.026652       3404
 2          2222       0.026986       3409
 3           613       0.027757        939
 4           454       0.028766        700
 5          2959       0.030846       4516
 6            66       0.034732        100
 7          2154       0.041160       3283
 8          1439       0.042960       2190
 9          2508       0.045118   

In [45]:
#... later on
dict_df_neighbors[ind] = temp_dict_df_neighbors

In [46]:
#recall that ind is the same across df and cf_df
dict_df_neighbors

{0: {'control':     knn_indices  knn_distances  org_index
  0             0       0.000000          0
  1          1247       0.011917       3591
  2          1648       0.023416       4789
  3           887       0.050495       2568
  4          1020       0.055355       2956
  5           811       0.064967       2355
  6          1407       0.117550       4055
  7          1042       0.118574       3019
  8           100       0.119408        279
  9           714       0.120045       2082
  10         1337       0.126756       3838,
  'test':     knn_indices  knn_distances  org_index
  0             0       0.000000          0
  1          2219       0.026652       3404
  2          2222       0.026986       3409
  3           613       0.027757        939
  4           454       0.028766        700
  5          2959       0.030846       4516
  6            66       0.034732        100
  7          2154       0.041160       3283
  8          1439       0.042960       2190
  9      

In [60]:
# 4) We need to start testing each group: do now for ind
alpha = 0.05
int(ind)

4990

In [52]:
# for ind ... need to merge with df and cf_df for rest of features:
ctr_group = dict_df_neighbors[ind]['control']
ctr_group = ctr_group.merge(df[feat_list], how='inner', left_on='org_index', right_index=True)

ctr_group

Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,0,0.0,0,-1.0,35000,7947.67809,1
1,1247,0.011917,3591,-1.0,35000,7824.38501,1
2,1648,0.023416,4789,-1.0,35000,7705.41598,1
3,887,0.050495,2568,-1.0,34500,7581.699106,1
4,1020,0.055355,2956,-1.0,36500,7844.293517,1
5,811,0.064967,2355,-1.0,35000,8619.817549,1
6,1407,0.11755,4055,-1.0,38000,8225.204699,1
7,1042,0.118574,3019,-1.0,38000,7659.554391,1
8,100,0.119408,279,-1.0,36500,7181.612524,1
9,714,0.120045,2082,-1.0,33500,8720.337032,1


In [53]:
tst_group = dict_df_neighbors[ind]['test']
tst_group = tst_group.merge(cf_df[feat_list], how='inner', left_on='org_index', right_index=True)

tst_group

Unnamed: 0,knn_indices,knn_distances,org_index,LoanApproval,AnnualSalary,AccountBalance,Gender
0,0,0.0,0,-1.0,50796.35,13852.05,1
1,2219,0.026652,3404,-1.0,50000.0,13832.88,0
2,2222,0.026986,3409,-1.0,50000.0,13829.55,0
3,613,0.027757,939,-1.0,50000.0,13821.87,0
4,454,0.028766,700,-1.0,50000.0,13811.83,0
5,2959,0.030846,4516,-1.0,50000.0,13791.11,0
6,66,0.034732,100,-1.0,50000.0,13752.41,0
7,2154,0.04116,3283,-1.0,50000.0,13688.39,0
8,1439,0.04296,2190,-1.0,50000.0,13670.47,0
9,2508,0.045118,3852,-1.0,50000.0,14055.13,0


In [54]:
p1 = ctr_group[ctr_group[feat_prot]==feat_trgt_vals['neg']].shape[0] / ctr_group.shape[0]
p2 = tst_group[tst_group[feat_prot]==feat_trgt_vals['neg']].shape[0] / tst_group.shape[0]
diff = p1 - p2
diff

0.0

In [55]:
def get_wald_ci(alpha: float, p1: float, p2: float, k1: int, k2: int):
    
    wald_ci_summary = {}
    
    z_score = round(st.norm.ppf(1 - (alpha/2)), 2)
    d_alpha = z_score * math.sqrt( (p1*(1 - p1)/k1) + (p2*(1 - p2)/k2) )
    
    wald_ci_summary['d_alpha'] = d_alpha
    wald_ci_summary['CIs'] = [(p1 - p2) - d_alpha, (p1 - p2) + d_alpha]
    if (p1 - p2) >= 0:
        wald_ci_summary['diff'] = max(0, p1 - p2 - d_alpha)
    else:
        wald_ci_summary['diff'] = min(0, p1 - p2 + d_alpha)
    
    print(wald_ci_summary)
    
    return wald_ci_summary

In [56]:
ind_wald_ci = get_wald_ci(alpha=alpha, p1=p1, p2=p2, k1=ctr_group.shape[0], k2=tst_group.shape[0])

{'d_alpha': 0.0, 'CIs': [0.0, 0.0], 'diff': 0}


In [None]:
# todo: what's the output for the ST: yes/no where?

In [None]:
print(ind)
print(ind_wald_ci)

In [None]:
# i guess i need some sort of threshold here???
diff_epsilon = 0.05 # tau in the second paper

In [None]:
if ind_wald_ci['diff'] > diff_epsilon:
    ind_wald_ci['cfST'] = 'Yes'
else:
    ind_wald_ci['cfST'] = 'No'
    
#how to return the results? get df with discrimination columns!

In [None]:
# for ind, row in df[df[feat_prot]==feat_prot_vals['prot']].iterrows():
#     print(ind)
    
#     # for storing ind's ctr and tst neighborhoods 
#     temp_dict_df_neighbors = {}

#     # get ctr center from factual df
#     center_ctr = df.loc[ind, feat_rlvt]

#     # get tst center from counterfactual df 
#     center_tst = cf_df.loc[ind, feat_rlvt]

#     # prepare for knn
#     if len(feat_rlvt) > 1:
#         center_ctr = center_ctr.values.reshape(1, -1)
#         center_tst = center_tst.values.reshape(1, -1)
#     else:
#         center_ctr = center_ctr.values.reshape(-1, 1)
#         center_tst = center_tst.values.reshape(-1, 1)
    
#     #--- knn for ctr
#     knn_1 = NearestNeighbors(n_neighbors = n + 1, algorithm='ball_tree', metric = d).fit(search_ctr_group[feat_rlvt])
#     distances_1, indices_1 = knn_1.kneighbors(center_ctr)
    
#     # store ind's ctr neighbors
#     temp_ctr_df = pd.DataFrame()
#     temp_ctr_df['knn_indices'] = pd.Series(indices_1[0])
#     temp_ctr_df['knn_distances'] = pd.Series(distances_1[0])
#     temp_ctr_df.sort_values(by='knn_distances', ascending=True, inplace=True)
#     # keep track of indices wrt to df_ctr
#     temp_ctr_df = temp_ctr_df.merge(df_ctr, how='inner', left_on='knn_indices', right_index=True)
#     # for ctr ind will always belong to the search space: hence, why n_neighbors = n + 1 in knn_1
#     temp_ctr_df = temp_ctr_df[temp_ctr_df['org_index'] != ind].reset_index(drop=True)
#     # insure nrows()== n
#     if temp_ctr_df.shape[0] > n:
#         print(temp_ctr_df.shape)
#         temp_ctr_df.drop(temp_ctr_df.tail(1).index,inplace=True)
#         print(temp_ctr_df.shape)

#     # store
#     temp_dict_df_neighbors['control'] = temp_ctr_df
    
#     # clean up
#     del center_ctr, knn_1, temp_ctr_df, indices_1, distances_1,
    
#     #--- knn for tst
#     knn_2 = NearestNeighbors(n_neighbors = n, algorithm='ball_tree', metric = d).fit(search_tst_group[feat_rlvt])
#     distances_2, indices_2 = knn_2.kneighbors(center_tst)
    
#     # store ind's tst neighbors
#     temp_tst_df = pd.DataFrame()
#     temp_tst_df['knn_indices'] = pd.Series(indices_2[0])
#     temp_tst_df['knn_distances'] = pd.Series(distances_2[0])
#     temp_tst_df.sort_values(by='knn_distances', ascending=True, inplace=True)
#     # keep track of indices wrt to df_tst
#     temp_tst_df = temp_tst_df.merge(df_tst, how='inner', left_on='knn_indices', right_index=True)
    
#     # store
#     temp_dict_df_neighbors['test'] = temp_tst_df
    
#     # clean up
#     del center_tst, knn_2, temp_tst_df, indices_2, distances_2, 
    
#     # store ind's neighborhoods
#     dict_df_neighbors[ind] = temp_dict_df_neighbors
    
#     # clean up
#     del temp_dict_df_neighbors

# print('done')

In [None]:
from sklearn.neighbors import BallTree
rng = np.random.RandomState(0)
X = rng.random_sample((10, 3))
print(X)
print('---')
print(X[:1])
tree = BallTree(X, leaf_size=2)              
dist, ind = tree.query(X[:1], k=3)                
print(ind)  # indices of 3 closest neighbors
#[0 3 1]
print(dist)  # distances to 3 closest neighbors
#[ 0.          0.19662693  0.29473397]