Working script for cfST

Use to greate the functions from_groups(), test_disc()... part of run_cfST()

From Salvatore's paper: "it boils down to the Manhattan distance of z-scores"

Under counterfactual situation testing, for the protected group we need to create two groups: a *control group* (CG) centered on the factuals and a *test group* (TG) centered on the counterfactuals. In practice, this means that we take individual women and compare them to other similar women using some distance $d_1$ to construct CG, while we take their corresponding counterfactuals and compare them to similar men using some distance $d_2$. We start off with $d_1=d_2$. We create the groups using a KNN algorithm.

Under this approach, rather than centering both CG and TG on the same (factual) instance, we construct the hypothetical group (*what would have happened had the female individual been male?*) by allowing for all variables to adjust due to the change in $A$. This is our implementation of what Kohler defined as *fairness given the difference* and what she arguess through her Eddie Murphy paper.

In [1]:
import os
import pandas as pd
import numpy as np
import math
import sys
import matplotlib.pyplot as plt
import scipy.stats as st

from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors

In [2]:
# set working directory - note: all code runs from the src folder
wrk_dir = os.getcwd()
# data path
data_path = wrk_dir + '\\' + 'data' + '\\'
# results path
resu_path = wrk_dir + '\\' + 'results\\counterfactuals' + '\\'

In [3]:
df    = pd.read_csv(data_path + 'Karimi2020_v2.csv', sep='|', )
print(df.shape)
cf_df = pd.read_csv(resu_path + 'cf_Karimi2020_v2.csv', sep='|', )
print(cf_df.shape)

(4993, 6)
(4993, 6)


In [4]:
cf_df

Unnamed: 0,u_AnnualSalary,u_AccountBalance,AnnualSalary,AccountBalance,LoanApproval,Gender
0,-49858.94,-1452.13,50796.35,13852.05,-1.0,1
1,19344.71,950.86,120000.00,36940.10,1.0,0
2,-10655.29,-3458.07,90000.00,23564.13,-1.0,0
3,-20655.29,3563.38,80000.00,27596.57,-1.0,0
4,116141.06,-8.85,216796.35,64912.94,1.0,1
...,...,...,...,...,...,...
4988,-655.29,395.28,100000.00,30406.49,1.0,0
4989,40141.06,-726.06,140796.35,41479.24,1.0,1
4990,-22858.94,906.36,77796.35,24280.87,-1.0,1
4991,-655.29,580.31,100000.00,30591.52,1.0,0


In [5]:
df

Unnamed: 0,LoanApproval,AnnualSalary,AccountBalance,u1,u2,Gender
0,-1.0,35000,7947.678090,50000,-973.152642,1
1,1.0,120000,36940.097383,120000,940.097383,0
2,-1.0,90000,23564.129008,90000,-3435.870992,0
3,-1.0,80000,27596.570524,80000,3596.570524,0
4,1.0,201000,59008.567839,210000,-705.778380,1
...,...,...,...,...,...,...
4988,1.0,100000,30406.498452,100000,406.498452,0
4989,1.0,125000,35574.858747,140000,-1865.003409,1
4990,-1.0,62000,18376.501668,80000,424.383234,1
4991,1.0,100000,30591.525522,100000,591.525522,0


In [4]:
round(df[df['Gender']==1].shape[0]/df.shape[0]*100, 3) # perc. of women in df
#df[df['Gender']==0].shape[0]/df.shape[0]*100

34.288

In [5]:
# plt.hist(df['LoanApproval'])
# plt.hist(cf_df['LoanApproval']) # number of denied loans drops!

In [6]:
print(df[(df['Gender']==1) & (df['LoanApproval']==1)].shape[0]/df.shape[0]*100)
print('--- vs ---')
print(cf_df[(cf_df['Gender']==1) & (cf_df['LoanApproval']==1)].shape[0]/cf_df.shape[0]*100)

13.498898457840976
--- vs ---
21.029441217704786


In [10]:
from src.cf_situation_testing import get_k_neighbors, get_wald_ci

feat_trgt = 'LoanApproval'
feat_trgt_vals = {'pos': 1, 'neg': -1}
# feat_rlvt = ['AnnualSalary']
feat_rlvt = ['AnnualSalary', 'AccountBalance']
feat_prot = 'Gender'
feat_prot_vals = {'non_protected': 0, 'protected': 1}
n = 10 # determine by power analysis? (future extension)
d = 'manhattan' # ‘manhattan’ - see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html
weights = None # weights = {'AnnualSalary': 5, 'AccountBalance': 1}
standardize = True #True changed to compare to Salvatore's DD code
alpha = 0.05


In [11]:
dict_df_neighbors = get_k_neighbors(df=df, cf_df=cf_df, 
                                    k=n, 
                                    feat_trgt=feat_trgt, feat_trgt_vals=feat_trgt_vals, 
                                    feat_rlvt=feat_rlvt, 
                                    feat_prot=feat_prot, feat_prot_vals=feat_prot_vals, 
                                    standardize=standardize, 
                                   )

target feature LoanApproval with values {'pos': 1, 'neg': -1}
protected feature Gender with values {'non_protected': 0, 'protected': 1}
with relevant features ['AnnualSalary', 'AccountBalance']
all features: ['LoanApproval', 'AnnualSalary', 'AccountBalance', 'Gender']


In [12]:
test_disc = get_wald_ci(dict_df_neighbors=dict_df_neighbors,
                       feat_trgt=feat_trgt, feat_trgt_vals=feat_trgt_vals,
                       alpha=alpha,
                       )

using significance level of 5.0%


In [13]:
test_disc

Unnamed: 0,individual,p1,p2,org_diff,d_alpha,diff,CIs,cfST
0,0,1.0,1.0,0.0,0.0,0.0,"[0.0, 0.0]",No
1,4,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0]",No
2,5,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0]",No
3,6,1.0,1.0,0.0,0.0,0.0,"[0.0, 0.0]",No
4,9,1.0,0.0,1.0,0.0,1.0,"[1.0, 1.0]",Yes
...,...,...,...,...,...,...,...,...
1707,4982,1.0,1.0,0.0,0.0,0.0,"[0.0, 0.0]",No
1708,4985,1.0,1.0,0.0,0.0,0.0,"[0.0, 0.0]",No
1709,4987,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0]",No
1710,4989,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0]",No


In [15]:
print(test_disc[test_disc['cfST'] == 'Yes'].shape)
test_disc[test_disc['cfST'] == 'Yes']

(443, 8)


Unnamed: 0,individual,p1,p2,org_diff,d_alpha,diff,CIs,cfST
4,9,1.000000,0.000000,1.000,0.000000,1.000000,"[1.0, 1.0]",Yes
14,44,1.000000,0.000000,1.000,0.000000,1.000000,"[1.0, 1.0]",Yes
15,47,1.000000,0.000000,1.000,0.000000,1.000000,"[1.0, 1.0]",Yes
17,50,1.000000,0.000000,1.000,0.000000,1.000000,"[1.0, 1.0]",Yes
19,55,0.818182,0.000000,0.818,0.227931,0.590251,"[0.5902508705143656, 1.0461127658492708]",Yes
...,...,...,...,...,...,...,...,...
1691,4918,1.000000,0.181818,0.818,0.227931,0.590251,"[0.5902508705143655, 1.0461127658492708]",Yes
1697,4946,1.000000,0.000000,1.000,0.000000,1.000000,"[1.0, 1.0]",Yes
1700,4958,1.000000,0.000000,1.000,0.000000,1.000000,"[1.0, 1.0]",Yes
1703,4973,1.000000,0.000000,1.000,0.000000,1.000000,"[1.0, 1.0]",Yes


In [18]:
test_disc.loc[test_disc['individual']==502, ]

Unnamed: 0,individual,p1,p2,org_diff,d_alpha,diff,CIs,cfST
176,502,0.727273,0.0,0.727,0.263192,0.464081,"[0.4640807392878268, 0.9904647152576278]",Yes


In [19]:
df.loc[502, ]

LoanApproval          -1.000000
AnnualSalary       96500.000000
AccountBalance     24589.519535
u1                110000.000000
u2                 -2874.941594
Gender                 1.000000
Name: 502, dtype: float64

In [26]:
# do I pick up positive results?
temp_df = test_disc.merge(df[['LoanApproval']], how='inner', left_on='individual', right_index=True)

print(temp_df[(temp_df['cfST'] == 'Yes') & (temp_df['LoanApproval'] == 1.0)].shape) # border cases?
print(temp_df[(temp_df['cfST'] == 'Yes') & (temp_df['LoanApproval'] != 1.0)].shape)

temp_df.head(5)

(49, 9)
(394, 9)


Unnamed: 0,individual,p1,p2,org_diff,d_alpha,diff,CIs,cfST,LoanApproval
0,0,1.0,1.0,0.0,0.0,0.0,"[0.0, 0.0]",No,-1.0
1,4,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0]",No,1.0
2,5,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0]",No,1.0
3,6,1.0,1.0,0.0,0.0,0.0,"[0.0, 0.0]",No,-1.0
4,9,1.0,0.0,1.0,0.0,1.0,"[1.0, 1.0]",Yes,-1.0


old script(s):

In [None]:
feat_trgt = 'LoanApproval'
feat_trgt_vals = {'pos': 1, 'neg': -1} # 'pos' for the desired label, 'neg' for the undesired label
# feat_rlvt = ['AnnualSalary']
feat_rlvt = ['AnnualSalary', 'AccountBalance']
feat_prot = 'Gender'
feat_prot_vals = {'non_protected': 0, 'protected': 1} # indicate labels for non-protected and protected groups

# future params!
# protected_group = {'Gender': 1} 
n = 10 # determine by power analysis? (future extension)
d = 'manhattan' # ‘manhattan’ - see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html

weights = None
# weights = {'AnnualSalary': 5, 'AccountBalance': 1}
standardize = True


# delete once functions have been tested
feat_list = [feat_trgt] + feat_rlvt + [feat_prot]
# feat_list = feat_trgt + feat_rlvt
# feat_list.append(feat_prot)
feat_list

In [None]:
# key: individuals have the same index across df and cf_df
protected_indices     = df[df[feat_prot]==feat_prot_vals['protected']].index.to_list()
non_protected_indices = df[df[feat_prot]==feat_prot_vals['non_protected']].index.to_list() 

In [None]:
# 1) Setup the respectice search spaces for control (ctr) and test (tst) groups
# use factual df for ctr
search_ctr_group = df[feat_rlvt].copy()
print(search_ctr_group.shape)
# use counterfactual df for tst
search_tst_group = cf_df[feat_rlvt].copy()
print(search_tst_group.shape)

In [None]:
# if we pre-process, the search_spaces must include the centers | TODO: do we normalize after ot before partitioning by A? decide in previous cell
if standardize:
    print('standardizing')
    
    scaler = preprocessing.StandardScaler()
    
    search_ctr_group_scaled = scaler.fit_transform(search_ctr_group)
    search_ctr_group_scaled = pd.DataFrame(search_ctr_group_scaled, index=search_ctr_group.index, columns=search_ctr_group.columns)
    search_ctr_group = search_ctr_group_scaled
    del search_ctr_group_scaled
    
    search_tst_group_scaled = scaler.fit_transform(search_tst_group)
    search_tst_group_scaled = pd.DataFrame(search_tst_group_scaled, index=search_tst_group.index, columns=search_tst_group.columns)
    search_tst_group = search_tst_group_scaled
    del search_tst_group_scaled

if weights:
    print('weighting')
    
    if len(weights) != len(feat_rlvt):
        sys.exit('provide a weight for each relevant feature')
    
    for feat_weight in weights:
        print(feat_weight)
        search_ctr_group[feat_weight] = weights[feat_weight] * search_ctr_group[feat_weight]
        search_tst_group[feat_weight] = weights[feat_weight] * search_tst_group[feat_weight]

In [None]:
centers_ctr = search_ctr_group.iloc[protected_indices].copy() # search_ctr_group will always include the ctr center

search_ctr_group = search_ctr_group.iloc[protected_indices].copy()
search_ctr_group.reset_index(inplace=True, )
search_ctr_group.rename(columns={'index': 'org_index'}, inplace=True)

print(search_ctr_group.shape)
search_ctr_group.head(5)

In [None]:
centers_ctr.head(5)

In [None]:
centers_tst = search_tst_group.iloc[protected_indices].copy()

# define the search space within the loop (unfortunately...)
# search_tst_group = search_tst_group.iloc[non_protected_indices].copy()
# search_tst_group.reset_index(inplace=True, )
# search_tst_group.rename(columns={'index': 'org_index'}, inplace=True)

# print(search_tst_group.shape)
# search_tst_group.head(5)

In [None]:
centers_tst.head(5)

In [None]:
# start the main function here...

# store neighboors here
dict_df_neighbors = {}

In [None]:
# 2) For eahc ind(ividual) set the centers

ind = 0 # start loop: for ind in protected_indices:

# for storing the neighboors 
temp_dict_df_neighbors = {}

# get ctr center from df of factual centers
ind_center_ctr = centers_ctr.loc[ind, ] #[ind, feat_rlvt]

print(ind_center_ctr)

# get tst center from df of counterfactual centers
ind_center_tst = centers_tst.loc[ind, ] #[ind, feat_rlvt]

print(ind_center_tst)

# prepare for knn
if len(feat_rlvt) > 1:
    ind_center_ctr = ind_center_ctr.values.reshape(1, -1)
    ind_center_tst = ind_center_tst.values.reshape(1, -1)
else:
    ind_center_ctr = ind_center_ctr.values.reshape(-1, 1)
    ind_center_tst = ind_center_tst.values.reshape(-1, 1)

print(ind_center_ctr)
print(ind_center_tst)

In [None]:
# 3) Control Group for ind

# NOTE: by default, the control group search space will include ind, which will appear as a neighbor (distrance will be 0.0)
knn_1 = NearestNeighbors(n_neighbors = n + 1, algorithm='ball_tree', metric = d).fit(search_ctr_group[feat_rlvt])
knn_1

In [None]:
distances_1, indices_1 = knn_1.kneighbors(ind_center_ctr)

In [None]:
temp_ctr_df = pd.DataFrame()
temp_ctr_df['knn_indices'] = pd.Series(indices_1[0])
temp_ctr_df['knn_distances'] = pd.Series(distances_1[0])
temp_ctr_df.sort_values(by='knn_distances', ascending=True, inplace=True)

# HERE we can drop neighbors based on the distance!
temp_ctr_df

In [None]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
temp_ctr_df = temp_ctr_df.merge(search_ctr_group[['org_index']], how='inner', left_on='knn_indices', right_index=True)

temp_ctr_df

# todo: we can. e.g., test on the features of interest as well for ST!


In [None]:
# let's not drop it for now...
# drop ind for ctr! org index is the one we are using to loop over the search space!!! | TODO should I keep the center?
# temp_ctr_df = temp_ctr_df[temp_ctr_df['org_index'] != ind].reset_index(drop=True)

# temp_ctr_df

In [None]:
# if temp_ctr_df.shape[0] > n:
#     print(temp_ctr_df.shape)
#     temp_ctr_df.drop(temp_ctr_df.tail(1).index,inplace=True)
#     print(temp_ctr_df.shape)

In [None]:
# store it
temp_dict_df_neighbors['control'] = temp_ctr_df

In [None]:
# clean up
del ind_center_ctr, knn_1, temp_ctr_df, indices_1, distances_1, 

In [None]:
# 3) Test Group for ind

temp_search_tst_group = search_tst_group.iloc[[ind] + non_protected_indices].copy()
temp_search_tst_group.reset_index(inplace=True, )
temp_search_tst_group.rename(columns={'index': 'org_index'}, inplace=True)

temp_search_tst_group.head(5)

In [None]:
knn_2 = NearestNeighbors(n_neighbors = n + 1, algorithm='ball_tree', metric = d).fit(temp_search_tst_group[feat_rlvt])
knn_2

In [None]:
distances_2, indices_2 = knn_2.kneighbors(ind_center_tst)

In [None]:
temp_tst_df = pd.DataFrame()
temp_tst_df['knn_indices'] = pd.Series(indices_2[0])
temp_tst_df['knn_distances'] = pd.Series(distances_2[0])
temp_tst_df.sort_values(by='knn_distances', ascending=True, inplace=True)

# HERE we can drop neighbors based on the distance!
temp_tst_df

In [None]:
# RECALL knn_indices are the same wrt the search space but not wrt org input
# we can uss the knn_indices to get the rest of information
temp_tst_df = temp_tst_df.merge(temp_search_tst_group[['org_index']], how='inner', left_on='knn_indices', right_index=True)

temp_tst_df

In [None]:
# store it
temp_dict_df_neighbors['test'] = temp_tst_df

In [None]:
# clean up
del ind_center_tst, knn_2, temp_tst_df, indices_2, distances_2, temp_search_tst_group

In [None]:
temp_dict_df_neighbors

In [None]:
#... later on
dict_df_neighbors[ind] = temp_dict_df_neighbors

In [None]:
#recall that ind is the same across df and cf_df
dict_df_neighbors

In [None]:
# 4) We need to start testing each group: do now for ind


ind = 0
int(ind)

In [None]:

ctr_group = dict_df_neighbors[ind]['control']
# ctr_group = ctr_group.merge(df[feat_list], how='inner', left_on='org_index', right_index=True)

ctr_group

In [None]:
# for ind ... need to merge with df and cf_df for rest of features:
tst_group = dict_df_neighbors[ind]['test']
# ctr_group = ctr_group.merge(df[feat_list], how='inner', left_on='org_index', right_index=True)

tst_group

In [None]:
k1=ctr_group.shape[0]
print(k1)
k2=tst_group.shape[0]
print(k2)

In [None]:
ctr_group[ctr_group[feat_trgt]==feat_trgt_vals['neg']].shape[0]

In [None]:
p1 = ctr_group[ctr_group[feat_trgt]==feat_trgt_vals['neg']].shape[0] / ctr_group.shape[0]
print(p1)
p2 = tst_group[tst_group[feat_trgt]==feat_trgt_vals['neg']].shape[0] / tst_group.shape[0]
print(p2)

diff = p1 - p2
diff

In [None]:
def get_wald_ci(alpha: float, p1: float, p2: float, k1: int, k2: int):
    
    wald_ci_summary = {}
    
    z_score = round(st.norm.ppf(1 - (alpha/2)), 2)
    d_alpha = z_score * math.sqrt( (p1*(1 - p1)/k1) + (p2*(1 - p2)/k2) )
    
    wald_ci_summary['d_alpha'] = d_alpha
    wald_ci_summary['CIs'] = [(p1 - p2) - d_alpha, (p1 - p2) + d_alpha]
    if (p1 - p2) >= 0:
        wald_ci_summary['diff'] = max(0, p1 - p2 - d_alpha)
    else:
        wald_ci_summary['diff'] = min(0, p1 - p2 + d_alpha)
    
    print(wald_ci_summary)
    
    return wald_ci_summary

In [None]:
ind_wald_ci = get_wald_ci(alpha=alpha, p1=p1, p2=p2, k1=ctr_group.shape[0], k2=tst_group.shape[0])

In [None]:
# todo: what's the output for the ST: yes/no where?

In [None]:
d = []
for p in game.players.passing():
    d.append(
        {
            'Player': p,
            'Team': p.team,
            'Passer Rating':  p.passer_rating()
        }
    )

pd.DataFrame(d)

In [None]:
print(ind)
print(ind_wald_ci)

In [None]:
# i guess i need some sort of threshold here???
diff_epsilon = 0.05 # tau in the second paper

In [None]:
if ind_wald_ci['diff'] > diff_epsilon:
    ind_wald_ci['cfST'] = 'Yes'
else:
    ind_wald_ci['cfST'] = 'No'
    
#how to return the results? get df with discrimination columns!

In [None]:
from sklearn.neighbors import BallTree
rng = np.random.RandomState(0)
X = rng.random_sample((10, 3))
print(X)
print('---')
print(X[:1])
tree = BallTree(X, leaf_size=2)              
dist, ind = tree.query(X[:1], k=3)                
print(ind)  # indices of 3 closest neighbors
#[0 3 1]
print(dist)  # distances to 3 closest neighbors
#[ 0.          0.19662693  0.29473397]