In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# general plot settings  
plt.style.use('seaborn-whitegrid')
plt.rc('font', size=11)
plt.rc('legend', fontsize=11)
plt.rc('lines', linewidth=2)
plt.rc('axes', linewidth=2)
plt.rc('axes', edgecolor='k')
plt.rc('xtick.major', width=2)
plt.rc('xtick.major', size=6)
plt.rc('ytick.major', width=2)
plt.rc('ytick.major', size=6)
plt.rc('pdf', fonttype=42)
plt.rc('ps', fonttype=42)

# absolute path to st modules
module_path = r'C:\Users\Jose Alvarez\Documents\Projects\CounterfactualSituationTesting\src'
# module_path = os.path.abspath(os.path.join('../src')) # or the path to your source code
sys.path.insert(0, module_path)

# local files
from src.situation_testing.situation_testing import SituationTesting

In [2]:
# change data path accordingly
data_path = os.getcwd() + '\\' + 'data' + '\\'
resu_path = os.getcwd() + '\\' + 'results' + '\\'

In [3]:
# factual dataset
df    = pd.read_csv(data_path + 'Karimi2020_v2.csv', sep='|', )
# counterfactual dataset
cf_df = pd.read_csv(resu_path + 'counterfactuals\\cf_Karimi2020_v2.csv', sep='|', )

In [None]:
# # distributions for paper: X1
# b = 100
# plt.hist(df[df['Gender']==1]['AnnualSalary'], bins = b, alpha=0.9, label=r'$X_1^F$')
# plt.hist(cf_df[cf_df['Gender']==1]['AnnualSalary'], bins = b, alpha=0.7, label=r'$X_1^{CF}$')
# plt.legend(loc='upper right')

# #plt.title('Annual salary for females')
# plt.ylabel('Frequency')
# plt.xlabel(r'Annual salary ($X1$) for females')

In [None]:
# # distributions for paper: X2
# b = 100
# plt.hist(df[df['Gender']==1]['AccountBalance'], bins=b, alpha=0.9, label=r'$X_2^F$')
# plt.hist(cf_df[cf_df['Gender']==1]['AccountBalance'], bins=b, alpha=0.7, label=r'$X_2^{CF}$')
# plt.legend(loc='upper right')

# #plt.title(r'Account balance for females')
# plt.ylabel('Frequency')
# plt.xlabel(r'Account balance ($X_2$) for females')

In [85]:
# store original for re-running 
org_df = df.copy()
# the dataset in question
res_df = df.copy()

In [86]:
# attribute-specific params
feat_trgt = 'LoanApproval'
feat_trgt_vals = {'positive': 1, 'negative': -1}
# list of relevant features
feat_rlvt = ['AnnualSalary', 'AccountBalance']
# protected feature
feat_prot = 'Gender'
# values for the protected feature: use 'non_protected' and 'protected' accordingly
feat_prot_vals = {'non_protected': 0, 'protected': 1}

# st-specific params
# size of neiuborhoods
n = 15
# significance level
alpha = 0.05
# tau diviation
tau = 0.0

### counterfactual fairness (also included in ST class)

In [87]:
prot_cond = df['Gender'] == 1

# counterfactual fairness results for paper
res_cf_1 = pd.Series(np.zeros(len(df)), index=df.index)
res_cf_2 = pd.Series(np.zeros(len(df)), index=df.index)

for i, row in df.loc[prot_cond, ].iterrows():
    # looking for cases of counterfactual unfairness
    f_y   = df.loc[i, 'LoanApproval']
    scf_y = cf_df.loc[i, 'LoanApproval']
    
    # strict CF: 
    if f_y != scf_y: 
        res_cf_1[i] = True
    
    # disc specific CF: rejected as a woman, accepted as a man
    if (f_y == -1.0) and (scf_y == 1.0):
        res_cf_2[i] = True
 
print(sum(res_cf_1))
print(sum(res_cf_2))

print(sum(res_cf_2) / df[prot_cond].shape[0] * 100)

376.0
376.0
21.962616822429908


### standard ST

In [88]:
test_df = df.copy()

st = SituationTesting()
st.setup_baseline(test_df, nominal_atts=['Gender'], continuous_atts=['AnnualSalary', 'AccountBalance'])

res_df['ST'] = st.run(target_att='LoanApproval', target_val={'positive': 1, 'negative': -1},
                      sensitive_att='Gender', sensitive_val={'non_protected': 0, 'protected': 1},
                      k=n, alpha=alpha, tau=tau)

print(res_df[res_df['ST'] > tau].shape[0])

standardizing factual dataset
55


### counterfactual ST

In [113]:
test_df = df.copy()
test_cfdf = cf_df.copy()

# don't include the centers
cf_st = SituationTesting()
cf_st.setup_baseline(test_df, test_cfdf, nominal_atts=['Gender'], continuous_atts=['AnnualSalary', 'AccountBalance'])

res_df['cfST'] = cf_st.run(target_att='LoanApproval', target_val={'positive': 1, 'negative': -1},
                           sensitive_att='Gender', sensitive_val={'non_protected': 0, 'protected': 1},
                           include_centers=False,
                           k=n, alpha=alpha, tau=tau)

print(res_df[res_df['cfST'] > tau].shape[0])

standardizing factual dataset
standardizing counterfactual dataset
288


In [115]:
# can get the counterfactual unfairness
sum(cf_st.res_counterfactual_unfairness)  # it works!

376.0

In [90]:
test_df = df.copy()
test_cfdf = cf_df.copy()

# include the centers
cf_st = SituationTesting()
cf_st.setup_baseline(test_df, test_cfdf, nominal_atts=['Gender'], continuous_atts=['AnnualSalary', 'AccountBalance'])

res_df['cfST'] = cf_st.run(target_att='LoanApproval', target_val={'positive': 1, 'negative': -1},
                           sensitive_att='Gender', sensitive_val={'non_protected': 0, 'protected': 1},
                           include_centers=True,
                           k=n, alpha=alpha, tau=tau)

print(res_df[res_df['cfST'] > tau].shape[0])

standardizing factual dataset
standardizing counterfactual dataset
420


In [91]:
test_disc = cf_st.get_test_discrimination()

In [92]:
test_disc.head(5)

Unnamed: 0,individual,p1,p2,org_diff,d_alpha,diff,CIs,cfST
0,0,1.0,1.0,0.0,0.0,0.0,"[0.0, 0.0]",No
1,4,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0]",No
2,5,0.0625,0.0,0.062,0.11861,0.0,"[-0.056, 0.181]",Yes
3,6,1.0,1.0,0.0,0.0,0.0,"[0.0, 0.0]",No
4,9,1.0,0.875,0.125,0.162052,0.0,"[-0.037, 0.287]",Yes


In [96]:
org_list = [5,
 9,
 44,
 47,
 50,
 55,
 89,
 94,
 121,
 134,
 141,
 147,
 150,
 160,
 169,
 177,
 181,
 187,
 203,
 207,
 209,
 214,
 218,
 232,
 233,
 242,
 245,
 253,
 262,
 264,
 299,
 302,
 315,
 330,
 332,
 336,
 344,
 374,
 389,
 390,
 412,
 419,
 420,
 435,
 441,
 465,
 484,
 502,
 508,
 513,
 522,
 549,
 558,
 584,
 586,
 593,
 596,
 598,
 604,
 620,
 644,
 674,
 677,
 697,
 719,
 725,
 750,
 768,
 786,
 788,
 798,
 804,
 814,
 845,
 881,
 886,
 893,
 895,
 904,
 905,
 907,
 916,
 923,
 926,
 929,
 959,
 967,
 983,
 991,
 1037,
 1038,
 1044,
 1053,
 1054,
 1096,
 1100,
 1101,
 1114,
 1149,
 1190,
 1201,
 1211,
 1214,
 1217,
 1232,
 1245,
 1257,
 1280,
 1306,
 1313,
 1316,
 1318,
 1337,
 1343,
 1353,
 1354,
 1364,
 1368,
 1373,
 1382,
 1385,
 1388,
 1389,
 1394,
 1399,
 1406,
 1419,
 1428,
 1442,
 1452,
 1460,
 1485,
 1492,
 1495,
 1499,
 1507,
 1536,
 1557,
 1573,
 1604,
 1607,
 1643,
 1649,
 1650,
 1700,
 1708,
 1713,
 1717,
 1720,
 1730,
 1782,
 1785,
 1798,
 1808,
 1813,
 1829,
 1840,
 1862,
 1868,
 1895,
 1905,
 1912,
 1913,
 1915,
 1941,
 1958,
 1988,
 1999,
 2006,
 2023,
 2026,
 2038,
 2048,
 2051,
 2052,
 2056,
 2064,
 2068,
 2083,
 2095,
 2100,
 2105,
 2111,
 2113,
 2119,
 2120,
 2125,
 2144,
 2157,
 2167,
 2169,
 2171,
 2230,
 2243,
 2258,
 2259,
 2285,
 2293,
 2295,
 2297,
 2307,
 2315,
 2321,
 2323,
 2327,
 2332,
 2338,
 2349,
 2361,
 2362,
 2419,
 2420,
 2422,
 2428,
 2446,
 2453,
 2464,
 2466,
 2477,
 2482,
 2512,
 2532,
 2544,
 2546,
 2565,
 2592,
 2621,
 2625,
 2627,
 2628,
 2646,
 2651,
 2658,
 2665,
 2679,
 2680,
 2732,
 2747,
 2751,
 2752,
 2759,
 2769,
 2787,
 2789,
 2807,
 2829,
 2839,
 2848,
 2865,
 2867,
 2880,
 2883,
 2894,
 2926,
 2929,
 2945,
 2949,
 2985,
 2991,
 2993,
 2996,
 3012,
 3026,
 3030,
 3039,
 3049,
 3065,
 3070,
 3071,
 3075,
 3105,
 3134,
 3154,
 3164,
 3166,
 3176,
 3202,
 3206,
 3217,
 3223,
 3230,
 3239,
 3247,
 3261,
 3267,
 3274,
 3281,
 3282,
 3297,
 3308,
 3312,
 3314,
 3323,
 3328,
 3340,
 3348,
 3352,
 3357,
 3374,
 3393,
 3397,
 3401,
 3442,
 3459,
 3463,
 3482,
 3513,
 3517,
 3523,
 3530,
 3531,
 3532,
 3534,
 3551,
 3552,
 3571,
 3602,
 3628,
 3633,
 3649,
 3653,
 3656,
 3675,
 3680,
 3688,
 3691,
 3693,
 3701,
 3703,
 3710,
 3711,
 3719,
 3747,
 3797,
 3802,
 3803,
 3808,
 3812,
 3816,
 3839,
 3842,
 3849,
 3857,
 3867,
 3871,
 3881,
 3902,
 3920,
 3923,
 3948,
 3951,
 3956,
 3958,
 3960,
 3966,
 3979,
 3984,
 3992,
 4059,
 4065,
 4072,
 4074,
 4107,
 4125,
 4127,
 4149,
 4157,
 4177,
 4186,
 4211,
 4216,
 4231,
 4234,
 4237,
 4242,
 4245,
 4259,
 4262,
 4265,
 4268,
 4288,
 4293,
 4297,
 4298,
 4312,
 4315,
 4334,
 4338,
 4346,
 4352,
 4390,
 4391,
 4421,
 4436,
 4471,
 4472,
 4482,
 4484,
 4489,
 4503,
 4504,
 4524,
 4530,
 4541,
 4565,
 4569,
 4571,
 4574,
 4578,
 4586,
 4607,
 4613,
 4619,
 4631,
 4714,
 4717,
 4747,
 4749,
 4758,
 4770,
 4779,
 4781,
 4784,
 4790,
 4795,
 4796,
 4799,
 4801,
 4804,
 4856,
 4862,
 4880,
 4898,
 4904,
 4907,
 4918,
 4946,
 4958,
 4973,
 4978]

In [95]:
new_list = test_disc[test_disc['cfST']=='Yes'].individual.to_list()

In [99]:
set(org_list) - set(new_list)

{412,
 644,
 904,
 1280,
 1700,
 2628,
 2829,
 2945,
 2985,
 3374,
 3393,
 3675,
 3703,
 3710,
 3803,
 3867,
 3881,
 3902,
 4074,
 4288}

In [100]:
df.loc[412, ]

LoanApproval         -1.000000
AnnualSalary      71000.000000
AccountBalance    21461.017917
u1                80000.000000
u2                  769.435916
Gender                1.000000
Name: 412, dtype: float64

In [101]:
cf_df.loc[412, ]

u_AnnualSalary     -13858.94
u_AccountBalance     1300.76
AnnualSalary        86796.35
AccountBalance      27365.39
LoanApproval           -1.00
Gender                  1.00
Name: 412, dtype: float64

In [102]:
test_disc[test_disc['individual'] == 412]

Unnamed: 0,individual,p1,p2,org_diff,d_alpha,diff,CIs,cfST
147,412,1.0,1.0,0.0,0.0,0.0,"[0.0, 0.0]",No


In [104]:
st.res_dict_df_neighbors[412]


{'ctr_idx': [4074,
  3703,
  2945,
  1280,
  3374,
  907,
  3710,
  1389,
  1972,
  3902,
  3871,
  3292,
  1499,
  2058,
  859],
 'tst_idx': [621,
  4342,
  1668,
  1021,
  2739,
  1571,
  1276,
  4182,
  3860,
  2770,
  1040,
  4647,
  4331,
  686,
  1755]}

In [111]:
df.iloc[st.res_dict_df_neighbors[412]['ctr_idx'], ]

Unnamed: 0,LoanApproval,AnnualSalary,AccountBalance,u1,u2,Gender
4074,-1.0,71000,21176.63287,80000,-8.505728,1
3703,-1.0,70500,21320.276105,90000,821.306,1
2945,-1.0,70500,21632.001342,90000,2367.035118,1
1280,-1.0,70500,21221.643013,90000,556.051973,1
3374,-1.0,72000,21157.677267,90000,1139.079865,1
907,-1.0,72500,21634.570627,80000,2008.676601,1
3710,-1.0,69500,21255.622207,80000,1429.011067,1
1389,-1.0,70500,22015.314732,90000,1237.217433,1
1972,-1.0,71000,20738.12554,80000,92.158835,1
3902,-1.0,72000,21050.840741,90000,579.753025,1


In [112]:
df.iloc[st.res_dict_df_neighbors[412]['tst_idx'], ]

Unnamed: 0,LoanApproval,AnnualSalary,AccountBalance,u1,u2,Gender
621,-1.0,70000,21479.744338,70000,479.744338,0
4342,-1.0,70000,21421.226891,70000,421.226891,0
1668,-1.0,70000,21390.506595,70000,390.506595,0
1021,-1.0,70000,21361.152358,70000,361.152358,0
2739,-1.0,70000,21564.042566,70000,564.042566,0
1571,-1.0,70000,21581.691804,70000,581.691804,0
1276,-1.0,70000,21337.713489,70000,337.713489,0
4182,-1.0,70000,21595.073594,70000,595.073594,0
3860,-1.0,70000,21322.622589,70000,322.622589,0
2770,-1.0,70000,21603.733904,70000,603.733904,0
