# Generate counterfactuals for Karimi_v2

Input: factual data (data folder)
Output: counterfactual data (results folder)

In [1]:
import os
import pandas as pd
import numpy as np

from pandas import DataFrame
from sklearn.linear_model import LinearRegression

In [2]:
# set working directory - note: all code runs from the src folder
wrk_dir = os.getcwd()
# data path
data_path = wrk_dir + '\\' + 'data' + '\\'
# results path
resu_path = wrk_dir + '\\' + 'results\\counterfactuals' + '\\'

In [3]:
org_df = pd.read_csv(data_path + '\\' + 'Karimi2020_v2.csv', sep='|', )
org_df.head(5)

Unnamed: 0,LoanApproval,AnnualSalary,AccountBalance,u1,u2,Gender
0,-1.0,35000,7947.67809,50000,-973.152642,1
1,1.0,120000,36940.097383,120000,940.097383,0
2,-1.0,90000,23564.129008,90000,-3435.870992,0
3,-1.0,80000,27596.570524,80000,3596.570524,0
4,1.0,201000,59008.567839,210000,-705.77838,1


In [4]:
# define the list of features
feat_trgt = ['LoanApproval']
feat_rlvt = ['AnnualSalary', 'AccountBalance']
feat_prot = ['Gender']
do = {'Gender': 0} # female to male

In [5]:
df = org_df[feat_trgt + feat_rlvt + feat_prot].copy()
df.head(5)

Unnamed: 0,LoanApproval,AnnualSalary,AccountBalance,Gender
0,-1.0,35000,7947.67809,1
1,1.0,120000,36940.097383,0
2,-1.0,90000,23564.129008,0
3,-1.0,80000,27596.570524,0
4,1.0,201000,59008.567839,1


Similar to the works on Algorithmic Recourse, we make a slight distinction between $\mathbf{X}$ and $Y$ regarding the data generating model (DGM): the ocurrence of $\mathbf{X}$ is independent of $Y$ as the latter is specific to the decision-making process of interest while the former is specific to the individuals involved in said process. For example, in college admissions the high-school grades are what they are for each individual despite what each college says. An individual can be accepted by one college, while getting rejected by a similar one. This distinction is made because $b$, which is the decision-maker, is external to the individual characteristics and yet crucial for testing discrimination. We, thus, can assume to know $b$ or try to approximate it based on the data. Here, let's assume we have access to $b$, though note that we can retrieve it from the data with some margin of error. This assumption is not too stringent for our cfST as we are mostly interested in making operational the **fairness given the (observed) difference** principle.


*Setup*: we have a given dataset, $\mathcal{D}$, from the decision-making process of interest and its corresponding structural causal model (SCM), $\mathcal{M}$, which explains the data generating process behind. We also have access to / recustruct $b$ using expert knowledge. 

*Assumptions*: besides knowing $\mathcal{M}$ (we can relax this assumption by saying we only know the causal graph), $\mathcal{D}$, and $b$ (though this assumption too can be relaxed by estimating it) we also assume
- Causal sufficiency, or no hidden confounder in $\mathcal{M}$;
- Each $f$ in $\mathcal{M}$ follows a linear additive noise model;
- 

Under this assumptions, the best known (parametric) modelling choice is the OLS as it is BLUE. This poses an issue when the errors are not white noise (future extension).

In [6]:
# store counterfactual df
cf_df = dict()

In [7]:
# First, estimate each f in M where needed according to the known causal graph:

# 1.1) create model objects
# f for AnnualSalary
model_sal = LinearRegression(fit_intercept=True, normalize=False)
# f for AccountBalance
model_acc = LinearRegression(fit_intercept=True, normalize=False)

In [8]:
# 1.2) prepare data for the models
x_sal = np.array(df['Gender'].copy()).reshape((-1, 1))
#print(x_sal.shape)
y_sal = np.array(df['AnnualSalary'].copy())
#print(y_sal.shape)

x_acc = np.array(df[['AnnualSalary', 'Gender']].copy())#.reshape((-1, 1))
#print(x_acc.shape)
y_acc = np.array(df['AccountBalance'].copy())
#print(y_acc.shape)

In [9]:
# 1.3) estimate the models
model_sal.fit(x_sal, y_sal)
model_acc.fit(x_acc, y_acc)
print('done')

done


In [10]:
round(df['AnnualSalary'] - model_sal.predict(x_sal), 2)

0       -49858.94
1        19344.71
2       -10655.29
3       -20655.29
4       116141.06
          ...    
4988      -655.29
4989     40141.06
4990    -22858.94
4991      -655.29
4992    -40655.29
Name: AnnualSalary, Length: 4993, dtype: float64

In [11]:
# Second, generate the (structural) counterfactuals (cf) for X using Pearl's abduction, action, prediction steps:

# 2.1) Abduction (or individual error terms given each f)
cf_df['u_AnnualSalary'] = round(df['AnnualSalary'] - model_sal.predict(x_sal), 2)
cf_df['u_AccountBalance'] = round(df['AccountBalance'] - model_acc.predict(x_acc), 2)

cf_df = pd.DataFrame.from_dict(cf_df)
cf_df.head(5)

Unnamed: 0,u_AnnualSalary,u_AccountBalance
0,-49858.94,-1452.13
1,19344.71,950.86
2,-10655.29,-3458.07
3,-20655.29,3563.38
4,116141.06,-8.85


In [12]:
# 2.2) Action + Prediction (X-wise): here, we focus on being female (the protected group)

do_male = np.repeat(0, repeats=df.shape[0]).reshape((-1, 1))
cf_df['AnnualSalary'] = round(model_sal.predict(do_male) + cf_df['u_AnnualSalary'], 2)

cf_df.head(5)

Unnamed: 0,u_AnnualSalary,u_AccountBalance,AnnualSalary
0,-49858.94,-1452.13,50796.35
1,19344.71,950.86,120000.0
2,-10655.29,-3458.07,90000.0
3,-20655.29,3563.38,80000.0
4,116141.06,-8.85,216796.35


In [13]:
do_male2 = cf_df[['AnnualSalary']].copy()
do_male2['Gender'] = do_male
cf_df['AccountBalance'] = round(model_acc.predict(do_male2) + cf_df['u_AccountBalance'], 2)

cf_df.head(5)

Unnamed: 0,u_AnnualSalary,u_AccountBalance,AnnualSalary,AccountBalance
0,-49858.94,-1452.13,50796.35,13852.05
1,19344.71,950.86,120000.0,36940.1
2,-10655.29,-3458.07,90000.0,23564.13
3,-20655.29,3563.38,80000.0,27596.57
4,116141.06,-8.85,216796.35,64912.94


In [14]:
# 2.3) Prediction (Y-wise): Generate cf_Y (when b is known)

beta_0 = 225000
beta_1 = (3/10)
beta_2 = 5

cf_df['LoanApproval'] = np.sign(cf_df['AnnualSalary'] + beta_2*cf_df['AccountBalance'] - beta_0)

# keep track of A
cf_df['Gender'] = df['Gender']

cf_df.head(5)

Unnamed: 0,u_AnnualSalary,u_AccountBalance,AnnualSalary,AccountBalance,LoanApproval,Gender
0,-49858.94,-1452.13,50796.35,13852.05,-1.0,1
1,19344.71,950.86,120000.0,36940.1,1.0,0
2,-10655.29,-3458.07,90000.0,23564.13,-1.0,0
3,-20655.29,3563.38,80000.0,27596.57,-1.0,0
4,116141.06,-8.85,216796.35,64912.94,1.0,1


In [15]:
# store in results folder
cf_df.to_csv(resu_path + '\\' + 'cf_Karimi2020_v2.csv', sep='|', index=False)