In [1]:
import random
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [2]:
# set working directory
wrk_dir = os.path.dirname(os.getcwd())
# set data path
data_path = wrk_dir + '\\' + 'data' + '\\'
# Loan Application v2 (factual) data
org_df = pd.read_csv(data_path + '\\' + 'clean_LawData.csv', sep='|', )

This scrip generates the structural counterfactuals (SCF) for the Law School data. We consider $|\mathbf{A}| = 2$ by looking at *Gender* (male vs female) and *Race* (white vs non-white). Under this setting, we generate two counterfactual datasets, which allows us to explore the case for **multiple discrimination**.

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# load scm model 
from scm_models.scm_law_school import LawSchool

In [2]:
# set relevant paths
wrk_dir = os.path.dirname(os.getcwd())
data_path = wrk_dir + '\\' + 'data' + '\\'
resu_path = wrk_dir + '\\' + 'results' + '\\'

### Pooled U

Similar to Kusner et al. (2017), we assume a hidden confounder $U$ that affects $UGPA$ and $LSAT$ (Level 2). We modify the original script by ignoring the first year law school grades. We use MCMC to draw $\hat{U}$ and, thus, perform the abduction step as done in the paper's code. Formally: 
- $Race -> UGPA$; $Race -> LSAT$; 
- $Gender -> UGPA$; $Gender -> LSAT$; 
- $U -> LSAT$ and $U -> UGPA$; 
- $Y <- f(UGPA, LSAT)$.

In [3]:
# params of interest

prefix = 'pU'  # meaning causal sufficiency is violated

# relevant vars list
vlist_protected = ['female', 'male', 'white', 'nonwhite']
vlist_latent = ['U']

In [4]:
df = pd.read_csv(resu_path + f'{prefix}_upd_LawData.csv', sep='|')
print(df.shape)
print(df.columns.tolist())
df.head(5)

(10896, 7)
['LSAT', 'UGPA', 'female', 'male', 'white', 'nonwhite', 'U']


Unnamed: 0,LSAT,UGPA,female,male,white,nonwhite,U
2,36,3.0,1,0,1,0,-0.736926
3,30,3.1,0,1,1,0,-0.267031
4,39,2.2,0,1,0,1,-1.647028
5,37,3.4,1,0,1,0,0.118742
6,30,3.6,1,0,1,0,0.50287


In [10]:
# the beta coefficients for the UGPA path(s)
ugpa_weights = pd.read_csv(resu_path + f'{prefix}_wUGPA_LawData.csv', sep='|')
ugpa_weights

Unnamed: 0,female,male,white,nonwhite,ugpa0,eta_u_ugpa
0,0.77306,0.640014,0.938756,0.713025,1.621285,0.350264


In [11]:
# the beta coefficients for the LSAT path(s)
lsat_weights = pd.read_csv(resu_path + f'{prefix}_wLSAT_LawData.csv', sep='|')
lsat_weights

Unnamed: 0,female,male,white,nonwhite,lsat0,eta_u_lsat
0,0.851098,0.867346,0.960812,0.823301,1.802908,0.032696


In [12]:
# check the diff files - todo for later...
ugpa_deltas = pd.read_csv(resu_path + f'{prefix}_delta_ugpa.csv', sep='|')
# plt.hist(ugpa_deltas['delta'])
lsat_deltas = pd.read_csv(resu_path + f'{prefix}_delta_lsat.csv', sep='|')
# plt.hist(lsat_deltas['delta'])

#### Use the SCM class: define the causal graph

In [14]:
# define the DAG
dag_law_school = [('U', 'UGPA', ugpa_weights.loc[0, 'eta_u_ugpa']),
                  ('U', 'LSAT', lsat_weights.loc[0, 'eta_u_lsat']),
                  ('female', 'UGPA', ugpa_weights.loc[0, 'female']),
                  ('male', 'UGPA', ugpa_weights.loc[0, 'male']),
                  ('white', 'UGPA', ugpa_weights.loc[0, 'white']),
                  ('nonwhite', 'UGPA', ugpa_weights.loc[0, 'nonwhite']),
                  ('female', 'LSAT', lsat_weights.loc[0, 'female']),
                  ('male', 'LSAT', lsat_weights.loc[0, 'male']),
                  ('white', 'LSAT', lsat_weights.loc[0, 'white']),
                  ('nonwhite', 'LSAT', lsat_weights.loc[0, 'nonwhite'])  
                 ]
dag_law_school

[('U', 'UGPA', 0.35026399476484804),
 ('U', 'LSAT', 0.0326959683105284),
 ('female', 'UGPA', 0.7730603366620609),
 ('male', 'UGPA', 0.6400142403982011),
 ('white', 'UGPA', 0.9387557544129591),
 ('nonwhite', 'UGPA', 0.7130248092744692),
 ('female', 'LSAT', 0.851097530659986),
 ('male', 'LSAT', 0.867345506707074),
 ('white', 'LSAT', 0.9608122538399481),
 ('nonwhite', 'LSAT', 0.823301257973325)]

In [15]:
# initiate the class LawSchool to get all the SCM methods (maybe too much for this...)
law_school = LawSchool(dag_law_school, 
                       end_vars=['UGPA', 'LSAT'], 
                       exo_vars=['U'], )

In [21]:
# it includes some nice methods
# print(law_school.nodes)
# print(law_school.weights)
# print(law_school.adjacency_mtr)
# print(law_school.adjacency_lst)

#### Use the SCM class: define the SEM

In [22]:
law_school.define_sem()

introduce the structural equation model as a dict via 'SEM'
provide def_ugpa function for UGPA
provide def_lsat function for LSAT
provide each in the form: 'lambda row: df_var(row[x1],...,row[xj])'


In [23]:
# UGPA
def pred_ugpa(v_u, v_female, v_male, v_white, v_nonwhite):
    return (ugpa_weights.loc[0, 'ugpa0'] + 
            law_school.adjacency_mtr.loc['U']['UGPA'] * v_u +
            law_school.adjacency_mtr.loc['female']['UGPA'] * v_female +
            law_school.adjacency_mtr.loc['male']['UGPA'] * v_male +
            law_school.adjacency_mtr.loc['white']['UGPA'] * v_white +
            law_school.adjacency_mtr.loc['nonwhite']['UGPA'] * v_nonwhite)

# LSAT
def pred_lsat(v_u, v_female, v_male, v_white, v_nonwhite):
    return np.exp(lsat_weights.loc[0, 'lsat0'] + 
                  law_school.adjacency_mtr.loc['U']['LSAT'] * v_u +
                  law_school.adjacency_mtr.loc['female']['LSAT'] * v_female +
                  law_school.adjacency_mtr.loc['male']['LSAT'] * v_male +
                  law_school.adjacency_mtr.loc['white']['LSAT'] * v_white +
                  law_school.adjacency_mtr.loc['nonwhite']['LSAT'] * v_nonwhite)

In [24]:
law_school.SEM['UGPA'] = lambda row: pred_ugpa(
    v_u=row['U'], v_female=row['female'], v_male=row['male'], v_white=row['white'], v_nonwhite=row['nonwhite'])

law_school.SEM['LSAT'] = lambda row: pred_lsat(
    v_u=row['U'], v_female=row['female'], v_male=row['male'], v_white=row['white'], v_nonwhite=row['nonwhite'])

In [25]:
law_school.define_sem()

class instance already has a structural equation model dict; overwrite it via 'SEM' if needed


#### Use the SCM class: generate the counterfactuals

In [27]:
# for running the SEM (i.e., get the factuals) | no need for us
test_df = law_school.run_sem(data=df)
test_df.head(5)

generating FCTs in the following order:
UGPA
LSAT
generated the new variables:
fct_UGPA
fct_LSAT


Unnamed: 0,LSAT,UGPA,female,male,white,nonwhite,U,fct_UGPA,fct_LSAT
2,36,3.0,1,0,1,0,-0.736926,3.074983,36.260282
3,30,3.1,0,1,1,0,-0.267031,3.106524,37.424839
4,39,2.2,0,1,0,1,-1.647028,2.39743,31.177707
5,37,3.4,1,0,1,0,0.118742,3.374692,37.289056
6,30,3.6,1,0,1,0,0.50287,3.509239,37.76034


In [28]:
# test_do = {'female': 0, 'male': 1}
# test_do_desc = 'do_male'
do_male = law_school.generate_scfs(do={'female': 0, 'male': 1}, 
                                   do_desc='do_male', 
                                   data=df)
do_male.head(5)

do(female=0)
do(male=1)
generating SCFs in the following order:
UGPA
LSAT
generated the new variables:
scf_UGPA
scf_LSAT


Unnamed: 0,LSAT,UGPA,female,male,white,nonwhite,U,org_female,org_male,scf_UGPA,scf_LSAT
2,36,3.0,0,1,1,0,-0.736926,1,0,2.941937,36.854251
3,30,3.1,0,1,1,0,-0.267031,0,1,3.106524,37.424839
4,39,2.2,0,1,0,1,-1.647028,0,1,2.39743,31.177707
5,37,3.4,0,1,1,0,0.118742,1,0,3.241646,37.899877
6,30,3.6,0,1,1,0,0.50287,1,0,3.376193,38.37888


In [29]:
# test_do = {'nonwhite': 0, 'white': 1}
# test_do_desc = 'do_white'
do_white = law_school.generate_scfs(do={'nonwhite': 0, 'white': 1},
                                    do_desc='do_white',
                                    data=df)
do_white.head(5)

do(nonwhite=0)
do(white=1)
generating SCFs in the following order:
UGPA
LSAT
generated the new variables:
scf_UGPA
scf_LSAT


Unnamed: 0,LSAT,UGPA,female,male,white,nonwhite,U,org_nonwhite,org_white,scf_UGPA,scf_LSAT
2,36,3.0,1,0,1,0,-0.736926,0,1,3.074983,36.260282
3,30,3.1,0,1,1,0,-0.267031,0,1,3.106524,37.424839
4,39,2.2,0,1,1,0,-1.647028,1,0,2.623161,35.773747
5,37,3.4,1,0,1,0,0.118742,0,1,3.374692,37.289056
6,30,3.6,1,0,1,0,0.50287,0,1,3.509239,37.76034


### A 'known' decision maker

To frame it as a (discrete) decision making process, we include an *admissions officer* based on [the "known" requirements of US Law Schools](https://schools.lawschoolnumbers.com/). We assume the case for Yale School. We could not find a one-to-one converter between LSAT over 48 to over 180. 173/180 is 96%; it would be abloud 46.1/48. Let's assume Yale cuts at these median values, and puts a slightly higher weight on UGPA over LSAT.

In [9]:
# Our decision maker:
b1 = 0.6
b2 = 0.4
min_score = round(b1*3.93 + b2*46.1, 2)  # 20.8
max_score = round(b1*4.00 + b2*48.00)    # 22

In [30]:
# add it to the factuals
df['Score'] = b1*df['UGPA'] + b2*df['LSAT']
df['Y'] = np.where(df['Score'] >= min_score, 1, 0)
df.head(5)

Unnamed: 0,LSAT,UGPA,female,male,white,nonwhite,U,Score,Y
2,36,3.0,1,0,1,0,-0.736926,16.2,0
3,30,3.1,0,1,1,0,-0.267031,13.86,0
4,39,2.2,0,1,0,1,-1.647028,16.92,0
5,37,3.4,1,0,1,0,0.118742,16.84,0
6,30,3.6,1,0,1,0,0.50287,14.16,0


In [31]:
# add it to the counterfactuals (gender)
do_male['Score'] = b1*do_male['UGPA'] + b2*do_male['LSAT']
do_male['Y'] = np.where(do_male['Score'] >= min_score, 1, 0)
do_male.head(5)

Unnamed: 0,LSAT,UGPA,female,male,white,nonwhite,U,org_female,org_male,scf_UGPA,scf_LSAT,Score,Y
2,36,3.0,0,1,1,0,-0.736926,1,0,2.941937,36.854251,16.2,0
3,30,3.1,0,1,1,0,-0.267031,0,1,3.106524,37.424839,13.86,0
4,39,2.2,0,1,0,1,-1.647028,0,1,2.39743,31.177707,16.92,0
5,37,3.4,0,1,1,0,0.118742,1,0,3.241646,37.899877,16.84,0
6,30,3.6,0,1,1,0,0.50287,1,0,3.376193,38.37888,14.16,0


In [32]:
# add it to the counterfactuals (race)
do_white['Score'] = b1*do_white['UGPA'] + b2*do_white['LSAT']
do_white['Y'] = np.where(do_white['Score'] >= min_score, 1, 0)
do_white.head(5)

Unnamed: 0,LSAT,UGPA,female,male,white,nonwhite,U,org_nonwhite,org_white,scf_UGPA,scf_LSAT,Score,Y
2,36,3.0,1,0,1,0,-0.736926,0,1,3.074983,36.260282,16.2,0
3,30,3.1,0,1,1,0,-0.267031,0,1,3.106524,37.424839,13.86,0
4,39,2.2,0,1,1,0,-1.647028,1,0,2.623161,35.773747,16.92,0
5,37,3.4,1,0,1,0,0.118742,0,1,3.374692,37.289056,16.84,0
6,30,3.6,1,0,1,0,0.50287,0,1,3.509239,37.76034,14.16,0


In [34]:
df.groupby('Y').count()

Unnamed: 0_level_0,LSAT,UGPA,female,male,white,nonwhite,U,Score
Y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,10656,10656,10656,10656,10656,10656,10656,10656
1,240,240,240,240,240,240,240,240


In [35]:
do_male.groupby('Y').count()

Unnamed: 0_level_0,LSAT,UGPA,female,male,white,nonwhite,U,org_female,org_male,scf_UGPA,scf_LSAT,Score
Y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,10656,10656,10656,10656,10656,10656,10656,10656,10656,10656,10656,10656
1,240,240,240,240,240,240,240,240,240,240,240,240


In [36]:
do_white.groupby('Y').count()

Unnamed: 0_level_0,LSAT,UGPA,female,male,white,nonwhite,U,org_nonwhite,org_white,scf_UGPA,scf_LSAT,Score
Y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,10656,10656,10656,10656,10656,10656,10656,10656,10656,10656,10656,10656
1,240,240,240,240,240,240,240,240,240,240,240,240


In [None]:
# we need to add a decision outcome: say, to enter Harvard (go to that website)

In [None]:
# store both datasets for cfST
do_male.to_csv(resu_path + '\\counterfactuals\\' + 'cf_male_LawSchoolData.csv', sep='|', index=False)
do_white.to_csv(resu_path + '\\counterfactuals\\' + 'cf_white_LawSchoolData.csv', sep='|', index=False)