# Generate structural counterfactuals (SCF) for Law School

This script...

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scm_models.scm_law_school import LawSchool


In [2]:
# set relevant paths
wrk_dir = os.path.dirname(os.getcwd())
data_path = wrk_dir + '\\' + 'data' + '\\'
rslt_path = wrk_dir + '\\' + 'results' + '\\'

## Pooled U

In [3]:
prefix = 'pU'

# relevant vars list
vlist_protected = ['female', 'male', 'white', 'nonwhite']
vlist_latent = ['U']

In [4]:
df = pd.read_csv(rslt_path + f'{prefix}_upd_LawData.csv', sep='|')
print(df.shape)
print(df.columns.tolist())
df.head(5)

(10896, 7)
['LSAT', 'UGPA', 'female', 'male', 'white', 'nonwhite', 'U']


Unnamed: 0,LSAT,UGPA,female,male,white,nonwhite,U
2,36,3.0,1,0,1,0,-0.736926
3,30,3.1,0,1,1,0,-0.267031
4,39,2.2,0,1,0,1,-1.647028
5,37,3.4,1,0,1,0,0.118742
6,30,3.6,1,0,1,0,0.50287


In [5]:
ugpa_weights = pd.read_csv(rslt_path + f'{prefix}_wUGPA_LawData.csv', sep='|')
ugpa_weights

Unnamed: 0,female,male,white,nonwhite,ugpa0,eta_u_ugpa
0,0.77306,0.640014,0.938756,0.713025,1.621285,0.350264


In [6]:
lsat_weights = pd.read_csv(rslt_path + f'{prefix}_wLSAT_LawData.csv', sep='|')
lsat_weights

Unnamed: 0,female,male,white,nonwhite,lsat0,eta_u_lsat
0,0.851098,0.867346,0.960812,0.823301,1.802908,0.032696


In [7]:
# check the diff files...
ugpa_deltas = pd.read_csv(rslt_path + f'{prefix}_delta_ugpa.csv', sep='|')

In [8]:
lsat_deltas = pd.read_csv(rslt_path + f'{prefix}_delta_lsat.csv', sep='|')

In [None]:
# plt.hist(ugpa_deltas[ugpa_deltas['sex']=='Female']['delta'], label = 'Female')
# plt.hist(ugpa_deltas[ugpa_deltas['sex']=='Male']['delta'], label = 'Male')
# plt.legend(loc='upper right')

In [None]:
# plt.hist(lsat_deltas[ugpa_deltas['sex']=='Female']['delta'], label = 'Female')
# plt.hist(lsat_deltas[ugpa_deltas['sex']=='Male']['delta'], label = 'Male')
# plt.legend(loc='upper right')

### Define the DAG

In [9]:
dag_law_school = [('U', 'UGPA', ugpa_weights.loc[0, 'eta_u_ugpa']),
                  ('U', 'LSAT', lsat_weights.loc[0, 'eta_u_lsat']),
                  ('female', 'UGPA', ugpa_weights.loc[0, 'female']),
                  ('male', 'UGPA', ugpa_weights.loc[0, 'male']),
                  ('white', 'UGPA', ugpa_weights.loc[0, 'white']),
                  ('nonwhite', 'UGPA', ugpa_weights.loc[0, 'nonwhite']),
                  ('female', 'LSAT', lsat_weights.loc[0, 'female']),
                  ('male', 'LSAT', lsat_weights.loc[0, 'male']),
                  ('white', 'LSAT', lsat_weights.loc[0, 'white']),
                  ('nonwhite', 'LSAT', lsat_weights.loc[0, 'nonwhite'])  
                 ]
dag_law_school

[('U', 'UGPA', 0.35026399476484804),
 ('U', 'LSAT', 0.0326959683105284),
 ('female', 'UGPA', 0.7730603366620609),
 ('male', 'UGPA', 0.6400142403982011),
 ('white', 'UGPA', 0.9387557544129591),
 ('nonwhite', 'UGPA', 0.7130248092744692),
 ('female', 'LSAT', 0.851097530659986),
 ('male', 'LSAT', 0.867345506707074),
 ('white', 'LSAT', 0.9608122538399481),
 ('nonwhite', 'LSAT', 0.823301257973325)]

In [10]:
law_school = LawSchool(dag_law_school, 
                       end_vars=['UGPA', 'LSAT'], 
                       exo_vars=['U'], )

In [11]:
law_school.nodes

['U', 'UGPA', 'LSAT', 'female', 'male', 'white', 'nonwhite']

In [12]:
law_school.weights

{('U', 'UGPA'): 0.35026399476484804,
 ('U', 'LSAT'): 0.0326959683105284,
 ('female', 'UGPA'): 0.7730603366620609,
 ('male', 'UGPA'): 0.6400142403982011,
 ('white', 'UGPA'): 0.9387557544129591,
 ('nonwhite', 'UGPA'): 0.7130248092744692,
 ('female', 'LSAT'): 0.851097530659986,
 ('male', 'LSAT'): 0.867345506707074,
 ('white', 'LSAT'): 0.9608122538399481,
 ('nonwhite', 'LSAT'): 0.823301257973325}

In [13]:
law_school.adjacency_mtr

Unnamed: 0,U,UGPA,LSAT,female,male,white,nonwhite
U,0,0.350264,0.032696,0,0,0,0
UGPA,0,0.0,0.0,0,0,0,0
LSAT,0,0.0,0.0,0,0,0,0
female,0,0.77306,0.851098,0,0,0,0
male,0,0.640014,0.867346,0,0,0,0
white,0,0.938756,0.960812,0,0,0,0
nonwhite,0,0.713025,0.823301,0,0,0,0


In [14]:
law_school.adjacency_lst

{'U': ['UGPA', 'LSAT'],
 'UGPA': [],
 'LSAT': [],
 'female': ['UGPA', 'LSAT'],
 'male': ['UGPA', 'LSAT'],
 'white': ['UGPA', 'LSAT'],
 'nonwhite': ['UGPA', 'LSAT']}

### Define the SEM

In [15]:

# UGPA
def pred_ugpa(v_u, v_female, v_male, v_white, v_nonwhite):
    return (ugpa_weights.loc[0, 'ugpa0'] + 
            law_school.adjacency_mtr.loc['U']['UGPA'] * v_u +
            law_school.adjacency_mtr.loc['female']['UGPA'] * v_female +
            law_school.adjacency_mtr.loc['male']['UGPA'] * v_male +
            law_school.adjacency_mtr.loc['white']['UGPA'] * v_white +
            law_school.adjacency_mtr.loc['nonwhite']['UGPA'] * v_nonwhite)

# LSAT
def pred_lsat(v_u, v_female, v_male, v_white, v_nonwhite):
    return np.exp(lsat_weights.loc[0, 'lsat0'] + 
                  law_school.adjacency_mtr.loc['U']['LSAT'] * v_u +
                  law_school.adjacency_mtr.loc['female']['LSAT'] * v_female +
                  law_school.adjacency_mtr.loc['male']['LSAT'] * v_male +
                  law_school.adjacency_mtr.loc['white']['LSAT'] * v_white +
                  law_school.adjacency_mtr.loc['nonwhite']['LSAT'] * v_nonwhite)


In [16]:
# not an ideal solution... hard to scale this var-specific approacj tbh
law_school.define_sem()

introduce the structural equation model as a dict via 'SEM'
provide def_ugpa function for UGPA
provide def_lsat function for LSAT
provide each in the form: 'lambda row: df_var(row[x1],...,row[xj])'


In [17]:
law_school.SEM['UGPA'] = lambda row: pred_ugpa(
    v_u=row['U'], v_female=row['female'], v_male=row['male'], v_white=row['white'], v_nonwhite=row['nonwhite'])

law_school.SEM['LSAT'] = lambda row: pred_lsat(
    v_u=row['U'], v_female=row['female'], v_male=row['male'], v_white=row['white'], v_nonwhite=row['nonwhite'])

In [18]:
law_school.define_sem()

class instance already has a structural equation model dict; overwrite it via 'SEM' if needed


### Generate the factuals

In [19]:
test_df = law_school.run_sem(data=df)

generating FCTs in the following order:
UGPA
LSAT
generated the new variables:
fct_UGPA
fct_LSAT


In [20]:
test_df.head(5)

Unnamed: 0,LSAT,UGPA,female,male,white,nonwhite,U,fct_UGPA,fct_LSAT
2,36,3.0,1,0,1,0,-0.736926,3.074983,36.260282
3,30,3.1,0,1,1,0,-0.267031,3.106524,37.424839
4,39,2.2,0,1,0,1,-1.647028,2.39743,31.177707
5,37,3.4,1,0,1,0,0.118742,3.374692,37.289056
6,30,3.6,1,0,1,0,0.50287,3.509239,37.76034


In [None]:
# df['fct_UGPA'] = df.apply(lambda row: pred_ugpa(
#     v_u=row['U'], v_female=row['female'], v_male=row['male'], v_white=row['white'], v_nonwhite=row['nonwhite']),
#                           axis=1
#                          )

# df['fct_LSAT'] = df.apply(lambda row: pred_lsat(
#     v_u=row['U'], v_female=row['female'], v_male=row['male'], v_white=row['white'], v_nonwhite=row['nonwhite']), 
#                           axis=1
#                          )

### Generate the structural counterfactuals

In [21]:
test_do = {'female': 0, 'male': 1}
test_do_desc = 'do_male'

In [22]:
test_df2 = law_school.generate_scfs(do=test_do, do_desc=test_do_desc, data=test_df)

do(female=0)
do(male=1)
generating SCFs in the following order:
UGPA
LSAT
generated the new variables:
scf_UGPA
scf_LSAT


In [23]:
test_df2

Unnamed: 0,LSAT,UGPA,female,male,white,nonwhite,U,fct_UGPA,fct_LSAT,org_female,org_male,scf_UGPA,scf_LSAT
2,36,3.0,0,1,1,0,-0.736926,3.074983,36.260282,1,0,2.941937,36.854251
3,30,3.1,0,1,1,0,-0.267031,3.106524,37.424839,0,1,3.106524,37.424839
4,39,2.2,0,1,0,1,-1.647028,2.397430,31.177707,0,1,2.397430,31.177707
5,37,3.4,0,1,1,0,0.118742,3.374692,37.289056,1,0,3.241646,37.899877
6,30,3.6,0,1,1,0,0.502870,3.509239,37.760340,1,0,3.376193,38.378880
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21781,33,3.8,0,1,1,0,0.927105,3.657833,38.287753,1,0,3.524787,38.914933
21783,31,3.5,0,1,1,0,0.319252,3.444924,37.534322,1,0,3.311878,38.149160
21784,28,3.3,0,1,1,0,0.164339,3.257617,37.956422,0,1,3.257617,37.956422
21788,36,4.0,0,1,1,0,1.426279,3.832676,38.917775,1,0,3.699629,39.555275


In [None]:
# individual SCF method?

### KNN?

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import BallTree
import scipy

In [None]:
# consider the group female
df[df['male'] == 0].head(5)

In [None]:
# consider the female i=2
i = 2
# such that
i_fct = do_df.loc[i, ['UGPA', 'LSAT']] # or shoulf I predict it? should I also incorporate threshold
print(i_fct)
i_scf = do_df.loc[i, ['scf_UGPA', 'scf_LSAT']]
print(i_scf)

In [None]:
# create the control group around i_fct
X_cont = df[df['male']==0][['UGPA', 'LSAT']].copy()
X_cont.head(5)

In [None]:
X_cont.to_numpy()

In [None]:
tree = BallTree(X_cont.to_numpy(), leaf_size=2) 
tree

In [None]:
dist, ind = tree.query([X_cont.to_numpy()[i]], k=11) 
#dist, ind = tree.query([i_fct.to_numpy()[i]], k=11) 
print(dist)
print(ind)

In [None]:
# it returns itself
print(i_fct)
X_cont.iloc[205]

In [None]:
# create the test group around i_fct
X_test = do_df[do_df['org_male']==1][['scf_UGPA', 'scf_LSAT']].copy()
X_test.head(5)

In [None]:
tree_t = BallTree(X_test.to_numpy(), leaf_size=2) 
tree_t

In [None]:
[do_df.loc[i, ['scf_UGPA', 'scf_LSAT']].to_numpy()]

In [None]:
dist, ind = tree.query([do_df.loc[i, ['scf_UGPA', 'scf_LSAT']].to_numpy()], k=11) 
#dist, ind = tree.query([i_fct.to_numpy()[i]], k=11) 
print(dist)
print(ind)

In [None]:
print(i_fct)
print(i_scf)
X_test.loc[3408, ]

In [None]:
# bit strange the joint search???! ask salvatore... we could prioratize given the causal order (i.e. the weights?)
# but this seems to be the pipeline, no? generate i_fct and i_scf, train a ball on A=a and A=a' respectively,
# and then find the closes neighbors 


### --- misc:

In [None]:
# male ugpa + white ugpa + ugpa_intercept
0.761571 + 0.880808 + 1.574632
# gives the intercept when I run male and white as the baseline
# but here is interesting bcs i wonder how this looks in terms of edges, no?

In [None]:
"""
same dist of Us and LSAT and UGPA for male and white as benchmarks
> sense_cols
[1] "female"   "nonwhite"
> eta_a_ugpa
[1]  0.1132429 -0.2275812
> eta_a_lsat
[1] -0.01449607 -0.13603405
> ugpa0
[1] 3.216965
> lsat0
[1] 3.627609
"""

In [None]:
# we can retreive the weights too...
# ugpa0 would mean white and male.. thus:
3.216965 + -0.2275812 # male + non-white

# vs all vars
0.761571 + 0.652492 -0.262836

In [None]:
"""
without the intercepts but still same dist of Us and LSAT and UGPA for male and white as benchmarks
> sense_cols
[1] "female"   "male"     "white"    "nonwhite"
> eta_a_ugpa
[1] 1.621399 1.508263 1.708761 1.480915
> eta_a_lsat
[1] 1.776778 1.791329 1.836215 1.700086

> eta_u_ugpa
[1] 0.2690262
> eta_u_lsat
[1] 0.02924528
"""

#https://stats.stackexchange.com/questions/7948/when-is-it-ok-to-remove-the-intercept-in-a-linear-regression-model
#https://stats.oarc.ucla.edu/other/mult-pkg/faq/general/faq-why-are-r2-and-f-so-large-for-models-without-a-constant/

# first, it's whether to include or not the intercept, which I think we should. otherwise, we claim that the 
# function passes through 0 and E[Y|X=0]=0 which is not the case, here
# second, whethere to include all the single encodings or the diverging groups
# based on standard ML and Stats practices, if we include the intercept, we should drop the base group(s) for proper identification
# not sure why kusner didn't do this in their paper... when, e.g., they ran their models and R kept dropping variables
# bcs they had an intercept!

In [None]:
# we need to know the functional specifications of each X
# the dag to know where to intervene 
# tbh.... I don't get the weights? why do we need to find U|evidence? the abduction step makes no sense!!!
# maybe ask karima?

In [None]:
# consider some factual of a female candidate
i = 0 
df.iloc[i,]

In [None]:
i_ugpa_scf = ugpa_w["ugpa0"] + ugpa_w["eta_u_ugpa"]*df.iloc[i]['U'] + ugpa_w['white'] + ugpa_w['male']
i_ugpa_scf

In [None]:
i_lsat_scf = np.exp(lsat_w["lsat0"] + lsat_w["eta_u_lsat"]*df.iloc[i]['U'] + lsat_w['white'] + lsat_w['male'])
i_lsat_scf

In [None]:
# the above mapping needs to be made cleaner... like a class.. translate the dag into a mapping [check the imt scripts for inspiration]
