In [None]:
# Example - https://github.com/cornelliusyudhawijaya/Churn_Causality_Analysis/blob/main/Causal%20Analysis%20Do%20Why.ipynb
    
import numpy as np
import pandas as pd

from dowhy import CausalModel
import dowhy.datasets

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import sklearn.metrics as metrics


import sys,os
sys.path.append(os.path.abspath('..'))


from Measure import measure_final_score,calculate_recall,calculate_far,calculate_precision,calculate_accuracy
from Generate_Samples import generate_samples

# Avoid printing dataconversion warnings from sklearn and numpy
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings('ignore')

# Prepare Data

In [None]:
from sklearn import preprocessing
data = pd.read_csv('../../../data/h181.csv')

# ## Drop NULL values
data = data.dropna()


data = data.rename(columns = {'FTSTU53X' : 'FTSTU', 'ACTDTY53' : 'ACTDTY', 'HONRDC53' : 'HONRDC', 'RTHLTH53' : 'RTHLTH',
                              'MNHLTH53' : 'MNHLTH', 'CHBRON53' : 'CHBRON', 'JTPAIN53' : 'JTPAIN', 'PREGNT53' : 'PREGNT',
                              'WLKLIM53' : 'WLKLIM', 'ACTLIM53' : 'ACTLIM', 'SOCLIM53' : 'SOCLIM', 'COGLIM53' : 'COGLIM',
                              'EMPST53' : 'EMPST', 'REGION53' : 'REGION', 'MARRY53X' : 'MARRY', 'AGE53X' : 'AGE',
                              'POVCAT15' : 'POVCAT', 'INSCOV15' : 'INSCOV'})


data = data[data['PANEL'] == 20]
data = data[data['REGION'] >= 0] # remove values -1
data = data[data['AGE'] >= 0] # remove values -1
data = data[data['MARRY'] >= 0] # remove values -1, -7, -8, -9
data = data[data['ASTHDX'] >= 0] # remove values -1, -7, -8, -9
data = data[(data[['FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX','EDUCYR','HIDEG',
                             'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
                             'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
                             'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42',
                             'PHQ242','EMPST','POVCAT','INSCOV']] >= -1).all(1)]

# ## Change symbolics to numerics
data['RACEV2X'] = np.where((data['HISPANX'] == 2 ) & (data['RACEV2X'] == 1), 1, data['RACEV2X'])
data['RACEV2X'] = np.where(data['RACEV2X'] != 1 , 0, data['RACEV2X'])
data = data.rename(columns={"RACEV2X" : "RACE"})
# data['UTILIZATION'] = np.where(data['UTILIZATION'] >= 10, 1, 0)



def utilization(row):
        return row['OBTOTV15'] + row['OPTOTV15'] + row['ERTOT15'] + row['IPNGTD15'] + row['HHTOTD15']

data['TOTEXP15'] = data.apply(lambda row: utilization(row), axis=1)
lessE = data['TOTEXP15'] < 10.0
data.loc[lessE,'TOTEXP15'] = 0.0
moreE = data['TOTEXP15'] >= 10.0
data.loc[moreE,'TOTEXP15'] = 1.0

data = data.rename(columns = {'TOTEXP15' : 'UTILIZATION'})

data = data[['REGION','AGE','SEX','RACE','MARRY',
                                 'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX',
                                 'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
                                 'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
                                 'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42', 'ADSMOK42',
                                 'PCS42','MCS42','K6SUM42','PHQ242','EMPST','POVCAT','INSCOV','UTILIZATION', 'PERWT15F']]

dataset_orig = data.rename(columns={"UTILIZATION": "Probability","RACE" : "race"})
protected_attribute = 'race'


from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
dataset_orig = pd.DataFrame(scaler.fit_transform(dataset_orig),columns = dataset_orig.columns)

dataset_orig_train, dataset_orig_test = train_test_split(dataset_orig, test_size=0.2, random_state=0,shuffle = True)

(dataset_orig.head(5))

# Fair-SMOTE

In [None]:
# first one is class value and second one is protected attribute value
zero_zero = len(dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)])
zero_one = len(dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)])
one_zero = len(dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)])
one_one = len(dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)])
print(zero_zero,zero_one,one_zero,one_one)

maximum = max(zero_zero,zero_one,one_zero,one_one)
if maximum == zero_zero:
    print("zero_zero is maximum")
if maximum == zero_one:
    print("zero_one is maximum")
if maximum == one_zero:
    print("one_zero is maximum")
if maximum == one_one:
    print("one_one is maximum")
    
zero_zero_to_be_incresed = maximum - zero_zero ## where both are 0
one_zero_to_be_incresed = maximum - one_zero ## where class is 1 attribute is 0
one_one_to_be_incresed = maximum - one_one ## where class is 1 attribute is 1
print(zero_zero_to_be_incresed,one_zero_to_be_incresed,one_one_to_be_incresed)

df_zero_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 0)]
df_one_zero = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 0)]
df_one_one = dataset_orig_train[(dataset_orig_train['Probability'] == 1) & (dataset_orig_train[protected_attribute] == 1)]

df_zero_zero['race'] = df_zero_zero['race'].astype(str)


df_one_zero['race'] = df_one_zero['race'].astype(str)

df_one_one['race'] = df_one_one['race'].astype(str)


df_zero_zero = generate_samples(zero_zero_to_be_incresed,df_zero_zero,'MEPS')
df_one_zero = generate_samples(one_zero_to_be_incresed,df_one_zero,'MEPS')
df_one_one = generate_samples(one_one_to_be_incresed,df_one_one,'MEPS')

df = df_zero_zero.append(df_one_zero)
df = df.append(df_one_one)

df['race'] = df['race'].astype(float)

df_zero_one = dataset_orig_train[(dataset_orig_train['Probability'] == 0) & (dataset_orig_train[protected_attribute] == 1)]
df = df.append(df_zero_one)

# Fairness Metrics

In [None]:
from sklearn.linear_model import LogisticRegression
from Measure import measure_final_score,calculate_recall,calculate_far,calculate_precision,calculate_accuracy

X_train, y_train =  df.loc[:, df.columns != 'Probability'], df['Probability']
X_test, y_test = dataset_orig_test.loc[:, dataset_orig_test.columns != 'Probability'], dataset_orig_test['Probability']
        
clf = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=100) # LSR
clf.fit(X_train,y_train)

print("aod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'aod'))
print("eod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'eod'))
print("SPD:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'SPD'))
print("DI:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'DI'))

# Causal Graph

In [None]:
causal_graph = """
digraph {
SEX;
AGE;
race;
MARRY;
REGION;
Probability;
U[label="Unobserved Confounders"];
SEX -> Probability;
race -> REGION; REGION -> Probability;
race -> Probability;
AGE -> Probability;
SEX -> MARRY; MARRY -> Probability;
U->SEX;U->race;U->Probability;
}
"""

## dowhy works for binary inputs only


df['Probability'] = np.where(df['Probability'] == 0, False, True)
df[protected_attribute] = np.where(df[protected_attribute] == 1, True, False)

In [None]:
# With graph
model=CausalModel(
        data = df,
        treatment="race",
        outcome="Probability",
        graph=causal_graph.replace("\n", " ")
        )
model.view_model()

In [None]:
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)

In [None]:
causal_estimate = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_stratification")
print(causal_estimate)
print("Causal Estimate is " + str(causal_estimate.value))

In [None]:
# Causal effect on the control group (ATC)
causal_estimate_att = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_stratification",
        target_units = "atc")
print(causal_estimate_att)
print("Causal Estimate is " + str(causal_estimate_att.value))

# Add Random Common Cause

In [None]:
refutel = model.refute_estimate(identified_estimand,causal_estimate, "random_common_cause")
print(refutel)

# Replace Treatment with Placebo

In [None]:
refutel = model.refute_estimate(identified_estimand,causal_estimate,method_name="placebo_treatment_refuter",placebo_type="permute")
print(refutel)

# Remove Random Subset of Data

In [None]:
refutel = model.refute_estimate(identified_estimand,causal_estimate, "data_subset_refuter")
print(refutel)