In [None]:
# Example - https://github.com/cornelliusyudhawijaya/Churn_Causality_Analysis/blob/main/Causal%20Analysis%20Do%20Why.ipynb
    
import numpy as np
import pandas as pd

from dowhy import CausalModel
import dowhy.datasets 

# Avoid printing dataconversion warnings from sklearn and numpy
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings('ignore')

# Prepare Data

In [None]:
from sklearn import preprocessing

data = pd.read_csv('../../../data/Titanic.csv')

## Drop categorical features
data = data.drop(['Name','Ticket','Cabin','PassengerId'],axis=1)

## Drop NULL values
data = data.dropna()


## Change symbolics to numerics
data['sex'] = np.where(data['sex'] == 'male', 0, 1)

le = preprocessing.LabelEncoder()
data['Embarked'] = le.fit_transform(data['Embarked'])

protected_attribute = 'sex'

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data),columns = data.columns)
print(data.head(5))

In [None]:
# This dataset size is very small. So repeating it to make it bigger

data = pd.concat([data]*5, ignore_index=True)

# Default Fairness Metrics

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from Measure import measure_final_score,calculate_recall,calculate_far,calculate_precision,calculate_accuracy
print(data.shape)
protected_attribute = 'sex'

data1,dataset_orig_test = train_test_split(data,test_size=0.2,random_state=0)
X_train, y_train =  data1.loc[:, data1.columns != 'Probability'], data1['Probability']
X_test, y_test = dataset_orig_test.loc[:, dataset_orig_test.columns != 'Probability'], dataset_orig_test['Probability']
clf = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=100) # LSR
clf.fit(X_train,y_train)
# print("recall :", measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'recall'))
# print("far :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'far'))
# print("precision :", measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'precision'))
print("accuracy :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'accuracy'))
# print("F1 Score :",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'F1'))
print("aod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'aod'))
print("eod :"+protected_attribute,measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'eod'))
print("SPD:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'SPD'))
print("DI:",measure_final_score(dataset_orig_test, clf, X_train, y_train, X_test, y_test, protected_attribute, 'DI'))

# Graph

In [None]:
causal_graph = """
digraph {
Age;
sex;
Fare;
Probability;
U[label="Unobserved Confounders"];

Age -> Probability;
sex -> Probability;
Age -> Fare; sex -> Fare; Fare -> Probability;
U->sex;U->Probability;
}
"""

## dowhy works for binary inputs only
data1['Probability'] = np.where(data1['Probability'] == 0, False, True)
data1[protected_attribute] = np.where(data1[protected_attribute] == 1, True, False)

In [None]:
# With graph
import time
# data1 = dataset_orig_train.copy()
start = time.time()
print(data1.shape)
model=CausalModel(
        data = data1,
        treatment="sex",
        outcome="Probability",
        graph=causal_graph.replace("\n", " ")
        )
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)
causal_estimate = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_stratification")
print(causal_estimate)
print("Causal Estimate is " + str(causal_estimate.value))
print(time.time()-start, "=="*20)
refutel = model.refute_estimate(identified_estimand,causal_estimate, "random_common_cause")
print(refutel)
print(time.time()-start, "=="*20)
refutel = model.refute_estimate(identified_estimand,causal_estimate, "data_subset_refuter")
print(refutel)
print(time.time()-start, "=="*20)
refutel = model.refute_estimate(identified_estimand,causal_estimate,method_name="placebo_treatment_refuter",placebo_type="permute")
print(refutel)
print(time.time()-start, "=="*20)


In [None]:
# Causal effect on the control group (ATC)
causal_estimate_att = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_stratification",
        target_units = "atc")
print(causal_estimate_att)
print("Causal Estimate is " + str(causal_estimate_att.value))

# Add Random Common Cause

In [None]:
refutel = model.refute_estimate(identified_estimand,causal_estimate, "random_common_cause")
print(refutel)

# Replace Treatment with Placebo

In [None]:
refutel = model.refute_estimate(identified_estimand,causal_estimate,method_name="placebo_treatment_refuter",placebo_type="permute")
print(refutel)

# Remove Random Subset of Data

In [None]:
refutel = model.refute_estimate(identified_estimand,causal_estimate, "data_subset_refuter")
print(refutel)