In [None]:
pip install dowhy

In [None]:
pip install econml

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dowhy
from dowhy import CausalModel
import dowhy.datasets
import econml
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import requests
import io
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingRegressor

# Avoiding unnecessary log messges and warnings
import logging
logging.getLogger("dowhy").setLevel(logging.WARNING)
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

#### For analysing the importance of features, feature engineering and the counterfactual effect observed by having additional data and features, we compare two datasets, one with minimal relevant features, and another with additional features

####Dataset 1: 

A 1980 census extract, also used in Angrist and Krueger (1991),329,509  observations on the following variables:

1) log weekly wage

2) quarter of birth (1-4)

3) year of birth (30-39)

4) place of birth (1980 census state codes)

5) education (highest grade completed)

Iniitial Exploratory Data Analysis on the data

In [None]:
df = pd.read_csv("/content/sample_data/asciiqob.txt", sep="         ",on_bad_lines='skip', header=None)

In [None]:
(df[1].unique())

In [None]:
(df[2].unique())

In [None]:
(df[3].unique())

In [None]:
(df[4].unique())

In [None]:
df.rename(columns = {0:'log weekly wage',1 : 'Education', 2 : 'Year of Birth', 3: 'Quarter of Birth', 4 : 'Place of Birth'}, inplace = True)

In [None]:
df['Education'].value_counts()

In [None]:
fig = ff.create_distplot([df['Education']],group_labels=['Education'])
fig.show()

In [None]:
df['Place of Birth'].value_counts()

In [None]:
fig = ff.create_distplot([df['Place of Birth']],group_labels=['Place of Birth'])
fig.show()

In [None]:
df['Quarter of Birth'].value_counts()

In [None]:
fig = ff.create_distplot([df['Quarter of Birth']],group_labels=['Quarter of Birth'])
fig.show()

In [None]:
df['Year of Birth'].value_counts()

In [None]:
fig = ff.create_distplot([df['Year of Birth']],group_labels=['Year of Birth'])
fig.show()

In [None]:
fig = px.imshow(df.corr(), text_auto=True)
fig.show()

In [None]:
fig = ff.create_distplot([df['log weekly wage']],group_labels=['log weekly wage'])
fig.show()

In [None]:
corr =  df.corr()

# Create a mask
mask = np.triu(np.ones_like(corr, dtype=bool))

plt.figure(figsize=(10, 6))
sns.heatmap(corr, mask=mask, center=0, annot=True,
            fmt='.2f', square=True, cmap = "YlGnBu")

plt.show()

As seen in the correlartional matrix, we can only find a relevant signal/predictor of weekly wage from the education variable, with no relavant signals from other variables

#####1) Creating the causal model

In [None]:
cols = ['Place of Birth','Year of Birth']

In [None]:
model = CausalModel(
    data=df,
    treatment=['Education'],
    outcome=['log weekly wage'],
    common_causes=cols,
    instruments = ['Quarter of Birth'])

In [None]:
model.view_model(layout="dot")
from IPython.display import Image, display

##### II. Identify causal effect and return target estimands

In [None]:
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)

##### III. Estimate the target estimand using a statistical method.
We use the backdoor estimator

In [None]:
dml_estimate = model.estimate_effect(identified_estimand, 
                                    method_name="backdoor.econml.dml.DML",
                                    method_params={
                                        'init_params': {'model_y':GradientBoostingRegressor(),
                                                        'model_t': GradientBoostingRegressor(),
                                                        'model_final':LassoCV(fit_intercept=False), },
                                        'fit_params': {}
                                     },confidence_intervals= False )
print(dml_estimate)

##### 4) For our refutational analysis, we use three refuters, namely
1) placebo_treatment_refuter

2) random_common_cause

3) dataset_subset_refuter

##### Runs for each refuter technique for 5 simulations

In [None]:
refute_results = model.refute_estimate(identified_estimand, dml_estimate,
                                       method_name="placebo_treatment_refuter",num_simulations=5)
print(refute_results)

In [None]:
refute_results = model.refute_estimate(identified_estimand, dml_estimate,
                                       method_name="random_common_cause",num_simulations=5)
print(refute_results)

In [None]:
refute_results = model.refute_estimate(identified_estimand, dml_estimate,
                                       method_name="data_subset_refuter",num_simulations=5)
print(refute_results)

##### Refuter experiments for 10 simulations

In [None]:
refute_results = model.refute_estimate(identified_estimand, dml_estimate,
                                       method_name="placebo_treatment_refuter",num_simulations=10)
print(refute_results)

In [None]:
refute_results = model.refute_estimate(identified_estimand, dml_estimate,
                                       method_name="random_common_cause",num_simulations=10)
print(refute_results)

In [None]:
refute_results = model.refute_estimate(identified_estimand, dml_estimate,
                                       method_name="data_subset_refuter",num_simulations=10)
print(refute_results)

#### Dataset 2: Wage data from UCI's ML repository

Contains the following features

1)year

2)age

3)sex

4)maritl

5)race

6)education

7)region

8)jobclass

9)health

10)health_ins

11)logwage

12)wage

Initial Exploratory Data Analysis from the data

In [None]:
url = "https://raw.githubusercontent.com/selva86/datasets/master/Wage.csv" # Make sure the url is the raw version of the file on GitHub
download = requests.get(url).content

# Reading the downloaded content and turning it into a pandas dataframe

df = pd.read_csv(io.StringIO(download.decode('utf-8')))

# Printing out the first 5 rows of the dataframe

In [None]:
df

In [None]:
df['sex'].unique()

In [None]:
df['sex'].value_counts()

In [None]:
df['maritl'].unique()

In [None]:
df['maritl'].value_counts()

In [None]:
df['race'].unique()

In [None]:
df['race'].value_counts()

In [None]:
df['education'].unique()

In [None]:
df['education'].value_counts()

In [None]:
df['region'].unique()

In [None]:
df['region'].value_counts()

In [None]:
df['jobclass'].unique()

In [None]:
df['jobclass'].value_counts()

In [None]:
df['health'].unique()

In [None]:
df['health'].value_counts()

In [None]:
df['health_ins'].unique()

In [None]:
data = df

Procesing and encoding the data so the model can work with categorical values

In [None]:
data.replace({'1. Male':1,'1. Never Married':1,'2. Married':2,'4. Divorced': 4,'3. Widowed':3,'5. Separated': 5,'1. White':1,'3. Asian':3,'4. Other':4,'2. Black':2,'1. < HS Grad': 1, '4. College Grad':4, '3. Some College':3, '2. HS Grad':2,
       '5. Advanced Degree':5,'2. Middle Atlantic':2,'1. Industrial':1, '2. Information':2,'1. <=Good':1, '2. >=Very Good':2,'2. No':2, '1. Yes':1},inplace = True)

In [None]:
data

In [None]:
corr =  data.corr()

# Create a mask
mask = np.triu(np.ones_like(corr, dtype=bool))

plt.figure(figsize=(10, 6))
sns.heatmap(corr, mask=mask, center=0, annot=True,
            fmt='.2f', square=True, cmap = "YlGnBu")

plt.show()

As we see in the figure above, in contrast to the first dataset, we have additional signals/predictors in this dataset, which will be beneficial for the modelling efforts

In [None]:
data.columns

In [None]:
cols = ['jobclass','year','age','sex','maritl','race']

#####1) Creating the causal model

In [None]:
model = CausalModel(
    data=data,
    treatment=['education'],
    outcome=['logwage'],
    common_causes=cols,
    instruments=['year','region','health','health_ins'])

In [None]:
model.view_model(layout="dot")
from IPython.display import Image, display

##### II. Identify causal effect and return target estimands

In [None]:
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)

##### III. Estimate the target estimand using a statistical method.
We use the backdoor estimator to obtain the estimates with 95% confidence intervals

In [None]:
estimate = model.estimate_effect(identified_estimand,
                                 method_name="backdoor.linear_regression",
                                method_params={'need_conditional_estimates': False},
                                confidence_intervals="bootstrap")
print(estimate)

##### 4) Refutational analysis

In [None]:
refute_results = model.refute_estimate(identified_estimand, dml_estimate,
                                       method_name="placebo_treatment_refuter",num_simulations=5)
print(refute_results)

#### Conclusion:

Null hypothesis from Dataset 1: Education can be a sole indicator of weekly wages

Since our p value > 0.05, we cannot refute the null hypothesis

Null hypothesis from Dataset 2: Education along with factors such as age, job class, health are indicators of wage

Since the obtained p value > 0.05, we cannot refute the null hypothesis

Hence, we can conclude that having additional features and data can help provide more context into the causal analysis results