In [1]:
!pip install pgmpy
!pip install dowhy
!apt-get install python3-dev graphviz libgraphviz-dev pkg-config
!pip install pygraphviz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
pkg-config is already the newest version (0.29.1-0ubuntu2).
graphviz is already the newest version (2.40.1-2).
libgraphviz-dev is already the newest version (2.40.1-2).
python3-dev is already the newest version (3.6.7-1~18.04).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!if [ ! -e social_media_advertisements.csv ] ; then wget social_media_advertisements.csv https://raw.githubusercontent.com/altdeep/causalML/1ecf8b12a170a3b3e44553070a7f44c076fffb93/datasets/social_media_advertisements.csv; fi

In [3]:
import pandas as pd
import numpy as np

from pgmpy.base.DAG import DAG
from pgmpy.inference.CausalInference import CausalInference
from pgmpy.estimators import ExpectationMaximization
from pgmpy.models import BayesianNetwork
from pgmpy.sampling import BayesianModelSampling
from pgmpy.factors.discrete.CPD import TabularCPD
from pgmpy.inference import VariableElimination as Inference

import warnings
warnings.filterwarnings('ignore')

from dowhy import CausalModel
from dowhy import datasets
from dowhy.causal_estimators.linear_regression_estimator import LinearRegressionEstimator
import dowhy

import graphviz
import pygraphviz

import time

# Task 0:
Creating the model and loading the dataset. 

In [4]:
model = BayesianNetwork([
      ('U','X'),
      ('X','Z'),
      ('Z','Y'),
      ('U','Y'),
    ], latents={'U'})

In [5]:
data = pd.read_csv('social_media_advertisements.csv')

# Helpe function for turning string values to a binary indicator (0 or 1)
def alias(entry):
    return {'no_ad': 0, 'ad': 1, 'no_purchase': 0, 'no_social': 0, 'social': 1, 'purchase': 1}[entry]

In [6]:
data.head()

Unnamed: 0,Z,X,Y
0,no_ad,no_social,purchase
1,no_ad,no_social,purchase
2,ad,social,no_purchase
3,ad,social,no_purchase
4,no_ad,no_social,purchase


# Task 1:
Train the parameters of the new model using the data. Use EM to learn the causal Markov kernel for U.

In [7]:
start_time = time.time()
model.fit(data, estimator=ExpectationMaximization)
end_time = time.time()
model_fit_time = end_time - start_time
print('Time (s) to fit model: ', model_fit_time)
for cpd in model.get_cpds():
  print(cpd)

  0%|          | 0/100 [00:00<?, ?it/s]

Time (s) to fit model:  7.580073595046997
+------+----------+
| U(0) | 0.499999 |
+------+----------+
| U(1) | 0.500001 |
+------+----------+
+--------------+---------------------+--------------------+
| U            | U(0)                | U(1)               |
+--------------+---------------------+--------------------+
| X(no_social) | 0.49999947936084643 | 0.5000005206376316 |
+--------------+---------------------+--------------------+
| X(social)    | 0.5000005206391536  | 0.4999994793623683 |
+--------------+---------------------+--------------------+
+----------+--------------+-----------+
| X        | X(no_social) | X(social) |
+----------+--------------+-----------+
| Z(ad)    | 0.05         | 0.95      |
+----------+--------------+-----------+
| Z(no_ad) | 0.95         | 0.05      |
+----------+--------------+-----------+
+----------------+-----+---------------------+
| U              | ... | U(1)                |
+----------------+-----+---------------------+
| Z              

# Task 2:
Use graph mutilation and Monte Carlo estimation to estimate the causal effect of X on Y

In [8]:
model_do_X0 = model.do("X")
model_do_X1 = model.do("X")

In [9]:
# Monte Carlo Estimate

# causal effect = P(Y|do(X=1)) - P(Y|do(X=0))

start_time = time.time()

generator_0 = BayesianModelSampling(model_do_X0)
generator_1 = BayesianModelSampling(model_do_X1)

n_iterations = 100
n_samples = 1000
deltas = np.zeros(n_iterations)
for i in range(n_iterations):
  generated_samples_0 = generator_0.forward_sample(size=n_samples, show_progress=False)
  # Sample Estimate for P(Y|do(X=0))
  pY_doX0 = generated_samples_0['Y'].apply(alias).mean()

  generated_samples_1 = generator_1.forward_sample(size=n_samples, show_progress=False)
  # Sample Estimate for P(Y|do(X=1))
  pY_doX1 = generated_samples_1['Y'].apply(alias).mean()

  # print(pY_doX0, pY_doX1)

  # print(pY_doX1, pY_doX0)
  deltas[i] = pY_doX1 - pY_doX0

causal_estimate = deltas.mean()
confidence_interval = np.quantile(deltas, [.25, .975])
end_time = time.time()
monte_carlo_time = end_time - start_time
print('Monte Carlo elapsed time (s): ', monte_carlo_time)
print('Causal Estaimate: ', causal_estimate)
print('Confidence Interval: ', confidence_interval)


Monte Carlo elapsed time (s):  3.9794533252716064
Causal Estaimate:  0.0036000000000000025
Confidence Interval:  [-0.01125  0.04505]


# Task 3: 
Calculate the front-door estimator in DoWhy

In [None]:
temp_string = 'digraph { U -> X U -> Y X -> Z Z -> Y }'

numerical_data = pd.DataFrame()
numerical_data['X'] = data['X'].apply(alias)
numerical_data['Y'] = data['Y'].apply(alias)
numerical_data['Z'] = data['Z'].apply(alias)

dowhy_model = CausalModel(
        data=numerical_data,
        treatment='X',
        outcome='Y',
        graph=temp_string
    )

dowhy_model.view_model()

from IPython.display import Image, display
display(Image(filename="causal_model.png"))

In [None]:
identified_estimand = dowhy_model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)

In [None]:
start_time = time.time()
estimate = dowhy_model.estimate_effect(identified_estimand, method_name='frontdoor.two_stage_regression',
                                       confidence_intervals=True, 
                                       method_params = {
                                            'first_stage_model': LinearRegressionEstimator,
                                            'second_stage_model': LinearRegressionEstimator
                                        })
end_time = time.time()
dowhy_estimate_time = end_time - start_time
print(estimate)

In [13]:
print('DoWhy Estimate Elapsed Time (s): ', dowhy_estimate_time)
print('Causal Estimaet: ', estimate.value)
print('Confidence Interval: ', estimate.get_confidence_intervals())

DoWhy Estimate Elapsed Time (s):  12.672666549682617
Causal Estimaet:  0.044999999999999166
Confidence Interval:  (-0.013808282808236524, 0.10212563957012252)


# Task 4:
Calculate the front-door estimate by hand

In [14]:
# FORMULA
# P(Y|do(X)) = Sum(Z, Sum(X', P(Y|Z, X')*P(X')*P(Z|X))
def frontdoor_calc(dataset):
  pY_doX1 = 0
  pY_doX0 = 0
  for x_prime in dataset['X'].unique():
    for z in data['Z'].unique():
      #P(Y|Z, X')
      # Random Sampling Means sometimes the groups dont exist
      try:
        Y0_xz_count = dataset.groupby(['X', 'Y', 'Z']).get_group((x_prime, 'no_purchase', z)).count()[0]
      except KeyError:
        Y0_xz_count = 0
      try:
        Y1_xz_count = dataset.groupby(['X', 'Y', 'Z']).get_group((x_prime, 'purchase', z)).count()[0]
      except KeyError:
        Y1_xz_count = 0

      py_ZX = Y1_xz_count / (Y0_xz_count + Y1_xz_count)

      # P(X') 
      x_prime_count = dataset.groupby(['X']).get_group(x_prime).count()[0]

      pX_prime = x_prime_count / (len(dataset['X']))

      # P(Z|X)
      z0_X1_count = dataset.groupby(['X', 'Z']).get_group(('social', 'no_ad')).count()[0]
      z0_X0_count = dataset.groupby(['X', 'Z']).get_group(('no_social', 'no_ad')).count()[0]
      z1_X1_count = dataset.groupby(['X', 'Z']).get_group(('social', 'ad')).count()[0]
      z1_X0_count = dataset.groupby(['X', 'Z']).get_group(('no_social', 'ad')).count()[0]

      if z == 'ad':
        pZ_X1 = z1_X1_count / (z0_X1_count + z1_X1_count)
        pZ_X0 = z1_X0_count / (z0_X0_count + z1_X0_count)
      else: 
        pZ_X1 = z0_X1_count / (z0_X1_count + z1_X1_count)
        pZ_X0 = z0_X0_count / (z0_X0_count + z1_X0_count)

      # Multiply
      pY_doX1 += (py_ZX * pX_prime * pZ_X1)
      pY_doX0 += (py_ZX * pX_prime * pZ_X0)

  return (pY_doX1 - pY_doX0)

In [15]:
start_time = time.time()
print('Confidence Interval: ', frontdoor_calc(data))
end_time = time.time()
print('Manual Elapsed Time (s): ', end_time - start_time)

Confidence Interval:  0.04500000000000004
Manual Elapsed Time (s):  0.08831453323364258


Using Bootstrapping to determine aconfidence interval, and more accurate estiamte

In [16]:
bootstrap_iterations = 100
causal_estimates = np.zeros(bootstrap_iterations)

start_time = time.time()
for n in range(bootstrap_iterations):
  sampled_data = data.sample(len(data), replace=True)
  causal_estimates[n] = frontdoor_calc(sampled_data)
end_time = time.time()

print('Manual Bootstrap Elapsed Time (s): ', end_time - start_time)
print('Causal Estimate: ', causal_estimates.mean())
print('Confidence Interval: ', np.quantile(causal_estimates, [.25, .975]))

Manual Bootstrap Elapsed Time (s):  4.852885723114014
Causal Estimate:  0.04451406322918086
Confidence Interval:  [0.01636701 0.11112524]


# Task 5
Compare your Monte Carlo estimate to the DoWhy estimate and you hand-calculated estimate.

Total Results: <br>

| Method | Causal Estimate | Confidence Interval | Elapsed Time (s) |
| --- | --- | --- | --- |
| Monte Carlo | 0.0036 |  [-0.01125, 0.04505] | 3.9795 |
| DoWhy |  0.0450 | [-0.0138, 0.1021] | 12.6727 |
| Manual | 0.0445 | [0.0164,  0.1111] | 4.8529 |

## Task 5.1
Are the estimates the close in value?

Yes the estimates are close in value. In fact, the DoWhy estimate and the non-bootstrapped Manual version give the exact same results. The Monte Carlo estimate was farthest from the other two. 

## Task 5.2
How do the DoWhy and Monte Carlo estimates compare in terms of confidence interval?

The Monte Carlo estimate had a confidence interval slightly shifted down from the other two. This makes sense as the estimation method has the most random sampling. Furthermore, the confidence intervals for all estimates contain the other estimates. 

## Task 5.3
How do they compare in terms of computation time?  For the Monte Carlo estimate, separately calculate the time it takes to train the model and the time it takes to run the Monte Carlo estimation procedure.  This is because training only has to be done once, then you can run different inference queries on the trained model.

The fastest algorithm was Monte Carlo followed closely by the bootstrapped manual version. The manual version without bootstrapping was by far the fastest as the calculations are already known, as opposed to regressing to the value. 