<a href="https://colab.research.google.com/github/careychou/exploration/blob/master/Use_Causal_Model_to_calculate_debias_segment_mean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install econml

In [2]:
import econml
from econml.orf import DMLOrthoForest, DROrthoForest
from econml.dml import CausalForestDML
from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper, WeightedLasso, WeightedLassoCV

# Helper imports
import numpy as np
from itertools import product
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
import matplotlib.pyplot as plt

%matplotlib inline

  import pandas.util.testing as tm


In [130]:
# Treatment effect function
def exp_te(x):
    return np.exp(2*np.max(x))

# DGP constants
np.random.seed(123)
n = 1000
n_w = 30
support_size = 5
n_x = 1
# Outcome support
support_Y = np.random.choice(range(n_w), size=support_size, replace=False)
coefs_Y = np.random.uniform(0, 1, size=support_size)
epsilon_sample = lambda n: np.random.uniform(-1, 1, size=n)
# Treatment support 
support_T = support_Y
coefs_T = np.random.uniform(0, 1, size=support_size)
eta_sample = lambda n: np.random.uniform(-1, 1, size=n) 

# Generate controls, covariates, treatments and outcomes
# W ~ confounders
W = np.random.normal(0, 1, size=(n, n_w))
# binary segment (0, 1) - to estimate final CATE
X = np.random.randint(1, 3, size=(n, n_x))
# Heterogeneous treatment effects
TE = np.array([exp_te(x_i) for x_i in X])
T = np.concatenate([np.ones(500), np.zeros(500)])
Y = TE * T + np.dot(W[:, support_Y], coefs_Y) + epsilon_sample(n)

# ORF parameters and test data
subsample_ratio = 0.3
lambda_reg = np.sqrt(np.log(n_w) / (10 * subsample_ratio * n))
X_test = np.array(list(product(np.arange(0, 1, 0.01), repeat=n_x)))

In [131]:
print('test mean: ', Y[0:500].mean(), 'control mean: ', Y[500:1000].mean())
print('seg2 test mean: ', Y[np.squeeze(X == 2) & (T == 1)].mean())
print('seg2 control mean: ', Y[np.squeeze(X == 2) & (T == 0)].mean())
print('seg1 test mean: ', Y[np.squeeze(X == 1) & (T == 1)].mean())
print('seg1 control mean: ', Y[np.squeeze(X == 1) & (T == 0)].mean())

test mean:  31.423632545027758 control mean:  0.02517086269843063
seg2 test mean:  54.637915187817285
seg2 control mean:  0.03991672512022674
seg1 test mean:  7.261828161716206
seg1 control mean:  0.012405787766129552


In [142]:
est = CausalForestDML(model_t=Lasso(alpha=lambda_reg),
                       model_y=Lasso(alpha=lambda_reg),
                       n_estimators=4000, min_samples_leaf=5,
                       max_depth=50,
                       verbose=0, random_state=123)

In [133]:
est.fit(Y, T, X=X, W=W)

# treatment effect for segment 1 and 2
print('segment1 CATE: ', est.effect(X=[[1]]))
print('segment2 CATE: ', est.effect(X=[[2]]))

segment1 CATE:  [7.81237808]
segment2 CATE:  [54.16055064]


In [134]:
# hack control
Y_proxy = Y.copy()
Y[500:1000] = 0

print('test mean: ', Y[0:500].mean(), 'control mean: ', Y[500:1000].mean())
print('seg2 test mean: ', Y[np.squeeze(X == 2) & (T == 1)].mean())
print('seg2 control mean: ', Y[np.squeeze(X == 2) & (T == 0)].mean())
print('seg1 test mean: ', Y[np.squeeze(X == 1) & (T == 1)].mean())
print('seg1 control mean: ', Y[np.squeeze(X == 1) & (T == 0)].mean())

test mean:  31.423632545027758 control mean:  0.0
seg2 test mean:  54.637915187817285
seg2 control mean:  0.0
seg1 test mean:  7.261828161716206
seg1 control mean:  0.0


In [135]:
est.fit(Y, T, X=X, W=W)

# treatment effect for segment 0 and 1
treatment_seg1_test_mean = est.effect(X=[[1]])
treatment_seg2_test_mean = est.effect(X=[[2]])

print('segment1 debias test mean: ', treatment_seg1_test_mean)
print('segment2 debias test mean: ', treatment_seg2_test_mean)

segment1 debias test mean:  [7.75939504]
segment2 debias test mean:  [54.2083246]


In [136]:
# hack test
Y = Y_proxy.copy()
Y[0:500] = 0

print('test mean: ', Y[0:500].mean(), 'control mean: ', Y[500:1000].mean())
print('seg2 test mean: ', Y[np.squeeze(X == 2) & (T == 1)].mean())
print('seg2 control mean: ', Y[np.squeeze(X == 2) & (T == 0)].mean())
print('seg1 test mean: ', Y[np.squeeze(X == 1) & (T == 1)].mean())
print('seg1 control mean: ', Y[np.squeeze(X == 1) & (T == 0)].mean())

test mean:  0.0 control mean:  0.02517086269843063
seg2 test mean:  0.0
seg2 control mean:  0.03991672512022674
seg1 test mean:  0.0
seg1 control mean:  0.012405787766129552


In [138]:
est.fit(Y, T, X=X, W=W)

# treatment effect for segment 1, 2
# we reverse the sign since TE is treatment - control
treatment_seg1_ctl_mean = est.effect(X=[[1]]) * -1
treatment_seg2_ctl_mean = est.effect(X=[[2]]) * -1

print('segment1 debias control mean: ', treatment_seg1_ctl_mean)
print('segment2 debias control mean: ', treatment_seg2_ctl_mean)

segment1 debias control mean:  [-0.04945803]
segment2 debias control mean:  [0.03223431]


In [141]:
# we can approximate the CATE from the two
# the approximation here should be close to the CATE calculated above
print('segment1 CATE: ', treatment_seg1_test_mean - treatment_seg1_ctl_mean)
print('segment2 CATE: ', treatment_seg2_test_mean - treatment_seg2_ctl_mean)

segment1 CATE:  [7.80885307]
segment2 CATE:  [54.17609029]
