In [15]:
import pandas
from econml.metalearners import XLearner
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import numpy as np
from tqdm import tqdm

In [17]:
df = pandas.read_csv('data/train_new.csv')
print(df)

Y = df.loc[:, ['booking_bool', 'click_bool']]  # using booking_bool or click_bool
X = df.loc[:, ['visitor_hist_starrating', 'visitor_hist_adr_usd',
              'prop_starrating', 'prop_review_score', 'prop_brand_bool', 
              'prop_location_score1', 'prop_log_historical_price', 'position', 'price_usd',
              'promotion_flag', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
              'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'orig_destination_distance']]
T = 1 - df.loc[:, 'random_bool']    # treatment (sorted as 1, random as 0)

         Unnamed: 0  click_bool  booking_bool  random_bool  \
0                 0           0             0            1   
1                 1           0             0            1   
2                 2           0             0            1   
3                 3           0             0            1   
4                 4           0             0            1   
...             ...         ...           ...          ...   
9917525     9917525           0             0            0   
9917526     9917526           1             1            0   
9917527     9917527           0             0            0   
9917528     9917528           0             0            0   
9917529     9917529           0             0            0   

         visitor_hist_starrating  visitor_hist_adr_usd  prop_starrating  \
0                       3.374634            176.588512                3   
1                       3.374634            176.588512                4   
2                       3.3746

In [18]:
from sklearn.model_selection import train_test_split
X_np = X.to_numpy(na_value=0)
Y_np = Y.to_numpy()
T_np = T.to_numpy()
data = np.concatenate((X_np, Y_np, T_np[..., None]), axis=-1)
n_data = data.shape[0]
train_data, test_data = train_test_split(data[:int(n_data * 0.01)], train_size=0.7)
X_train = train_data[:, :-3]
Y_train = train_data[:, -3:-1]  # N x 2
T_train = train_data[:, -1]

X_test = test_data[:, :-3]
Y_test = test_data[:, -3:-1]
T_test = test_data[:, -1]

In [33]:
# XLearner
xlearners = []
for i in tqdm(range(Y_train.shape[1])):
    xlearner = XLearner(models=GradientBoostingRegressor(),
                propensity_model=GradientBoostingClassifier(),
                cate_models=GradientBoostingRegressor())
    xlearner.fit(Y_train[:, i], T_train, X=X_train)
    xlearners.append(xlearner)

100%|██████████| 2/2 [01:06<00:00, 33.24s/it]


In [34]:
HTEs_xlearner = []
ATEs_xlearner = []
for i in tqdm(range(Y_train.shape[1])):
    HTE_xlearner = xlearners[i].effect(X_test)
    ATE_xlearner = xlearners[i].ate(X_test)
    HTEs_xlearner.append(HTE_xlearner)
    ATEs_xlearner.append(ATE_xlearner)

100%|██████████| 2/2 [00:00<00:00,  3.75it/s]


In [35]:
HTEs_xlearner, ATEs_xlearner

([array([0.01859478, 0.01362686, 0.00848823, ..., 0.00598069, 0.01079034,
         0.01703112]),
  array([-0.00800371, -0.01388718, -0.00694866, ..., -0.01778324,
         -0.01865235, -0.01219883])],
 [0.0312565069827455, -0.003927284023636])

In [11]:
'''
# Fit with bootstrap confidence interval construction enabled
xlearner.fit(Y_train, T_train, X=X_train, inference='bootstrap')
TE_xlearner = xlearner.effect(X_test)
lb, ub = xlearner.effect_interval(X_test, alpha=0.05) # Bootstrap CIs
TE_xlearner, lb, ub
'''

(array([-0.0048295 , -0.01718111, -0.02596979, ..., -0.04902052,
        -0.02031294, -0.02661698]),
 array([-0.0126477 , -0.02419306, -0.03425615, ..., -0.05796047,
        -0.02388705, -0.0405591 ]),
 array([ 0.00516105, -0.00903868, -0.01623263, ..., -0.03808922,
        -0.01357785, -0.02108641]))

In [28]:
# TLearner
from econml.metalearners import TLearner
from sklearn.linear_model import LinearRegression

tlearner = TLearner(models=LinearRegression())
tlearner.fit(Y_train, T_train, X=X_train)

<econml.metalearners._metalearners.TLearner at 0x7fba35b8b070>

In [36]:
HTE_tlearner = tlearner.effect(X_test)
ATE_tlearner = tlearner.ate(X_test)
HTE_tlearner, ATE_tlearner 

(array([[ 0.02757165, -0.00420639],
        [ 0.03467252, -0.00063656],
        [ 0.02378496, -0.00280876],
        ...,
        [ 0.00323927, -0.0126021 ],
        [ 0.0298636 , -0.01397227],
        [ 0.05299882,  0.00560695]]),
 array([ 0.03269705, -0.00303887]))

In [37]:
# SLearner
from econml.metalearners import SLearner
from sklearn.ensemble import RandomForestRegressor

slearner = SLearner(overall_model=RandomForestRegressor())
slearner.fit(Y_train, T_train, X=X_train, inference='bootstrap')

<econml.metalearners._metalearners.SLearner at 0x7fba35bdcbe0>

In [38]:
HTE_slearner = slearner.effect(X_test)
ATE_slearner = slearner.ate(X_test)
HTE_slearner, ATE_slearner 

(array([[-0.01, -0.04],
        [ 0.05,  0.1 ],
        [-0.02,  0.03],
        ...,
        [ 0.01,  0.01],
        [ 0.03,  0.  ],
        [ 0.07,  0.06]]),
 array([ 0.03955635, -0.0064128 ]))

In [39]:
# DRLearner
from econml.dr import DRLearner
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

drlearners = []
for i in tqdm(range(Y_train.shape[1])):
    drlearner = DRLearner(model_propensity=RandomForestClassifier(n_estimators=100, min_samples_leaf=10), 
                        model_regression=RandomForestRegressor(n_estimators=100, min_samples_leaf=10),
                        model_final=LassoCV(cv=3),
                        random_state=0)
    drlearner.fit(Y_train[:, i], T_train, X=X_train, W=None)
    drlearners.append(drlearner)

100%|██████████| 2/2 [01:10<00:00, 35.46s/it]


In [40]:
HTEs_drlearner = []
ATEs_drlearner = []
for i in tqdm(range(Y_train.shape[1])):
    HTE_drlearner = drlearners[i].effect(X_test)
    ATE_drlearner = drlearners[i].ate(X_test)
    HTEs_drlearner.append(HTE_drlearner)
    ATEs_drlearner.append(ATE_drlearner)

HTEs_drlearner, ATEs_drlearner

100%|██████████| 2/2 [00:00<00:00, 20.81it/s]


([array([0.02547601, 0.02809258, 0.02184279, ..., 0.00618645, 0.01959   ,
         0.05491278]),
  array([-0.00577698, -0.00407257, -0.00819839, ..., -0.01036014,
         -0.00813854, -0.0018023 ])],
 [0.03191912772342984, -0.0050003292726500575])

In [41]:
# DML
from econml.dml import DML
from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

dml = DML(model_y=RandomForestRegressor(),
          model_t=RandomForestClassifier(),
          model_final=StatsModelsLinearRegression(fit_intercept=False), 
          linear_first_stages=False, 
          discrete_treatment=True,
          random_state=0)

dml.fit(Y_train, T_train, X=X_train)

<econml.dml.dml.DML at 0x7fbaf8d0f790>

In [42]:
HTE_dml = dml.effect(X_test)
ATE_dml = dml.ate(X_test)

HTE_dml, ATE_dml

(array([[ 0.03744712, -0.00340578],
        [ 0.04784964, -0.01148044],
        [ 0.02912355, -0.00689383],
        ...,
        [ 0.01195178, -0.02211793],
        [ 0.03855042, -0.02313146],
        [ 0.0655047 ,  0.00644256]]),
 array([ 0.04548884, -0.00901736]))