Skip to content

Commit

Permalink
Merge pull request #432 from Everglow00/master
Browse files Browse the repository at this point in the history
Create plot_3_double_machine_learning.py
  • Loading branch information
Mamba413 committed May 30, 2022
2 parents ee73807 + 7ee7a21 commit 4b4a65b
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
"""
================================
Work with DoubleML
================================
Double machine learning offer a debiased way for estimating low-dimensional parameter of interest in the presence of
high-dimensional nuisance. Many machine learning methods can be used to estimate the nuisance parameters, such as random
forests, lasso or post-lasso, neural nets, boosted regression trees, and so on. The Python package ``DoubleML`` provide an
implementation of the double machine learning. It's built on top of scikit-learn and is an excellent package. The
object-oriented implementation of ``DoubleML`` is very flexible, in particular functionalities to estimate double machine
learning models and to perform statistical inference via the methods fit, bootstrap, confint, p_adjust and tune.
"""

###############################################################################
#
# In fact, ``abess`` also works well with the package ``DoubleML``. Here is an example of using ``abess`` to solve such
# a problem, and we will compare it to the lasso regression.


import numpy as np
from sklearn.base import clone
from sklearn.linear_model import LassoCV
from abess.linear import LinearRegression
from doubleml import DoubleMLPLR
import matplotlib.pyplot as plt
import warnings # ignore warnings
warnings.filterwarnings('ignore')
import time

###############################################################################
# Partially linear regression (PLR) model
# ^^^^^^^^^^^^^^^^^
# PLR models take the form
#
# .. math::
# Y=D \theta\theta_{0}+g_{0}(X)+U, & \quad\mathbb{E}(U \mid X, D)=0,\\
# D=m_{0}(X)+V, & \quad\mathbb{E}(V \mid X)=0,
#
# where :math:`Y` is the outcome variable, :math:`D` is the policy/treatment variable. :math:`\theta_0` is the main
# regression coefficient that we would like to infer, which has the interpretation of the treatment effect parameter.
# The high-dimensional vector :math:`X=(X_1,\dots, X_p)` consists of other confounding covariates, and :math:`U` and
# :math:`V` are stochastic errors. Usually, :math:`p` is not vanishingly small relative to the sample size, it's
# difficult to estimate the nuisance parameters :math:`\eta_0 = (m_0, g_0)`. ``abess`` aims to solve general best subset
# selection problem. In PLR models, ``abess`` is applicable when nuisance parameters are sparse. Here, we are going to
# use ``abess`` to estimate the nuisance parameters, then combine with ``DoubleML`` to estimate the treatment effect
# parameter.


###############################################################################
# Data
# """""""""""""""
# We simulate the data from a PLR model, which both :math:`m_0` and :math:`g_0` are low-dimensional linear combinations
# of :math:`X`, and we save the data as ``DoubleMLData`` class.



from doubleml import DoubleMLData
np.random.seed(1234)
n_obs = 200
n_vars = 600
theta = 3
X = np.random.normal(size=(n_obs, n_vars))
d = np.dot(X[:, :3], np.array([5]*3)) + np.random.standard_normal(size=(n_obs,))
y = theta * d + np.dot(X[:, :3], np.array([5]*3)) + np.random.standard_normal(size=(n_obs,))
dml_data_sim = DoubleMLData.from_arrays(X, y, d)



###############################################################################
# Model fitting with ``abess``
# """""""""""""""
# Based on the simulated data, now we are going to illustrate how to integrate the ``abess`` with ``DoubleML``. To
# estimate the PLR model with the double machine learning algorithm, first we need to choose a learner to estimate the
# nuisance parameters :math:`\eta_0 = (m_0, g_0)`. Considering the sparsity of the data, we can use the adaptive best
# subset selection model. Then fitting the model to learn the average treatment effct parameter :math:`\theta_0`.


abess = LinearRegression(cv = 5) # abess learner
ml_g_abess = clone(abess)
ml_m_abess = clone(abess)

obj_dml_plr_abess = DoubleMLPLR(dml_data_sim, ml_g_abess, ml_m_abess) # model fitting
obj_dml_plr_abess.fit();
print("thetahat:", obj_dml_plr_abess.coef)
print("sd:", obj_dml_plr_abess.se)

# %%
# The estimated value is close to the true parameter, and the standard error is very small. ``abess`` integrates with
# ``DoubleML`` easily, and works well for estimating the nuisance parameter.


###############################################################################
# Comparison with lasso
# ^^^^^^^^^^^^^^^^^
# The lasso regression is a shrinkage and variable selection method for regression models, which can also be used in
# high-dimensional setting. Here, we compare the abess regression with the lasso regression at different variable
# dimensions.

# %%
# The following figures show the absolute bias of the abess learner and the lasso learner.
lasso = LassoCV(cv = 5) # lasso learner
ml_g_lasso = clone(lasso)
ml_m_lasso = clone(lasso)

M = 15 # repeate times
n_obs = 200
n_vars_range = range(100,1100,300) # different dimensions of confounding covariates
theta_lasso = np.zeros(len(n_vars_range)*M)
theta_abess = np.zeros(len(n_vars_range)*M)
time_lasso = np.zeros(len(n_vars_range)*M)
time_abess = np.zeros(len(n_vars_range)*M)
j = 0

for n_vars in n_vars_range:
for i in range(M):
np.random.seed(i)
# simulated data: three true variables
X = np.random.normal(size=(n_obs, n_vars))
d = np.dot(X[:, :3], np.array([5]*3)) + np.random.standard_normal(size=(n_obs,))
y = theta * d + np.dot(X[:, :3], np.array([5]*3)) + np.random.standard_normal(size=(n_obs,))
dml_data_sim = DoubleMLData.from_arrays(X, y, d)

# Estimate double/debiased machine learning models
starttime = time.time()
obj_dml_plr_lasso = DoubleMLPLR(dml_data_sim, ml_g_lasso, ml_m_lasso)
obj_dml_plr_lasso.fit()
endtime = time.time()
time_lasso[j*M + i] = endtime - starttime
theta_lasso[j*M + i] = obj_dml_plr_lasso.coef

starttime = time.time()
obj_dml_plr_abess = DoubleMLPLR(dml_data_sim, ml_g_abess, ml_m_abess)
obj_dml_plr_abess.fit()
endtime = time.time()
time_abess[j*M + i] = endtime - starttime
theta_abess[j*M + i] = obj_dml_plr_abess.coef
j = j + 1

# absolute bias
abs_bias1 = [abs(theta_lasso-theta)[:M],abs(theta_abess-theta)[:M]]
abs_bias2 = [abs(theta_lasso-theta)[M:2*M],abs(theta_abess-theta)[M:2*M]]
abs_bias3 = [abs(theta_lasso-theta)[2*M:3*M],abs(theta_abess-theta)[2*M:3*M]]
abs_bias4 = [abs(theta_lasso-theta)[3*M:4*M],abs(theta_abess-theta)[3*M:4*M]]
labels = ["lasso", "abess"]

fig, ([ax1, ax2], [ax3, ax4]) = plt.subplots(nrows=2, ncols=2, figsize=(10,5))
bplot1 = ax1.boxplot(abs_bias1, vert=True, patch_artist=True, labels=labels)
ax1.set_title("p = 100")
bplot2 = ax2.boxplot(abs_bias2, vert=True, patch_artist=True, labels=labels)
ax2.set_title("p = 400")
bplot3 = ax3.boxplot(abs_bias3, vert=True, patch_artist=True, labels=labels)
ax3.set_title("p = 700")
bplot4 = ax4.boxplot(abs_bias4, vert=True, patch_artist=True, labels=labels)
ax4.set_title("p = 1000")
colors = ["lightblue", "orange"]

for bplot in (bplot1, bplot2, bplot3, bplot4):
for patch, color in zip(bplot["boxes"], colors):
patch.set_facecolor(color)
for ax in [ax1, ax2, ax3, ax4]:
ax.yaxis.grid(True)
ax.set_ylabel("absolute bias")
plt.show();


# %%
# The following figure shows the running time of the abess learner and the lasso learner.
plt.plot(np.repeat(n_vars_range, M),time_lasso, "o", color = "lightblue", label="lasso", markersize=3);
plt.plot(np.repeat(n_vars_range, M),time_abess, "o", color = "orange", label="abess", markersize=3);
slope_lasso, intercept_lasso = np.polyfit(np.repeat(n_vars_range, M),time_lasso, 1)
slope_abess, intercept_abess = np.polyfit(np.repeat(n_vars_range, M),time_abess, 1)
plt.axline(xy1=(0,intercept_lasso), slope = slope_lasso, color = "lightblue", lw = 2)
plt.axline(xy1=(0,intercept_abess), slope = slope_abess, color = "orange", lw = 2)
plt.grid()
plt.xlabel("number of variables")
plt.ylabel("running time")
plt.legend(loc="upper left")

# %%
# At each dimension, we repeat the double machine learning procedure 15 times for each of the two learners. As can be
# seen from the above figures, the parameters estimated by both learners are very close to the true parameter
# :math:`\theta_0`. But the running time of abess learner is much shorter than lasso. Besides, in high-dimensional
# situations, the mean absolute bias of abess learner regression is relatively smaller.


###############################################################################
# References
# ^^^^^^^^^^^^^^^^^
# .. rubric:: References
# .. [1] Chernozhukov V, Chetverikov D, Demirer M, et al. Double/debiased machine learning for treatment and structural parameters[M]. Oxford University Press Oxford, UK, 2018.
# .. [2] Bach P, Chernozhukov V, Kurz M S, et al. Doubleml-an object-oriented implementation of double machine learning in python[J]. Journal of Machine Learning Research, 2022, 23(53): 1-6.
# .. [3] Zhu J, Hu L, Huang J, et al. abess: A fast best subset selection library in python and r[J]. arXiv preprint arXiv:2110.09697, 2021.
#
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ sphinx-gallery
sphinxcontrib-phpdomain
sphinx-autoapi
geomstats
DoubleML

0 comments on commit 4b4a65b

Please sign in to comment.