In [1]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.pipeline as skpipe
import celer as cel
import matplotlib.pyplot as plt
import statsmodels.api as sm

# dml dgps
from doubleml import datasets 
# this module
from aipyw import aipyw


## generate some synthetic data with OVB

In [2]:
# make some data - true effect is 1
X, y, w = datasets.make_irm_data(1_000, theta=1, return_type='array')

In [3]:
ppl = skpipe.Pipeline([
	('minmax', sklearn.preprocessing.MinMaxScaler()),
	('sieve',  sklearn.preprocessing.PolynomialFeatures(2)),
])
XX = ppl.fit_transform(X)

# naive estimate is biased
y[w==1].mean() - y[w==0].mean()

1.5326252413874115

Badly biased.

### Regression adjustment

In [4]:
sm.OLS(y, np.c_[np.repeat(1, w.shape[0]), w, X],).fit(vcov_type = "HC2").params[1]

0.9118774657828757

Also biased.

### AIPW

In [8]:
# pmod = sklearn.svm.SVC(gamma='auto', probability=True)
doubledouble = aipyw(y, w, XX,
                    omod = cel.ElasticNetCV(l1_ratio= [.5, .7, .9],
                                            n_alphas=20, cv=5, n_jobs = 8),
                    pmod = cel.LogisticRegression(C=1)
)
doubledouble.fit()
doubledouble.summary()

Poor overlap - some pscores are < 0.02; Either call summary() with a trimming threshold 
 or change the estimand to ATT.
                                    ATE        SE  95% CI-LB  95% CI-UB
Treat level 1 - Treat level 0  1.062369  0.071074   0.923063   1.201675


In [7]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
doubledouble2 = aipyw(y, w, XX,
                    omod = RandomForestRegressor(),
                    pmod = RandomForestClassifier(),
)
doubledouble2.fit()
doubledouble2.summary()

                                    ATE        SE  95% CI-LB  95% CI-UB
Treat level 1 - Treat level 0  1.073689  0.074892   0.926902   1.220477


Better.