# Chapter 9: Bridging Finite and Super-population Causal Inference

In [36]:
from joblib import Parallel, delayed

import numpy as np
import statsmodels.api as sm

np.random.seed(42)


In [37]:
def linestimator(Z, Y, X):
    X = (X - X.mean(axis=0))/X.std(axis=0)
    n, p = X.shape
    # fully interacted OLS
    Xmat = np.c_[sm.add_constant(Z),
              X,
              Z.reshape(-1, 1) * X]
    m = sm.OLS(Y, Xmat).fit(cov_type="HC2")
    est, vehw = m.params[1], m.bse[1]**2
    # super-population correction
    inter = m.params[-p:] # (β_1 - β_0) term - last p elements of coef
    # (β_1 - β_0)' Σ (β_1 - β_0) / n
    superCorr = np.sum(inter * (np.cov(X.T) @ inter))/n
    vsuper = vehw + superCorr
    return est, np.sqrt(vehw), np.sqrt(vsuper)


In [38]:
def onerepl(*args):
    n = 500
    X = np.random.normal(0, 1, n*2).reshape(n, 2)
    Y0 = X[:, 0] + X[:, 0]**2 + np.random.uniform(-.5, .5, n)
    Y1 = X[:, 1] + X[:, 1]**2 + np.random.uniform(-1, 1, n)
    Z = np.random.binomial(1, .6, n)
    Y = Y0 * (1 - Z) + Y1 * Z
    return linestimator(Z, Y, X)


In [39]:
onerepl()


(0.052230404017171474, 0.02176516995732536, 0.026679530224550992)

In [40]:
nrep, k = 2000, 8
results = Parallel(n_jobs = k)(delayed(onerepl)(i) for i in range(nrep))
simres = np.vstack(results)


In [41]:
# bias, estimated EHW SE, estimated super-population SE
simres[:, 0].mean(), simres[:, 1].mean(), simres[:, 2].mean()


(0.007213145049286639, 0.01843410232910921, 0.022661715589192874)

In [42]:
# empirical SD
simres[:, 0].std()


0.15129734780104556

In [43]:
# EHW coverage
np.mean((simres[:, 0] - 1.96 * simres[:, 1]) * (simres[:, 0] + 1.96 * simres[:, 1] ) <= 0)


0.1795

In [44]:
# superpop coverage
np.mean((simres[:, 0] - 1.96 * simres[:, 2]) * (simres[:, 0] + 1.96 * simres[:, 2] ) <= 0)


0.218