# Chapter 15: Matching in Observational Studies

In [1]:
from joblib import Parallel, delayed

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl

font = {'family' : 'IBM Plex Sans Condensed',
               'weight' : 'normal',
               'size'   : 10}
plt.rc('font', **font)
plt.rcParams['figure.figsize'] = (6, 6)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

np.random.seed(42)
%load_ext autoreload
%autoreload 1

%load_ext watermark
%watermark --iversions



numpy            : 1.24.3
matplotlib       : 3.7.2
sklearn          : 1.3.2
statsmodels      : 0.14.0
pandas           : 2.0.3
matplotlib_inline: 0.1.6
seaborn          : 0.12.2



## experimental data

In [2]:
import empirical_calibration as ec
import empirical_calibration.data.lalonde as lalonde
treat, ctrl = lalonde.experimental_treated(), lalonde.experimental_control()
lalonde_exp = pd.concat([treat, ctrl])
lalonde_exp.head()


Unnamed: 0,treatment,age,education,black,hispanic,married,nodegree,earnings1974,earnings1975,earnings1978
0,1.0,37.0,11.0,1.0,0.0,1.0,1.0,0.0,0.0,9930.046
1,1.0,22.0,9.0,0.0,1.0,0.0,1.0,0.0,0.0,3595.894
2,1.0,30.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,24909.45
3,1.0,27.0,11.0,1.0,0.0,0.0,1.0,0.0,0.0,7506.146
4,1.0,33.0,8.0,1.0,0.0,0.0,1.0,0.0,0.0,289.7899


In [3]:
y, z = lalonde_exp.earnings1978.values, lalonde_exp.treatment.values
X = lalonde_exp.drop(columns=['earnings1978', 'treatment']).values


In [4]:
def reg_adjust(z, y, X):
    tau_n_fit = sm.OLS(y, sm.add_constant(z)).fit(cov_type="HC2")
    tau_f_fit = sm.OLS(y, np.c_[sm.add_constant(z), X]).fit(cov_type="HC2")
    tau_l_fit = sm.OLS(y, np.c_[
                        sm.add_constant(z),
                        X - X.mean(axis=0),
                        z.reshape(-1, 1) * (X - X.mean(axis=0))
                        ],
                       ).fit(cov_type="HC2")
    resmat = np.r_[
        np.c_[tau_n_fit.params[1], tau_n_fit.bse[1], tau_n_fit.tvalues[1]],
        np.c_[tau_f_fit.params[1], tau_f_fit.bse[1], tau_f_fit.tvalues[1]],
        np.c_[tau_l_fit.params[1], tau_l_fit.bse[1], tau_l_fit.tvalues[1]],
        ]
    return pd.DataFrame(resmat, index = ['neyman', 'fisher', 'lin'], columns=['coef', 'se', 't'])

reg_adjust(z, y, X)


Unnamed: 0,coef,se,t
neyman,1794.342404,670.996546,2.674146
fisher,1676.342644,677.049148,2.475954
lin,1621.583101,694.721574,2.334148


### Matching

In [8]:
from sklearn.neighbors import KNeighborsRegressor

def one_nn_att(X, z, y, k = 1, bias_corr_mod=None, n_boot = None):
    mod = KNeighborsRegressor(n_neighbors=k)
    treat_nn_mod = mod.fit(X[z == 0,:], y[z == 0])
    Y0hat = treat_nn_mod.predict(X[z == 1,:])
    if bias_corr_mod: # bias correction with outcome model
        _, neighbours = mod.kneighbors(X[z == 1,:]) # store neighbour index for each treated
        # outcome model
        muhat = bias_corr_mod.fit(X[z == 0,:], y[z == 0])
        # bias correction term is μ^0(x_i) - μ^0(x_j) for each
        bias_corr_term = (muhat.predict(X[z == 1,:]) -
                          muhat.predict(X[z == 0,:][neighbours.flatten(), :]))
        return y[z == 1].mean() - Y0hat.mean() - bias_corr_term.mean()
    return y[z == 1].mean() - Y0hat.mean()


In [18]:
def nn_att(X, z, y, k = 1, bias_corr_mod=None, n_boot = 1e3):
    point_est = one_nn_att(X, z, y, k = k, bias_corr_mod=bias_corr_mod)
    n = len(z)
    def bootfn(*args):
        # draw indices
        ids = np.random.choice(np.arange(n), size = n, replace = True)
        return one_nn_att(X[ids,], z[ids], y[ids], k = 1, bias_corr_mod=bias_corr_mod)
    boot_est = Parallel(n_jobs=-1)(
        delayed(bootfn)(i) for i in range(int(n_boot))
    )
    boot_est = np.stack(boot_est)
    boot_se = boot_est.std()
    return point_est, boot_se


No bias correction

In [19]:
nn_att(X, z, y)


(2011.1531874594612, 846.732689517417)

bias correction with OLS

In [20]:
from sklearn.linear_model import LinearRegression
nn_att(X, z, y, bias_corr_mod=LinearRegression())


(1922.5641252847165, 822.57824805314)

### observational

In [21]:
# read CPS data
dat = pd.read_table("cps1re74.csv", delimiter = " ")
dat['u74'], dat['u75'] = 1 * (dat.re74 == 0), 1 * (dat.re75 == 0)
z, y = dat.treat.values, dat.re78.values
X = dat.drop(columns=['treat', 're78']).values


Regression is bad.

In [22]:
reg_adjust(z, y, X)


Unnamed: 0,coef,se,t
neyman,-8506.495361,583.442609,-14.579832
fisher,1067.546135,628.438879,1.698727
lin,-4265.800513,3211.771843,-1.328177


No bias correction : Much better than OLS

In [23]:
nn_att(X, z, y)


(1521.3765032432439, 866.0921246882162)

bias correction with OLS

In [25]:
nn_att(X, z, y, bias_corr_mod=LinearRegression())


(1929.466878721527, 860.0536173156096)

Abadie-Imbens Variance estimators: TBD.