In [None]:
import pandas as pd 
import numpy as np 
import statsmodels.api as sm
import statsmodels.formula.api as smf
from pathlib import Path
from dowhy import CausalModel
from linearmodels.panel import PanelOLS
from linearmodels.iv.model import IV2SLS
from sklearn.linear_model import LogisticRegression
from statsmodels.api import add_constant

base_data_loc = Path("../data/all_data")

In [47]:
def diff_mean(treat_data, control_data):
    
    tau = np.mean(treat_data) - np.mean(control_data)
    n1 = treat_data.shape[0]
    n2 = control_data.shape[0]
    std = (np.var(treat_data, ddof=1) / n1 + np.var(control_data, ddof=1) / n2) ** 0.5
    
    return tau, std

## Social Pressure and Voter Turnout: Evidence from a Large-Scale Field Experiment

In [48]:
df_voter = pd.read_csv(base_data_loc / "voter_turnout_data.csv")
model_3b = smf.ols(formula='voted ~ C(treatment, Treatment("Control")) + g2000 + g2002 + p2000 + p2002 + p2004', 
                 data=df_voter).fit()
model_3b.summary()

0,1,2,3
Dep. Variable:,voted,R-squared:,0.075
Model:,OLS,Adj. R-squared:,0.075
Method:,Least Squares,F-statistic:,3101.0
Date:,"Fri, 25 Jul 2025",Prob (F-statistic):,0.0
Time:,14:29:33,Log-Likelihood:,-211250.0
No. Observations:,344084,AIC:,422500.0
Df Residuals:,344074,BIC:,422600.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0774,0.002,33.462,0.000,0.073,0.082
"C(treatment, Treatment(""Control""))[T.Civic Duty]",0.0179,0.003,7.151,0.000,0.013,0.023
"C(treatment, Treatment(""Control""))[T.Hawthorne]",0.0246,0.003,9.807,0.000,0.020,0.029
"C(treatment, Treatment(""Control""))[T.Neighbors]",0.0807,0.003,32.213,0.000,0.076,0.086
"C(treatment, Treatment(""Control""))[T.Self]",0.0479,0.003,19.102,0.000,0.043,0.053
g2000,-0.0026,0.002,-1.121,0.262,-0.007,0.002
g2002,0.1015,0.002,46.554,0.000,0.097,0.106
p2000,0.0995,0.002,55.600,0.000,0.096,0.103
p2002,0.1330,0.002,81.969,0.000,0.130,0.136

0,1,2,3
Omnibus:,280628.082,Durbin-Watson:,1.429
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47992.65
Skew:,0.699,Prob(JB):,0.0
Kurtosis:,1.819,Cond. No.,7.36


## Propensity Score Matching for methods for non-experimental causal studies

In [49]:
df_lalonde = pd.read_csv(base_data_loc / "lalonde_data.csv", index_col=0)
treat_78 = df_lalonde[df_lalonde["treat"] == 1]["re78"]
control_78 = df_lalonde[df_lalonde["treat"] == 0]["re78"]
diff_mean_lalonde = diff_mean(treat_78.to_numpy(), control_78.to_numpy())

## The coefficient matches but the stadard error is off by 22 
print("Unadjusted regression results\ntau:{}, std error:{}".format(diff_mean_lalonde[0], 
                                                                  diff_mean_lalonde[1]))
reg_with_covar = smf.ols(formula="re78 ~ treat + age + age**2 + education + black + hispanic +nodegree + re74 + re75",
                        data=df_lalonde).fit()

## the standard error differs by 0.3, the coefficient differs by 1.5 
reg_with_covar.summary()

Unadjusted regression results
tau:1794.3430848752596, std error:670.9967296585894


0,1,2,3
Dep. Variable:,re78,R-squared:,0.055
Model:,OLS,Adj. R-squared:,0.037
Method:,Least Squares,F-statistic:,3.158
Date:,"Fri, 25 Jul 2025",Prob (F-statistic):,0.00173
Time:,14:29:33,Log-Likelihood:,-4534.2
No. Observations:,445,AIC:,9086.0
Df Residuals:,436,BIC:,9123.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,845.9411,3349.068,0.253,0.801,-5736.384,7428.266
treat,1673.4848,637.711,2.624,0.009,420.116,2926.854
age,53.8823,44.311,1.216,0.225,-33.208,140.973
education,392.9070,226.454,1.735,0.083,-52.169,837.983
black,-2169.1153,1166.146,-1.860,0.064,-4461.082,122.851
hispanic,150.1738,1545.233,0.097,0.923,-2886.857,3187.205
nodegree,-74.6412,1002.950,-0.074,0.941,-2045.859,1896.577
re74,0.0825,0.077,1.068,0.286,-0.069,0.234
re75,0.0481,0.132,0.364,0.716,-0.211,0.308

0,1,2,3
Omnibus:,285.06,Durbin-Watson:,2.058
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3783.256
Skew:,2.547,Prob(JB):,0.0
Kurtosis:,16.345,Cond. No.,71400.0


## Propensity Score Matching for methods for non-experimental causal studies (Matching)

In [50]:
from sklearn.neighbors import NearestNeighbors
from causalinference import CausalModel
#from dowhy import CausalModel

lalonde = pd.read_csv(base_data_loc / "lalonde_data_psid.csv")  # Adjust path if necessary
print(lalonde.shape)

# Estimate the propensity model
lalonde['age2'] = lalonde['age'] ** 2
lalonde['education2'] = lalonde['education'] ** 2
lalonde['re742'] = lalonde['re74'] ** 2
lalonde['re752'] = lalonde['re75'] ** 2
lalonde['u74'] = (lalonde['re74'] == 0).astype(int)
lalonde['u75'] = (lalonde['re75'] == 0).astype(int)

covariates = ['age', 'age2', 'education', 'education2', 'black', 'hispanic', 'married', 
              'nodegree', 're74', 're742', 're75', 're752', 'u74', 'u75']


X = lalonde[covariates]
X = sm.add_constant(X)
Y = lalonde['re78']
Tr = lalonde['treat']

logit = sm.Logit(Tr, X)
propensity_model = logit.fit()
lalonde['pscore'] = propensity_model.predict(X)

min_pscore_treated = lalonde.loc[Tr == 1, 'pscore'].min()
max_pscore_treated = lalonde.loc[Tr == 1, 'pscore'].max()
lalonde = lalonde[(Tr == 1) | ((Tr == 0) & (lalonde['pscore'] >= min_pscore_treated) & (lalonde['pscore'] <= max_pscore_treated))]

nn = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn.fit(lalonde.loc[Tr == 0, ['pscore']])
_, indices = nn.kneighbors(lalonde.loc[Tr == 1, ['pscore']])
matched_controls = lalonde.loc[Tr == 0].iloc[indices.flatten()]

matched_data = pd.concat([lalonde.loc[Tr == 1], matched_controls], axis=0)
cm = CausalModel(Y=matched_data['re78'].values, D=matched_data['treat'].values, X=matched_data[covariates].values)
cm.est_via_matching()

print(cm.estimates)
print(lalonde.shape)

(573, 11)
Optimization terminated successfully.
         Current function value: 0.583264
         Iterations 7

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE   1238.586   1095.906      1.130      0.258   -909.389   3386.561
           ATC   1182.083   1260.702      0.938      0.348  -1288.894   3653.059
           ATT   1295.089   1206.056      1.074      0.283  -1068.781   3658.958

(500, 18)


## Can immigrants counteract employer discrimination? A factorial field experiment reveals the immutability of ethnic hierarchies

In [51]:
data = pd.read_csv(base_data_loc / "vernby_2019.csv", index_col=0)
data.to_csv("example.csv")
model = smf.ols("invited ~ immigrant + citizen + woman+ religious+ experience+skilledjob+time + C(stad)", data=data).fit(cov_type="HC3")
model.summary()

0,1,2,3
Dep. Variable:,invited,R-squared:,0.075
Model:,OLS,Adj. R-squared:,0.067
Method:,Least Squares,F-statistic:,7.099
Date:,"Fri, 25 Jul 2025",Prob (F-statistic):,1.49e-13
Time:,14:29:33,Log-Likelihood:,-392.51
No. Observations:,1492,AIC:,813.0
Df Residuals:,1478,BIC:,887.3
Df Model:,13,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0107,0.045,0.238,0.812,-0.077,0.099
C(stad)[T.LKPGNKPG],0.0343,0.052,0.666,0.506,-0.067,0.135
C(stad)[T.Malmo],-0.0664,0.026,-2.531,0.011,-0.118,-0.015
C(stad)[T.Orebro],-0.0435,0.048,-0.912,0.362,-0.137,0.050
C(stad)[T.Stockholm],-0.0070,0.021,-0.330,0.741,-0.049,0.035
C(stad)[T.Uppsala],0.0070,0.044,0.158,0.875,-0.080,0.094
C(stad)[T.Vasteras],-0.0826,0.041,-2.001,0.045,-0.163,-0.002
immigrant,-0.0867,0.030,-2.869,0.004,-0.146,-0.027
citizen,0.0159,0.017,0.940,0.347,-0.017,0.049

0,1,2,3
Omnibus:,565.534,Durbin-Watson:,2.039
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1561.079
Skew:,2.05,Prob(JB):,0.0
Kurtosis:,5.881,Cond. No.,32.5


In [52]:
model = smf.ols("invited ~ somalia + poland +  iraq + citizen + woman+ religious+ experience+skilledjob+time + C(stad)", data=data).fit(cov_type="HC3")
model.summary()

0,1,2,3
Dep. Variable:,invited,R-squared:,0.094
Model:,OLS,Adj. R-squared:,0.085
Method:,Least Squares,F-statistic:,8.296
Date:,"Fri, 25 Jul 2025",Prob (F-statistic):,1.9e-18
Time:,14:29:33,Log-Likelihood:,-376.81
No. Observations:,1492,AIC:,785.6
Df Residuals:,1476,BIC:,870.5
Df Model:,15,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0170,0.045,0.382,0.703,-0.070,0.104
C(stad)[T.LKPGNKPG],0.0336,0.052,0.651,0.515,-0.068,0.135
C(stad)[T.Malmo],-0.0671,0.026,-2.572,0.010,-0.118,-0.016
C(stad)[T.Orebro],-0.0463,0.046,-1.000,0.317,-0.137,0.044
C(stad)[T.Stockholm],-0.0075,0.021,-0.356,0.722,-0.049,0.034
C(stad)[T.Uppsala],0.0070,0.044,0.160,0.873,-0.079,0.093
C(stad)[T.Vasteras],-0.0868,0.041,-2.119,0.034,-0.167,-0.007
somalia,-0.1443,0.031,-4.697,0.000,-0.205,-0.084
poland,-0.0242,0.034,-0.719,0.472,-0.090,0.042

0,1,2,3
Omnibus:,541.725,Durbin-Watson:,2.031
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1434.108
Skew:,1.98,Prob(JB):,0.0
Kurtosis:,5.719,Cond. No.,30.1


## Using geographic variation in college proximity to estimate the return to schooling

In [53]:
card_data = pd.read_csv(base_data_loc / "card_geographic.csv")
ols = smf.ols("lwage ~ educ + exper + I(exper**2) + black + south + smsa", 
              data = card_data).fit()
print(ols.summary())
iv = IV2SLS.from_formula("lwage ~ 1 + exper + black + south + smsa + [educ ~ nearc4 ]", 
                             data=card_data).fit()
iv.summary

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.291
Model:                            OLS   Adj. R-squared:                  0.289
Method:                 Least Squares   F-statistic:                     204.9
Date:                Fri, 25 Jul 2025   Prob (F-statistic):          1.52e-219
Time:                        14:29:33   Log-Likelihood:                -1308.7
No. Observations:                3010   AIC:                             2631.
Df Residuals:                    3003   BIC:                             2673.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         4.7337      0.068     70.022

0,1,2,3
Dep. Variable:,lwage,R-squared:,0.2140
Estimator:,IV-2SLS,Adj. R-squared:,0.2127
No. Observations:,3010,F-statistic:,724.38
Date:,"Fri, Jul 25 2025",P-value (F-stat),0.0000
Time:,14:29:33,Distribution:,chi2(5)
Cov. Estimator:,robust,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,3.9398,0.8176,4.8189,0.0000,2.3374,5.5422
exper,0.0623,0.0193,3.2299,0.0012,0.0245,0.1001
black,-0.1296,0.0518,-2.5023,0.0123,-0.2311,-0.0281
south,-0.1093,0.0229,-4.7662,0.0000,-0.1542,-0.0643
smsa,0.1348,0.0298,4.5317,0.0000,0.0765,0.1931
educ,0.1318,0.0488,2.7024,0.0069,0.0362,0.2275


## Randomized experiments from non-random selection in U.S. House elections

In [54]:
lee_data = pd.read_csv(base_data_loc / "lee_2008.csv")
lee_data.to_csv("lee_2008.csv")
lee_data = lee_data.dropna(subset=['demsharenext', 'demsharenext', 'difdemshare', 'difdemshare2', 'difdemshare3',
                                   'difdemshare4', 'rdifdemshare', 'rdifdemshare2', 'rdifdemshare3',
                                   'rdifdemshare4', 'right'])

formula1 = "demsharenext ~ difdemshare + difdemshare2 + difdemshare3 + difdemshare4 + " \
          "rdifdemshare + rdifdemshare2 + rdifdemshare3 + rdifdemshare4 + right"

formula2 = "demsharenext ~ difdemshare + difdemshare2 + difdemshare3 + difdemshare4 + " \
          "rdifdemshare + rdifdemshare2 + rdifdemshare3 + rdifdemshare4 + right + demofficeexp + othofficeexp"

model1 = smf.ols(formula=formula1, data=lee_data).fit(cov_type='cluster', 
                                                    cov_kwds={'groups': lee_data['statedisdec'].astype('int')})
print(model1.summary())

model2 = smf.ols(formula=formula2, data=lee_data).fit(cov_type='cluster', 
                                                    cov_kwds={'groups': lee_data['statedisdec'].astype('int')})
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:           demsharenext   R-squared:                       0.670
Model:                            OLS   Adj. R-squared:                  0.670
Method:                 Least Squares   F-statistic:                     1674.
Date:                Fri, 25 Jul 2025   Prob (F-statistic):               0.00
Time:                        14:29:33   Log-Likelihood:                 5204.3
No. Observations:                9174   AIC:                        -1.039e+04
Df Residuals:                    9164   BIC:                        -1.032e+04
Df Model:                           9                                         
Covariance Type:              cluster                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.4445      0.008     58.789

In [55]:
abortion_bf15 = pd.read_csv(base_data_loc / "abortion_bf15.csv")
formula = ("lnr ~ repeal + C(year) + C(fip) + acc + ir + pi + alcohol + crack + poverty + income + ur")
reg = (smf.wls(formula, data=abortion_bf15, weights=abortion_bf15.totpop.values).fit(cov_type='cluster', 
        cov_kwds={'groups': abortion_bf15.fip.values}, method='pinv'))
print(reg.summary())

                            WLS Regression Results                            
Dep. Variable:                    lnr   R-squared:                       0.838
Model:                            WLS   Adj. R-squared:                  0.821
Method:                 Least Squares   F-statistic:                     2033.
Date:                Fri, 25 Jul 2025   Prob (F-statistic):           1.73e-66
Time:                        14:29:33   Log-Likelihood:                   -inf
No. Observations:                 737   AIC:                               inf
Df Residuals:                     663   BIC:                               inf
Df Model:                          73                                         
Covariance Type:              cluster                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             7.8378      1.12

  llf += 0.5 * np.sum(np.log(self.weights))


In [56]:

abortion_bm15 = pd.read_csv(base_data_loc / "abortion_bm15.csv")
formula = ("lnr ~ C(repeal) + C(year) + C(fip) + acc + ir + pi + alcohol + crack + poverty + income + ur")
reg = (smf.wls(formula, data=abortion_bm15, weights=abortion_bm15.totpop.values).fit(cov_type='cluster', 
        cov_kwds={'groups': abortion_bm15.fip.values}, method='pinv'))
print(reg.summary())

                            WLS Regression Results                            
Dep. Variable:                    lnr   R-squared:                       0.876
Model:                            WLS   Adj. R-squared:                  0.863
Method:                 Least Squares   F-statistic:                     459.4
Date:                Fri, 25 Jul 2025   Prob (F-statistic):           2.14e-50
Time:                        14:29:33   Log-Likelihood:                   -inf
No. Observations:                 755   AIC:                               inf
Df Residuals:                     681   BIC:                               inf
Df Model:                          73                                         
Covariance Type:              cluster                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             6.5599      1.52

  llf += 0.5 * np.sum(np.log(self.weights))


## Black Politicians Are More Intrinsically Motivated to Advance Blacks’ Interests: A Field Experiment Manipulating Political Incentives

In [57]:
br = pd.read_csv(base_data_loc / "broockman_intrinsic.csv")
reg1 = smf.ols("responded~treat_out", data=br).fit()
reg2 = smf.ols("responded ~ treat_out * leg_black", data=br).fit()
print(reg1.summary())
print(reg2.summary())

                            OLS Regression Results                            
Dep. Variable:              responded   R-squared:                       0.073
Model:                            OLS   Adj. R-squared:                  0.072
Method:                 Least Squares   F-statistic:                     437.4
Date:                Fri, 25 Jul 2025   Prob (F-statistic):           1.44e-93
Time:                        14:29:33   Log-Likelihood:                -3781.3
No. Observations:                5593   AIC:                             7567.
Df Residuals:                    5591   BIC:                             7580.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.5551      0.009     61.881      0.0

In [58]:
from cem.coarsen import coarsen
from cem.match import match
from cem.imbalance import L1
import statsmodels.formula.api as smf

df = br
T = "leg_black"

df["leg_democrat_2"] = pd.qcut(df["leg_democrat"], q=2, duplicates='drop')
df["treatXblack"] = df["treat_out"] * df["leg_black"] 
X = df[["medianhhincom", "blackpercent", "leg_democrat", "leg_black"]]

X_coarse = coarsen(X, T)
weights = match(X_coarse, T)

df["cem_weights"] = weights

formula = ("responded ~ treat_out + treatXblack + leg_black + nonblacknonwhite + "
    "leg_democrat + leg_senator + south + blackpercent + black_medianhh + "
    "white_medianhh + statessquireindex + totalpop + urbanpercent")

model = smf.wls(formula=formula, data=df, weights=df["cem_weights"]).fit()
print(model.summary())

  for level, group in data.groupby(treatment):
  level_strata_counts[level] = tg.groupby(strata_cols, observed=True).apply(lambda g: weights.loc[g.index].sum()).to_dict()
  for level, group in data.groupby(treatment):
  level_strata_counts[level] = tg.groupby(strata_cols, observed=True).apply(lambda g: weights.loc[g.index].sum()).to_dict()
  level_strata_counts[level] = tg.groupby(strata_cols, observed=True).apply(lambda g: weights.loc[g.index].sum()).to_dict()
  for level, group in data.groupby(treatment):
  level_strata_counts[level] = tg.groupby(strata_cols, observed=True).apply(lambda g: weights.loc[g.index].sum()).to_dict()
  level_strata_counts[level] = tg.groupby(strata_cols, observed=True).apply(lambda g: weights.loc[g.index].sum()).to_dict()
  for level, group in data.groupby(treatment):
  level_strata_counts[level] = tg.groupby(strata_cols, observed=True).apply(lambda g: weights.loc[g.index].sum()).to_dict()
  level_strata_counts[level] = tg.groupby(strata_cols, observed=True

0.839714168027421


  matched = data.groupby(gb).filter(lambda x: x[treatment].nunique() == data[treatment].nunique())
  weights = pd.concat([_weight_stratum(stratum[treatment], global_level_counts) for _, stratum in matched.groupby(gb)])


                            WLS Regression Results                            
Dep. Variable:              responded   R-squared:                       0.176
Model:                            WLS   Adj. R-squared:                  0.174
Method:                 Least Squares   F-statistic:                     91.68
Date:                Fri, 25 Jul 2025   Prob (F-statistic):          6.20e-223
Time:                        14:29:34   Log-Likelihood:                   -inf
No. Observations:                5593   AIC:                               inf
Df Residuals:                    5579   BIC:                               inf
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             0.7725      0.05

  llf += 0.5 * np.sum(np.log(self.weights))


## Does Strengthening Self-Defense Law Deter Crime or Escalate Violence? Evidence from Castle Doctrine

In [59]:
castle = pd.read_csv(base_data_loc / "castle.csv")
dd_simple = smf.ols("l_homicide ~ cdl + C(year) + C(sid)", data=castle).fit(cov_type='cluster', 
                                                                        cov_kwds={'groups':castle['sid']})
print(dd_simple.params["cdl"], dd_simple.bse["cdl"])

0.08770138681115834 0.06689912483194875


In [60]:
dd_simple = smf.ols("l_larceny ~ cdl + C(year) + C(sid)", data=castle).fit(cov_type='cluster', 
                                                                        cov_kwds={'groups':castle['sid']})
print(dd_simple.params["cdl"], dd_simple.bse["cdl"])

0.007453541783757872 0.02382419194779041


In [61]:
dd_simple = smf.ols("l_motor ~ cdl + C(year) + C(sid)", data=castle).fit(cov_type='cluster', 
                                                                        cov_kwds={'groups':castle['sid']})
print(dd_simple.params["cdl"], dd_simple.bse["cdl"])

0.07665384953479379 0.043346859238072175


In [62]:
dd_simple = smf.ols("l_burglary ~ cdl + C(year) + C(sid)", data=castle).fit(cov_type='cluster', 
                                                                        cov_kwds={'groups':castle['sid']})
print(dd_simple.params["cdl"], dd_simple.bse["cdl"])

0.05724631958241479 0.028483578574048464


## Government Transfers and Political Support

In [63]:
df = pd.read_csv(base_data_loc / "gov_transfers.csv")
m1 = smf.ols('Support~Income_Centered*Participation + I(Income_Centered**2)*Participation', data=df).fit()
print(m1.summary())

                            OLS Regression Results                            
Dep. Variable:                Support   R-squared:                       0.036
Model:                            OLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     14.63
Date:                Fri, 25 Jul 2025   Prob (F-statistic):           4.24e-14
Time:                        14:29:34   Log-Likelihood:                -496.76
No. Observations:                1948   AIC:                             1006.
Df Residuals:                    1942   BIC:                             1039.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

## Don't Take 'No' For An Answer: An Experiment With Actual Organ Donor Registrations

In [64]:
od = pd.read_csv(base_data_loc / "organ_donations.csv")

od['Post'] = (od['Quarter_Num'] > 3).astype(int)
od['California'] = (od['State'] == 'California').astype(int)

od['Post_California'] = od['Post'] * od['California']
did_model = smf.ols('Rate ~ Post + California + Post_California', 
                    data=od).fit(cov_type='cluster', cov_kwds={'groups': od['State']})

print(did_model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Rate   R-squared:                       0.054
Model:                            OLS   Adj. R-squared:                  0.036
Method:                 Least Squares   F-statistic:                     32.82
Date:                Fri, 25 Jul 2025   Prob (F-statistic):           7.73e-08
Time:                        14:29:34   Log-Likelihood:                 78.895
No. Observations:                 162   AIC:                            -149.8
Df Residuals:                     158   BIC:                            -137.4
Df Model:                           3                                         
Covariance Type:              cluster                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.4449      0.031     



## The Demand for, and Impact of, Learning Hiv Status.

In [65]:
df = pd.read_csv(base_data_loc / "thornton_hiv.csv")
df = df[["got", "any", "tinc", "male", "hiv2004", "age", "rumphi", "balaka","over", "under", "age2", "villnum"]]
df.to_csv("thornton_hiv.csv", index=False)
model = smf.ols('got ~ any + male + hiv2004 + age+ + rumphi + balaka', data=df)
results = model.fit(cov_type='cluster', cov_kwds={'groups': df['villnum']})

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    got   R-squared:                       0.181
Model:                            OLS   Adj. R-squared:                  0.179
Method:                 Least Squares   F-statistic:                     77.85
Date:                Fri, 25 Jul 2025   Prob (F-statistic):           1.11e-38
Time:                        14:29:34   Log-Likelihood:                -1535.7
No. Observations:                2812   AIC:                             3085.
Df Residuals:                    2805   BIC:                             3127.
Df Model:                           6                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3924      0.038     10.305      0.0

## Do Voters Affect or Elect Policies: Evidence from the U.S. House

In [66]:

from stargazer.stargazer import Stargazer

def lm_robust(formula, data):
    regression = sm.OLS.from_formula(formula, data = data).fit()
    #regression = regression.fit(cov_type="cluster",cov_kwds={"groups":data['id']})
    return regression

lmb_data = pd.read_csv(base_data_loc / "close_elections.csv")
lmb_data = lmb_data[lmb_data.demvoteshare.between(.45, .55)]

lmb_data['demvoteshare_c'] = lmb_data['demvoteshare'] - 0.5
lmb_data['lagdemvoteshare_c'] = lmb_data['lagdemvoteshare'] - 0.5
lmb_data = lmb_data[~pd.isnull(lmb_data.demvoteshare_c)]

lm_2 = lm_robust('score ~ democrat*demvoteshare_c', data = lmb_data)
lm_3 = lm_robust('democrat ~ lagdemocrat*lagdemvoteshare_c', data = lmb_data)
print("Original results based on ADA Scores -- Full Sample with linear interactions")
Stargazer([lm_2, lm_3])

Original results based on ADA Scores -- Full Sample with linear interactions


0,1,2
,,
,,
,(1),(2)
,,
Intercept,18.097***,0.429***
,(1.207),(0.020)
democrat,46.778***,
,(1.735),
democrat:demvoteshare_c,-91.115,
,(60.273),


## The effects of rural electrification in India: An instrumental variable approach at the household level

In [67]:
data = pd.read_csv(base_data_loc / "electrification_data.csv")
data['Pole'] = data['Pole'].astype('category')
vars = ['total_expenditure', 'treat', 'forcing', 'gender', 'birthplace', 
               'age', 'religion', 'caste', 'Pole']

data_sub = data.dropna(subset=vars)

iv_formula = 'total_expenditure ~ 1 + gender + birthplace + age + religion + caste + C(Pole) + [treat ~ forcing]'
iv = IV2SLS.from_formula(iv_formula, data=data_sub).fit(cov_type='clustered', clusters=data_sub['Pole'])
print(iv.params["treat"], iv.std_errors["treat"])
print(iv.summary)

#iv1_formula = 'total_expenditure ~ 1 + gender + birthplace + age + religion + caste + C(Pole) + [treat ~ forcing]'
#iv1 = IV2SLS.from_formula(iv1_formula, data=data).fit()
#print(iv2.summary)

4509.398955404442 587.8142908621033
                          IV-2SLS Estimation Summary                          
Dep. Variable:      total_expenditure   R-squared:                      0.3349
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1351
No. Observations:                 685   F-statistic:                -7.327e+17
Date:                Fri, Jul 25 2025   P-value (F-stat)                1.0000
Time:                        14:29:34   Distribution:                chi2(158)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             6864.8     1063.5     6.4551     0.0000      4780.4 

In [68]:
vars = ['food_expenditure', 'treat', 'forcing', 'gender', 'birthplace', 
               'age', 'religion', 'caste', 'Pole']

data_sub = data.dropna(subset=vars)

iv_formula = 'food_expenditure ~ 1 + gender + birthplace + age + religion + caste + C(Pole) + [treat ~ forcing]'
iv = IV2SLS.from_formula(iv_formula, data=data_sub).fit(cov_type='clustered', clusters=data_sub['Pole'])
print(iv.params["treat"], iv.std_errors["treat"])
print(iv.summary)

3023.2639370650486 485.8723973822213
                          IV-2SLS Estimation Summary                          
Dep. Variable:       food_expenditure   R-squared:                      0.2881
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0746
No. Observations:                 686   F-statistic:                 2.409e+18
Date:                Fri, Jul 25 2025   P-value (F-stat)                0.0000
Time:                        14:29:35   Distribution:                chi2(158)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             6030.7     898.66     6.7107     0.0000      4269.3

In [69]:
vars = ['education_expenditure', 'treat', 'forcing', 'gender', 'birthplace', 
               'age', 'religion', 'caste', 'Pole']

data_sub = data.dropna(subset=vars)

iv_formula = 'education_expenditure ~ 1 + gender + birthplace + age + religion + caste + C(Pole) + [treat ~ forcing]'
iv = IV2SLS.from_formula(iv_formula, data=data_sub).fit(cov_type='clustered', clusters=data_sub['Pole'])
print(iv.params["treat"], iv.std_errors["treat"])
print(iv.summary)

654.9923815244526 209.76903171150985
                            IV-2SLS Estimation Summary                           
Dep. Variable:     education_expenditure   R-squared:                      0.2953
Estimator:                       IV-2SLS   Adj. R-squared:                 0.0840
No. Observations:                    686   F-statistic:                -1.187e+19
Date:                   Fri, Jul 25 2025   P-value (F-stat)                1.0000
Time:                           14:29:35   Distribution:                chi2(158)
Cov. Estimator:                clustered                                         
                                                                                 
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             242.21     361.71     0.669

In [70]:
vars = ['kerosene_expenditure', 'treat', 'forcing', 'gender', 'birthplace', 
               'age', 'religion', 'caste', 'Pole']

data_sub = data.dropna(subset=vars)

iv_formula = 'kerosene_expenditure ~ 1 + gender + birthplace + age + religion + caste + C(Pole) + [treat ~ forcing]'
iv = IV2SLS.from_formula(iv_formula, data=data_sub).fit(cov_type='clustered', clusters=data_sub['Pole'])
print(iv.params["treat"], iv.std_errors["treat"])
print(iv.summary)

17.55694864864168 8.610676695544104
                           IV-2SLS Estimation Summary                           
Dep. Variable:     kerosene_expenditure   R-squared:                      0.3017
Estimator:                      IV-2SLS   Adj. R-squared:                 0.0923
No. Observations:                   686   F-statistic:                -6.264e+18
Date:                  Fri, Jul 25 2025   P-value (F-stat)                1.0000
Time:                          14:29:36   Distribution:                chi2(158)
Cov. Estimator:               clustered                                         
                                                                                
                                 Parameter Estimates                                 
                   Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------------
Intercept             65.893     13.477     4.8893     0.0

## Restructuring Research: Communication Costs and the Democratization of University Innovation

In [None]:
## Commenting this out. This is heavy csv file. The kernel might crash. 

from linearmodels.panel import PanelOLS
'''
df = pd.read_csv(base_data_loc / "bitnet_paired.csv")
variables = ['dcoauths1_2', 'bothbitnet', 'lag_sum_ee_rd', 'lag_sum_doc',
             'lag_sum_postdocs', 'totsoloauths', 'gpinst', 'year']
df = df.dropna(subset=variables)

# Set panel index: entity = gpinst, time = year
df = df.set_index(['gpinst', 'year'])

# Define dependent and independent variables
y = df['dcoauths1_2']
X = df[['bothbitnet', 'lag_sum_ee_rd', 'lag_sum_doc',
        'lag_sum_postdocs', 'totsoloauths']]

# Add a constant manually (PanelOLS does not do it automatically)
X = sm.add_constant(X)

# Estimate fixed effects model with entity and time fixed effects
model = PanelOLS(y, X, entity_effects=True, time_effects=True)
results = model.fit(cov_type='clustered', cluster_entity=True)

# Print results
print(results.summary)
'''


'\ndf = pd.read_csv(base_data_loc / "bitnet_paired.csv")\nvariables = [\'dcoauths1_2\', \'bothbitnet\', \'lag_sum_ee_rd\', \'lag_sum_doc\',\n             \'lag_sum_postdocs\', \'totsoloauths\', \'gpinst\', \'year\']\ndf = df.dropna(subset=variables)\n\n# Set panel index: entity = gpinst, time = year\ndf = df.set_index([\'gpinst\', \'year\'])\n\n# Define dependent and independent variables\ny = df[\'dcoauths1_2\']\nX = df[[\'bothbitnet\', \'lag_sum_ee_rd\', \'lag_sum_doc\',\n        \'lag_sum_postdocs\', \'totsoloauths\']]\n\n# Add a constant manually (PanelOLS does not do it automatically)\nX = sm.add_constant(X)\n\n# Estimate fixed effects model with entity and time fixed effects\nmodel = PanelOLS(y, X, entity_effects=True, time_effects=True)\nresults = model.fit(cov_type=\'clustered\', cluster_entity=True)\n\n# Print results\nprint(results.summary)\n'

## Minimum Wages and Employment:A Case Study of the Fast-Food Industry in New Jersey and Pennsylvania

In [72]:
df_min = pd.read_csv(base_data_loc / "min_wage_data.csv", index_col=0)
did_model = smf.ols("empft ~ after * state", data=df_min).fit()
print(did_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  empft   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     3.514
Date:                Fri, 25 Jul 2025   Prob (F-statistic):             0.0149
Time:                        14:29:36   Log-Likelihood:                -2880.7
No. Observations:                 808   AIC:                             5769.
Df Residuals:                     804   BIC:                             5788.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      10.2051      0.971     10.512      

In [73]:
import pandas as pd
import numpy as np
from dowhy import CausalModel

data = pd.read_csv(base_data_loc / "fda_carpenter.csv")
data['hospdisc'] /= 100000
data['natreg'] /= 100
data['stafcder'] /= 100
data['prevgenx'] /= 100
data['hhosleng'] /= 10
data['condavg3'] /= 10
data['orderent'] /= 10
data['vandavg3'] /= 10
data['wpnoavg3'] /= 100

treatment_var = 'demsnmaj'
outcome_var = 'acttime'
covariates = ['orderent', 'prevgenx', 'lethal', 'deathrt1', 'hosp01', 'hospdisc', 'hhosleng',
              'femdiz01', 'mandiz01', 'peddiz01', 'acutediz', 'orphdum', 'natreg', 'wpnoavg3',
              'vandavg3', 'condavg3', 'stafcder', 'hospdisc']

logit = LogisticRegression(max_iter=1000)
logit.fit(data[covariates], data[treatment_var])
data['propensity_score'] = logit.predict_proba(data[covariates])[:, 1]

# Trim to common support
#treated_ps = data.loc[data[treatment_var] == 1, 'propensity_score']
#control_ps = data.loc[data[treatment_var] == 0, 'propensity_score']
#common_support = data[(data['propensity_score'] >= treated_ps.min()) &
#                      (data['propensity_score'] <= control_ps.max())]

common_causes = data.columns.drop([treatment_var, outcome_var, 'd']).tolist()

model = CausalModel(data=data,treatment=treatment_var,outcome=outcome_var, common_causes=common_causes)
identified_estimand = model.identify_effect()
estimate = model.estimate_effect(identified_estimand, method_name="backdoor.distance_matching",
                                 target_units="att")

print('ATT Estimate:', estimate.value)
print("SE:", estimate.get_standard_error())



ATT Estimate: -17.844994641717786
SE: 8.302135786241353


## Markets: The Fulton Fish Market

In [76]:
df = pd.read_csv(base_data_loc / "fulton.csv")
iv_model = IV2SLS.from_formula("q ~ 1 + Mon+Tue+Wed+Thu + [p ~ Stormy]",data=df).fit(cov_type='robust')

print(iv_model.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                      q   R-squared:                      0.1391
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0981
No. Observations:                 111   F-statistic:                    24.946
Date:                Fri, Jul 25 2025   P-value (F-stat)                0.0001
Time:                        14:30:45   Distribution:                  chi2(5)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      8.5059     0.1479     57.510     0.0000      8.2160      8.7958
Mon           -0.0254     0.2154    -0.1179     0.90

##  Punishment and Deterrence: Evidence from Drunk Driving

In [74]:
from rdd import rdd

# Load data

df = pd.read_csv(base_data_loc / "hansen.csv")
df['dui'] = (df['bac1'] > 0.08).astype(int)
df = df[(df['bac1'] > 0.03) & (df['bac1'] < 0.13)]
df['bac1_orig'] = df['bac1']
#df['bac1'] = df['bac1'] - 0.08
data = df[['recidivism', 'bac1']].dropna()
est = rdd.rdd(data, 'bac1', 'recidivism', cut=0.08).fit()
print("\nRD Estimate:")
print(est.summary())

Estimation Equation:	 recidivism ~ TREATED + bac1

RD Estimate:
                            WLS Regression Results                            
Dep. Variable:             recidivism   R-squared:                       0.000
Model:                            WLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     20.97
Date:                Fri, 25 Jul 2025   Prob (F-statistic):           7.88e-10
Time:                        14:29:48   Log-Likelihood:                -21539.
No. Observations:               88373   AIC:                         4.308e+04
Df Residuals:                   88370   BIC:                         4.311e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

## Children and Their Parent's Labor Supply: Evidence from Exogenous Variation in Family Size

In [None]:
## Commenting this out. This is heavy csv file. The kernel might crash. 
'''
df = pd.read_csv(base_data_loc / "labor_supply.csv").map(
    lambda x: x.strip().lstrip("b'").rstrip("'") if isinstance(x, str) else x
)
for col in ['AGEM','KIDCOUNT','WEEKSM']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df['BOY1ST'] = (df['SEXK'] == '0').astype(int)
df['BOY2ND'] = (df['SEX2ND'] == '0').astype(int)
df['SAMESEX'] = ((df['BOY1ST'] & df['BOY2ND']) | (~df['BOY1ST'] & ~df['BOY2ND'])).astype(int)
df['MOREKIDS'] = (df['KIDCOUNT'] > 2).astype(int)
df['BLACKM'] = (df['RACEM'] == '02').astype(int)
df['HISPM'] = (df['RACEM'] == '12').astype(int)
df['OTHRACEM'] = (~df['RACEM'].isin(['01','02','12'])).astype(int)
df['WORKEDM'] = (df['WEEKSM'] > 0).astype(int)

sample = df[(df['AGEM'] >= 21) & (df['AGEM'] <= 35) & (df['KIDCOUNT'] >= 2)].dropna(subset=[
    'WORKEDM','MOREKIDS','AGEM','BOY1ST','BOY2ND',
    'BLACKM','HISPM','OTHRACEM','SAMESEX'
])

exog = add_constant(sample[['AGEM','BOY1ST','BOY2ND','BLACKM','HISPM','OTHRACEM']])
endog = sample['MOREKIDS']
instr = sample[['SAMESEX']]

model = IV2SLS(
    dependent=sample['WORKEDM'],
    exog=exog,
    endog=endog,
    instruments=instr
)
results = model.fit(cov_type='unadjusted')

print(results.summary)
'''

FileNotFoundError: [Errno 2] No such file or directory: 'data/labor_supply.csv'