## Solutions

In [80]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf 

### Exercise 1

In [81]:
from linearmodels.iv import IV2SLS

data=pd.read_stata('data/mus06data.dta')
data=data.dropna(subset=['linc'])
formula = 'ldrugexp ~ 1 + totchr + female + age + linc + blhisp + [hi_empunion ~ multlc+firmsz]'
res = IV2SLS.from_formula(formula, data).fit(cov_type='robust')
print(res.first_stage)
print('\n'.join(res.summary.as_text().split('\n')[8:]))

    First Stage Estimation Results    
                           hi_empunion
--------------------------------------
R-squared                       0.0646
Partial R-squared               0.0058
Shea's R-squared                0.0058
Partial F-statistic             58.687
P-value (Partial F-stat)     1.804e-13
Partial F-stat Distn           chi2(2)
Intercept                       0.9010
                              (15.276)
totchr                          0.0109
                              (2.9662)
female                         -0.0794
                             (-8.2030)
age                            -0.0092
                             (-12.926)
linc                            0.0722
                              (11.607)
blhisp                         -0.0743
                             (-6.0078)
multlc                          0.1419
                              (6.8044)
firmsz                          0.0040
                              (2.1246)
-------------------------

In [82]:
from linearmodels.iv.results import compare

res_ols = IV2SLS.from_formula('ldrugexp ~ 1 + hi_empunion + totchr + female + age + linc + blhisp', data).fit(cov_type='robust')
compare([res,res_ols])

0,1,2
,Model 0,Model 1
Dep. Variable,ldrugexp,ldrugexp
Estimator,IV-2SLS,OLS
No. Observations,10089,10089
Cov. Est.,robust,robust
R-squared,-0.1220,0.1770
Adj. R-squared,-0.1226,0.1765
F-statistic,1643.9,2262.6
P-value (F-stat),0.0000,0.0000
==================,===========,===========


In [93]:
res.wu_hausman()

Wu-Hausman test of exogeneity
H0: All endogenous variables are exogenous
Statistic: 21.3325
P-value: 0.0000
Distributed: F(1,10081)
WaldTestStatistic, id: 0x21fa4751c48

In [85]:
res.sargan

Sargan's test of overidentification
H0: The model is not overidentified.
Statistic: 2.6073
P-value: 0.1064
Distributed: chi2(1)
WaldTestStatistic, id: 0x21fa45f2548

In [88]:
from linearmodels.iv import IVGMM

res_gmm = IVGMM.from_formula(formula, data).fit(cov_type='robust')
print(res_gmm.first_stage)
print('\n'.join(res_gmm.summary.as_text().split('\n')[8:]))

    First Stage Estimation Results    
                           hi_empunion
--------------------------------------
R-squared                       0.0646
Partial R-squared               0.0058
Shea's R-squared                0.0058
Partial F-statistic             58.687
P-value (Partial F-stat)     1.804e-13
Partial F-stat Distn           chi2(2)
Intercept                       0.9010
                              (15.276)
totchr                          0.0109
                              (2.9662)
female                         -0.0794
                             (-8.2030)
age                            -0.0092
                             (-12.926)
linc                            0.0722
                              (11.607)
blhisp                         -0.0743
                             (-6.0078)
multlc                          0.1419
                              (6.8044)
firmsz                          0.0040
                              (2.1246)
-------------------------

In [90]:
res_gmm.c_stat()

C-statistic
H0: All endogenous variables are exogenous
Statistic: 16.8790
P-value: 0.0000
Distributed: chi2(1)
WaldTestStatistic, id: 0x21fa44ed188

In [9]:
res_gmm.j_stat

H0: Expected moment conditions are equal to 0
Statistic: 2.0259
P-value: 0.1546
Distributed: chi2(1)
WaldTestStatistic, id: 0x21fcc5fd988

### Exercise 2

In [11]:
from linearmodels.iv import IVLIML
res_liml = IVLIML.from_formula(formula, data).fit(cov_type='robust')
compare([res,res_liml,res_gmm])

0,1,2,3
,Model 0,Model 1,Model 2
Dep. Variable,ldrugexp,ldrugexp,ldrugexp
Estimator,IV-2SLS,IV-LIML,IV-GMM
No. Observations,10089,10089,10089
Cov. Est.,robust,robust,robust
R-squared,-0.1220,-0.1503,-0.1067
Adj. R-squared,-0.1226,-0.1510,-0.1074
F-statistic,1643.9,1601.9,1667.5
P-value (F-stat),0.0000,0.0000,0.0000
==================,===========,===========,===========


### Exercise 3

In [12]:
res1 = IV2SLS.from_formula('ldrugexp ~ 1 + totchr + female + age + linc + blhisp + [hi_empunion ~ ssiratio]', data).fit(cov_type='robust')
res2 = IV2SLS.from_formula('ldrugexp ~ 1 + totchr + female + age + linc + blhisp + [hi_empunion ~ ssiratio+lowincome]', data).fit(cov_type='robust')
res3 = IV2SLS.from_formula('ldrugexp ~ 1 + totchr + female + age + linc + blhisp + [hi_empunion ~ ssiratio+lowincome+multlc]', data).fit(cov_type='robust')
res4 = IV2SLS.from_formula('ldrugexp ~ 1 + totchr + female + age + linc + blhisp + [hi_empunion ~ ssiratio+lowincome+multlc+firmsz]', data).fit(cov_type='robust')
compare([res1,res2,res3,res4])

0,1,2,3,4
,Model 0,Model 1,Model 2,Model 3
Dep. Variable,ldrugexp,ldrugexp,ldrugexp,ldrugexp
Estimator,IV-2SLS,IV-2SLS,IV-2SLS,IV-2SLS
No. Observations,10089,10089,10089,10089
Cov. Est.,robust,robust,robust,robust
R-squared,0.0640,0.1030,0.0809,0.0720
Adj. R-squared,0.0634,0.1025,0.0804,0.0715
F-statistic,2000.9,2084.7,2036.3,2017.7
P-value (F-stat),0.0000,0.0000,0.0000,0.0000
==================,===========,===========,===========,===========


### Exercise 4

In [13]:
res.first_stage.diagnostics

Unnamed: 0,rsquared,partial.rsquared,shea.rsquared,f.stat,f.pval,f.dist
hi_empunion,0.064626,0.005774,0.005774,58.686708,1.804112e-13,chi2(2)


### Exercise 5

In [20]:
from linearmodels import IV3SLS

ldrugexp='ldrugexp ~ 1 + hi_empunion + totchr + age + female + linc + blhisp'
hi_empunion='hi_empunion ~ 1 + ldrugexp + totchr + female + blhisp + ssiratio'
equations = dict(ldrugexp=ldrugexp, hi_empunion=hi_empunion)
res_3sls = IV3SLS.from_formula(equations, data).fit(cov_type='robust')
print('\n'.join(res_3sls.summary.as_text().split('\n')[8:20]))
print('\n'.join(res.summary.as_text().split('\n')[8:]))

                                                                              
                Equation: ldrugexp, Dependent Variable: ldrugexp               
             Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-------------------------------------------------------------------------------
Intercept       5.8251     0.1270     45.873     0.0000      5.5762      6.0740
hi_empunion     0.1532     0.0210     7.2982     0.0000      0.1120      0.1943
totchr          0.4399     0.0076     58.069     0.0000      0.4251      0.4548
age            -0.0034     0.0016    -2.1638     0.0305     -0.0065     -0.0003
female          0.0652     0.0205     3.1758     0.0015      0.0250      0.1054
linc            0.0071     0.0110     0.6474     0.5173     -0.0145      0.0288
blhisp         -0.1457     0.0276    -5.2791     0.0000     -0.1998     -0.0916
                                                                              
                              Parameter Es

### Exercise 6

In [25]:
labels = pd.read_stata('data/mus06klingdata.dta', iterator=True)
labels.variable_labels()

{'black': 'Race (r0002300)',
 'grade76': "'76 Grade level",
 'smsa66': '= lived in SMSA in 1966',
 'smsa76': '= lived in SMSA in 1976',
 'col4': 'If any 4-year college nearby',
 'mcol4': 'If male 4-year college nearby',
 'col4pub': 'If public 4-year college nearby',
 'south76': 'If lived in South in 1976',
 'wage76': "'76 Wage",
 'exp76': "'76 experience (10+age66)-grade76-6",
 'expsq76': "'76 experience = exp76 ^2/100",
 'age76': "'76 age (age66 +10)",
 'agesq76': "'76 age squared (age76^2)",
 'reg1': 'region==NE',
 'reg2': '= lived in Region 2 MidAtl',
 'reg3': '= lived in Region 3 ENC',
 'reg4': '= lived in Region 4 WNC',
 'reg5': '= lived in Region 5 SA',
 'reg6': '= lived in Region 6 ESC',
 'reg7': '= lived in Region 7 WSC',
 'reg8': '= lived in Region 8 M',
 'reg9': '= lived in Region 9 P',
 'momdad14': '= lived with both parents at age 14',
 'sinmom14': '= lived with mother only at age 14',
 'nodaded': '= father has no formal education',
 'nomomed': '= mother has no formal educa

In [27]:
data=pd.read_stata('data/mus06klingdata.dta')
formula='wage76 ~ 1 + black+south76+smsa76+reg2+reg3+reg4+reg5+reg6+reg7+reg8+reg9+smsa66+sinmom14+nodaded+nomomed+daded+momed+famed1+famed2+famed3+famed4+famed5+famed6+famed7+famed8 + [grade76+exp76+expsq76 ~ col4+age76+agesq76]'
res = IV2SLS.from_formula(formula, data).fit(cov_type='robust')
print(res.first_stage)
print('\n'.join(res.summary.as_text().split('\n')[8:]))

                First Stage Estimation Results                
                               grade76       exp76     expsq76
--------------------------------------------------------------
R-squared                       0.2953      0.7056      0.6754
Partial R-squared               0.0081      0.6411      0.6094
Shea's R-squared                0.0062      0.0860      0.0823
Partial F-statistic             26.010      5318.9      3562.5
P-value (Partial F-stat)      9.49e-06      0.0000      0.0000
Partial F-stat Distn           chi2(3)     chi2(3)     chi2(3)
Intercept                      -4.0120     -1.9880      6.6009
                             (-1.0468)   (-0.5187)    (7.7352)
black                          -0.4444      0.4444      0.0810
                             (-3.5571)    (3.5571)    (2.7505)
south76                        -0.1231      0.1231      0.0456
                             (-0.7702)    (0.7702)    (1.4659)
smsa76                          0.7460     -0.7460     

In [29]:
data.loc[:,['grade76','exp76','expsq76','col4','age76','agesq76']].corr()

Unnamed: 0,grade76,exp76,expsq76,col4,age76,agesq76
grade76,1.0,-0.652956,-0.631521,0.14424,-0.00874,-0.011597
exp76,-0.652956,1.0,0.967202,-0.061621,0.763074,0.764028
expsq76,-0.631521,0.967202,1.0,-0.064001,0.738063,0.743718
col4,0.14424,-0.061621,-0.064001,1.0,0.04173,0.042734
age76,-0.00874,0.763074,0.738063,0.04173,1.0,0.998822
agesq76,-0.011597,0.764028,0.743718,0.042734,0.998822,1.0


In [30]:
res.first_stage.diagnostics

Unnamed: 0,rsquared,partial.rsquared,shea.rsquared,f.stat,f.pval,f.dist
grade76,0.29527,0.008105,0.006185,26.010332,9e-06,chi2(3)
exp76,0.705598,0.641137,0.085975,5318.851936,0.0,chi2(3)
expsq76,0.675432,0.609415,0.082337,3562.538602,0.0,chi2(3)


### Exercise 7

In [31]:
formula='wage76 ~ 1 + black+south76+smsa76+reg2+reg3+reg4+reg5+reg6+reg7+reg8+reg9+smsa66+sinmom14+nodaded+nomomed+daded+momed+famed1+famed2+famed3+famed4+famed5+famed6+famed7+famed8+exp76+expsq76 + [grade76 ~ col4]'
res = IV2SLS.from_formula(formula, data).fit(cov_type='robust')
res.first_stage.diagnostics

Unnamed: 0,rsquared,partial.rsquared,shea.rsquared,f.stat,f.pval,f.dist
grade76,0.528641,0.004701,0.004701,15.14017,0.0001,chi2(1)


### Exercise 8

In [76]:
from scipy.stats import norm, multivariate_normal

N=10000
π=1
β=2
uv=multivariate_normal([0,0],[[1,0.5],[0.5,1]]).rvs(N)
u=uv[:,0]
v=uv[:,1]
z=norm().rvs(N)
y2=π*z+v
y1=β*y2**2+u
data=pd.DataFrame({'y1':y1,'y2':y2,'z':z,'u':u,'v':v})

In [77]:
res = IV2SLS.from_formula('y1~[I(y2**2)~z]', data).fit(cov_type='unadjusted')
print(res.first_stage)
print('\n'.join(res.summary.as_text().split('\n')[8:]))

    First Stage Estimation Results   
                           I(y2 ** 2)
-------------------------------------
R-squared                      0.0010
Partial R-squared              0.0010
Shea's R-squared               0.0010
Partial F-statistic            10.370
P-value (Partial F-stat)       0.0013
Partial F-stat Distn          chi2(1)
z                              0.1117
                             (3.2202)
-------------------------------------

T-stats reported in parentheses
T-stats use same covariance type as original model
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
I(y2 ** 2)     2.0059     0.0891     22.512     0.0000      1.8312      2.1805

Endogenous: I(y2 ** 2)
Instruments: z
Unadjusted Covariance (Hom

In [78]:
first_stage=smf.ols('I(y2**2)~0+z',data).fit()
data.loc[:,'y2sqfit']=first_stage.fittedvalues
second_stage=smf.ols('y1~0+y2sqfit',data).fit()
print(first_stage.summary().tables[1])
print(second_stage.summary().tables[1])

                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
z              0.1117      0.035      3.220      0.001       0.044       0.180
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
y2sqfit        2.0059      0.627      3.198      0.001       0.776       3.236


In [79]:
first_stage=smf.ols('y2~0+z',data).fit()
data.loc[:,'y2fit']=first_stage.fittedvalues
second_stage=smf.ols('y1~0+I(y2fit**2)',data).fit()
print(first_stage.summary().tables[1])
print(second_stage.summary().tables[1])

                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
z              0.9953      0.010     98.589      0.000       0.975       1.015
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
I(y2fit ** 2)     2.6844      0.031     86.285      0.000       2.623       2.745
