# Basic Examples

## Instrumental Variables estimation and two Stage Least Squares

**Chapter 15**
_Introduction to Econometrics_, Jeffrey Wooldridge

In [1]:
%matplotlib inline
import seaborn as sns
sns.mpl.rcParams['figure.figsize'] = (20,12)

In [2]:
import numpy as np
from statsmodels.api import add_constant
from linearmodels.datasets import mroz
data = mroz.load()
data = data.dropna()
data = add_constant(data, has_constant='add')
data.dtypes

const       float64
inlf          int64
hours         int64
kidslt6       int64
kidsge6       int64
age           int64
educ          int64
wage        float64
repwage     float64
hushrs        int64
husage        int64
huseduc       int64
huswage     float64
faminc        int64
mtr         float64
motheduc      int64
fatheduc      int64
unem        float64
city          int64
exper         int64
nwifeinc    float64
lwage       float64
expersq       int64
dtype: object

In [3]:
from linearmodels import iv
res = iv.IV2SLS(np.log(data.wage), data[['const','educ']], None, None).fit('unadjusted')
print(res.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   wage   R-squared:                      0.1179
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1158
No. Observations:                 428   F-statistic:                    57.196
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const         -0.1852     0.1848    -1.0022     0.3163     -0.5474      0.1770
educ           0.1086     0.0144     7.5628     0.00

In [4]:
res_first = iv.IV2SLS(data.educ, data[['const','fatheduc']], None, None).fit('unadjusted')
print(res_first.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   educ   R-squared:                      0.1726
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1706
No. Observations:                 428   F-statistic:                    89.258
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          10.237     0.2753     37.186     0.0000      9.6975      10.777
fatheduc       0.2694     0.0285     9.4476     0.00

In [5]:
data['educ_hat'] = data.educ - res.resids
res_second = iv.IV2SLS(np.log(data.wage), data[['const']], data.educ, data.fatheduc).fit('unadjusted')
print(res_second.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   wage   R-squared:                      0.0934
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0913
No. Observations:                 428   F-statistic:                    2.8487
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0914
Time:                        12:58:03   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          0.4411     0.4451     0.9911     0.3216     -0.4312      1.3134
educ           0.0592     0.0351     1.6878     0.09

In [6]:
from linearmodels.datasets import wage
men = wage.load()
men = men[['educ','wage','sibs']]
men = add_constant(men)
men = men.dropna()
res_first = iv.IV2SLS(men.educ, men[['const', 'sibs']], None, None).fit('unadjusted')
print(res_first.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   educ   R-squared:                      0.0576
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0566
No. Observations:                 934   F-statistic:                    57.107
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          14.143     0.1131     125.02     0.0000      13.921      14.365
sibs          -0.2287     0.0303    -7.5569     0.00

In [7]:
res = iv.IV2SLS(np.log(men.wage), men.const, men.educ, men.sibs).fit('unadjusted')
print(res.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   wage   R-squared:                     -0.0090
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0101
No. Observations:                 934   F-statistic:                    21.715
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          5.1310     0.3539     14.497     0.0000      4.4373      5.8248
educ           0.1224     0.0263     4.6599     0.00

In [8]:
from linearmodels.datasets import birthweight
data = birthweight.load()
data = add_constant(data)
res = iv.IV2SLS(data.packs, data[['const','cigprice']], None, None).fit('unadjusted')
print(res)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                  packs   R-squared:                      0.0015
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0008
No. Observations:                1388   F-statistic:                    2.0731
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.1499
Time:                        12:58:03   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const         -2.9746     3.2361    -0.9192     0.3580     -9.3172      3.3681
cigprice       0.0356     0.0247     1.4398     0.14

In [9]:
from linearmodels.datasets import birthweight
data = birthweight.load()
data = add_constant(data)
res = iv.IV2SLS(np.log(data.bwght), data.const, data.packs, data.cigprice).fit('unadjusted')
print(res)
print(birthweight.DESCR)
print(data.describe())

                          IV-2SLS Estimation Summary                          
Dep. Variable:                  bwght   R-squared:                     -1.5224
Estimator:                    IV-2SLS   Adj. R-squared:                -1.5242
No. Observations:                1388   F-statistic:                    1.1356
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.2866
Time:                        12:58:03   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          4.7203     0.0381     123.80     0.0000      4.6456      4.7951
packs          0.0238     0.0223     1.0656     0.28

In [10]:
from linearmodels.datasets import card
data = card.load()
data = add_constant(data)
data.dtypes
dep = ['wage']
endog = ['educ']
exog = ['const','exper','expersq','black','smsa','south','smsa66','reg662',
       'reg663','reg664','reg665','reg666','reg667','reg668','reg669']
instr = ['nearc4']
data = data[dep +exog+endog+ instr].dropna()
res = iv.IV2SLS(data.educ, data[instr+exog], None, None).fit()
print(res.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   educ   R-squared:                      0.4771
Estimator:                    IV-2SLS   Adj. R-squared:                 0.4745
No. Observations:                3010   F-statistic:                    3693.4
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                 chi2(15)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
nearc4         0.3199     0.0848     3.7702     0.0002      0.1536      0.4862
const          16.638     0.2148     77.456     0.00

In [11]:
res = iv.IV2SLS(np.log(data.wage), data[exog+endog],None,None).fit()
print(res.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   wage   R-squared:                      0.2998
Estimator:                    IV-2SLS   Adj. R-squared:                 0.2963
No. Observations:                3010   F-statistic:                    1377.0
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                 chi2(15)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          4.6208     0.0740     62.417     0.0000      4.4757      4.7659
exper          0.0848     0.0067     12.592     0.00

In [12]:
res = iv.IV2SLS(np.log(data.wage), data[exog], data[endog], data[instr]).fit()
print(res.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                   wage   R-squared:                      0.2382
Estimator:                    IV-2SLS   Adj. R-squared:                 0.2343
No. Observations:                3010   F-statistic:                    840.83
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                 chi2(15)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          3.6662     0.9085     4.0352     0.0001      1.8855      5.4468
exper          0.1083     0.0233     4.6376     0.00

In [13]:
data = mroz.load()
data = data.dropna()
data = add_constant(data, has_constant='add')
data['lnwage'] = np.log(data.wage)
dep = 'lnwage'
exog = ['const','exper','expersq']
endog = ['educ']
instr = ['fatheduc','motheduc']

In [14]:
res = iv.IV2SLS(data[dep],data[exog],data[endog],data[instr]).fit('unadjusted')
print(res)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                 lnwage   R-squared:                      0.1357
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1296
No. Observations:                 428   F-statistic:                    24.653
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                  chi2(3)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          0.0481     0.3985     0.1207     0.9039     -0.7329      0.8291
exper          0.0442     0.0134     3.3038     0.00

In [15]:
res.wooldridge_regression

Wooldridge's regression test of exogeneity
H0: Endogenous variables are exogenous
WaldTestStatistic(stat=2.8256012768590586, pval=0.093, dist=chi2(1))
id=0x2d0e2864518

In [16]:
v = iv.IV2SLS(data[endog],data[exog+instr],None,None).fit().resids
import pandas as pd
res_direct = iv.IV2SLS(data[dep],pd.concat([v, data[exog]], 1),data[endog],data[instr]).fit('unadjusted')
print(res_direct)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                 lnwage   R-squared:                      0.1624
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1544
No. Observations:                 428   F-statistic:                    82.954
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                  chi2(4)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
residual       0.0582     0.0346     1.6810     0.0928     -0.0097      0.1260
const          0.0481     0.3923     0.1226     0.90

In [17]:
res.wooldridge_overid

Wooldridge's score test of overidentification
H0: Model is not overidentified.
WaldTestStatistic(stat=0.4434612781087668, pval=0.505, dist=chi2(1))
id=0x2d0e2864550

In [18]:
u = res.resids
res = iv.IV2SLS(u,data[['exper','expersq'] + instr],None,None).fit()
print(res)

                          IV-2SLS Estimation Summary                          
Dep. Variable:               residual   R-squared:                      0.0009
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0086
No. Observations:                 428   F-statistic:                    0.4361
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.9794
Time:                        12:58:03   Distribution:                  chi2(4)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
exper          0.0006     0.0110     0.0530     0.9577     -0.0210      0.0221
expersq    -1.345e-05     0.0003    -0.0398     0.96

In [19]:
res.nobs * res.rsquared

0.3719811106149802

In [20]:
instr = ['fatheduc','motheduc','huseduc']
res = iv.IV2SLS(data[dep],data[exog],data[endog],data[instr]).fit('unadjusted')
print(res)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                 lnwage   R-squared:                      0.1495
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1435
No. Observations:                 428   F-statistic:                    34.900
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                  chi2(3)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const         -0.1869     0.2841    -0.6578     0.5107     -0.7436      0.3699
exper          0.0431     0.0132     3.2643     0.00

In [21]:
res.wooldridge_overid

Wooldridge's score test of overidentification
H0: Model is not overidentified.
WaldTestStatistic(stat=1.0421330958068737, pval=0.594, dist=chi2(2))
id=0x2d0e27efe80

In [22]:
u = res.resids
res = iv.IV2SLS(u,data[['exper','expersq'] + instr],None,None).fit()
print(res)

                          IV-2SLS Estimation Summary                          
Dep. Variable:               residual   R-squared:                      0.0026
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0092
No. Observations:                 428   F-statistic:                    1.0587
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.9577
Time:                        12:58:03   Distribution:                  chi2(5)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
exper          0.0003     0.0121     0.0284     0.9773     -0.0234      0.0241
expersq    -1.547e-05     0.0004    -0.0434     0.96

In [23]:
res.nobs * res.rsquared

1.112658868197094

In [24]:
from linearmodels.datasets import jobtraining
data = jobtraining.load()
data.head()
data = data.where(data.year.isin((1987,1988)))
data = data.dropna(how='all',axis=0).sort_values(['fcode','year'])

In [28]:
deltas = data.groupby('fcode')['scrap','grant','hrsemp'].diff()
deltas = add_constant(deltas).dropna()
print(deltas.describe())

       const      scrap      grant     hrsemp
count   45.0  45.000000  45.000000  45.000000
mean     1.0  -0.817556   0.377778  10.812321
std      0.0   2.496392   0.490310  20.523825
min      1.0 -10.000000   0.000000 -19.850180
25%      1.0  -1.000000   0.000000   0.000000
50%      1.0  -0.110000   0.000000   1.846154
75%      1.0   0.090000   1.000000  15.333330
max      1.0   5.000000   1.000000  80.000000


In [26]:
mod = iv.IV2SLS(deltas.hrsemp, deltas[['const','grant']], None, None)
print(mod.fit('unadjusted'))

                          IV-2SLS Estimation Summary                          
Dep. Variable:                 hrsemp   R-squared:                      0.3408
Estimator:                    IV-2SLS   Adj. R-squared:                 0.3255
No. Observations:                  45   F-statistic:                    23.266
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.0000
Time:                        12:58:03   Distribution:                  chi2(1)
Cov. Estimator:            unadjusted                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          1.5806     3.1139     0.5076     0.6117     -4.5225      7.6837
grant          24.437     5.0662     4.8235     0.00

In [27]:
mod = iv.IV2SLS(deltas.scrap, deltas.const, deltas.hrsemp, deltas.grant)
print(mod.fit())

                          IV-2SLS Estimation Summary                          
Dep. Variable:                  scrap   R-squared:                      0.0160
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0069
No. Observations:                  45   F-statistic:                    1.5260
Date:                Tue, Mar 14 2017   P-value (F-stat)                0.2167
Time:                        12:58:03   Distribution:                  chi2(1)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
           Parameters  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const         -0.4030     0.4885    -0.8248     0.4095     -1.3605      0.5545
hrsemp        -0.0383     0.0310    -1.2353     0.21