In [1]:
%reset -f

## if you have not installed "linearmodels", run this first

In [None]:
!conda install -c conda-forge linearmodels

## load all packages

In [2]:
import numpy             as np
import statsmodels.api   as sm
import pandas            as pd
import seaborn           as sns

from linearmodels.iv            import IV2SLS
from statsmodels.iolib.summary2 import summary_col
from collections                import OrderedDict
from linearmodels.iv.results    import compare

## load data

In [3]:
df = pd.read_csv("CARD.csv")

In [4]:
df.columns

Index(['id', 'nearc2', 'nearc4', 'educ', 'age', 'fatheduc', 'motheduc',
       'weight', 'momdad14', 'sinmom14', 'step14', 'reg661', 'reg662',
       'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668', 'reg669',
       'south66', 'black', 'smsa', 'south', 'smsa66', 'wage', 'enroll', 'KWW',
       'IQ', 'married', 'libcrd14', 'exper', 'lwage', 'expersq'],
      dtype='object')

## add constant terms

In [5]:
df = sm.add_constant(df)

In [6]:
df.columns

Index(['const', 'id', 'nearc2', 'nearc4', 'educ', 'age', 'fatheduc',
       'motheduc', 'weight', 'momdad14', 'sinmom14', 'step14', 'reg661',
       'reg662', 'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668',
       'reg669', 'south66', 'black', 'smsa', 'south', 'smsa66', 'wage',
       'enroll', 'KWW', 'IQ', 'married', 'libcrd14', 'exper', 'lwage',
       'expersq'],
      dtype='object')

In [7]:
df.describe()

Unnamed: 0,const,id,nearc2,nearc4,educ,age,fatheduc,motheduc,weight,momdad14,...,smsa66,wage,enroll,KWW,IQ,married,libcrd14,exper,lwage,expersq
count,3010.0,3010.0,3010.0,3010.0,3010.0,3010.0,2320.0,2657.0,3010.0,3010.0,...,3010.0,3010.0,3010.0,2963.0,2061.0,3003.0,2997.0,3010.0,3010.0,3010.0
mean,1.0,2581.748837,0.440864,0.68206,13.263455,28.119601,10.003448,10.348137,321185.3,0.789369,...,0.649502,577.282392,0.092359,33.540668,102.449782,2.271395,0.674341,8.856146,6.261832,95.57907
std,0.0,1500.538849,0.496573,0.465753,2.676913,3.137004,3.720737,3.179671,170645.8,0.407825,...,0.477205,262.958302,0.28958,8.611619,15.423756,2.066823,0.468699,4.141672,0.443798,84.618314
min,1.0,2.0,0.0,0.0,1.0,24.0,0.0,0.0,75607.0,0.0,...,0.0,100.0,0.0,4.0,50.0,1.0,0.0,0.0,4.60517,0.0
25%,1.0,1275.5,0.0,0.0,12.0,25.0,8.0,8.0,122798.0,1.0,...,0.0,394.25,0.0,28.0,93.0,1.0,0.0,6.0,5.976985,36.0
50%,1.0,2541.0,0.0,1.0,13.0,28.0,10.0,12.0,365200.0,1.0,...,1.0,537.5,0.0,34.0,103.0,1.0,1.0,8.0,6.286928,64.0
75%,1.0,3858.75,1.0,1.0,16.0,31.0,12.0,12.0,406024.0,1.0,...,1.0,708.75,0.0,40.0,113.0,4.0,1.0,11.0,6.563503,121.0
max,1.0,5225.0,1.0,1.0,18.0,34.0,18.0,18.0,1752340.0,1.0,...,1.0,2404.0,1.0,56.0,149.0,6.0,1.0,23.0,7.784889,529.0


## (a)

In [8]:
X_a   = df[['educ','exper','expersq','black','smsa','south']]
X_a   = sm.add_constant(X_a)
Y_a   = df[['lwage']]
M_OLS = sm.OLS(Y_a, X_a, missing='drop')

OLS_result = M_OLS.fit()
print(OLS_result.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.291
Model:                            OLS   Adj. R-squared:                  0.289
Method:                 Least Squares   F-statistic:                     204.9
Date:                Sun, 28 Mar 2021   Prob (F-statistic):          1.52e-219
Time:                        21:59:18   Log-Likelihood:                -1308.7
No. Observations:                3010   AIC:                             2631.
Df Residuals:                    3003   BIC:                             2673.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.7337      0.068     70.022      0.0

### for final comparison, we also run it in linearmodels

In [9]:
M_OLS = IV2SLS(dependent=df['lwage'],
               exog=df[['const','educ','exper','expersq',
                        'black','smsa','south']],
               endog=None,
               instruments=None).fit()

print(M_OLS.summary)

                            OLS Estimation Summary                            
Dep. Variable:                  lwage   R-squared:                      0.2905
Estimator:                        OLS   Adj. R-squared:                 0.2891
No. Observations:                3010   F-statistic:                    1309.5
Date:                Sun, Mar 28 2021   P-value (F-stat)                0.0000
Time:                        21:59:27   Distribution:                  chi2(6)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          4.7337     0.0701     67.550     0.0000      4.5963      4.8710
educ           0.0740     0.0036     20.344     0.00

## (b)

In [10]:
X_b   = df[['educ','exper','expersq','black','smsa','south','IQ']]
X_b   = sm.add_constant(X_b)
Y_b   = df[['lwage']]
M_Proxy = sm.OLS(Y_b, X_b, missing='drop')

Proxy_result = M_Proxy.fit()
print(Proxy_result.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.226
Model:                            OLS   Adj. R-squared:                  0.223
Method:                 Least Squares   F-statistic:                     85.49
Date:                Sun, 28 Mar 2021   Prob (F-statistic):          2.28e-109
Time:                        21:59:28   Log-Likelihood:                -861.68
No. Observations:                2061   AIC:                             1739.
Df Residuals:                    2053   BIC:                             1784.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.4826      0.104     43.270      0.0

### for final comparison, we also run it in linearmodels

In [11]:
M_Proxy = IV2SLS(dependent=df['lwage'],
                 exog=df[['const','educ','exper','expersq',
                          'black','smsa','south', 'IQ']],
                 endog=None,
                 instruments=None).fit()

print(M_Proxy.summary)

                            OLS Estimation Summary                            
Dep. Variable:                  lwage   R-squared:                      0.2257
Estimator:                        OLS   Adj. R-squared:                 0.2231
No. Observations:                2061   F-statistic:                    624.66
Date:                Sun, Mar 28 2021   P-value (F-stat)                0.0000
Time:                        21:59:35   Distribution:                  chi2(7)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          4.4826     0.1095     40.931     0.0000      4.2679      4.6972
educ           0.0693     0.0051     13.637     0.00

Inputs contain missing values. Dropping rows with missing observations.


## (d)

In [12]:
X_d   = df[['nearc4','black','smsa','south']]
X_d   = sm.add_constant(X_d)
Y_d   = df[['IQ']]
reg_d = sm.OLS(Y_d, X_d, missing='drop')

reg_d_result = reg_d.fit()
print(reg_d_result.summary())

                            OLS Regression Results                            
Dep. Variable:                     IQ   R-squared:                       0.195
Model:                            OLS   Adj. R-squared:                  0.194
Method:                 Least Squares   F-statistic:                     124.8
Date:                Sun, 28 Mar 2021   Prob (F-statistic):           1.94e-95
Time:                        21:59:36   Log-Likelihood:                -8338.7
No. Observations:                2061   AIC:                         1.669e+04
Df Residuals:                    2056   BIC:                         1.672e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        102.9627      0.768    134.124      0.0

## (e)

In [13]:
X_e   = df[['nearc4','black','smsa','south']]
X_e   = sm.add_constant(X_e)
Y_e   = df[['educ']]
reg_e = sm.OLS(Y_e, X_e, missing='drop')

reg_e_result = reg_e.fit()
print(reg_e_result.summary())

                            OLS Regression Results                            
Dep. Variable:                   educ   R-squared:                       0.114
Model:                            OLS   Adj. R-squared:                  0.113
Method:                 Least Squares   F-statistic:                     96.94
Date:                Sun, 28 Mar 2021   Prob (F-statistic):           1.09e-77
Time:                        21:59:37   Log-Likelihood:                -7051.7
No. Observations:                3010   AIC:                         1.411e+04
Df Residuals:                    3005   BIC:                         1.414e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         12.9578      0.115    113.045      0.0

## (f)

In [14]:
X_f   = df[['educ','exper','expersq','black','smsa','south','IQ', 'nearc4']]
X_f   = sm.add_constant(X_f)
Y_f   = df[['lwage']]
reg_f = sm.OLS(Y_f, X_f, missing='drop')

reg_f_result = reg_f.fit()
print(reg_f_result.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.226
Model:                            OLS   Adj. R-squared:                  0.223
Method:                 Least Squares   F-statistic:                     74.83
Date:                Sun, 28 Mar 2021   Prob (F-statistic):          1.82e-108
Time:                        21:59:37   Log-Likelihood:                -861.49
No. Observations:                2061   AIC:                             1741.
Df Residuals:                    2052   BIC:                             1792.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.4804      0.104     43.215      0.0

## (g)

In [15]:
M_IV = IV2SLS(dependent=df['lwage'],
              exog=df[['const','exper','expersq',
                       'black','smsa','south']],
            endog=df[['educ']],
            instruments=df[['nearc4']]).fit()

print(M_IV.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                  lwage   R-squared:                      0.2252
Estimator:                    IV-2SLS   Adj. R-squared:                 0.2237
No. Observations:                3010   F-statistic:                    792.07
Date:                Sun, Mar 28 2021   P-value (F-stat)                0.0000
Time:                        21:59:38   Distribution:                  chi2(6)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          3.7528     0.8167     4.5948     0.0000      2.1520      5.3536
exper          0.1075     0.0211     5.0916     0.00

## (h) : in Python, you can test endogeneity using 4 tests; unlike Stata can only do one.
## Please note that 
* durbin and hausman are very similar
* wooldridge is more robust to heterskedasticity

In [16]:
M_IV.wu_hausman()

Wu-Hausman test of exogeneity
H0: All endogenous variables are exogenous
Statistic: 1.5390
P-value: 0.2149
Distributed: F(1,3002)
WaldTestStatistic, id: 0x7fe381f567d0

In [17]:
M_IV.durbin()

Durbin test of exogeneity
H0: All endogenous variables are exogenous
Statistic: 1.5423
P-value: 0.2143
Distributed: chi2(1)
WaldTestStatistic, id: 0x7fe3788e9bd0

In [18]:
M_IV.wooldridge_regression

Wooldridge's regression test of exogeneity
H0: Endogenous variables are exogenous
Statistic: 1.6104
P-value: 0.2044
Distributed: chi2(1)
WaldTestStatistic, id: 0x7fe308041150

In [19]:
M_IV.wooldridge_score

Wooldridge's score test of exogeneity
H0: Endogenous variables are exogenous
Statistic: 1.6091
P-value: 0.2046
Distributed: chi2(1)
WaldTestStatistic, id: 0x7fe308041490

## Extra : 2SLS with two instruments (nearc4 and nearc2) for educ

In [20]:
M_2SLS = IV2SLS(dependent=df['lwage'],
                exog=df[['const','exper','expersq',
                         'black','smsa','south']],
                endog=df[['educ']],
                instruments=df[['nearc4','nearc2']]).fit()

print(M_2SLS.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                  lwage   R-squared:                      0.1455
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1438
No. Observations:                3010   F-statistic:                    720.37
Date:                Sun, Mar 28 2021   P-value (F-stat)                0.0000
Time:                        21:59:42   Distribution:                  chi2(6)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          3.2721     0.8169     4.0056     0.0001      1.6711      4.8732
exper          0.1192     0.0213     5.5959     0.00

## Finally, let's compare them all

In [21]:
IV_result = OrderedDict()
IV_result['OLS'] = M_OLS
IV_result['Proxy'] = M_Proxy
IV_result['IV'] = M_IV
IV_result['2SLS'] = M_2SLS
print(compare(IV_result,precision='std_errors',stars=True))

                                Model Comparison                                
                                OLS          Proxy             IV           2SLS
--------------------------------------------------------------------------------
Dep. Variable                 lwage          lwage          lwage          lwage
Estimator                       OLS            OLS        IV-2SLS        IV-2SLS
No. Observations               3010           2061           3010           3010
Cov. Est.                    robust         robust         robust         robust
R-squared                    0.2905         0.2257         0.2252         0.1455
Adj. R-squared               0.2891         0.2231         0.2237         0.1438
F-statistic                  1309.5         624.66         792.07         720.37
P-value (F-stat)             0.0000         0.0000         0.0000         0.0000
const                     4.7337***      4.4826***      3.7528***      3.2721***
                           (

In [22]:
!jupyter nbconvert --to html W4_Python.ipynb

[NbConvertApp] Converting notebook W4_Python.ipynb to html
[NbConvertApp] Writing 633607 bytes to W4_Python.html
