## Solutions

### Exercise 1

In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

data=pd.read_stata('data/mus03data.dta')

In [2]:
from statsmodels.iolib.summary2 import summary_col

data1=data.iloc[:100,:]
f='ltotexp~suppins+phylim+actlim+totchr+age+female+income'
res1=smf.ols(formula=f,data=data1).fit()
res2=smf.ols(formula=f,data=data1).fit(cov_type='HC1')
res3=smf.ols(formula=f,data=data1).fit(cov_type='cluster', cov_kwds={'groups': data1.totchr})

summary_col(results=[res1,res2,res3], stars = True, model_names=['default', 'heteroskedastic', 'cluster'], drop_omitted=True)

0,1,2,3
,default,heteroskedastic,cluster
Intercept,5.6756***,5.6756***,5.6756***
,(1.0273),(0.9988),(0.4193)
suppins,0.0338,0.0338,0.0338
,(0.1772),(0.1735),(0.1578)
phylim,-0.5777**,-0.5777*,-0.5777*
,(0.2767),(0.3445),(0.2992)
actlim,0.7339**,0.7339**,0.7339***
,(0.3180),(0.3440),(0.1884)
totchr,0.1823,0.1823,0.1823***


In [3]:
res4=smf.ols(formula=f,data=data1).fit(cov_type='HC2')
res5=smf.ols(formula=f,data=data1).fit(cov_type='HC3')
summary_col(results=[res2,res4,res5], stars = True, model_names=['HC1', 'HC2', 'HC3'], drop_omitted=True)

0,1,2,3
,HC1,HC2,HC3
Intercept,5.6756***,5.6756***,5.6756***
,(0.9988),(1.0217),(1.0936)
suppins,0.0338,0.0338,0.0338
,(0.1735),(0.1737),(0.1818)
phylim,-0.5777*,-0.5777,-0.5777
,(0.3445),(0.3611),(0.3951)
actlim,0.7339**,0.7339**,0.7339*
,(0.3440),(0.3638),(0.4026)
totchr,0.1823,0.1823,0.1823


### Exercise 2

In [4]:
res=smf.ols(formula=f,data=data).fit(cov_type='HC1')
res.f_test('age=0,female=0,income=0')

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[3.64985035]]), p=0.012104264047142514, df_denom=2947, df_num=3>

In [5]:
data2=data.copy()
data2.loc[:,'age10']=data2.loc[:,'age']/10
res2=smf.ols(formula='ltotexp~suppins+phylim+actlim+totchr+age10*female+income',data=data2).fit(cov_type='HC1')
res2.t_test('age10:female')

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
c0            -0.2025      0.072     -2.799      0.005      -0.344      -0.061

In [6]:
res=smf.ols(formula='ltotexp~suppins+I(phylim+actlim)+totchr+age+female+income',data=data).fit(cov_type='HC1')
res.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,6.7051,0.282,23.735,0.000,6.151,7.259
suppins,0.2545,0.047,5.471,0.000,0.163,0.346
I(phylim + actlim),0.3275,0.031,10.682,0.000,0.267,0.388
totchr,0.3758,0.019,20.079,0.000,0.339,0.413
age,0.0038,0.004,1.019,0.308,-0.003,0.011
female,-0.0856,0.046,-1.876,0.061,-0.175,0.004
income,0.0025,0.001,2.420,0.016,0.000,0.005


### Exercise 3

In [7]:
res=smf.ols(formula=f,data=data).fit(cov_type='HC1')
data3=pd.DataFrame(np.column_stack([res.model.endog,res.model.exog[:,1:],res.predict()]))
data3.columns=['ltotexp','suppins','phylim','actlim','totchr','age','female','income','yhat']

res_default=smf.ols(formula=f+'+I(yhat**2)+I(yhat**3)+I(yhat**4)',data=data3).fit()
res_HC1=smf.ols(formula=f+'+I(yhat**2)+I(yhat**3)+I(yhat**4)',data=data3).fit(cov_type='HC1')
print(res_default.f_test("I(yhat ** 2)=0,I(yhat ** 3)=0,I(yhat ** 4)=0"))
print(res_HC1.f_test("I(yhat ** 2)=0,I(yhat ** 3)=0,I(yhat ** 4)=0"))

<F test: F=array([[9.03663802]]), p=5.914607556874138e-06, df_denom=2944, df_num=3>
<F test: F=array([[11.3182544]]), p=2.2196435866635983e-07, df_denom=2944, df_num=3>


Clearly we need to use the default cov_type. If you look at the source code of reset_ramsey method, you will find the same answer.

### Exercise 4

In [8]:
res=smf.ols(formula=f,data=data).fit()
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
name = ['Lagrange multiplier statistic', 'p-value']
test = sms.het_breuschpagan(res.resid, res.model.exog)
lzip(name, test)

[('Lagrange multiplier statistic', 93.12802905335813),
 ('p-value', 2.8150607155258386e-17)]

In [9]:
data4=pd.DataFrame(np.column_stack([res.model.exog[:,1:],res.resid]))
data4.columns=['suppins','phylim','actlim','totchr','age','female','income','u']
res2=smf.ols(formula='I(u**2)~suppins+phylim+actlim+totchr+age+female+income',data=data4).fit()
res2.nobs*res2.rsquared

93.12802905335813

### Exercise 5

In [10]:
f="totexp~suppins+phylim+actlim+totchr+age+female+income"
res=smf.ols(formula=f,data=data).fit(cov_type="HC1")
res.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,7370.5665,2734.062,2.696,0.007,2011.903,1.27e+04
suppins,829.3004,411.543,2.015,0.044,22.691,1635.910
phylim,2426.5716,536.920,4.519,0.000,1374.228,3478.915
actlim,3695.0459,687.586,5.374,0.000,2347.403,5042.689
totchr,1939.7297,180.914,10.722,0.000,1585.144,2294.315
age,-77.3720,36.575,-2.115,0.034,-149.057,-5.687
female,-1257.1738,416.639,-3.017,0.003,-2073.771,-440.576
income,7.2231,8.369,0.863,0.388,-9.179,23.625


In [11]:
yhat=res.predict()
cor=np.corrcoef(data.totexp,yhat)
cor

array([[1.        , 0.35121681],
       [0.35121681, 1.        ]])

In [12]:
res.rsquared

0.12335324640042522

In [13]:
cor[0,1]**2-res.rsquared<=1e-7

True

### Exercise 6

In [14]:
data6a=data.iloc[:2000,:]
data6b=data.iloc[2000:,:]
res=smf.ols(formula=f,data=data6a).fit()
yhat=res.predict(data6b)
pd.DataFrame({'totexp':data6b.totexp,'predict':yhat}).describe(percentiles=[])

Unnamed: 0,totexp,predict
count,1064.0,1064.0
mean,16007.863722,2642.626445
std,16637.93387,729.354163
min,0.0,1211.670976
50%,10393.5,2615.392192
max,125610.0,4686.64809
