## Solutions

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf 

### Exercise 1

In [2]:
from scipy import stats

np.random.seed(10101)
N=500
nd=stats.norm(0,5)
x2,x3,epsilon=np.split(nd.rvs(3*N),3)
u=np.sqrt(np.exp(-1+0.2*x2))*epsilon
y=1+x2+x3+u
data=pd.DataFrame({'x2':x2,'x3':x3,'e':epsilon,'u':u,'y':y})

In [3]:
from scipy.optimize import least_squares

res=smf.ols(formula='y~x2+x3',data=data).fit()

def func(g,x,y):
    return np.exp(g[0]+g[1]*x)-y

data['uhatsq']=res.resid**2
g0=np.ones(2)
res_lsq = least_squares(func, g0, args=(data.x2, data.uhatsq))
data['varu']=np.exp(res_lsq.x@[np.ones(len(data.x2)),data.x2])

In [4]:
# 1
res1=smf.wls(formula='y~x2+x3',data=data,weights=1./data.varu).fit()
res1.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0664,0.144,7.415,0.000,0.784,1.349
x2,1.0065,0.024,41.193,0.000,0.959,1.055
x3,0.9839,0.023,42.090,0.000,0.938,1.030


In [5]:
# 2
tr_y=y/np.sqrt(data.varu) 
tr_x2=x2/np.sqrt(data.varu)
tr_x3=x3/np.sqrt(data.varu) 
tr_one=np.ones(len(data.x2))/np.sqrt(data.varu)
res2=smf.ols(formula='tr_y~tr_one+tr_x2+tr_x3-1',data=data).fit()
res2.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
tr_one,1.0664,0.144,7.415,0.000,0.784,1.349
tr_x2,1.0065,0.024,41.193,0.000,0.959,1.055
tr_x3,0.9839,0.023,42.090,0.000,0.938,1.030


### Exercise 2

In [6]:
w=np.sqrt(np.exp(-1+0.2*x2))
tr_y=y/w
tr_x2=x2/w
tr_x3=x3/w 
tr_one=np.ones(len(data.x2))/w
res3=smf.ols(formula='tr_y~tr_one+tr_x2+tr_x3-1',data=data).fit()
res3.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
tr_one,1.0513,0.148,7.111,0.000,0.761,1.342
tr_x2,1.0028,0.022,46.398,0.000,0.960,1.045
tr_x3,0.9890,0.020,50.023,0.000,0.950,1.028


### Exercise 3

In [7]:
x2sq=x2**2
res4=smf.wls(formula='y~x2+x3',data=data,weights=1./x2sq).fit(cov_type='HC1')
res4.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0645,0.472,0.137,0.891,-0.861,0.990
x2,1.9148,1.184,1.618,0.106,-0.405,4.234
x3,1.0176,0.366,2.782,0.005,0.301,1.735


In [8]:
res.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0715,0.157,6.827,0.000,0.763,1.380
x2,1.0121,0.031,32.886,0.000,0.952,1.073
x3,0.9638,0.031,30.685,0.000,0.902,1.026


In [9]:
from statsmodels.stats.diagnostic import het_breuschpagan

het_breuschpagan(res4.resid, res4.model.exog)[1]

0.5968487611275441

### Exercise 4

In [10]:
from statsmodels.iolib.summary2 import summary_col

data=pd.read_stata('data/mus05surdata.dta')
res=smf.ols(formula='drugexp ~ age+age2+actlim+totchr+medicaid+private',data=data).fit()
res1=smf.ols(formula='drugexp ~ age+age2+actlim+totchr+medicaid+private',data=data).fit(cov_type='HC1')
res2=smf.ols(formula='totothr ~ age+age2+educyr+actlim+totchr+private',data=data).fit()
res3=smf.ols(formula='totothr ~ age+age2+educyr+actlim+totchr+private',data=data).fit(cov_type='HC1')
summary_col(results=[res,res1,res2,res3], stars = True, model_names=['Reg', 'Reg 1', 'Reg 2', 'Reg 3'], drop_omitted=True)

0,1,2,3,4
,Reg,Reg 1,Reg 2,Reg 3
Intercept,-11358.6034**,-11358.6034**,-42445.6035,-42445.6035
,(4884.9018),(4466.4852),(28975.2153),(27116.6104)
R-squared,0.1959,0.1959,0.0820,0.0820
R-squared Adj.,0.1945,0.1945,0.0804,0.0804
actlim,704.0432***,704.0432***,4778.5762***,4778.5762***
,(77.0857),(87.4941),(454.3489),(547.3765)
age,330.6778**,330.6778***,1159.3610,1159.3610
,(130.5802),(118.8841),(774.7039),(720.4713)
age2,-2.2975***,-2.2975***,-8.0282,-8.0282*


In [11]:
from linearmodels.system import SUR

formula = '{drugexp ~ 1 + age+age2+actlim+totchr+medicaid+private} {totothr ~ 1 + age+age2+educyr+actlim+totchr+private}'
res4 = SUR.from_formula(formula, data).fit(cov_type='unadjusted')
print('\n'.join(res4.summary.as_text().split('\n')[8:]))

                                        Num. Constraints:                      None
                Equation: drugexp, Dependent Variable: drugexp                
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept  -1.136e+04     4879.8    -2.3279     0.0199  -2.092e+04     -1795.3
age            330.69     130.45     2.5351     0.0112      75.025      586.36
age2          -2.2975     0.8673    -2.6491     0.0081     -3.9974     -0.5977
actlim         703.88     76.990     9.1424     0.0000      552.98      854.78
totchr         549.09     26.406     20.794     0.0000      497.33      600.84
medicaid       405.72     98.351     4.1252     0.0000      212.96      598.49
private        64.970     71.461     0.9092     0.3633     -75.090      205.03
                Equation: totothr, Dependent Variable: totothr                
            Parameter  Std. Err.     T-stat    

  if is_categorical(s):


In [12]:
res5 = SUR.from_formula(formula, data).fit(cov_type='robust')
print('\n'.join(res5.summary.as_text().split('\n')[8:]))

                                        Num. Constraints:                      None
                Equation: drugexp, Dependent Variable: drugexp                
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept  -1.136e+04     2.3428    -4848.7     0.0000  -1.136e+04  -1.135e+04
age            330.69     0.0624     5301.5     0.0000      330.57      330.82
age2          -2.2975     0.0004    -5565.1     0.0000     -2.2984     -2.2967
actlim         703.88     0.0460  1.529e+04     0.0000      703.79      703.97
totchr         549.09     0.0164  3.354e+04     0.0000      549.05      549.12
medicaid       405.72     0.0637     6366.8     0.0000      405.60      405.85
private        64.970     0.0343     1893.1     0.0000      64.903      65.037
                Equation: totothr, Dependent Variable: totothr                
            Parameter  Std. Err.     T-stat    

### Exercise 5

In [13]:
data=pd.read_stata('data/mus05nhanes2.dta')
data.hgb.mean()

14.26046085357666

In [14]:
np.average(data.hgb,weights=data.finalwgt)

14.28339958862687

In [15]:
data.loc[:,'uniqpsu']=2*data.strata+data.psu
res=smf.wls(formula='hgb~age+female',data=data,weights=data.finalwgt).fit(cov_type='cluster',cov_kwds={'groups': data.uniqpsu})
res.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,15.1171,0.062,242.750,0.000,14.995,15.239
age,0.0007,0.001,0.777,0.437,-0.001,0.002
female,-1.6572,0.031,-53.275,0.000,-1.718,-1.596


In [16]:
res2=smf.ols(formula='hgb~age+female',data=data).fit()
res2.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,15.0862,0.035,428.325,0.000,15.017,15.155
age,-0.0003,0.001,-0.507,0.612,-0.002,0.001
female,-1.5422,0.023,-68.084,0.000,-1.587,-1.498


### Exercise 6

In [17]:
from statsmodels.iolib.summary2 import summary_col

data=pd.read_stata('data/mus05surdata.dta')
res=smf.ols(formula='ldrugexp ~ age+age2+actlim+totchr+medicaid+private',data=data).fit()
res1=smf.ols(formula='ldrugexp ~ age+age2+actlim+totchr+medicaid+private',data=data).fit(cov_type='HC1')
res2=smf.ols(formula='ltotothr ~ age+age2+educyr+actlim+totchr+private',data=data).fit()
res3=smf.ols(formula='ltotothr ~ age+age2+educyr+actlim+totchr+private',data=data).fit(cov_type='HC1')
summary_col(results=[res,res1,res2,res3], stars = True, model_names=['Reg', 'Reg 1', 'Reg 2', 'Reg 3'], drop_omitted=True)

0,1,2,3,4
,Reg,Reg 1,Reg 2,Reg 3
Intercept,-4.4022,-4.4022,-6.1414,-6.1414
,(2.9867),(2.9724),(3.8382),(3.8531)
R-squared,0.2270,0.2270,0.1540,0.1540
R-squared Adj.,0.2256,0.2256,0.1525,0.1525
actlim,0.3574***,0.3574***,0.7421***,0.7421***
,(0.0468),(0.0455),(0.0601),(0.0636)
age,0.2764***,0.2764***,0.3174***,0.3174***
,(0.0798),(0.0793),(0.1026),(0.1030)
age2,-0.0018***,-0.0018***,-0.0021***,-0.0021***


In [18]:
from linearmodels.system import SUR

formula = '{ldrugexp ~ 1 + age+age2+actlim+totchr+medicaid+private} {ltotothr ~ 1 + age+age2+educyr+actlim+totchr+private}'
res4 = SUR.from_formula(formula, data).fit(cov_type='unadjusted')
print('\n'.join(res4.summary.as_text().split('\n')[8:]))

                                        Num. Constraints:                      None
               Equation: ldrugexp, Dependent Variable: ldrugexp               
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept     -3.8913     2.9759    -1.3076     0.1910     -9.7239      1.9414
age            0.2630     0.0795     3.3074     0.0009      0.1072      0.4189
age2          -0.0017     0.0005    -3.2964     0.0010     -0.0028     -0.0007
actlim         0.3547     0.0466     7.6079     0.0000      0.2633      0.4460
totchr         0.4005     0.0161     24.810     0.0000      0.3689      0.4322
medicaid       0.1068     0.0592     1.8028     0.0714     -0.0093      0.2229
private        0.0810     0.0436     1.8598     0.0629     -0.0044      0.1664
               Equation: ltotothr, Dependent Variable: ltotothr               
            Parameter  Std. Err.     T-stat    

  if is_categorical(s):
Inputs contain missing values. Dropping rows with missing observations.


In [19]:
res5 = SUR.from_formula(formula, data).fit(cov_type='robust')
print('\n'.join(res5.summary.as_text().split('\n')[8:]))

                                        Num. Constraints:                      None
               Equation: ldrugexp, Dependent Variable: ldrugexp               
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept     -3.8913     2.6290    -1.4802     0.1388     -9.0439      1.2614
age            0.2630     0.0702     3.7478     0.0002      0.1255      0.4006
age2          -0.0017     0.0005    -3.7387     0.0002     -0.0027     -0.0008
actlim         0.3547     0.0400     8.8715     0.0000      0.2763      0.4330
totchr         0.4005     0.0144     27.828     0.0000      0.3723      0.4287
medicaid       0.1068     0.0539     1.9798     0.0477      0.0011      0.2125
private        0.0810     0.0390     2.0788     0.0376      0.0046      0.1574
               Equation: ltotothr, Dependent Variable: ltotothr               
            Parameter  Std. Err.     T-stat    