In [4]:

# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install statsmodels linearmodels
#!{sys.executable} -m pip install --upgrade pip

Collecting linearmodels
[?25l  Downloading https://files.pythonhosted.org/packages/0c/e5/cbf041eb2606735e8b8305460dbcda1e8873205797eebf9d3ef87683d543/linearmodels-4.9-py2.py3-none-any.whl (940kB)
[K    100% |████████████████████████████████| 942kB 2.6MB/s 
Installing collected packages: linearmodels
Successfully installed linearmodels-4.9


https://github.com/bashtage/linearmodels

In [5]:
import numpy as np
from statsmodels.datasets import grunfeld
data = grunfeld.load_pandas().data
data.year = data.year.astype(np.int64)
# MultiIndex, entity - time
data = data.set_index(['firm','year'])
from linearmodels import PanelOLS
mod = PanelOLS(data.invest, data[['value','capital']], entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_entity=True)

In [7]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,invest,value,capital
firm,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
b'General Motors',1935,317.600,3078.500,2.800
b'General Motors',1936,391.800,4661.700,52.600
b'General Motors',1937,410.600,5387.100,156.900
b'General Motors',1938,257.700,2792.200,209.200
b'General Motors',1939,330.800,4313.200,203.400
b'General Motors',1940,461.200,4643.900,207.200
b'General Motors',1941,512.000,4551.200,255.200
b'General Motors',1942,448.000,3244.100,303.700
b'General Motors',1943,499.600,4053.700,264.100
b'General Motors',1944,547.500,4379.300,201.600


The formula interface for PanelOLS supports the special values EntityEffects and TimeEffects which add entity (fixed) and time effects, respectively.

In [8]:
from linearmodels import PanelOLS
mod = PanelOLS.from_formula('invest ~ value + capital + EntityEffects', data)
res = mod.fit(cov_type='clustered', cluster_entity=True)

In [9]:
res

0,1,2,3
Dep. Variable:,invest,R-squared:,0.7667
Estimator:,PanelOLS,R-squared (Between):,0.8223
No. Observations:,220,R-squared (Within):,0.7667
Date:,"Tue, Sep 04 2018",R-squared (Overall):,0.8132
Time:,20:38:10,Log-likelihood,-1167.4
Cov. Estimator:,Clustered,,
,,F-statistic:,340.08
Entities:,11,P-value,0.0000
Avg Obs:,20.000,Distribution:,"F(2,207)"
Min Obs:,20.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
value,0.1101,0.0144,7.6453,0.0000,0.0817,0.1385
capital,0.3100,0.0500,6.1970,0.0000,0.2114,0.4087




The data set consists of wages and characteristics for men during the 1980s. The entity identifier is nr and the time identified is year. This data is used extensively in Chapter 14 of Introduction to Econometrics by Jeffrey Wooldridge.

Here a MultiIndex DataFrame is used to hold the data in a format that can be understood as a panel. Before setting the index, a year Categorical is created which facilitated making dummies.

In [3]:


from linearmodels.datasets import wage_panel
import pandas as pd
data = wage_panel.load()
year = pd.Categorical(data.year)
data = data.set_index(['nr', 'year'])
data['year'] = year
print(wage_panel.DESCR)
print(data.head())




F. Vella and M. Verbeek (1998), "Whose Wages Do Unions Raise? A Dynamic Model
of Unionism and Wage Rate Determination for Young Men," Journal of Applied
Econometrics 13, 163-183.

nr                       person identifier
year                     1980 to 1987
black                    =1 if black
exper                    labor market experience
hisp                     =1 if Hispanic
hours                    annual hours worked
married                  =1 if married
educ                     years of schooling
union                    =1 if in union
lwage                    log(wage)
expersq                  exper^2
occupation               Occupation code

         black  exper  hisp  hours  married  educ  union     lwage  expersq  \
nr year                                                                       
13 1980      0      1     0   2672        0    14      0  1.197540        1   
   1981      0      2     0   2320        0    14      1  1.853060        4   
   1982      0    

### PooledOLS 

PooledOLS is just plain OLS that understands that various panel data structures. It is useful as a base model. Here the log wage is modeled using all of the variables and time dummies.

In [2]:

from linearmodels.panel import PooledOLS
import statsmodels.api as sm
exog_vars = ['black','hisp','exper','expersq','married', 'educ', 'union', 'year']
exog = sm.add_constant(data[exog_vars])
mod = PooledOLS(data.lwage, exog)
pooled_res = mod.fit()
print(pooled_res)



  from pandas.core import datetools


                          PooledOLS Estimation Summary                          
Dep. Variable:                  lwage   R-squared:                        0.1893
Estimator:                  PooledOLS   R-squared (Between):              0.2066
No. Observations:                4360   R-squared (Within):               0.1692
Date:                Fri, Sep 07 2018   R-squared (Overall):              0.1893
Time:                        22:34:52   Log-likelihood                   -2982.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      72.459
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                 F(14,4345)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             72.459
                            

### Random effects

The random effects model is virtually identical to the pooled OLS model except that is accounts for the structure of the model and so is more efficient. Random effects uses a quasi-demeaning strategy which subtracts the time average fo the within entity values to account for the common shock.

In [4]:

from linearmodels.panel import RandomEffects
mod = RandomEffects(data.lwage, exog)
re_res = mod.fit()
print(re_res)



                        RandomEffects Estimation Summary                        
Dep. Variable:                  lwage   R-squared:                        0.1806
Estimator:              RandomEffects   R-squared (Between):              0.1853
No. Observations:                4360   R-squared (Within):               0.1799
Date:                Fri, Sep 07 2018   R-squared (Overall):              0.1828
Time:                        22:36:42   Log-likelihood                   -1622.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      68.409
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                 F(14,4345)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             68.409
                            

The quasi-differencing in the random effects estimator depends on a quantity that depends on the relative variance of the idiosyncratic shock and the common shock. This can be accessed using variance_decomposition.

In [5]:
re_res.variance_decomposition

Effects                   0.106946
Residual                  0.123324
Percent due to Effects    0.464438
Name: Variance Decomposition, dtype: float64

In [6]:
re_res.theta.head()

Unnamed: 0_level_0,theta
nr,Unnamed: 1_level_1
13,0.645059
17,0.645059
18,0.645059
45,0.645059
110,0.645059


### The between estimator

The between estimator is an alternative, usually less efficient estimator, can can be used to estimate model parameters. It is particular simple since it first computes the time averages of y
and x

and then runs a simple regression using these averages.

The year dummies are dropped since the averaging removes differences due to the year. expersq was also dropped since it is fairly co-linear with exper. These results are broadly similar to the previous models.

In [7]:
from linearmodels.panel import BetweenOLS
exog_vars = ['black','hisp','exper','married', 'educ', 'union']
exog = sm.add_constant(data[exog_vars])
mod = BetweenOLS(data.lwage, exog)
be_res = mod.fit()
print(be_res)

                         BetweenOLS Estimation Summary                          
Dep. Variable:                  lwage   R-squared:                        0.2155
Estimator:                 BetweenOLS   R-squared (Between):              0.2155
No. Observations:                 545   R-squared (Within):               0.1141
Date:                Fri, Sep 07 2018   R-squared (Overall):              0.1686
Time:                        22:44:55   Log-likelihood                   -194.54
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      24.633
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                   F(6,538)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             24.633
                            

### Fixed effects

Entity effects are included by setting entity_effects=True. This is equivalent to including dummies for each entity. In this panel, this would add 545 dummy variables and estimation of the model would be considerably slower. PanelOLS does not actually use dummy variables and instead uses group-wise demeaning to achieve the same effect.
Time-invariant Variables

Time-invariant variables cannot be included when using entity effects since, once demeaned, these will all be 0. Here exper is also excluded since once entity effects and time dummies are incorporated, exper is perfectly co-linear.

In [8]:
from linearmodels.panel import PanelOLS
exog_vars = ['expersq', 'union', 'married', 'year']
exog = sm.add_constant(data[exog_vars])
mod = PanelOLS(data.lwage, exog, entity_effects=True)
fe_res = mod.fit()
print(fe_res)


                          PanelOLS Estimation Summary                           
Dep. Variable:                  lwage   R-squared:                        0.1806
Estimator:                   PanelOLS   R-squared (Between):             -0.0052
No. Observations:                4360   R-squared (Within):               0.1806
Date:                Fri, Sep 07 2018   R-squared (Overall):              0.0807
Time:                        22:48:05   Log-likelihood                   -1324.8
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      83.851
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                 F(10,3805)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             83.851
                            

### Time Effects

Time effect can be added using time_effects=True. Here the time dummies are removed. Note that the core coefficients are identical. The only change is in the test statistic for poolability since not the “effects” include both entity and time, whereas before only entity were included.
Effects vs Dummies

For variable which can be consistently estimated, such as time effects in the usual large N, fixed T panel, it isn’t necessary to include these as effects. They can instead be implemented as dummy variables.

In [9]:
from linearmodels.panel import PanelOLS
exog_vars = ['expersq','union','married']
exog = sm.add_constant(data[exog_vars])
mod = PanelOLS(data.lwage, exog, entity_effects=True, time_effects=True)
fe_te_res = mod.fit()
print(fe_te_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  lwage   R-squared:                        0.0216
Estimator:                   PanelOLS   R-squared (Between):             -0.0052
No. Observations:                4360   R-squared (Within):              -0.4809
Date:                Fri, Sep 07 2018   R-squared (Overall):             -0.2253
Time:                        22:49:31   Log-likelihood                   -1324.8
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      27.959
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                  F(3,3805)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             27.959
                            

https://lectures.quantecon.org/py/pandas_panel.html

