In [None]:
pip install linearmodels

Collecting linearmodels
  Downloading linearmodels-4.24-cp37-cp37m-manylinux1_x86_64.whl (1.5 MB)
[?25l[K     |▎                               | 10 kB 27.0 MB/s eta 0:00:01[K     |▌                               | 20 kB 26.5 MB/s eta 0:00:01[K     |▊                               | 30 kB 30.3 MB/s eta 0:00:01[K     |█                               | 40 kB 32.6 MB/s eta 0:00:01[K     |█▏                              | 51 kB 35.2 MB/s eta 0:00:01[K     |█▍                              | 61 kB 36.8 MB/s eta 0:00:01[K     |█▋                              | 71 kB 30.6 MB/s eta 0:00:01[K     |█▉                              | 81 kB 31.0 MB/s eta 0:00:01[K     |██                              | 92 kB 32.3 MB/s eta 0:00:01[K     |██▎                             | 102 kB 30.3 MB/s eta 0:00:01[K     |██▌                             | 112 kB 30.3 MB/s eta 0:00:01[K     |██▊                             | 122 kB 30.3 MB/s eta 0:00:01[K     |███                            

# Welcome
This is material for the **Panel Data** chapter in Scott Cunningham's book, [Causal Inference: The Mixtape](https://mixtape.scunning.com/).

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from linearmodels import PanelOLS, PooledOLS
from IPython.display import display, HTML
def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

# Prepare data

In [18]:
#Labels of regressors
labdict={'unsafe':'Unprotected sex with client of any kind',
         'llength':'Ln(Length)',
         'reg':'Client was a Regular',
         'age_cl':'Age of Client',
         'asq_cl':'Age of Client Squared',
         'appearance_cl':'Client Attractiveness (Scale of 1 to 10)',
         'provider_second':'Second Provider Involved',
         'asian_cl':'Asian Client',
         'black_cl':'Black Client',
         'hispanic_cl':'Hispanic Client',
         'othrace_cl':'Other Ethnicity Client',
         'hot':'Met Client in Hotel',
         'massage_cl':'Gave Client a Massage',
         'age':'Age of provider',
         'asq':'Age of provider squared',
         'bmi':'Body Mass Index',
         'hispanic':'Hispanic',
         'black':'Black',
         'other':'Other',
         'asian':'Asian',
         'schooling':'Imputed Years of Schooling',
         'cohab':'Cohabitating (living with a partner) but unmarried',
         'married':'Currently married and living with your spouse',
         'divorced':'Divorced and not remarried',
         'separated':'Married but not currently living with your spouse'}

#List of regressors
regvars=list(labdict.keys())

#List of time-invariant regressors
tfixedvars=['age', 'asq', 'bmi', 'hispanic', 'black', 'other', 'asian', 'schooling', 'cohab', 'married', 'divorced', 'separated']

#List of time-variant regressors
tvars=[var for var in regvars if var not in tfixedvars]

#Dependent variable
depvar=['lnw']

In [None]:
# Load data: 
df=pd.read_stata("https://raw.github.com/scunning1975/mixtape/master/sasp_panel.dta")
# Drop missing data: 
df=df.dropna()
# Keep balanced data (4 observations for each id)
df=df[df.groupby('id')['session'].transform('count')==4]
# Transform indicator
df.provider_second=df.provider_second.map({'1. No':0,'2. Yes':1}).astype('int64')

#Generate demeaned dataframe
dmdf=df[['id','session']].copy()
dmdf[regvars+depvar]=df[regvars+depvar]-df.groupby('id')[regvars+depvar].transform('mean')

#Declare panel setup:
df=df.set_index(['id','session'])
dmdf=dmdf.set_index(['id','session'])

# Regressions

## POLS

In [33]:
reg1= PooledOLS(df.lnw, df[regvars].assign(const=1).rename(columns=labdict)).fit(cov_type='clustered',cluster_entity=True)
reg1

0,1,2,3
Dep. Variable:,lnw,R-squared:,0.3026
Estimator:,PooledOLS,R-squared (Between):,0.2299
No. Observations:,1028,R-squared (Within):,0.4398
Date:,"Thu, Nov 18 2021",R-squared (Overall):,0.3026
Time:,11:56:04,Log-likelihood,-570.00
Cov. Estimator:,Clustered,,
,,F-statistic:,17.394
Entities:,257,P-value,0.0000
Avg Obs:,4.0000,Distribution:,"F(25,1002)"
Min Obs:,4.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Unprotected sex with client of any kind,0.0134,0.0424,0.3163,0.7519,-0.0698,0.0966
Ln(Length),-0.3083,0.0408,-7.5467,0.0000,-0.3884,-0.2281
Client was a Regular,-0.0470,0.0332,-1.4145,0.1575,-0.1122,0.0182
Age of Client,-0.0013,0.0098,-0.1364,0.8915,-0.0205,0.0178
Age of Client Squared,4.401e-05,0.0001,0.4391,0.6607,-0.0002,0.0002
Client Attractiveness (Scale of 1 to 10),0.0200,0.0096,2.0774,0.0380,0.0011,0.0389
Second Provider Involved,0.0554,0.0753,0.7355,0.4622,-0.0924,0.2031
Asian Client,-0.0135,0.0522,-0.2592,0.7955,-0.1160,0.0889
Black Client,0.0919,0.0689,1.3342,0.1824,-0.0433,0.2270


## Fixed Effects

In [32]:
reg2=PanelOLS(df.lnw,df[tvars].rename(columns=labdict)
              ,entity_effects=True).fit(cov_type='clustered', cluster_entity=True,debiased=True)
reg2

0,1,2,3
Dep. Variable:,lnw,R-squared:,0.5160
Estimator:,PanelOLS,R-squared (Between):,-0.7005
No. Observations:,1028,R-squared (Within):,0.5160
Date:,"Thu, Nov 18 2021",R-squared (Overall):,-0.6970
Time:,11:55:58,Log-likelihood,162.25
Cov. Estimator:,Clustered,,
,,F-statistic:,62.153
Entities:,257,P-value,0.0000
Avg Obs:,4.0000,Distribution:,"F(13,758)"
Min Obs:,4.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Unprotected sex with client of any kind,0.0510,0.0282,1.8079,0.0710,-0.0044,0.1064
Ln(Length),-0.4345,0.0243,-17.899,0.0000,-0.4822,-0.3869
Client was a Regular,-0.0373,0.0187,-1.9943,0.0465,-0.0741,-0.0006
Age of Client,0.0023,0.0068,0.3409,0.7333,-0.0110,0.0156
Age of Client Squared,-1.47e-05,7.536e-05,-0.1951,0.8454,-0.0002,0.0001
Client Attractiveness (Scale of 1 to 10),0.0056,0.0058,0.9730,0.3309,-0.0057,0.0170
Second Provider Involved,0.1131,0.0603,1.8768,0.0609,-0.0052,0.2315
Asian Client,-0.0099,0.0338,-0.2924,0.7701,-0.0762,0.0564
Black Client,0.0265,0.0423,0.6273,0.5307,-0.0564,0.1095


## Demeaned OLS

In [31]:
reg3= PooledOLS(dmdf.lnw, dmdf[tvars].assign(const=1).rename(columns=labdict)).fit(cov_type='clustered',
                                                                                   cluster_entity=True,debiased=True)
reg3.summary

0,1,2,3
Dep. Variable:,lnw,R-squared:,0.5160
Estimator:,PooledOLS,R-squared (Between):,-0.1863
No. Observations:,1028,R-squared (Within):,0.5160
Date:,"Thu, Nov 18 2021",R-squared (Overall):,0.5160
Time:,11:55:49,Log-likelihood,162.25
Cov. Estimator:,Clustered,,
,,F-statistic:,83.143
Entities:,257,P-value,0.0000
Avg Obs:,4.0000,Distribution:,"F(13,1014)"
Min Obs:,4.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Unprotected sex with client of any kind,0.0510,0.0282,1.8070,0.0711,-0.0044,0.1065
Ln(Length),-0.4345,0.0243,-17.890,0.0000,-0.4822,-0.3868
Client was a Regular,-0.0373,0.0187,-1.9933,0.0465,-0.0741,-0.0006
Age of Client,0.0023,0.0068,0.3407,0.7334,-0.0110,0.0156
Age of Client Squared,-1.47e-05,7.539e-05,-0.1950,0.8454,-0.0002,0.0001
Client Attractiveness (Scale of 1 to 10),0.0056,0.0058,0.9725,0.3310,-0.0057,0.0170
Second Provider Involved,0.1131,0.0603,1.8759,0.0610,-0.0052,0.2315
Asian Client,-0.0099,0.0338,-0.2923,0.7701,-0.0762,0.0564
Black Client,0.0265,0.0423,0.6270,0.5308,-0.0565,0.1095


## Put them together

In [118]:
def readreg(res,sig_levels=[0.01,0.05,0.1],bnum=3,senum=3):
  """
  Function to extract information from regression results
  res: input a linearmodels.panel.results.PanelResults object
  sig_levels: significance levels (list)
  bnum: numer of digits for parameters
  senum: number of digits for standard errors
  """
  # Extract stats
  stats=[str(cell).strip() for r  in res.summary.tables[0] for cell in r]
  stats={stats[i]:stats[i+1] for i in range(0, len(stats)-1,2)}
  # Extract estimates
  T=pd.DataFrame(res.summary.tables[1])
  T.columns=['var','b','se','tstat','pval','low','up']
  T=T[1:].astype('string')
  T[['b','se','pval']]=T[['b','se','pval']].astype('float',copy=False)
  #  Create stars
  T['star']=''
  for level in sorted(sig_levels):
    T['star']=T.star+np.where(T.pval<=level,"*","")
  T['out']=T.b.map(lambda x: str(round(x,bnum)))+T.star +"\n("+T.se.map(lambda x: str(round(x,senum)))+")"
  #Add stats
  T=T.append(pd.DataFrame({'var': ['No. Observations'],'out':[stats['No. Observations:']]}))
  return T.set_index('var')[['out']]


In [119]:
res=pd.concat([readreg(reg) for reg in [reg1,reg2,reg3]],axis=1).fillna('(-)')
res.columns=['POLS','FE','Demeaned FE']
pretty_print(res)

Unnamed: 0,POLS,FE,Demeaned FE
Unprotected sex with client of any kind,0.013 (0.042),0.051* (0.028),0.051* (0.028)
Ln(Length),-0.308*** (0.041),-0.434*** (0.024),-0.434*** (0.024)
Client was a Regular,-0.047 (0.033),-0.037** (0.019),-0.037** (0.019)
Age of Client,-0.001 (0.01),0.002 (0.007),0.002 (0.007)
Age of Client Squared,0.0 (0.0),-0.0 (0.0),-0.0 (0.0)
Client Attractiveness (Scale of 1 to 10),0.02** (0.01),0.006 (0.006),0.006 (0.006)
Second Provider Involved,0.055 (0.075),0.113* (0.06),0.113* (0.06)
Asian Client,-0.013 (0.052),-0.01 (0.034),-0.01 (0.034)
Black Client,0.092 (0.069),0.026 (0.042),0.026 (0.042)
Hispanic Client,0.052 (0.073),-0.062 (0.052),-0.062 (0.052)


# QUESTIONS
- Interpret the effect of natural log of session length on the natural log of hourly wage. Describe the economic theory that might explain this relationship? (HINT: Consider the role that supplier fixed versus variable costs may have on the hourly wage.)
- Becker described discrimination in terms of taste based. This meant that social interactions with people of the other race were factors into marginal cost. Given that these persist, what does this imply about the effect that competition is having on discrimination?
- Hamermesh and Biddle suggest that beauty is valued on the market. Describe some reasons why there is no effect on client beauty once we use the within estimators?
- What other interesting results did you find in this analysis? Which ones surprised you and which ones were intuitive and why?