# An inferential problem: The Gender Wage Gap

In [1]:
import pandas as pd
import numpy as np
import pyreadr

In [2]:
rdata_read = pyreadr.read_r("../data/wage2015_subsample_inference.Rdata")

In [3]:
# Which elements are inside this object
print( rdata_read.keys() )

odict_keys(['data'])


In [4]:
# Extracting the data frame from rdata_read
data = rdata_read[ 'data' ]

In [5]:
Z = data[ ["lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"] ]

data_female = data[data[ 'sex' ] == 1 ]
Z_female = data_female[ ["lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"] ]

data_male = data[ data[ 'sex' ] == 0 ]
Z_male = data_male[ [ "lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1" ] ]


## Sumarize mean values

In [6]:
table = np.zeros( (12, 3) )

In [7]:
table[:, 0] = Z.mean().values
table[:, 1] = Z_male.mean().values
table[:, 2] = Z_female.mean().values

In [8]:
table_pandas = pd.DataFrame( table, columns = [ 'All', 'Men', 'Women'])
table_pandas.index = ["Log Wage","Sex","Less then High School","High School Graduate","Some College","Gollage Graduate","Advanced Degree", "Northeast","Midwest","South","West","Experience"]
table_pandas

Unnamed: 0,All,Men,Women
Log Wage,2.970787,2.98783,2.949485
Sex,0.444466,0.0,1.0
Less then High School,0.023301,0.031807,0.012669
High School Graduate,0.243883,0.294303,0.180865
Some College,0.278058,0.273331,0.283967
Gollage Graduate,0.31767,0.293953,0.347313
Advanced Degree,0.137087,0.106606,0.175186
Northeast,0.227767,0.22195,0.235037
Midwest,0.259612,0.259,0.260376
South,0.296505,0.298148,0.294452


In [9]:
data_female['lwage'].mean() - data_male['lwage'].mean()

-0.03834473367441493

In [10]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

## Nocontrol Model

In [11]:
nocontrol_model = smf.ols( formula = 'lwage ~ sex', data = data )

In [12]:
nocontrol_est = nocontrol_model.fit().summary2().tables[1]['Coef.']['sex']

In [13]:
# Principal covariables

lwage = data[[ 'lwage' ]].values
sex = data[[ 'sex' ]].values
X = sm.add_constant( sex )

In [14]:
# In Math

xtx_inv = np.linalg.inv(np.transpose( X ).dot( X ))

HCV_coefs = xtx_inv.dot(np.transpose(X)).dot(np.diag(nocontrol_model.fit().resid**2)).dot(X).dot(xtx_inv)

nocontrol_se = np.power( HCV_coefs.diagonal() , 0.5)[1]

In [15]:
# With built-in functions
HCV_coefs = nocontrol_model.fit().cov_HC0

nocontrol_se = np.power( HCV_coefs.diagonal() , 0.5)[1]

In [16]:
print( f'The estimated gender coefficient is {nocontrol_est} and the corresponding robust standard error is {nocontrol_se}' )

The estimated gender coefficient is -0.03834473367441481 and the corresponding robust standard error is 0.01590193507909572


## Control Model

In [17]:
flex = 'lwage ~ sex + (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we)'

control_model = smf.ols( formula = flex, data = data )

control_est = control_model.fit().summary2().tables[1]['Coef.']['sex']

print( f"Coefficient for OLS with controls {control_est}" )

HCV_coefs = control_model.fit().cov_HC0
control_se = np.power( HCV_coefs.diagonal() , 0.5)[1]

Coefficient for OLS with controls -0.06955320329684744


## Partialling-Out using ols

In [18]:
# models
# model for Y
flex_y = 'lwage ~  (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we)'

# model for D
flex_d = 'sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we)' 

In [19]:
# partialling-out the linear effect of W from Y
t_Y = smf.ols( formula = flex_y , data = data ).fit().resid

# partialling-out the linear effect of W from D
t_D = smf.ols( formula = flex_d , data = data ).fit().resid

data_res = pd.DataFrame( np.vstack(( t_Y.values , t_D.values )).T , columns = [ 't_Y', 't_D' ] )

In [20]:
# regression of Y on D after partialling-out the effect of W
partial_fit =  smf.ols( formula = 't_Y ~ t_D' , data = data_res ).fit()

In [21]:
partial_est = partial_fit.summary2().tables[1]['Coef.']['t_D']

In [22]:
print("Coefficient for D via partialling-out", partial_est)

Coefficient for D via partialling-out -0.06955320329684613


In [23]:
# standard error
HCV_coefs = partial_fit.cov_HC0
partial_se = np.power( HCV_coefs.diagonal() , 0.5)[1]

In [24]:
# confidence interval

partial_fit.conf_int( alpha=0.05 ).iloc[1, :]

0   -0.098671
1   -0.040435
Name: t_D, dtype: float64

## Partialling-Out using lasso

In [25]:
# models
# model for Y
flex_y = 'lwage ~  (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we)'

# model for D
flex_d = 'sex ~ (exp1+exp2+exp3+exp4)*(shs+hsg+scl+clg+occ2+ind2+mw+so+we)'

### With Statsmodels

In [26]:
# partialling-out the linear effect of W from Y
Y_lasso_fitted = smf.ols(formula = flex_y, data = data).fit_regularized( method='elastic_net', alpha=0.0, L1_wt=1.0 ,  start_params = None, profile_scale = False, refit = False ).fittedvalues
t_Y = lwage - Y_lasso_fitted.values.reshape( Y_lasso_fitted.values.size, 1 )


D_lasso_fitted = smf.ols( flex_d, data=data).fit_regularized( method = 'elastic_net', alpha = 0.0, L1_wt = 1.0 ,  start_params = None, profile_scale = False, refit = False ).fittedvalues
t_D = lwage - D_lasso_fitted.values.reshape( D_lasso_fitted.values.size, 1 )


In [28]:
from sklearn import linear_model

In [29]:
lasso_model = linear_model.Lasso(alpha=0.1)

### With Sklearn

In [30]:
# Flex_y

covariables_flex_y = smf.ols(formula = flex_y, data = data)

Y_lasso_fitted = lasso_model.fit( covariables_flex_y.exog, lwage ).predict( covariables_flex_y.exog )

t_Y = lwage - Y_lasso_fitted.reshape( Y_lasso_fitted.size, 1)

In [31]:
# Flex_d
covariables_flex_d = smf.ols( flex_d, data=data)

D_lasso_fitted = lasso_model.fit( covariables_flex_d.exog, lwage ).predict( covariables_flex_d.exog )

t_D = lwage - D_lasso_fitted.reshape( D_lasso_fitted.size, 1)

In [32]:
data_res = pd.DataFrame( np.hstack(( t_Y , t_D )) , columns = [ 't_Y', 't_D' ] )

In [33]:
# regression of Y on D after partialling-out the effect of W
partial_lasso_fit = smf.ols( formula = 't_Y ~ t_D' , data = data_res ).fit()
partial_lasso_est = partial_fit.summary2().tables[1]['Coef.']['t_D']

In [34]:
print( f"Coefficient for D via partialling-out using lasso, {partial_lasso_est}")

Coefficient for D via partialling-out using lasso, -0.06955320329684613


In [35]:
# standard error
HCV_coefs = partial_lasso_fit.cov_HC0
partial_lasso_se = np.power( HCV_coefs.diagonal() , 0.5)[1]

## Summarize the results

In [36]:
table2 = np.zeros( (4, 2) )

In [37]:
table2[0,0] = nocontrol_est  
table2[0,1] = nocontrol_se   
table2[1,0] = control_est
table2[1,1] = control_se    
table2[2,0] = partial_est  
table2[2,1] = partial_se  
table2[3,0] =  partial_lasso_est
table2[3,1] = partial_lasso_se 

In [38]:
table2_pandas = pd.DataFrame( table2, columns = [ "Estimate","Std. Error" ])
table2_pandas.index = [ "Without controls", "full reg", "partial reg", "partial reg via lasso" ]
table2_pandas

Unnamed: 0,Estimate,Std. Error
Without controls,-0.038345,0.01590194
full reg,-0.069553,0.144608
partial reg,-0.069553,0.01500047
partial reg via lasso,-0.069553,1.429019e-17


## "Extra" flexible model

In [39]:
extraflex = 'lwage ~ sex + (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)**2'

control_fit = smf.ols( formula = extraflex, data=data).fit()



In [40]:
#summary( control_fit )
control_est = control_fit.summary2().tables[1]['Coef.']['sex']

In [41]:
print( f"Number of Extra-Flex Controls {control_fit.summary2().tables[1].shape[0]-1} \nCoefficient for OLS with extra flex controls {control_est}" )

Number of Extra-Flex Controls 979 
Coefficient for OLS with extra flex controls -0.061270463794010514


In [42]:
# standard error
HCV_coefs = control_fit.cov_HC0

In [43]:
n= len(data[ 'wage' ])

In [44]:
p = len(control_fit.summary2().tables[1]['Coef.'])

In [45]:
control_se = np.power( HCV_coefs.diagonal() , 0.5)[1]*n/(n-p)

## Laso "Extra" Flexible model

In [46]:
# models
# model for Y
extraflex_y = 'lwage ~  (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)**2'

# model for 
extraflex_d = 'sex ~ (exp1+exp2+exp3+exp4+shs+hsg+scl+clg+occ2+ind2+mw+so+we)**2'

### With SKlearn

In [47]:
# extraflex_y
lasso_model = linear_model.Lasso(alpha=0.1)

extraflex_y_covariables = smf.ols(formula = extraflex_y, data = data)

Y_lasso_fitted = lasso_model.fit( extraflex_y_covariables.exog, lwage ).predict( extraflex_y_covariables.exog )

t_Y = lwage - Y_lasso_fitted.reshape( Y_lasso_fitted.size, 1)

# extraflex_d
extraflex_d_covariables = smf.ols( extraflex_d, data=data)

D_lasso_fitted = lasso_model.fit( extraflex_d_covariables.exog, lwage ).predict( extraflex_d_covariables.exog )

t_D = lwage - D_lasso_fitted.reshape( D_lasso_fitted.size, 1)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [48]:
data_res = pd.DataFrame( np.hstack(( t_Y , t_D )) , columns = [ 't_Y', 't_D' ] )

In [49]:
# regression of Y on D after partialling-out the effect of W
partial_lasso_fit = smf.ols( formula = 't_Y ~ t_D' , data = data_res ).fit()
partial_lasso_est = partial_fit.summary2().tables[1]['Coef.']['t_D']

In [50]:
print( f"Coefficient for D via partialling-out using lasso {partial_lasso_est}" )

Coefficient for D via partialling-out using lasso -0.06955320329684613


In [51]:
# standard error
HCV_coefs = partial_lasso_fit.cov_HC0
partial_lasso_se = np.power( HCV_coefs.diagonal() , 0.5)[1]

In [52]:
partial_lasso_se

2.7384487301008737e-17

## Summarize the results

In [53]:
table3 = np.zeros( ( 2, 2 ) )

In [54]:
table3[0,0] = control_est
table3[0,1] = control_se    
table3[1,0] =  partial_lasso_est
table3[1,1] = partial_lasso_se 

In [55]:
table3_pandas = pd.DataFrame( table3, columns = [ "Estimate","Std. Error" ])
table3_pandas.index = [ "full reg","partial reg via lasso" ]
table3_pandas.round(8)

Unnamed: 0,Estimate,Std. Error
full reg,-0.06127,0.212402
partial reg via lasso,-0.069553,0.0
