# Lasso para inferencia

En este laboratorio exploramos la capacidad de doble Lasso para estimar el parámetro de interés, a través del procedimiento de *partialling-out*. Usaremos una muestra restringida de los datos de la GEIH para Barranquilla AM, a fin de generar un escenario de alta dimensionalidad, $p>n$

## Brecha salarial

Queremos responder la siguiente pregunta ¿Cuál es la diferencia de salarios entre hombres y mujeres con las mismas caraceterísticas observables?

\begin{equation*}
logy=\alpha D+\boldsymbol{\beta}'W+\epsilon
\end{equation*}

donde $Y$ es el salario por hora, $D$, es la dummy que identifica a las mujeres, y $W$ es un vector de características observables así como de regresores técnicos. En W incluimos los años de experiencia, el nivel educativo, el sector económico, la ocupación, así como todas interacciones entre las variables. 

### Cargar y definir la muestra

In [36]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
import sys
from sklearn.base import BaseEstimator
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

In [None]:

file = "https://raw.githubusercontent.com/andvarga-eco/econometrics_master_uninorte/refs/heads/main/Datos%20y%20c%C3%B3digo/geih_24_baq.csv"
df = pd.read_csv(file)
# Filter df by MES == 1
print(df.shape)

#Construir variables

df=df[(df['P6040']>24)&((df['P6040']<65))]

df['lwage']=np.log(df['INGLABO']/df['P6800'])
df['exp']=df['P6040']-16
df['exp2']=df['exp']^2
df['exp3']=df['exp']^3
df['exp4']=df['exp']^4

conditions=[
    df['P3042']<3,
    (df['P3042']>=3) & (df['P3042']<8),
    (df['P3042']>=8) & (df['P3042']<11),
    (df['P3042']>=11) & (df['P3042']<14),
]
choices=[1,2,3,4]

df['educ']=np.select(conditions,choices)
df['educ']=df['educ'].astype('category')


df['P3271']=df['P3271'].astype('category')


# Create 'sex' variable: 1 if P3271==2, 0 if P3271==1
df['sex'] = np.where(df['P3271'] == 2, 1, np.where(df['P3271'] == 1, 0, np.nan))


df['sector'] = df['RAMA2D_R4'].astype(str).str[:1].astype(int)
df['sector'].value_counts()
df['occ'] = df['OFICIO_C8'].astype(str).str[:1].astype(int)


df.describe()


(5167, 18)


Unnamed: 0,PERIODO,MES,DIRECTORIO,SECUENCIA_P,ORDEN,INGLABO,P6500,P6800,RAMA2D_R4,RAMA4D_R4,...,AREA,CLASE,lwage,exp,exp2,exp3,exp4,sex,sector,occ
count,4373.0,4373.0,4373.0,4373.0,4373.0,4329.0,4329.0,4373.0,4373.0,4373.0,...,4373.0,4373.0,4329.0,4373.0,4373.0,4373.0,4373.0,4373.0,4373.0,4373.0
mean,20240640.0,6.103362,7839283.0,1.008004,2.095815,2114515.0,2076537.0,48.241024,54.238052,5451.285159,...,8.0,1.0,10.469856,23.939172,23.980791,23.978276,24.02241,0.410016,4.987651,4.821633
std,364.3551,3.490979,127828.4,0.109811,1.538828,2186301.0,2126770.0,8.124263,23.194182,2318.375329,...,0.0,0.0,0.616292,10.219277,10.428276,10.501241,10.552247,0.491892,2.247461,2.537409
min,20240100.0,1.0,7656021.0,1.0,1.0,50000.0,50000.0,6.0,1.0,111.0,...,8.0,1.0,7.824046,9.0,8.0,8.0,8.0,0.0,1.0,1.0
25%,20240310.0,3.0,7729140.0,1.0,1.0,1300000.0,1300000.0,48.0,42.0,4210.0,...,8.0,1.0,10.206674,15.0,15.0,15.0,15.0,0.0,4.0,3.0
50%,20240620.0,6.0,7800470.0,1.0,2.0,1460000.0,1400000.0,48.0,49.0,4921.0,...,8.0,1.0,10.322746,23.0,23.0,23.0,23.0,0.0,4.0,5.0
75%,20241040.0,10.0,7981937.0,1.0,3.0,2000000.0,2000000.0,48.0,77.0,7730.0,...,8.0,1.0,10.691945,31.0,31.0,31.0,31.0,1.0,7.0,7.0
max,20241250.0,12.0,8086540.0,4.0,22.0,35000000.0,35000000.0,120.0,99.0,9900.0,...,8.0,1.0,13.520711,48.0,50.0,51.0,52.0,1.0,9.0,9.0


In [19]:
# Muestra aleatoria, n=200
df_sample = df.sample(n=200, random_state=42)  # random_state for reproducibility
print(f"Original df shape: {df.shape}")
print(f"Sample df shape: {df_sample.shape}")

Original df shape: (4373, 27)
Sample df shape: (200, 27)


### Estimación por MCO

In [40]:
# Modelo extraflexbile, p>200
model_extra=('lwage~sex+(exp+exp2+exp3+exp4+C(educ)+C(sector)+C(occ))**2')
model_mco= smf.ols(model_extra, data=df_sample).fit()
sex_est=model_mco.params['sex']
sex_se=model_mco.HC3_se['sex']
model_mcoR2=model_mco.rsquared
model_mcoMSE=model_mco.mse_resid
n_params = len(model_mco.params)

print(f"The estimated sex coefficient is {sex_est:.4f} "
      f"and the corresponding robust standard error is {sex_se:.4f}")
print(f"The model R-squared is {model_mcoR2:.4f} "
      f"and the model MSE is {model_mcoMSE:.4f}")
print(f"Number of parameters estimated: {n_params}")


The estimated sex coefficient is 0.0442 and the corresponding robust standard error is 0.2245
The model R-squared is 0.8587 and the model MSE is 0.1929
Number of parameters estimated: 203


### Estimación por Lasso: *partialling-out**

In [33]:
import hdmpy

# We wrap the package so that it has the familiar sklearn API
class RLasso(BaseEstimator):

    def __init__(self, *, post=True):
        self.post = post

    def fit(self, X, y):
        self.rlasso_ = hdmpy.rlasso(X, y, post=self.post)
        return self

    def predict(self, X):
        pred = np.array(X) @ np.array(self.rlasso_.est['beta']).flatten()
        pred += np.array(self.rlasso_.est['intercept'])
        return pred

    def nsel(self):
        return sum(abs(np.array(self.rlasso_.est['beta']).flatten() > 0))


def lasso_model():
    return RLasso(post=False)

In [28]:
#Modelos
## Y sobre W
model_yW='lwage~(exp+exp2+exp3+exp4+C(educ)+C(sector)+C(occ))**2'
X_y=smf.ols(model_yW, data=df_sample).data.exog[:,1:]
y_y=smf.ols(model_yW, data=df_sample).data.endog
## Sex sobre W
model_sexW='sex~(exp+exp2+exp3+exp4+C(educ)+C(sector)+C(occ))**2'
X_sex=smf.ols(model_sexW, data=df_sample).data.exog[:,1:]
y_sex=smf.ols(model_sexW, data=df_sample).data.endog

In [38]:
# Partialling out el efecto de W sobre Y

t_y_lasso=y_y-lasso_model().fit(X_y, y_y).predict(X_y)

# Partialling out el efecto de W sobre sex
t_sex_lasso=y_sex-lasso_model().fit(X_sex, y_sex).predict(X_sex)

# Regresión de Y sobre sex después de partialling out
partial_lasso=sm.OLS(t_y_lasso, sm.add_constant(t_sex_lasso)).fit()
partial_lasso_est = partial_lasso.params[1]

print("Coefficient for sex via partialling-out using lasso " + str(partial_lasso_est))

# standard error
partial_lasso_se = partial_lasso.HC3_se[1]
print("Standard error: " + str(partial_lasso_se))

# confidence interval
print("95% CI: " + str(partial_lasso.conf_int()[1]))



Coefficient for sex via partialling-out using lasso -0.023261750366233624
Standard error: 0.0791381193037471
95% CI: [-0.1652503  0.1187268]


In [39]:
table2 = pd.DataFrame()

table2['Model'] = ["MCO", "Lasso"]

table2['Estimate'] = [sex_est, partial_lasso_est]

table2['Std. Error'] = [sex_se, partial_lasso_se]

# Show results
table2

Unnamed: 0,Model,Estimate,Std. Error
0,MCO,0.044194,0.224535
1,Lasso,-0.023262,0.079138
