# Lasso en inferencia causal

En este taller usaremos el estimador de LASSO para apoyar en la inferencia causal

In [None]:
! pip install hdmpy statsmodels pandas numpy scikit-learn pip linearmodels

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.base import BaseEstimator

In [None]:
import hdmpy as hd

# We wrap the package so that it has the familiar sklearn API
class RLasso(BaseEstimator):

    def __init__(self, *, post=True):
        self.post = post

    def fit(self, X, y):
        self.rlasso_ = hdmpy.rlasso(X, y, post=self.post)
        return self

    def predict(self, X):
        pred = np.array(X) @ np.array(self.rlasso_.est['beta']).flatten()
        pred += np.array(self.rlasso_.est['intercept'])
        return pred

    def nsel(self):
        return sum(abs(np.array(self.rlasso_.est['beta']).flatten() > 0))


def lasso_model():
    return RLasso(post=False)

In [None]:
# Import data from tab-separated txt file
df = pd.read_csv('acemoglu_col_notext.txt', sep='\t')

print(df.head())  # Display first few rows
print(f"Shape: {df.shape}")  # Show dimensions

In [None]:
# Creación de variables y transformaciones

df['lnmort'] = np.log(df['Mort'])
df['lat2'] = df['Latitude'] ** 2
df['lat3'] = df['Latitude'] ** 3
df['lat_c08'] = (df['Latitude'] - 0.08) * (df['Latitude'] - 0.08 > 0)
df['lat2_c08']=((df['Latitude'] - 0.08) * (df['Latitude'] - 0.08 > 0))**2
df['lat3_c08']=((df['Latitude'] - 0.08) * (df['Latitude'] - 0.08 > 0))**3
df['lat_c16'] = (df['Latitude'] - 0.16) * (df['Latitude'] - 0.16 > 0)
df['lat2_c16']=((df['Latitude'] - 0.16) * (df['Latitude'] - 0.16 > 0))**2
df['lat3_c16']=((df['Latitude'] - 0.16) * (df['Latitude'] - 0.16 > 0))**3
df['lat_c24'] = (df['Latitude'] - 0.24) * (df['Latitude'] - 0.24 > 0)
df['lat2_c24']=((df['Latitude'] - 0.24) * (df['Latitude'] - 0.24 > 0))**2
df['lat3_c24']=((df['Latitude'] - 0.24) * (df['Latitude'] - 0.24 > 0))**3


In [None]:
print(df.head())  # Display first few rows after transformations
print(f"Shape: {df.shape}")  # Show dimensions after transformations

In [None]:
# Baseline IV regression
# GDP (dependent) ~ Exprop (endogenous) | lnmort (instrument)

from linearmodels import IV2SLS

# Define the IV regression
# Format: dependent ~ endogenous | exogenous ~ instruments
iv_model = IV2SLS.from_formula('GDP ~ 1 + Latitude+[Exprop ~ lnmort]', data=df)
iv_results = iv_model.fit(cov_type='heteroskedastic')  # Use 'heteroskedastic' instead of 'HC1'

print("Instrumental Variables Regression Results:")
print(iv_results)


In [None]:
# Primera etapa

y = df['Exprop']
X = df[['Latitude', 'lnmort']]
X = sm.add_constant(X)  # Add intercept

# Fit OLS model with robust standard errors
ols_model = sm.OLS(y, X).fit(cov_type='HC1')

print("Linear Regression Results:")
print("Exprop ~ Latitude + lnmort")
print("="*40)
print(ols_model.summary())

In [None]:
from statsmodels.api import add_constant
df=add_constant(df)

In [None]:
print(df.head())  # Display first few rows after transformations

In [None]:
# Incluyendo todos los controles

# Define control variables (equivalent to Stata local macro)
controls = ["Africa", "Asia", "Namer", "Samer", "Latitude", "lat2", "lat3", 
           "lat_c08", "lat2_c08", "lat3_c08", "lat_c16", "lat2_c16", "lat3_c16", 
           "lat_c24", "lat2_c24", "lat3_c24"]

controls=["const"]+controls
print(controls)


In [None]:

iv_full=IV2SLS(df.GDP,df[controls],df.Exprop,df.lnmort)
iv_full_results=iv_full.fit(cov_type='heteroskedastic')
print(iv_full_results)

In [None]:
#Primera etapa
z=['lnmort']+controls  
y = df['Exprop']
X = df[z]

# Fit OLS model with robust standard errors
ols_model = sm.OLS(y, X).fit(cov_type='HC1')

print("Linear Regression Results:")
print("Exprop ~ Controls")
print("="*40)
print(ols_model.summary())

In [None]:
# Selección de controles con Lasso

## Ecuación 1: PIB per cápita ~ Controles
X_gdp = df[controls[1:]]  # Exclude 'const' from controls
y_gdp = df['GDP']

# Fit rlasso for GDP equation
lasso_gdp = hd.rlasso(X_gdp, y_gdp, post=False)



In [None]:

## Ecuación 2: Exprop ~ Controles  
X_exprop = df[controls[1:]]  # Same controls
y_exprop = df['Exprop']

# Fit rlasso for Exprop equation
lasso_exprop = hd.rlasso(X_exprop, y_exprop, post=False)



In [None]:

## Ecuación 3: Mortality ~ controles
X_inst = df[controls[1:]]  # Controls
y_inst = df['lnmort']

# Fit rlasso for instrument equation
lasso_inst = hd.rlasso(X_inst, y_inst, post=False)


In [None]:
# Print only NON-ZERO coefficients for all three equations

print("\n" + "="*60)
print("NON-ZERO COEFFICIENTS FROM DOUBLE LASSO")
print("="*60)

# GDP equation coefficients (only non-zero)
print("\n1. GDP Lasso - Selected Variables:")
gdp_coefs = lasso_gdp.est['beta']
gdp_vars = controls[1:]  # Variable names
gdp_coefs_values = gdp_coefs.values.flatten() if hasattr(gdp_coefs, 'values') else gdp_coefs.flatten()

for i, (var, coef) in enumerate(zip(gdp_vars, gdp_coefs_values)):
    if abs(coef) > 1e-10:  # Only show truly non-zero variables
        print(f"  {var}: {coef:.6f}")

# Check intercept for GDP
gdp_intercept = lasso_gdp.est['intercept']
if hasattr(gdp_intercept, 'iloc'):
    gdp_int_val = gdp_intercept.iloc[0]
else:
    gdp_int_val = gdp_intercept
if abs(gdp_int_val) > 1e-10:
    print(f"  Intercept: {gdp_int_val:.6f}")

# Exprop equation coefficients (only non-zero)
print("\n2. Exprop Lasso - Selected Variables:")
exprop_coefs = lasso_exprop.est['beta']
exprop_coefs_values = exprop_coefs.values.flatten() if hasattr(exprop_coefs, 'values') else exprop_coefs.flatten()

for i, (var, coef) in enumerate(zip(gdp_vars, exprop_coefs_values)):
    if abs(coef) > 1e-10:  # Only show truly non-zero variables
        print(f"  {var}: {coef:.6f}")

# Check intercept for Exprop
exprop_intercept = lasso_exprop.est['intercept']
if hasattr(exprop_intercept, 'iloc'):
    exprop_int_val = exprop_intercept.iloc[0]
else:
    exprop_int_val = exprop_intercept
if abs(exprop_int_val) > 1e-10:
    print(f"  Intercept: {exprop_int_val:.6f}")

# Instrument equation coefficients (only non-zero)
print("\n3. Instrument Lasso - Selected Variables:")
inst_coefs = lasso_inst.est['beta']
inst_vars = ['lnmort'] + controls[1:]  # Include instrument + controls
inst_coefs_values = inst_coefs.values.flatten() if hasattr(inst_coefs, 'values') else inst_coefs.flatten()

for i, (var, coef) in enumerate(zip(inst_vars, inst_coefs_values)):
    if abs(coef) > 1e-10:  # Only show truly non-zero variables
        print(f"  {var}: {coef:.6f}")

# Check intercept for Instrument
inst_intercept = lasso_inst.est['intercept']
if hasattr(inst_intercept, 'iloc'):
    inst_int_val = inst_intercept.iloc[0]
else:
    inst_int_val = inst_intercept
if abs(inst_int_val) > 1e-10:
    print(f"  Intercept: {inst_int_val:.6f}")

# Summary
print("\n" + "-"*40)
print("SUMMARY:")
gdp_selected = sum(abs(coef) > 1e-10 for coef in gdp_coefs_values)
exprop_selected = sum(abs(coef) > 1e-10 for coef in exprop_coefs_values)
inst_selected = sum(abs(coef) > 1e-10 for coef in inst_coefs_values)

print(f"GDP equation: {gdp_selected}/{len(gdp_vars)} variables selected")
print(f"Exprop equation: {exprop_selected}/{len(gdp_vars)} variables selected")
print(f"Instrument equation: {inst_selected}/{len(inst_vars)} variables selected")

In [None]:
# IV2SLS With selected control
iv_lasso=IV2SLS(df.GDP,df[["const","Africa"]],df.Exprop,df.lnmort)
iv_lasso_results=iv_lasso.fit(cov_type='heteroskedastic')
print(iv_lasso_results)


In [None]:
# Primera etapa

y = df['Exprop']
X = df[['const','Africa','lnmort']] 
# Fit OLS model with robust standard errors
ols_model = sm.OLS(y, X).fit(cov_type='HC1')    
print("Linear Regression Results:")
print("Exprop ~ Selected Controls") 
print(ols_model.summary())