# MLAOIV: Sparse Instruments

This notebook demonstrates MLAOIV in a **sparse** instruments setting where only a few instruments are strong.

## Model

We have $d=500$ potential instruments, but only $s$ of them are relevant:
- **First stage**: $y_1 = \pi' Z + u$ where $\pi = (\underbrace{1,...,1}_{s}, \underbrace{0,...,0}_{d-s})$
- **Structural equation**: $y_2 = \beta y_1 + e$

where $(u, e)$ are correlated (œÅ=0.5), creating endogeneity.

## Key Insight

In this setting, Lasso-based MLAOIV should perform well due to its ability to perform variable selection.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from linearmodels.iv import IV2SLS
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Core functions
def simulate_sparse_iv_data(n_obs, n_instruments, n_strong, error_covariance=None, 
                            instrument_strength=0.1, true_beta=0.75, seed=None):
    """
    Simulate data for IV regression with sparse instruments.
    
    Parameters:
        n_obs: number of observations
        n_instruments: total number of instruments (d)
        n_strong: number of strong instruments (s)
        instrument_strength: strength parameter (mu)
    
    Model:
        pi = (1,...,1, 0,...,0) with s ones and d-s zeros
        y1 = (50/s) * mu * Z @ pi + u
        y2 = beta * y1 + e
    """
    if seed is not None:
        np.random.seed(seed)
    if error_covariance is None:
        error_covariance = np.array([[1, 0.5], [0.5, 1]])
    
    # Generate errors
    errors = np.random.multivariate_normal([0, 0], error_covariance, n_obs)
    u, e = errors[:, 0], errors[:, 1]
    
    # Generate instruments
    Z = np.random.normal(0, 1, (n_obs, n_instruments))
    
    # Sparse pi: only first s instruments are relevant
    pi = np.concatenate([np.ones(n_strong), np.zeros(n_instruments - n_strong)])
    scale = (50 / n_strong) * instrument_strength
    
    # Generate variables
    y1 = scale * (Z @ pi) + u
    y2 = true_beta * y1 + e
    
    return {'Z': Z, 'y1': y1, 'y2': y2, 'pi': pi}


def compute_mlaoiv(Z, y_endog, regressor, cv_folds=3):
    """Compute MLAOIV using cross-validated predictions."""
    return cross_val_predict(regressor, Z, y_endog, cv=cv_folds)


def estimate_iv2sls(y_outcome, y_endog, mlaoiv_instrument):
    """Estimate IV-2SLS using MLAOIV instrument."""
    df = pd.DataFrame({'y2': y_outcome, 'y1': y_endog, 'mlaoiv': mlaoiv_instrument})
    model = IV2SLS.from_formula("y2 ~ 1 + [y1 ~ mlaoiv]", data=df).fit()
    return {'params': np.array(model.params), 'std_errors': np.array(model.std_errors)}


def estimate_iv2sls_all(Z, y_outcome, y_endog):
    """Estimate IV-2SLS using all instruments directly."""
    d = Z.shape[1]
    df = pd.DataFrame(Z, columns=[f'z{i}' for i in range(d)])
    df['y1'] = y_endog
    df['y2'] = y_outcome
    iv_str = ' + '.join([f'z{i}' for i in range(d)])
    model = IV2SLS.from_formula(f"y2 ~ [y1 ~ {iv_str}] - 1", data=df).fit()
    return {'params': np.array(model.params), 'std_errors': np.array(model.std_errors)}

## Single Simulation: OLS vs Standard IV vs MLAOIV

In [3]:
# Simulation parameters
n_obs = 1500
n_instruments = 500
n_strong = 25  # Only 25 out of 500 instruments are strong
true_beta = 0.75

print(f"Sample size: {n_obs}")
print(f"Total instruments: {n_instruments}")
print(f"Strong instruments: {n_strong} (sparse!)")
print(f"True beta: {true_beta}")

Sample size: 1500
Total instruments: 500
Strong instruments: 25 (sparse!)
True beta: 0.75


In [4]:
# Generate sparse IV data
data = simulate_sparse_iv_data(
    n_obs=n_obs,
    n_instruments=n_instruments,
    n_strong=n_strong,
    instrument_strength=0.1,
    seed=42
)

# OLS (biased)
X_ols = np.column_stack([np.ones(n_obs), data['y1']])
ols_beta = np.linalg.lstsq(X_ols, data['y2'], rcond=None)[0]

# Standard IV with all instruments
iv_all = estimate_iv2sls_all(data['Z'], data['y2'], data['y1'])

# MLAOIV with different methods
alphas = [2000, 1000, 100, 50, 10, 1, 0.1]
regressors = {
    'Lasso': LassoCV(cv=4, alphas=alphas, max_iter=10000),
    'Ridge': RidgeCV(cv=4, alphas=alphas),
    'ElasticNet': ElasticNetCV(cv=4, l1_ratio=[0.1, 0.5, 0.9], alphas=alphas, max_iter=10000)
}

results = [
    {'Method': 'OLS', 'Beta Est': ols_beta[1], 'Beta Bias': ols_beta[1] - true_beta},
    {'Method': 'IV-All', 'Beta Est': iv_all['params'][0], 'Beta Bias': iv_all['params'][0] - true_beta}
]

for name, regr in regressors.items():
    mlaoiv = compute_mlaoiv(data['Z'], data['y1'], regressor=regr, cv_folds=2)
    est = estimate_iv2sls(data['y2'], data['y1'], mlaoiv)
    results.append({
        'Method': f'MLAOIV-{name}',
        'Beta Est': est['params'][1],
        'Beta Bias': est['params'][1] - true_beta
    })

df_results = pd.DataFrame(results)
print("\n=== Sparse IV Comparison (n=1500, d=500, s=25) ===")
print(df_results.to_string(index=False))


=== Sparse IV Comparison (n=1500, d=500, s=25) ===
           Method  Beta Est  Beta Bias
              OLS  0.964650   0.214650
           IV-All  0.848452   0.098452
     MLAOIV-Lasso  0.744415  -0.005585
     MLAOIV-Ridge  0.687134  -0.062866
MLAOIV-ElasticNet  0.718628  -0.031372


## Monte Carlo Simulation

In [5]:
def run_single_sim(seed, regressor, n_obs=1500, n_instruments=500, n_strong=25, rho=0.5):
    """Run single simulation for sparse IV setting."""
    error_cov = np.array([[1, rho], [rho, 1]])
    data = simulate_sparse_iv_data(n_obs, n_instruments, n_strong, error_cov, 
                                    instrument_strength=0.1, seed=seed)
    mlaoiv = compute_mlaoiv(data['Z'], data['y1'], regressor, cv_folds=2)
    return estimate_iv2sls(data['y2'], data['y1'], mlaoiv)


def mc_simulation(regressor_name, regressor, n_sims=10, **kwargs):
    """Run Monte Carlo simulation for a given regressor."""
    results = Parallel(n_jobs=-1)(
        delayed(run_single_sim)(seed, regressor, **kwargs) 
        for seed in range(n_sims)
    )
    params = np.array([r['params'] for r in results])
    true_beta = 0.75
    
    return {
        'Method': regressor_name,
        'Mean Beta': np.mean(params[:, 1]),
        'Bias': np.mean(params[:, 1]) - true_beta,
        'Std': np.std(params[:, 1]),
        'RMSE': np.sqrt(np.mean((params[:, 1] - true_beta)**2))
    }

In [6]:
# Run Monte Carlo for each method
n_sims = 10
print(f"Running {n_sims} simulations for each method...")

alphas = [2000, 1000, 100, 50, 10, 1, 0.1]
regressors = {
    'Lasso': LassoCV(cv=4, alphas=alphas, max_iter=10000),
    'Ridge': RidgeCV(cv=4, alphas=alphas),
    'ElasticNet': ElasticNetCV(cv=4, l1_ratio=[0.1, 0.5, 0.9], alphas=alphas, max_iter=10000)
}

mc_results = []
for name, regr in regressors.items():
    print(f"  {name}...")
    mc_results.append(mc_simulation(name, regr, n_sims=n_sims))

df_mc = pd.DataFrame(mc_results)
print("\n=== Monte Carlo Results: Sparse IV (n=1500, d=500, s=25) ===")
print(df_mc.to_string(index=False))

Running 10 simulations for each method...
  Lasso...
  Ridge...
  ElasticNet...

=== Monte Carlo Results: Sparse IV (n=1500, d=500, s=25) ===
    Method  Mean Beta      Bias      Std     RMSE
     Lasso   0.735308 -0.014692 0.024511 0.028577
     Ridge   0.741265 -0.008735 0.031104 0.032307
ElasticNet   0.733906 -0.016094 0.025898 0.030491


## Varying Sparsity Level

Compare Lasso MLAOIV across different numbers of strong instruments.

In [7]:
# Compare across different sparsity levels
s_values = [10, 25, 50, 100]
lasso = LassoCV(cv=4, alphas=[2000, 1000, 100, 50, 10, 1, 0.1], max_iter=10000)

sparsity_results = []
for s in s_values:
    print(f"Running simulations for s={s}...")
    result = mc_simulation('Lasso', lasso, n_sims=10, n_strong=s)
    result['s'] = s
    sparsity_results.append(result)

df_sparsity = pd.DataFrame(sparsity_results)
print("\n=== Lasso MLAOIV by Sparsity Level ===")
print(df_sparsity[['s', 'Mean Beta', 'Bias', 'Std', 'RMSE']].to_string(index=False))

Running simulations for s=10...
Running simulations for s=25...
Running simulations for s=50...
Running simulations for s=100...

=== Lasso MLAOIV by Sparsity Level ===
  s  Mean Beta      Bias      Std     RMSE
 10   0.749822 -0.000178 0.012991 0.012992
 25   0.735308 -0.014692 0.024511 0.028577
 50   0.735066 -0.014934 0.057528 0.059435
100   0.576868 -0.173132 0.453167 0.485113
