# MLAOIV: Many Weak Instruments

This notebook compares different ML methods for MLAOIV construction in the many-weak-instruments setting.

We compare:
- LassoCV
- RidgeCV  
- ElasticNetCV
- KernelRidge
- MLP (Neural Network)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor
from linearmodels.iv import IV2SLS
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Core functions
def simulate_iv_data(n_obs, n_instruments, error_covariance=None, 
                      instrument_strength=0.5, seed=None):
    """Simulate data for IV regression with many weak instruments."""
    if seed is not None:
        np.random.seed(seed)
    if error_covariance is None:
        error_covariance = np.array([[1, 0.5], [0.5, 1]])
    
    errors = np.random.multivariate_normal([0, 0], error_covariance, n_obs)
    u, e = errors[:, 0], errors[:, 1]
    
    Z = np.random.normal(0, 1, (n_obs, n_instruments))
    pi = np.ones(n_instruments)
    scale = (50 / n_instruments) * instrument_strength
    
    y1 = 0.3 + scale * (Z @ pi) + u
    y2 = -0.9 + 0.75 * y1 + e
    
    return {'Z': Z, 'y1': y1, 'y2': y2}


def compute_mlaoiv(Z, y_endog, regressor, cv_folds=3):
    """Compute MLAOIV using cross-validated predictions."""
    return cross_val_predict(regressor, Z, y_endog, cv=cv_folds)


def estimate_iv2sls(y_outcome, y_endog, mlaoiv_instrument):
    """Estimate IV-2SLS using MLAOIV instrument."""
    df = pd.DataFrame({'y2': y_outcome, 'y1': y_endog, 'mlaoiv': mlaoiv_instrument})
    model = IV2SLS.from_formula("y2 ~ 1 + [y1 ~ mlaoiv]", data=df).fit()
    return {'params': np.array(model.params), 'std_errors': np.array(model.std_errors)}


def get_regressors():
    """Return dictionary of ML regressors to compare."""
    alphas = [2000, 1000, 100, 50, 10, 1, 0.1]
    return {
        'Lasso': LassoCV(cv=4, alphas=alphas, max_iter=10000),
        'Ridge': RidgeCV(cv=4, alphas=alphas),
        'ElasticNet': ElasticNetCV(cv=4, l1_ratio=[0.1, 0.5, 0.9], 
                                   alphas=alphas, max_iter=10000),
        'KernelRidge': KernelRidge(alpha=1.0),
        'MLP': MLPRegressor(hidden_layer_sizes=(64,), alpha=1e-4, 
                           max_iter=1000, random_state=42)
    }

## Single Comparison

In [None]:
# Generate data
n_obs = 1000
n_instruments = 500
true_beta = 0.75

data = simulate_iv_data(n_obs, n_instruments, instrument_strength=0.5, seed=42)

# Compare all methods
results = []
for name, regr in get_regressors().items():
    mlaoiv = compute_mlaoiv(data['Z'], data['y1'], regressor=regr, cv_folds=3)
    est = estimate_iv2sls(data['y2'], data['y1'], mlaoiv)
    results.append({
        'Method': name,
        'Beta Est': est['params'][1],
        'Beta SE': est['std_errors'][1],
        'Bias': est['params'][1] - true_beta
    })

df_results = pd.DataFrame(results)
print("=== Method Comparison (n=1000, d=500) ===")
print(df_results.to_string(index=False))

## Monte Carlo Simulation

Compare methods across multiple simulation runs.

In [None]:
def run_single_sim(seed, regressor, n_obs=1000, n_instruments=500, mu=0.5, rho=0.5):
    """Run single simulation."""
    error_cov = np.array([[1, rho], [rho, 1]])
    data = simulate_iv_data(n_obs, n_instruments, error_cov, mu, seed)
    mlaoiv = compute_mlaoiv(data['Z'], data['y1'], regressor, cv_folds=3)
    return estimate_iv2sls(data['y2'], data['y1'], mlaoiv)


def mc_simulation(regressor_name, regressor, n_sims=20, **kwargs):
    """Run Monte Carlo simulation for a given regressor."""
    results = Parallel(n_jobs=-1)(
        delayed(run_single_sim)(seed, regressor, **kwargs) 
        for seed in range(n_sims)
    )
    params = np.array([r['params'] for r in results])
    true_beta = 0.75
    
    return {
        'Method': regressor_name,
        'Mean Beta': np.mean(params[:, 1]),
        'Bias': np.mean(params[:, 1]) - true_beta,
        'Std': np.std(params[:, 1]),
        'RMSE': np.sqrt(np.mean((params[:, 1] - true_beta)**2))
    }

In [None]:
# Run Monte Carlo for each method
n_sims = 20
print(f"Running {n_sims} simulations for each method...")

mc_results = []
for name, regr in get_regressors().items():
    print(f"  {name}...")
    mc_results.append(mc_simulation(name, regr, n_sims=n_sims))

df_mc = pd.DataFrame(mc_results)
print("\n=== Monte Carlo Results (n=1000, d=500, 20 sims) ===")
print(df_mc.to_string(index=False))

## Varying Instrument Strengthw

In [None]:
# Compare across different instrument strengths
mu_values = [0.3, 0.5, 0.7]
ridge = RidgeCV(cv=4, alphas=[2000, 1000, 100, 50, 10, 1, 0.1])

strength_results = []
for mu in mu_values:
    print(f"Running simulations for μ={mu}...")
    result = mc_simulation('Ridge', ridge, n_sims=20, mu=mu)
    result['μ'] = mu
    strength_results.append(result)

df_strength = pd.DataFrame(strength_results)
print("\n=== Ridge MLAOIV by Instrument Strength ===")
print(df_strength[['μ', 'Mean Beta', 'Bias', 'Std', 'RMSE']].to_string(index=False))