In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import norm
from scipy.optimize import minimize
from scipy.special import xlogy

np.random.seed(42)


In [2]:
def generate_data(n_obs=1000, n_periods=2, treatment_effect=2, heterogeneous=False, selection=False):
    # Generate time periods
    time = np.repeat([0, 1], n_obs // 2)
    
    # Generate treatment group
    if selection:
        # Add selection bias
        z = np.random.normal(0, 1, n_obs)
        prob_treatment = norm.cdf(z)
        treatment = np.random.binomial(1, prob_treatment)
    else:
        treatment = np.random.binomial(1, 0.5, n_obs)
    
    # Generate covariates
    X = np.random.normal(0, 1, (n_obs, 2))
    
    # Generate outcomes
    epsilon = np.random.normal(0, 1, n_obs)
    y = 1.0 + 2 * treatment + 3 * time
    
    if heterogeneous:
        # Add heterogeneous treatment effects
        y += treatment * time * (2 + 0.5 * X[:, 0] + 0.5 * X[:, 1])
    else:
        # Add homogeneous treatment effect
        y += treatment_effect * treatment * time
    
    y += epsilon
    
    if selection:
        # Add selection bias to outcome
        y += 0.5 * z
    
    # Create DataFrame
    df = pd.DataFrame({
        'y': y,
        'time': time,
        'treatment': treatment,
        'X1': X[:, 0],
        'X2': X[:, 1]
    })
    
    if selection:
        df['z'] = z
    
    # Create interaction terms
    df['treatment_time'] = df['treatment'] * df['time']
    df['treatment_time_X1'] = df['treatment_time'] * df['X1']
    df['treatment_time_X2'] = df['treatment_time'] * df['X2']
    
    return df

In [3]:

def traditional_did(df):
    X = sm.add_constant(df[['treatment', 'time', 'treatment_time']])
    model = sm.OLS(df['y'], X).fit()
    return model.params['treatment_time']


In [4]:

def heterogeneous_did(df):
    df['treatment_time_X1'] = df['treatment_time'] * df['X1']
    df['treatment_time_X2'] = df['treatment_time'] * df['X2']
    X = sm.add_constant(df[['treatment', 'time', 'treatment_time', 'X1', 'X2', 'treatment_time_X1', 'treatment_time_X2']])
    model = sm.OLS(df['y'], X).fit()
    return model.params['treatment_time']


In [5]:

def selection_correction_did(df):
    # First stage: Probit model for treatment selection
    probit_model = sm.Probit(df['treatment'], sm.add_constant(df[['z']])).fit()
    
    # Calculate Inverse Mills Ratio
    df['imr'] = norm.pdf(probit_model.predict()) / norm.cdf(probit_model.predict())
    
    # Second stage: DiD with selection correction
    df['treatment_time_X1'] = df['treatment_time'] * df['X1']
    df['treatment_time_X2'] = df['treatment_time'] * df['X2']
    X = sm.add_constant(df[['treatment', 'time', 'treatment_time', 'X1', 'X2', 'treatment_time_X1', 'treatment_time_X2', 'imr']])
    model = sm.OLS(df['y'], X).fit()
    return model.params['treatment_time']


In [6]:
def gme_did(df, n_support=5):
    # Prepare data
    y = df['y'].values
    X = sm.add_constant(df[['treatment', 'time', 'treatment_time', 'X1', 'X2', 'treatment_time_X1', 'treatment_time_X2']])
    
    n_obs, n_params = X.shape
    
    # Create support points for parameters and error terms
    z_beta = np.linspace(-10, 10, n_support)
    z_eps = np.linspace(-5, 5, n_support)
    
    # Define the objective function (negative entropy)
    def objective(p):
        p_beta = p[:n_params*n_support].reshape(n_params, n_support)
        p_eps = p[n_params*n_support:].reshape(n_obs, n_support)
        return np.sum(xlogy(p_beta, p_beta)) + np.sum(xlogy(p_eps, p_eps))
    
    # Define constraints
    def constraint_mean(p):
        p_beta = p[:n_params*n_support].reshape(n_params, n_support)
        p_eps = p[n_params*n_support:].reshape(n_obs, n_support)
        beta_hat = np.sum(z_beta * p_beta, axis=1)
        eps_hat = np.sum(z_eps * p_eps, axis=1)
        return y - np.dot(X, beta_hat) - eps_hat
    
    def constraint_sum_to_one(p):
        return np.concatenate([
            np.sum(p[:n_params*n_support].reshape(n_params, n_support), axis=1) - 1,
            np.sum(p[n_params*n_support:].reshape(n_obs, n_support), axis=1) - 1
        ])
    
    # Initial guess
    p0 = np.full(n_params*n_support + n_obs*n_support, 1/(n_support))
    
    # Solve the optimization problem
    result = minimize(
        objective, p0,
        method='SLSQP',
        constraints=[
            {'type': 'eq', 'fun': constraint_mean},
            {'type': 'eq', 'fun': constraint_sum_to_one}
        ],
        options={'ftol': 1e-8, 'maxiter': 1000}
    )
    
    # Extract the estimated parameters
    p_beta_opt = result.x[:n_params*n_support].reshape(n_params, n_support)
    beta_hat = np.sum(z_beta * p_beta_opt, axis=1)
    
    # Return the treatment effect (coefficient on 'treatment_time')
    return beta_hat[3]

In [7]:
def gme_did_selection(df, n_support=5):
    # First stage: Probit model for treatment selection
    probit_model = sm.Probit(df['treatment'], sm.add_constant(df[['z']])).fit()
    
    # Calculate Inverse Mills Ratio
    df['imr'] = norm.pdf(probit_model.predict()) / norm.cdf(probit_model.predict())
    
    # Prepare data
    y = df['y'].values
    X = sm.add_constant(df[['treatment', 'time', 'treatment_time', 'X1', 'X2', 'treatment_time_X1', 'treatment_time_X2', 'imr']])
    
    n_obs, n_params = X.shape
    
    # Create support points for parameters and error terms
    z_beta = np.linspace(-10, 10, n_support)
    z_eps = np.linspace(-5, 5, n_support)
    
    # Define the objective function (negative entropy)
    def objective(p):
        p_beta = p[:n_params*n_support].reshape(n_params, n_support)
        p_eps = p[n_params*n_support:].reshape(n_obs, n_support)
        return np.sum(xlogy(p_beta, p_beta)) + np.sum(xlogy(p_eps, p_eps))
    
    # Define constraints
    def constraint_mean(p):
        p_beta = p[:n_params*n_support].reshape(n_params, n_support)
        p_eps = p[n_params*n_support:].reshape(n_obs, n_support)
        beta_hat = np.sum(z_beta * p_beta, axis=1)
        eps_hat = np.sum(z_eps * p_eps, axis=1)
        return y - np.dot(X, beta_hat) - eps_hat
    
    def constraint_sum_to_one(p):
        return np.concatenate([
            np.sum(p[:n_params*n_support].reshape(n_params, n_support), axis=1) - 1,
            np.sum(p[n_params*n_support:].reshape(n_obs, n_support), axis=1) - 1
        ])
    
    # Initial guess
    p0 = np.full(n_params*n_support + n_obs*n_support, 1/(n_support))
    
    # Solve the optimization problem
    result = minimize(
        objective, p0,
        method='SLSQP',
        constraints=[
            {'type': 'eq', 'fun': constraint_mean},
            {'type': 'eq', 'fun': constraint_sum_to_one}
        ],
        options={'ftol': 1e-8, 'maxiter': 1000}
    )
    
    # Extract the estimated parameters
    p_beta_opt = result.x[:n_params*n_support].reshape(n_params, n_support)
    beta_hat = np.sum(z_beta * p_beta_opt, axis=1)
    
    # Return the treatment effect (coefficient on 'treatment_time')
    return beta_hat[3]

In [8]:
def run_simulation(n_simulations=1000, n_obs=1000, treatment_effect=2):
    results = {
        'traditional': [],
        'heterogeneous': [],
        'selection': [],
        'gme': [],
        'gme_selection': []
    }
    
    for _ in range(n_simulations):
        # Base case
        df_base = generate_data(n_obs=n_obs, treatment_effect=treatment_effect)
        results['traditional'].append(traditional_did(df_base))
        results['gme'].append(gme_did(df_base))
        
        # Heterogeneous treatment effects
        df_hetero = generate_data(n_obs=n_obs, treatment_effect=treatment_effect, heterogeneous=True)
        results['heterogeneous'].append(heterogeneous_did(df_hetero))
        
        # Selection effects
        df_selection = generate_data(n_obs=n_obs, treatment_effect=treatment_effect, heterogeneous=True, selection=True)
        results['selection'].append(selection_correction_did(df_selection))
        results['gme_selection'].append(gme_did_selection(df_selection))
    
    return results

In [None]:

# Run simulation
sim_results = run_simulation()

# Print results
for method, estimates in sim_results.items():
    print(f"{method.capitalize()} DiD:")
    print(f"  Mean estimate: {np.mean(estimates):.4f}")
    print(f"  Standard deviation: {np.std(estimates):.4f}")
    print(f"  95% CI: ({np.percentile(estimates, 2.5):.4f}, {np.percentile(estimates, 97.5):.4f})")
    print()

In [None]:

# Print results
for method, estimates in sim_results.items():
    print(f"{method.capitalize()} DiD:")
    print(f"  Mean estimate: {np.mean(estimates):.4f}")
    print(f"  Standard deviation: {np.std(estimates):.4f}")
    print(f"  95% CI: ({np.percentile(estimates, 2.5):.4f}, {np.percentile(estimates, 97.5):.4f})")
    print()

In [None]:
def run_simulation(n_simulations=1000, n_obs=1000, treatment_effect=2):
    results = {
        'traditional': [],
        'heterogeneous': [],
        'selection': [],
        'gme': []
    }
    
    for _ in range(n_simulations):
        # Base case
        df_base = generate_data(n_obs=n_obs, treatment_effect=treatment_effect)
        results['traditional'].append(traditional_did(df_base))
        results['gme'].append(gme_did(df_base))
        
        # Heterogeneous treatment effects
        df_hetero = generate_data(n_obs=n_obs, treatment_effect=treatment_effect, heterogeneous=True)
        results['heterogeneous'].append(heterogeneous_did(df_hetero))
        
        # Selection effects
        df_selection = generate_data(n_obs=n_obs, treatment_effect=treatment_effect, heterogeneous=True, selection=True)
        results['selection'].append(selection_correction_did(df_selection))
    
    return results

# Run simulation
sim_results = run_simulation(n_simulations=100)  # Reduced number of simulations for quicker execution

# Print results
for method, estimates in sim_results.items():
    print(f"{method.capitalize()} DiD:")
    print(f"  Mean estimate: {np.mean(estimates):.4f}")
    print(f"  Standard deviation: {np.std(estimates):.4f}")
    print(f"  95% CI: ({np.percentile(estimates, 2.5):.4f}, {np.percentile(estimates, 97.5):.4f})")
    print()