In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Set random seed for reproducibility
np.random.seed(42)

# Simulate data
n = 1000  # Number of observations
x = np.random.normal(0, 1, n)  # Predictor variable

# Simulate the hurdle process
# First, a binary outcome (logistic regression)
p_zero = 1 / (1 + np.exp(-0.5 * x))  # Probability of zero counts
is_zero = np.random.binomial(1, p_zero, n)

# Second, a count outcome (Poisson regression)
mean_count = np.exp(1 + 0.5 * x)  # Mean of the Poisson distribution
counts = np.random.poisson(mean_count)
counts[is_zero == 1] = 0  # Set counts to zero where is_zero == 1

# Create a DataFrame
data = pd.DataFrame({'x': x, 'counts': counts})

# Convert boolean response to integers (0 or 1)
data['count_gt_zero'] = (data['counts'] > 0).astype(int)

# Logistic regression for the hurdle (zero vs non-zero)
logit_model = smf.logit('count_gt_zero ~ x', data=data).fit()
print(logit_model.summary())

# Truncated Poisson regression for the count model (counts > 0)
# We subset the data to only those rows with counts > 0
truncated_data = data[data['counts'] > 0]
poisson_model = smf.poisson('counts ~ x', data=truncated_data).fit()
print(poisson_model.summary())

Optimization terminated successfully.
         Current function value: 0.682555
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:          count_gt_zero   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      998
Method:                           MLE   Df Model:                            1
Date:                Tue, 20 Aug 2024   Pseudo R-squ.:                 0.01403
Time:                        09:36:30   Log-Likelihood:                -682.55
converged:                       True   LL-Null:                       -692.26
Covariance Type:            nonrobust   LLR p-value:                 1.049e-05
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0803      0.064     -1.256      0.209      -0.206       0.045
x             -0.2894      0.