In [5]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.utils import resample

# Step 1: Load the dataset
data = pd.read_csv('/Users/bishmaybarik/Library/CloudStorage/OneDrive-ShivNadarInstitutionofEminence/estimate_inc_risk/01_data/inrisk_data.csv')

# Step 2: Combine income variables to calculate total income
data['total_income'] = data['inc_of_mem_frm_all_srcs'] + data['inc_of_mem_frm_wages']

# Step 3: Take the log of total income
data['log_total_income'] = np.log(data['total_income'].replace(0, np.nan))  # Replace 0s to avoid log(0)

In [4]:
# Step 4: Regression to isolate residuals
# Prepare independent variables
X = pd.get_dummies(data[['age_yrs', 'region_type']], drop_first=True)
X = sm.add_constant(X)  # Add constant for regression
# Convert any boolean columns to integers for compatibility
X = X.astype(float)

# Define dependent variable
y = data['log_total_income']

# Fit the regression model
model = sm.OLS(y, X, missing='drop').fit()
data['residuals'] = model.resid

# Step 5: Estimating Variance Components
# Compute lagged residuals for AR(1) process
data['residuals_lagged'] = data.groupby(['hh_id', 'mem_id'])['residuals'].shift(1)

# Drop NaN values caused by lagging
residuals_data = data[['residuals', 'residuals_lagged']].dropna()

# Calculate variances and covariances
sigma_residuals = residuals_data['residuals'].var()
sigma_residuals_lagged = residuals_data['residuals_lagged'].var()
cov_residuals = residuals_data.cov().iloc[0, 1]

# Estimate persistence (rho)
rho = cov_residuals / sigma_residuals_lagged

# Estimate variance components
sigma_eta_squared = sigma_residuals - rho**2 * sigma_residuals_lagged
sigma_alpha_squared = sigma_residuals - sigma_eta_squared

# Step 6: Bootstrapping for Confidence Intervals
n_iterations = 500
bootstrap_rho = []
bootstrap_sigma_eta_squared = []
bootstrap_sigma_alpha_squared = []

for _ in range(n_iterations):
    # Resample the data with replacement
    sample = resample(residuals_data)

    # Calculate variances and covariances for the sample
    sigma_residuals_sample = sample['residuals'].var()
    sigma_residuals_lagged_sample = sample['residuals_lagged'].var()
    cov_residuals_sample = sample.cov().iloc[0, 1]

    # Estimate rho for the sample
    rho_sample = cov_residuals_sample / sigma_residuals_lagged_sample
    bootstrap_rho.append(rho_sample)

    # Estimate variance components for the sample
    sigma_eta_squared_sample = sigma_residuals_sample - rho_sample**2 * sigma_residuals_lagged_sample
    sigma_alpha_squared_sample = sigma_residuals_sample - sigma_eta_squared_sample

    bootstrap_sigma_eta_squared.append(sigma_eta_squared_sample)
    bootstrap_sigma_alpha_squared.append(sigma_alpha_squared_sample)

# Calculate confidence intervals
rho_ci = (np.percentile(bootstrap_rho, 2.5), np.percentile(bootstrap_rho, 97.5))
sigma_eta_ci = (np.percentile(bootstrap_sigma_eta_squared, 2.5), np.percentile(bootstrap_sigma_eta_squared, 97.5))
sigma_alpha_ci = (np.percentile(bootstrap_sigma_alpha_squared, 2.5), np.percentile(bootstrap_sigma_alpha_squared, 97.5))

# Output results
print(f"Estimated rho (persistence of income shocks): {rho}")
print(f"95% CI for rho: {rho_ci}")
print(f"Estimated variance of transitory shocks (sigma_eta^2): {sigma_eta_squared}")
print(f"95% CI for sigma_eta^2: {sigma_eta_ci}")
print(f"Estimated variance of persistent shocks (sigma_alpha^2): {sigma_alpha_squared}")
print(f"95% CI for sigma_alpha^2: {sigma_alpha_ci}")

Estimated rho (persistence of income shocks): 0.9441899729781386
95% CI for rho: (0.9437703645442105, 0.9445955738338977)
Estimated variance of transitory shocks (sigma_eta^2): 0.0672447799508552
95% CI for sigma_eta^2: (0.06683108799733349, 0.06758979886543359)
Estimated variance of persistent shocks (sigma_alpha^2): 0.5433710067584161
95% CI for sigma_alpha^2: (0.5424839360577294, 0.5441660827569366)


In [7]:
# Define the output content
output_content = (
    "Estimated rho (persistence of income shocks): 0.9441899729781386\n"
    "95% CI for rho: (0.9437703645442105, 0.9445955738338977)\n"
    "Estimated variance of transitory shocks (sigma_eta^2): 0.0672447799508552\n"
    "95% CI for sigma_eta^2: (0.06683108799733349, 0.06758979886543359)\n"
    "Estimated variance of persistent shocks (sigma_alpha^2): 0.5433710067584161\n"
    "95% CI for sigma_alpha^2: (0.5424839360577294, 0.5441660827569366)\n"
)

# Specify the file path
file_path = "/Users/bishmaybarik/Library/CloudStorage/OneDrive-ShivNadarInstitutionofEminence/estimate_inc_risk/03_results/population.txt"

# Write the content to the file
with open(file_path, "w") as file:
    file.write(output_content)

print(f"Results saved to {file_path}")


Results saved to /Users/bishmaybarik/Library/CloudStorage/OneDrive-ShivNadarInstitutionofEminence/estimate_inc_risk/03_results/population.txt


In [1]:
pip install fuzzywuzzy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
