In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.utils import resample

# Step 1: Load the dataset
data = pd.read_csv('/Users/bishmaybarik/Library/CloudStorage/OneDrive-ShivNadarInstitutionofEminence/estimate_inc_risk/01_data/inrisk_data.csv')

# Step 2: Filter the data
filtered_data = data[
    (data['relation_with_hoh'] == 'HOH') &
    (data['gender'] == 'M') &
    (data['age_yrs'] > 25) &
    (data['age_yrs'] < 60)
]

# Step 3: Combine income variables to calculate total income
filtered_data['total_income'] = (
    filtered_data['inc_of_mem_frm_all_srcs'] + filtered_data['inc_of_mem_frm_wages']
)

# Step 4: Retain only rows with positive total_income
trimmed_data = filtered_data[filtered_data['total_income'] > 0]

# Step 5: Take the log of total income
trimmed_data['log_total_income'] = np.log(trimmed_data['total_income'])


In [None]:
import pandas as pd
import numpy as np

# Step 6: Initialize the interest_soc_group column
trimmed_data['interest_soc_group'] = np.nan

# Step 7: Replace values based on conditions
trimmed_data.loc[trimmed_data['caste_category'] == "SC", 'interest_soc_group'] = 1
trimmed_data.loc[trimmed_data['caste_category'] == "ST", 'interest_soc_group'] = 2
trimmed_data.loc[trimmed_data['caste_category'] == "OBC", 'interest_soc_group'] = 3
trimmed_data.loc[trimmed_data['religion'] == "Muslim", 'interest_soc_group'] = 4
trimmed_data.loc[
    (trimmed_data['religion'] == "Hindu") & 
    (trimmed_data['caste_category'].isin(["Intermediate Caste", "Upper Caste"])), 
    'interest_soc_group'
] = 5
trimmed_data.loc[trimmed_data['interest_soc_group'].isna(), 'interest_soc_group'] = 6

# Step 8: Define and apply labels
socgroup_labels = {
    1: "SC",
    2: "ST",
    3: "OBC",
    4: "Muslims",
    5: "Other Hindus",
    6: "Other Religions"
}

trimmed_data['interest_soc_group'] = trimmed_data['interest_soc_group'].astype(int)  # Convert to int for labeling
trimmed_data['interest_soc_group_label'] = trimmed_data['interest_soc_group'].map(socgroup_labels)

# The `trimmed_data` DataFrame now contains the `interest_soc_group` and its labeled version.


In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Define social groups
social_groups = {
    1: "SC",
    2: "ST",
    3: "OBC",
    4: "Muslims",
    5: "Other Hindus",
    6: "Other Religions"
}

# Initialize a dictionary to store the results
results = {}

# Loop over each social group
for group, label in social_groups.items():
    # Filter the data for the current social group
    group_data = trimmed_data[trimmed_data['interest_soc_group'] == group]
    
    # Step 6: Run OLS to estimate the fixed component (alpha_i)
    X = sm.add_constant(group_data['age_yrs'])  # Add constant for intercept
    y = group_data['log_total_income']
    ols_model = sm.OLS(y, X).fit()
    group_data['fitted_values'] = ols_model.fittedvalues
    
    # Step 7: Calculate residuals
    group_data['residuals'] = group_data['log_total_income'] - group_data['fitted_values']
    
    # Step 8: Estimate variance of alpha (sigma_alpha^2)
    sigma_alpha_squared = np.var(group_data['fitted_values'], ddof=1)  # Use ddof=1 for sample variance
    
    # Step 9: Estimate the transitory income variance (sigma_nu^2)
    sigma_nu_squared = np.var(group_data['residuals'], ddof=1)  # Use ddof=1 for sample variance
    
    # Step 10: Estimate rho (autoregressive parameter)
    group_data['lagged_residuals'] = group_data['residuals'].shift(1)
    group_data = group_data.dropna()  # Drop rows with NaN due to lag
    X_lag = sm.add_constant(group_data['lagged_residuals'])
    y_res = group_data['residuals']
    ar_model = sm.OLS(y_res, X_lag).fit()
    rho = ar_model.params['lagged_residuals']
    
    # Step 11: Estimate variance of permanent income at t=0 (sigma_y_p0^2)
    if abs(rho) < 1:
        sigma_y_p0_squared = sigma_alpha_squared / (1 - rho**2)
    else:
        raise ValueError("rho must be less than 1 for the variance formula to hold.")
    
    # Step 12: Estimate variance of shock (sigma_xi^2)
    shocks = group_data['residuals'] - rho * group_data['lagged_residuals']
    sigma_xi_squared = np.var(shocks, ddof=1)  # Use ddof=1 for sample variance
    
    # Store the results in the dictionary
    results[label] = {
        'sigma_alpha_squared': sigma_alpha_squared,
        'sigma_nu_squared': sigma_nu_squared,
        'rho': rho,
        'sigma_y_p0_squared': sigma_y_p0_squared,
        'sigma_xi_squared': sigma_xi_squared
    }

# Display the results
for label, result in results.items():
    print(f"Results for {label}:")
    print(f"Variance of alpha (sigma_alpha^2): {result['sigma_alpha_squared']}")
    print(f"Variance of transitory income (sigma_nu^2): {result['sigma_nu_squared']}")
    print(f"Autoregressive parameter (rho): {result['rho']}")
    print(f"Variance of permanent income at t=0 (sigma_y_p0^2): {result['sigma_y_p0_squared']}")
    print(f"Variance of shock (sigma_xi^2): {result['sigma_xi_squared']}")
    print("-" * 50)


In [None]:
output_path = "/Users/bishmaybarik/Library/CloudStorage/OneDrive-ShivNadarInstitutionofEminence/estimate_inc_risk/03_results/caste_group_results.txt"

# Open the file in write mode
with open(output_path, "w") as file:
    # Loop over the results dictionary and write each group's results
    for label, result in results.items():
        file.write(f"Results for {label}:\n")
        file.write(f"Variance of alpha (sigma_alpha^2): {result['sigma_alpha_squared']}\n")
        file.write(f"Variance of transitory income (sigma_nu^2): {result['sigma_nu_squared']}\n")
        file.write(f"Autoregressive parameter (rho): {result['rho']}\n")
        file.write(f"Variance of permanent income at t=0 (sigma_y_p0^2): {result['sigma_y_p0_squared']}\n")
        file.write(f"Variance of shock (sigma_xi^2): {result['sigma_xi_squared']}\n")
        file.write("-" * 50 + "\n")
        
print(f"Results successfully saved to {output_path}")


In [None]:
import numpy as np
from sklearn.utils import resample
import statsmodels.api as sm

# Number of bootstrap iterations
n_iterations = 10  # Reduced to 10 iterations
n_size = len(trimmed_data)

# Define social groups
social_groups = {
    1: "SC",
    2: "ST",
    3: "OBC",
    4: "Muslims",
    5: "Other Hindus",
    6: "Other Religions"
}

# Function to calculate the confidence interval
def confidence_interval(data, alpha=0.05):
    lower = np.percentile(data, 100 * (alpha / 2))
    upper = np.percentile(data, 100 * (1 - alpha / 2))
    return lower, upper

# Loop over each social group
for group, label in social_groups.items():
    # Filter data for the current group
    group_data = trimmed_data[trimmed_data['interest_soc_group'] == group]
    
    # Arrays to store bootstrap estimates
    sigma_alpha_squared_samples = []
    sigma_nu_squared_samples = []
    rho_samples = []
    sigma_y_p0_squared_samples = []
    sigma_xi_squared_samples = []
    
    # Bootstrapping
    for i in range(n_iterations):
        # Resample the data with replacement
        bootstrap_sample = resample(group_data, n_samples=n_size, replace=True)
        
        # Step 1: Recalculate total_income and residuals
        X_boot = sm.add_constant(bootstrap_sample['age_yrs'])  # Add constant for intercept
        y_boot = bootstrap_sample['log_total_income']
        ols_model_boot = sm.OLS(y_boot, X_boot).fit()
        bootstrap_sample['fitted_values'] = ols_model_boot.fittedvalues
        bootstrap_sample['residuals'] = y_boot - bootstrap_sample['fitted_values']
        
        # Step 2: Variance of alpha (sigma_alpha^2)
        sigma_alpha_squared_samples.append(np.var(bootstrap_sample['fitted_values'], ddof=1))
        
        # Step 3: Variance of transitory income (sigma_nu^2)
        sigma_nu_squared_samples.append(np.var(bootstrap_sample['residuals'], ddof=1))
        
        # Step 4: Autoregressive parameter (rho)
        bootstrap_sample['lagged_residuals'] = bootstrap_sample['residuals'].shift(1)
        bootstrap_sample = bootstrap_sample.dropna()  # Drop rows with NaN due to lag
        X_lag_boot = sm.add_constant(bootstrap_sample['lagged_residuals'])
        y_res_boot = bootstrap_sample['residuals']
        ar_model_boot = sm.OLS(y_res_boot, X_lag_boot).fit()
        rho_samples.append(ar_model_boot.params['lagged_residuals'])
        
        # Step 5: Variance of permanent income at t=0 (sigma_y_p0^2)
        rho_boot = ar_model_boot.params['lagged_residuals']
        if abs(rho_boot) < 1:
            sigma_y_p0_squared_samples.append(np.var(bootstrap_sample['fitted_values'], ddof=1) / (1 - rho_boot**2))
        else:
            sigma_y_p0_squared_samples.append(np.nan)  # Handle invalid rho values
        
        # Step 6: Variance of shock (sigma_xi^2)
        shocks_boot = bootstrap_sample['residuals'] - rho_boot * bootstrap_sample['lagged_residuals']
        sigma_xi_squared_samples.append(np.var(shocks_boot, ddof=1))
    
    # Calculate confidence intervals for each parameter
    results = {
        "sigma_alpha_squared": confidence_interval(sigma_alpha_squared_samples),
        "sigma_nu_squared": confidence_interval(sigma_nu_squared_samples),
        "rho": confidence_interval(rho_samples),
        "sigma_y_p0_squared": confidence_interval(sigma_y_p0_squared_samples),
        "sigma_xi_squared": confidence_interval(sigma_xi_squared_samples),
    }
    
    # Print the results for the current group
    print(f"Bootstrap Confidence Intervals for {label}:")
    for param, (lower, upper) in results.items():
        print(f"{param}: 95% CI = ({lower:.4f}, {upper:.4f})")
    print("-" * 50)


In [None]:
output_path = "/Users/bishmaybarik/Library/CloudStorage/OneDrive-ShivNadarInstitutionofEminence/estimate_inc_risk/03_results/caste_group_results.txt"

# Open the file in append mode
with open(output_path, "a") as file:  # Use "a" to append to the existing file
    # Loop over each social group
    for group, label in social_groups.items():
        # Filter data for the current group
        group_data = trimmed_data[trimmed_data['interest_soc_group'] == group]
        
        # Arrays to store bootstrap estimates
        sigma_alpha_squared_samples = []
        sigma_nu_squared_samples = []
        rho_samples = []
        sigma_y_p0_squared_samples = []
        sigma_xi_squared_samples = []
        
        # Bootstrapping
        for i in range(n_iterations):
            # Resample the data with replacement
            bootstrap_sample = resample(group_data, n_samples=n_size, replace=True)
            
            # Step 1: Recalculate total_income and residuals
            X_boot = sm.add_constant(bootstrap_sample['age_yrs'])  # Add constant for intercept
            y_boot = bootstrap_sample['log_total_income']
            ols_model_boot = sm.OLS(y_boot, X_boot).fit()
            bootstrap_sample['fitted_values'] = ols_model_boot.fittedvalues
            bootstrap_sample['residuals'] = y_boot - bootstrap_sample['fitted_values']
            
            # Step 2: Variance of alpha (sigma_alpha^2)
            sigma_alpha_squared_samples.append(np.var(bootstrap_sample['fitted_values'], ddof=1))
            
            # Step 3: Variance of transitory income (sigma_nu^2)
            sigma_nu_squared_samples.append(np.var(bootstrap_sample['residuals'], ddof=1))
            
            # Step 4: Autoregressive parameter (rho)
            bootstrap_sample['lagged_residuals'] = bootstrap_sample['residuals'].shift(1)
            bootstrap_sample = bootstrap_sample.dropna()  # Drop rows with NaN due to lag
            X_lag_boot = sm.add_constant(bootstrap_sample['lagged_residuals'])
            y_res_boot = bootstrap_sample['residuals']
            ar_model_boot = sm.OLS(y_res_boot, X_lag_boot).fit()
            rho_samples.append(ar_model_boot.params['lagged_residuals'])
            
            # Step 5: Variance of permanent income at t=0 (sigma_y_p0^2)
            rho_boot = ar_model_boot.params['lagged_residuals']
            if abs(rho_boot) < 1:
                sigma_y_p0_squared_samples.append(np.var(bootstrap_sample['fitted_values'], ddof=1) / (1 - rho_boot**2))
            else:
                sigma_y_p0_squared_samples.append(np.nan)  # Handle invalid rho values
            
            # Step 6: Variance of shock (sigma_xi^2)
            shocks_boot = bootstrap_sample['residuals'] - rho_boot * bootstrap_sample['lagged_residuals']
            sigma_xi_squared_samples.append(np.var(shocks_boot, ddof=1))
        
        # Calculate confidence intervals for each parameter
        results = {
            "sigma_alpha_squared": confidence_interval(sigma_alpha_squared_samples),
            "sigma_nu_squared": confidence_interval(sigma_nu_squared_samples),
            "rho": confidence_interval(rho_samples),
            "sigma_y_p0_squared": confidence_interval(sigma_y_p0_squared_samples),
            "sigma_xi_squared": confidence_interval(sigma_xi_squared_samples),
        }
        
        # Write the results for the current group
        file.write(f"Bootstrap Confidence Intervals for {label}:\n")
        for param, (lower, upper) in results.items():
            file.write(f"{param}: 95% CI = ({lower:.4f}, {upper:.4f})\n")
        file.write("-" * 50 + "\n")

print(f"Bootstrap results successfully appended to {output_path}")
