In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sys
import statsmodels.api as sm
import statsmodels.formula.api as smf
from IPython.display import display, clear_output

# --- CONFIGURATION ---
INPUT_DATA_PATH = "data/analysis/data_group_interactions.dta"

In [4]:
# Load the Stata dataset
try:
    df = pd.read_stata(INPUT_DATA_PATH)
    print(f"Successfully loaded data with {len(df)} rows.")
except FileNotFoundError:
    print("---")
    print(f"ERROR: Data file not found at '{INPUT_DATA_PATH}'.")
    print("Please update the 'INPUT_DATA_PATH' variable in the first cell and re-run.")
    print("---")

# --- Data Preparation---

# Fill NaNs for selected variables by averaging across rows for the same id
for _col in ["paymentmean", "debt", "deposits", "assets"]:
    if _col in df.columns:
        _grp_mean = df.groupby('id')[_col].transform('mean')
        df[_col] = df[_col].fillna(_grp_mean)
        df[_col] = df[_col].fillna(df[_col].mean())

# Filter for the relevant group as in the Stata code `if group == 1`
df_analysis = df[df['group'] == 'Sept-Dec'].copy()
print(f"Filtered data for 'group == 1', resulting in {len(df_analysis)} rows for analysis.")


# Define variable lists from the Stata globals
outcomes = ["fausebal", "daysunder", "avgbalance"]
strat_vars = [col for col in df.columns if col.startswith('strat_')]
controls_formula = " + ".join(strat_vars)

# Display the first few rows to verify
print("\nData preview:")
display(df_analysis.head())

Successfully loaded data with 2808000 rows.
Filtered data for 'group == 1', resulting in 108000 rows for analysis.

Data preview:


Unnamed: 0,id,group,daysunder,faamount,autobillpay,postrans,transnum,avgbalance,a_total,a_deposit,...,strat_308,strat_309,strat_310,strat_311,strat_312,strat_313,strat_314,strat_315,strat_316,strat_317
0,1,Sept-Dec,0.0,0.0,0.0,0.0,0.0,0.0,13.3625,13.3625,...,0,0,0,0,0,0,0,0,0,0
26,2,Sept-Dec,0.0,0.0,0.0,1.0,6.75,0.0,397.27,248.32,...,0,0,0,0,0,0,0,0,0,0
52,3,Sept-Dec,0.0,0.0,0.0,0.0,0.0,0.0,4.5325,4.5325,...,0,0,0,0,0,0,0,0,0,0
78,4,Sept-Dec,0.0,0.0,0.0,0.0,0.0,0.0,212.1075,212.1075,...,0,0,0,0,0,0,0,0,0,0
104,5,Sept-Dec,0.0,0.0,0.0,1.0,0.5,0.0,0.825,0.825,...,0,0,0,0,0,0,0,0,0,0


In [5]:
all_results = []
for outcome in outcomes:
    # Model 1
    formula1 = f"{outcome} ~ awareness + message + message_fa + {' + '.join(strat_vars)}"
    model1 = smf.ols(formula1, data=df_analysis).fit(cov_type='HC1', use_t=True)
    
    # Model 2 (named '3' in Stata script)
    formula2 = f"{outcome} ~ awareness + message + fa + billpayfa + debitfa + {' + '.join(strat_vars)}"
    model2 = smf.ols(formula2, data=df_analysis).fit(cov_type='HC1', use_t=True)

    # Store summary stats for the dependent variable
    summary_stats = {
        'Observations': df_analysis[outcome].notna().sum(),
        'Mean of Dependent Variable': df_analysis[outcome].mean(),
        'std dev': f"({df_analysis[outcome].std():.2f})"
    }

    all_results.append({'outcome': outcome, 'model': 1, 'results': model1, 'summary': summary_stats})
    all_results.append({'outcome': outcome, 'model': 3, 'results': model2, 'summary': summary_stats})

# --- Build the Output Table (replicates esttab) ---
regressors_to_show = ['awareness', 'message', 'message_fa', 'fa', 'billpayfa', 'debitfa']

# Create columns for each model
final_table_cols = {}
col_counter = 1
for res in all_results:
    # Format results: "coef*** (stderr)"
    def format_result(param):
        coef = res['results'].params.get(param, np.nan)
        se = res['results'].bse.get(param, np.nan)
        pval = res['results'].pvalues.get(param, np.nan)
        
        if pd.isna(coef):
            return "" # Regressor not in this model
        
        stars = ""
        if pval < 0.01: stars = "***"
        elif pval < 0.05: stars = "**"
        elif pval < 0.10: stars = "*"
        
        return f"{coef:8.4f}{stars}\n({se:8.4f})"

    col_data = {v: format_result(v) for v in regressors_to_show}

    # Add summary stats
    col_data['Mean of Dependent Variable'] = f"{res['summary']['Mean of Dependent Variable']:.4f}"
    col_data['std dev'] = res['summary']['std dev']
    col_data['Observations'] = f"{res['summary']['Observations']}"

    final_table_cols[f"v{col_counter}"] = col_data
    col_counter += 1

# Combine into a DataFrame
final_table = pd.DataFrame(final_table_cols)
final_table = final_table.rename_axis('Variable').reset_index()

In [6]:
# Save analysis data to CSV for reuse in other notebooks
from pathlib import Path

# Ensure directories exist
csv_dir = Path("data/analysis")
csv_dir.mkdir(parents=True, exist_ok=True)

# Save the main analysis dataframe
analysis_csv_path = csv_dir / "analysis_df.csv"
df_analysis.to_csv(analysis_csv_path, index=False)
print(f"Saved analysis dataset to {analysis_csv_path.resolve()}")

# Optionally save the regression summary table as well
try:
    tables_dir = Path("output/tables")
    tables_dir.mkdir(parents=True, exist_ok=True)
    summary_csv_path = tables_dir / "analysis_regression_summary.csv"
    final_table.to_csv(summary_csv_path, index=False)
    print(f"Saved regression summary to {summary_csv_path.resolve()}")
except Exception as e:
    print(f"Note: Could not save regression summary (maybe not created yet): {e}")


Saved analysis dataset to /Users/zenofficial/Documents/statistics/pcs/turkey_python_analysis/data/analysis/analysis_df.csv
Saved regression summary to /Users/zenofficial/Documents/statistics/pcs/turkey_python_analysis/output/tables/analysis_regression_summary.csv
