# Table 2: Effects of First Two Overdraft Messages on Overdraft Usage

**Equivalent to:** `t2_direct1.do`

**Purpose:** Creates Table 2 showing the direct effects of the first two overdraft messages on overdraft usage during the experiment period

This table examines the main treatment effects of:
1. Overdraft availability message
2. Overdraft interest discount message

**Analysis Period:** During experiment (September - December 2012)

**Original Stata Logic:**
- Uses campaign and post-campaign dataset 
- Runs regressions with overdraft usage as dependent variable
- Treatment variables: overdraft mentions and interest discount
- Controls for baseline characteristics and randomization strata


In [None]:
# Setup
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project paths
project_root = Path.cwd().parent.parent
sys.path.append(str(project_root / 'src'))
sys.path.append(str(project_root / 'config'))

import config
import analysis_utils
import pyreadstat
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

print("Table 2: Direct Effects of Overdraft Messages")
print("=" * 50)
print(f"Equivalent to: t2_direct1.do")
print(f"Output: {config.TABLES_DIR / 'Table2_Direct_Effects.xlsx'}")


In [None]:
# Load campaign dataset
print("Loading campaign and post-campaign dataset...")

# Load dataset directly from Stata file
try:
    df, meta = pyreadstat.read_dta(str(config.DATASETS['campaign_and_postcampaign']))
    print(f"✓ Loaded from Stata file: {df.shape}")
except:
        print("❌ Campaign dataset not found. Please run data conversion first.")
        raise

print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Unique customers: {df['id'].nunique()}")
print(f"Sample variables: {list(df.columns[:10])}")

# Filter to experiment period (September - December 2012)
experiment_start = pd.to_datetime('2012-09-01')
experiment_end = pd.to_datetime('2012-12-31')

df_experiment = df[(df['date'] >= experiment_start) & (df['date'] <= experiment_end)].copy()
print(f"\nExperiment period data: {df_experiment.shape}")
print(f"Date range after filter: {df_experiment['date'].min()} to {df_experiment['date'].max()}")


In [None]:
# Define treatment and outcome variables
print("\nDefining treatment and outcome variables...")

# Main treatment variables (update these based on actual variable names in data)
treatment_vars = [
    'overdraft_mention',     # First overdraft message (availability)
    'interest_discount',     # Second overdraft message (interest discount)
    'awareness_msg'          # General awareness message
]

# Outcome variables - overdraft usage measures
outcome_vars = [
    'od_usage_rate',         # Overdraft usage rate (0/1)
    'od_amount',             # Overdraft amount used
    'od_frequency',          # Frequency of overdraft use
    'od_balance'             # Average overdraft balance
]

# Control variables for regression
control_vars = [
    'baseline_balance',      # Baseline account balance
    'baseline_transactions', # Baseline transaction activity
    'city',                 # City fixed effects
    'cinsiyet',             # Gender
    'age'                   # Age
]

# Check which variables actually exist in the dataset
existing_treatment_vars = [var for var in treatment_vars if var in df_experiment.columns]
existing_outcome_vars = [var for var in outcome_vars if var in df_experiment.columns]
existing_control_vars = [var for var in control_vars if var in df_experiment.columns]

print(f"Available treatment variables: {existing_treatment_vars}")
print(f"Available outcome variables: {existing_outcome_vars}")
print(f"Available control variables: {existing_control_vars}")

# For demonstration, we'll create example specifications
# In actual implementation, these would match the original Stata code exactly


In [None]:
# Prepare data for regression analysis
print("\nPreparing data for regression analysis...")

# Create customer-level analysis dataset
# Aggregate to customer level for treatment period
df_analysis = df_experiment.groupby('id').agg({
    # Aggregate outcome variables (mean during treatment period)
    'faamount': 'mean',               # Average overdraft amount
    'od_usage': 'mean',               # Average usage rate
    
    # Treatment variables (first non-missing value)
    'treatment': 'first',
    'phase1treat': 'first',
    
    # Control variables (baseline characteristics)
    'cinsiyet': 'first',
    'city': 'first',
    'acctbalance': 'first'
}).reset_index()

print(f"Analysis dataset: {df_analysis.shape}")

# Create binary outcome variable for overdraft usage
if 'faamount' in df_analysis.columns:
    df_analysis['od_used'] = (df_analysis['faamount'] > 0).astype(int)
    print("✓ Created binary overdraft usage variable")

# Show treatment distribution
if 'treatment' in df_analysis.columns:
    treatment_dist = df_analysis['treatment'].value_counts().sort_index()
    print(f"\nTreatment distribution:")
    for treatment, count in treatment_dist.items():
        print(f"  Treatment {treatment}: {count} customers")

# Show outcome variable summary
if 'faamount' in df_analysis.columns:
    print(f"\nOverdraft amount summary:")
    print(f"  Mean: {df_analysis['faamount'].mean():.2f}")
    print(f"  Users (>0): {(df_analysis['faamount'] > 0).sum()} / {len(df_analysis)}")
    print(f"  Usage rate: {(df_analysis['faamount'] > 0).mean():.3f}")


In [None]:
# Run main regression specifications
print("\nRunning regression analysis...")

# This is a template - actual specifications would match original Stata code exactly
regression_results = {}

# Specification 1: Binary outcome (overdraft usage rate)
if 'od_used' in df_analysis.columns and 'treatment' in df_analysis.columns:
    print("\nSpecification 1: Binary overdraft usage")
    
    # Basic specification
    formula1 = "od_used ~ treatment"
    model1 = smf.ols(formula1, data=df_analysis).fit(cov_type='HC1')
    regression_results['Model1_Basic'] = model1
    
    # With controls
    if existing_control_vars:
        controls_str = " + ".join([var for var in existing_control_vars if var in df_analysis.columns])
        if controls_str:
            formula2 = f"od_used ~ treatment + {controls_str}"
            model2 = smf.ols(formula2, data=df_analysis).fit(cov_type='HC1')
            regression_results['Model2_Controls'] = model2

# Specification 2: Continuous outcome (overdraft amount)
if 'faamount' in df_analysis.columns and 'treatment' in df_analysis.columns:
    print("Specification 2: Overdraft amount")
    
    # Basic specification
    formula3 = "faamount ~ treatment"
    model3 = smf.ols(formula3, data=df_analysis).fit(cov_type='HC1')
    regression_results['Model3_Amount_Basic'] = model3
    
    # With controls
    if existing_control_vars:
        controls_str = " + ".join([var for var in existing_control_vars if var in df_analysis.columns])
        if controls_str:
            formula4 = f"faamount ~ treatment + {controls_str}"
            model4 = smf.ols(formula4, data=df_analysis).fit(cov_type='HC1')
            regression_results['Model4_Amount_Controls'] = model4

print(f"✓ Completed {len(regression_results)} regression specifications")

# Display key results
for name, model in regression_results.items():
    treatment_coef = model.params.get('treatment', 0)
    treatment_pval = model.pvalues.get('treatment', 1)
    significance = "***" if treatment_pval < 0.01 else "**" if treatment_pval < 0.05 else "*" if treatment_pval < 0.1 else ""
    
    print(f"{name}: β = {treatment_coef:.4f}{significance} (p = {treatment_pval:.3f})")


In [None]:
# Create formatted Table 2
print("\nCreating formatted Table 2...")

if regression_results:
    # Create regression table
    models_list = list(regression_results.values())
    model_names = list(regression_results.keys())
    
    table2_df = analysis_utils.create_regression_table(
        results_list=models_list,
        model_names=model_names,
        decimal_places=4
    )
    
    print("✓ Table 2 created")
    print("\nTable 2: Effects of Overdraft Messages")
    print("=" * 80)
    print(table2_df.to_string())
    
    # Export to Excel
    tables_dict = {
        'Table2_Direct_Effects': table2_df,
        'Regression_Details': pd.DataFrame({
            'Model': model_names,
            'N_Obs': [int(model.nobs) for model in models_list],
            'R_Squared': [model.rsquared for model in models_list],
            'F_Statistic': [model.fvalue for model in models_list]
        })
    }
    
    analysis_utils.export_to_excel(
        tables_dict=tables_dict,
        filename='Table2_Direct_Effects.xlsx',
        output_dir=str(config.TABLES_DIR)
    )
    
    # Also save as CSV
    csv_path = config.TABLES_DIR / 'Table2_Direct_Effects.csv'
    table2_df.to_csv(csv_path)
    print(f"✓ Also saved as CSV: {csv_path}")

else:
    print("⚠️  No regression results to display - check variable names and data")

print("\n" + "="*60)
print("TABLE 2 ANALYSIS COMPLETE")
print("="*60)
print("✓ Direct effects analysis completed")
print(f"✓ Results saved to: {config.TABLES_DIR / 'Table2_Direct_Effects.xlsx'}")

# Notes for replication
print("\nReplication Notes:")
print("- This is a template implementation")
print("- Actual variable names need to be matched to original Stata code")
print("- Treatment specifications should match original study design")
print("- Standard errors are heteroskedasticity-robust")
print("- Consider clustered standard errors at appropriate level")
