# Turkey Bank Overdraft Analysis - Master Notebook

**Unshrouding: Evidence from Bank Overdrafts in Turkey**  
Python Replication - June 2017  

This master notebook orchestrates the entire analysis pipeline, equivalent to the original `0_master_all.do` Stata script.

## Analysis Pipeline

1. **Setup**
2. **Data Cleaning**  
3. **Main Analysis Tables**
4. **Appendix Tables**
5. **Figures**

---


In [4]:
# Setup and imports
import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path.cwd().parent
sys.path.append(str(project_root / 'config'))

# Import configuration
import config
import pyreadstat
import pandas as pd
import numpy as np
from datetime import datetime

print(f"Master Analysis Started: {datetime.now()}")
print(f"Project Root: {project_root}")
print(f"Study: {config.STUDY_TITLE}")

# Configuration control flags (equivalent to Stata locals)
run_data_cleaning = True  
run_analysis = True

# Output options
create_excel_output = True
create_figures = True
verbose = True

print("\nAnalysis Configuration:")
print(f"  Data Cleaning: {run_data_cleaning}")
print(f"  Main Analysis: {run_analysis}")
print(f"  Excel Output: {create_excel_output}")
print(f"  Figures: {create_figures}")

# Load all datasets directly from original files using pyreadstat
print(f"\nLoading data from: {config.ORIGINAL_DATA_DIR}")
print("Loading datasets directly with pyreadstat...")
print("="*50)

datasets = {}
for name, file_path in config.DATASETS.items():
    if file_path.exists():
        print(f"Loading {name}...")
        try:
            df, meta = pyreadstat.read_dta(str(file_path))
            datasets[name] = df
            print(f"✓ {name}: {df.shape[0]:,} rows × {df.shape[1]} columns")
        except Exception as e:
            print(f"✗ Failed to load {name}: {e}")
    else:
        print(f"✗ File not found: {file_path}")

print(f"\nSuccessfully loaded {len(datasets)} datasets: {list(datasets.keys())}")


Master Analysis Started: 2025-08-28 00:39:43.425192
Project Root: /Users/zenofficial/Documents/statistics/pcs/turkey_python_analysis
Study: Unshrouding: Evidence from Bank Overdrafts in Turkey

Analysis Configuration:
  Data Cleaning: True
  Main Analysis: True
  Excel Output: True
  Figures: True

Loading data from: /Users/zenofficial/Documents/statistics/pcs/Turkey Dataverse Files/Turkey Dataverse Files/Data
Loading datasets directly with pyreadstat...
Loading all...
✓ all: 1,836,354 rows × 42 columns
Loading dailydata...
✓ dailydata: 1,195,591 rows × 3 columns
Loading campaign_and_postcampaign...
✓ campaign_and_postcampaign: 2,484,000 rows × 49 columns
Loading campaign_and_postcampaign_controlmerged...
✓ campaign_and_postcampaign_controlmerged: 3,389,441 rows × 54 columns
Loading orthogonality...
✓ orthogonality: 108,000 rows × 77 columns
Loading data_group...
✓ data_group: 2,808,000 rows × 35 columns
Loading data_group_interactions...
✓ data_group_interactions: 2,808,000 rows × 519

In [5]:
# Complete Orthogonality Data Preparation - Exact Stata Replication
print("\n" + "="*60)
print("COMPLETE ORTHOGONALITY DATA PREPARATION")
print("Replicating: data7_orthogonality.do")
print("="*60)

# Load 'all' dataset
if 'all' not in datasets:
    raise ValueError("'all' dataset not found in loaded datasets")

df = datasets['all'].copy()
print(f"Starting with 'all' dataset: {df.shape[0]:,} rows")

# Step 1: Filter to pre-campaign period
# Stata: keep if (date <= ym(2012,8) & date >= ym(2011,9))
print("\nStep 1: Filtering to pre-campaign period...")
initial_rows = len(df)
df = df[(df['date'] >= 621) & (df['date'] <= 632)].copy()  # ym(2011,9) to ym(2012,8)
print(f"✓ Filtered: {initial_rows:,} → {len(df):,} rows ({initial_rows - len(df):,} removed)")

# Sort by id and date
df = df.sort_values(['id', 'date'])
print("✓ Sorted by id and date")



COMPLETE ORTHOGONALITY DATA PREPARATION
Replicating: data7_orthogonality.do
Starting with 'all' dataset: 1,836,354 rows

Step 1: Filtering to pre-campaign period...
✓ Filtered: 1,836,354 → 1,296,000 rows (540,354 removed)
✓ Sorted by id and date


In [14]:
# Step 3: Create aggregated variables (Stata egen commands)
print("\nStep 3: Creating aggregated variables...")

# Check data types first
print("Checking column data types:")
cols_to_check = ['a_total', 'a_deposit', 'payment', 'debt_tot', 'transnum', 'var3', 'faamount']
for col in cols_to_check:
    if col in df.columns:
        print(f"  {col}: {df[col].dtype}")

# First ensure numeric columns are properly typed
print("\nConverting columns to numeric...")
numeric_cols = ['a_total', 'a_deposit', 'payment', 'debt_tot', 'transnum']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        # Fill missing values with 0 (as per Stata logic)
        df[col] = df[col].fillna(0)
        print(f"  Converted {col} to numeric")

# Create customer-level means (equivalent to Stata egen mean, by(id))
agg_vars = {
    'transactions': 'transnum',
    'assets': 'a_total',
    'deposits': 'a_deposit', 
    'paymentmean': 'payment',
    'debt': 'debt_tot'
}

for new_var, source_var in agg_vars.items():
    if source_var in df.columns:
        customer_means = df.groupby('id')[source_var].mean().reset_index()
        customer_means.columns = ['id', new_var]
        df = df.merge(customer_means, on='id', how='left')
        mean_val = customer_means[new_var].mean()
        print(f"✓ Created {new_var} from {source_var} (mean: {mean_val:.2f})")

# Handle faamount and pastuse
# Note: In Stata, egen pastuse = mean(faamount), by(id) creates a variable at observation level
# Then collapse (sum) faamount keeps the sum, while pastuse is kept as first value
df['faamount'] = df['faamount'].fillna(0)

# Create pastuse based on whether customer ever used flex account
customer_faamount_sum = df.groupby('id')['faamount'].sum()
pastuse_binary = (customer_faamount_sum > 0).astype(int)

# Create pastuse at observation level (like Stata egen)
df['pastuse'] = df['id'].map(pastuse_binary)

print(f"✓ Created pastuse: {pastuse_binary.sum():,} customers with past usage")
print(f"  Distribution: {pastuse_binary.value_counts().to_dict()}")

# Handle autobillpay (rename var3)
if 'var3' in df.columns:
    # First convert var3 to numeric if needed
    df['var3'] = pd.to_numeric(df['var3'], errors='coerce').fillna(0)
    df['autobillpay_old'] = df['var3'].copy()
    
    # Create autobillpay mean by customer
    autobillpay_mean = df.groupby('id')['autobillpay_old'].mean().reset_index()
    autobillpay_mean.columns = ['id', 'autobillpay']
    autobillpay_mean['autobillpay'] = (autobillpay_mean['autobillpay'] != 0).astype(int)
    df = df.merge(autobillpay_mean, on='id', how='left')
    print(f"✓ Created autobillpay: {autobillpay_mean['autobillpay'].sum():,} customers with autobillpay")
else:
    print("⚠️  var3 not found - skipping autobillpay creation")

print("✓ All aggregated variables created")



Step 3: Creating aggregated variables...
Checking column data types:
  a_total: float64
  a_deposit: float64
  payment: float64
  debt_tot: object
  transnum: object
  var3: int64
  faamount: float64

Converting columns to numeric...
  Converted a_total to numeric
  Converted a_deposit to numeric
  Converted payment to numeric
  Converted debt_tot to numeric
  Converted transnum to numeric
✓ Created transactions from transnum (mean: 1.64)
✓ Created assets from a_total (mean: 584.12)
✓ Created deposits from a_deposit (mean: 342.38)
✓ Created paymentmean from payment (mean: 6.96)
✓ Created debt from debt_tot (mean: 283.39)
✓ Created pastuse: 32,865 customers with past usage
  Distribution: {0: 75135, 1: 32865}
✓ Created autobillpay: 3,715 customers with autobillpay
✓ All aggregated variables created


In [15]:
# Step 4: Keep only required variables and collapse to customer level
print("\nStep 4: Selecting variables and collapsing...")

# Keep only required variables
keep_vars = ['id', 'date', 'treatment', 'phase1treat', 'cinsiyet', 'city', 'medenihal', 
             'falimit', 'autobillpay', 'creditcard', 'credit', 'transactions', 'assets', 
             'deposits', 'paymentmean', 'debt', 'faamount', 'pastuse', 'acctbalance']

available_vars = [v for v in keep_vars if v in df.columns]
df = df[available_vars].copy()
print(f"✓ Selected {len(available_vars)} variables")

# Sort before collapse
df = df.sort_values(['id', 'date'])

# Collapse to customer level
# firstnm = first non-missing, lastnm = last non-missing, sum = sum
agg_dict = {
    'treatment': 'first',
    'phase1treat': 'first',
    'cinsiyet': 'first',
    'city': 'first',
    'medenihal': 'first',
    'falimit': 'first',
    'transactions': 'first',
    'assets': 'first',
    'deposits': 'first',
    'paymentmean': 'first',
    'debt': 'first',
    'pastuse': 'first',
    'autobillpay': 'first',
    'creditcard': 'last',
    'credit': 'last',
    'faamount': 'sum'
}

# Only aggregate columns that exist
available_agg = {k: v for k, v in agg_dict.items() if k in df.columns}
df_collapsed = df.groupby('id').agg(available_agg).reset_index()

print(f"✓ Collapsed to customer level: {len(df_collapsed):,} customers")
print(f"  Shape: {df_collapsed.shape}")

# Check if we have expected 108,000 customers
if len(df_collapsed) == 108000:
    print("✓ Customer count matches expected: 108,000")
else:
    print(f"⚠️  Customer count: {len(df_collapsed):,} (expected: 108,000)")



Step 4: Selecting variables and collapsing...
✓ Selected 19 variables
✓ Collapsed to customer level: 108,000 customers
  Shape: (108000, 17)
✓ Customer count matches expected: 108,000


In [16]:
# Step 5: Create dummy variables (Stata ta var, gen(newvar_))
print("\nStep 5: Creating dummy variables...")

df = df_collapsed.copy()

# City dummies - ta city, gen(city_)
city_dummies = pd.get_dummies(df['city'], prefix='city', dtype=int)
# Stata labels: city_1 "Other City", city_2 "Istanbul", etc.
# Note: Stata numbering may differ from pandas default
df = pd.concat([df, city_dummies], axis=1)
print(f"✓ Created city dummies: {list(city_dummies.columns)}")

# Marital status dummies - ta medenihal, gen(maritalstatus_)
marital_dummies = pd.get_dummies(df['medenihal'], prefix='maritalstatus', dtype=int)
df = pd.concat([df, marital_dummies], axis=1)
print(f"✓ Created marital status dummies: {list(marital_dummies.columns)}")

# Flex limit dummies - ta falimit, gen(flexlim_)
flexlim_dummies = pd.get_dummies(df['falimit'], prefix='flexlim', dtype=int)
df = pd.concat([df, flexlim_dummies], axis=1)
print(f"✓ Created flex limit dummies: {list(flexlim_dummies.columns)}")

print("✓ All dummy variables created")



Step 5: Creating dummy variables...
✓ Created city dummies: ['city_0.0', 'city_1.0', 'city_2.0', 'city_3.0', 'city_4.0']
✓ Created marital status dummies: ['maritalstatus_1', 'maritalstatus_2', 'maritalstatus_3']
✓ Created flex limit dummies: ['flexlim_1.0', 'flexlim_2.0', 'flexlim_3.0']
✓ All dummy variables created


In [17]:
# Step 6: Create treatment-related variables
print("\nStep 6: Creating treatment-related variables...")

# Frequency treatments
freq_10_treatments = [2,5,8,11,14,17,20,23,26,29,32,35]
freq_20_treatments = [3,6,9,12,15,18,21,24,27,30,33,36]

df['freq_10'] = df['treatment'].isin(freq_10_treatments).astype(int)
df['freq_20'] = df['treatment'].isin(freq_20_treatments).astype(int)

# Create freqtreat variable
df['freqtreat'] = np.nan
df.loc[df['freq_10'] == 1, 'freqtreat'] = 0
df.loc[df['freq_20'] == 1, 'freqtreat'] = 1

# Frequency none
df['freq_none'] = ((df['freq_10'] == 0) & (df['freq_20'] == 0)).astype(int)

# Reminder variables
reminder_short_treatments = [2,3,8,9,14,15,20,21,26,27,32,33]
df['reminder_short'] = df['treatment'].isin(reminder_short_treatments).astype(int)
df['reminder_long'] = ((df['reminder_short'] == 0) & (df['freq_none'] == 0)).astype(int)

# Reminder treatment variable
df['remindtreat'] = np.nan
df.loc[df['freq_none'] == 1, 'remindtreat'] = 0
df.loc[df['reminder_short'] == 1, 'remindtreat'] = 1
df.loc[df['reminder_long'] == 1, 'remindtreat'] = 2

# Main treatment categories
df['control'] = ((df['treatment'] >= 1) & (df['treatment'] <= 6)).astype(int)
df['faonly'] = ((df['treatment'] >= 31) & (df['treatment'] <= 36)).astype(int)
df['fabp'] = ((df['treatment'] >= 25) & (df['treatment'] <= 30)).astype(int)
df['fadc'] = ((df['treatment'] >= 13) & (df['treatment'] <= 18)).astype(int)
df['bponly'] = ((df['treatment'] >= 7) & (df['treatment'] <= 12)).astype(int)
df['dconly'] = ((df['treatment'] >= 19) & (df['treatment'] <= 24)).astype(int)

# Combined treatment variable
df['treats'] = np.nan
df.loc[df['control'] == 1, 'treats'] = 0
df.loc[df['faonly'] == 1, 'treats'] = 1
df.loc[df['fabp'] == 1, 'treats'] = 2
df.loc[df['fadc'] == 1, 'treats'] = 3
df.loc[df['bponly'] == 1, 'treats'] = 4
df.loc[df['dconly'] == 1, 'treats'] = 5

print("✓ Created all treatment-related variables")
print(f"  Frequency treatments: {df['freqtreat'].value_counts().to_dict()}")
print(f"  Reminder treatments: {df['remindtreat'].value_counts().to_dict()}")
print(f"  Main treatments: {df['treats'].value_counts().to_dict()}")



Step 6: Creating treatment-related variables...
✓ Created all treatment-related variables
  Frequency treatments: {1.0: 36052, 0.0: 35985}
  Reminder treatments: {1.0: 36064, 2.0: 35973, 0.0: 35963}
  Main treatments: {0.0: 18043, 4.0: 18021, 3.0: 17995, 5.0: 17983, 2.0: 17981, 1.0: 17977}


In [18]:
# Step 7: Create treatment dummy variables (ta treatment, gen(treat_))
print("\nStep 7: Creating treatment dummy variables...")

# Create treatment dummies
treatment_dummies = pd.get_dummies(df['treatment'], prefix='treat', dtype=int)

# Drop treat_1 (Stata: drop treat_1)
if 'treat_1.0' in treatment_dummies.columns:
    treatment_dummies = treatment_dummies.drop('treat_1.0', axis=1)
elif 'treat_1' in treatment_dummies.columns:
    treatment_dummies = treatment_dummies.drop('treat_1', axis=1)

# Rename columns to remove .0 suffix if present
treatment_dummies.columns = [col.replace('.0', '') for col in treatment_dummies.columns]

# Add to main dataframe
df = pd.concat([df, treatment_dummies], axis=1)

print(f"✓ Created {len(treatment_dummies.columns)} treatment dummy variables")
print(f"  Treatment dummies: treat_2 through treat_{36}")

# Final sort by id
df = df.sort_values('id')
print("✓ Final sort by id")

# Update datasets dictionary
df_final = df.copy()
datasets['orthogonality'] = df_final

print(f"\nFinal dataset shape: {df_final.shape}")
print(f"Expected shape: (108000, 77)")



Step 7: Creating treatment dummy variables...
✓ Created 35 treatment dummy variables
  Treatment dummies: treat_2 through treat_36
✓ Final sort by id

Final dataset shape: (108000, 77)
Expected shape: (108000, 77)


In [20]:
# Step 8: Verify against existing orthogonality.dta
print("\n" + "="*80)
print("VERIFICATION AGAINST EXISTING ORTHOGONALITY.DTA")
print("="*80)

# Load existing dataset for comparison
existing_path = config.DATASETS.get('orthogonality')
if existing_path and existing_path.exists():
    print(f"Loading existing orthogonality.dta...")
    df_existing, _ = pyreadstat.read_dta(str(existing_path))
    
    # Basic comparison
    print(f"\nShape comparison:")
    print(f"  Generated: {df_final.shape}")
    print(f"  Existing:  {df_existing.shape}")
    print(f"  Match: {'✓' if df_final.shape == df_existing.shape else '✗'}")
    
    # Column comparison
    gen_cols = set(df_final.columns)
    exist_cols = set(df_existing.columns)
    missing_cols = exist_cols - gen_cols
    extra_cols = gen_cols - exist_cols
    
    if missing_cols or extra_cols:
        print(f"\nColumn differences:")
        if missing_cols:
            print(f"  Missing: {sorted(missing_cols)}")
        if extra_cols:
            print(f"  Extra: {sorted(extra_cols)}")
    
    # Value comparison for key variables
    print(f"\nKey variable comparison:")
    key_vars = ['treatment', 'pastuse', 'assets', 'debt', 'cinsiyet']
    
    df_final_sorted = df_final.sort_values('id').reset_index(drop=True)
    df_existing_sorted = df_existing.sort_values('id').reset_index(drop=True)
    
    all_match = True
    for var in key_vars:
        if var in df_final.columns and var in df_existing.columns:
            gen_vals = pd.to_numeric(df_final_sorted[var], errors='coerce')
            exist_vals = pd.to_numeric(df_existing_sorted[var], errors='coerce')
            
            if np.allclose(gen_vals, exist_vals, equal_nan=True, rtol=1e-5):
                print(f"  ✓ {var}: Perfect match")
            else:
                diff_count = (~np.isclose(gen_vals, exist_vals, equal_nan=True)).sum()
                print(f"  ✗ {var}: {diff_count:,} differences")
                all_match = False
    
    # Final assessment
    print(f"\n{'='*60}")
    if df_final.shape == df_existing.shape and all_match:
        print("🎉 SUCCESS: Dataset perfectly matches existing orthogonality.dta!")
    else:
        print("⚠️  Dataset created but has some differences from existing")
        print("   Run additional cells to investigate and fix if needed")
    
    # Update final dataset
    datasets['orthogonality'] = df_final
    
else:
    print("No existing orthogonality.dta found for comparison")
    print("Generated dataset saved as new reference")
    
print(f"\nFinal dataset: {df_final.shape[0]:,} rows × {df_final.shape[1]} columns")



VERIFICATION AGAINST EXISTING ORTHOGONALITY.DTA
Loading existing orthogonality.dta...

Shape comparison:
  Generated: (108000, 77)
  Existing:  (108000, 77)
  Match: ✓

Key variable comparison:
  ✓ treatment: Perfect match
  ✗ pastuse: 8,856 differences
  ✗ assets: 107,586 differences
  ✗ debt: 37,877 differences
  ✓ cinsiyet: Perfect match

⚠️  Dataset created but has some differences from existing
   Run additional cells to investigate and fix if needed

Final dataset: 108,000 rows × 77 columns
