# Table 1: Orthogonality Checks

**Equivalent to:** `t1_orthogonality.do`

**Purpose:** Creates Table 1 showing balance checks between treatment and control groups

This table verifies that randomization was successful by testing whether baseline characteristics are balanced across treatment groups.

**Original Stata Logic:**
- Uses orthogonality dataset created by `data7_orthogonality.do`
- Runs regressions of baseline characteristics on treatment indicators
- Tests for statistical significance of treatment coefficients
- Reports means for control group and treatment effects


In [None]:
# Setup
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project paths
project_root = Path.cwd().parent.parent
sys.path.append(str(project_root / 'src'))
sys.path.append(str(project_root / 'config'))

import config
import analysis_utils
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

print("Table 1: Orthogonality Checks")
print("=" * 40)
print(f"Equivalent to: t1_orthogonality.do")
print(f"Output: {config.TABLES_DIR / 'Table1_Orthogonality.xlsx'}")


Table 1: Orthogonality Checks
Equivalent to: t1_orthogonality.do
Output: /Users/zenofficial/Documents/statistics/pcs/turkey_python_analysis/output/tables/Table1_Orthogonality.xlsx


In [2]:
# Load orthogonality dataset
print("Loading orthogonality dataset...")

try:
    df = pd.read_parquet(config.ANALYSIS_DATA_DIR / 'orthogonality.parquet')
    print(f"✓ Loaded dataset: {df.shape}")
except FileNotFoundError:
    print("❌ Orthogonality dataset not found. Please run 02_orthogonality_data.ipynb first.")
    raise

print(f"Variables available: {list(df.columns)}")

# Check treatment groups
if 'treatment' in df.columns:
    treatment_counts = df['treatment'].value_counts().sort_index()
    print(f"\nTreatment group distribution:")
    for group, count in treatment_counts.items():
        print(f"  Group {group}: {count} observations")

# Define baseline characteristics for balance tests
baseline_vars = [
    'cinsiyet',      # Gender
    'city',          # City  
    'medenihal',     # Marital status
    'falimit',       # Overdraft limit
    'transactions',  # Average transactions
    'assets',        # Average assets
    'deposits',      # Average deposits
    'debt',          # Average debt
    'pastuse',       # Past overdraft use
    'autobillpay'    # Auto bill pay
]

# Filter to variables that exist in dataset
existing_baseline_vars = [var for var in baseline_vars if var in df.columns]
print(f"\nBaseline variables for balance tests: {existing_baseline_vars}")


Loading orthogonality dataset...
✓ Loaded dataset: (108000, 17)
Variables available: ['id', 'treatment', 'phase1treat', 'cinsiyet', 'city', 'medenihal', 'falimit', 'transactions', 'assets', 'deposits', 'paymentmean', 'debt', 'pastuse', 'autobillpay', 'creditcard', 'credit', 'faamount']

Treatment group distribution:
  Group 1.0: 3006 observations
  Group 2.0: 3013 observations
  Group 3.0: 3014 observations
  Group 4.0: 3000 observations
  Group 5.0: 3001 observations
  Group 6.0: 3009 observations
  Group 7.0: 3003 observations
  Group 8.0: 3001 observations
  Group 9.0: 3017 observations
  Group 10.0: 2992 observations
  Group 11.0: 2997 observations
  Group 12.0: 3011 observations
  Group 13.0: 2999 observations
  Group 14.0: 2992 observations
  Group 15.0: 3010 observations
  Group 16.0: 2984 observations
  Group 17.0: 3006 observations
  Group 18.0: 3004 observations
  Group 19.0: 3002 observations
  Group 20.0: 2993 observations
  Group 21.0: 3001 observations
  Group 22.0: 2996 

In [3]:
# Perform balance tests
print("\nPerforming balance tests...")

# Run balance tests comparing treatment vs control
balance_results = analysis_utils.balance_test(
    data=df,
    treatment_var='treatment',
    outcome_vars=existing_baseline_vars
)

print(f"✓ Completed balance tests for {len(balance_results)} variables")

# Display results
print("\nBalance Test Results:")
print("=" * 60)
for _, row in balance_results.iterrows():
    var = row['Variable']
    control_mean = row['Control_Mean']
    treatment_effect = row['Treatment_Effect']
    pval = row['P_Value']
    
    significance = ""
    if pval < 0.01:
        significance = "***"
    elif pval < 0.05:
        significance = "**"
    elif pval < 0.1:
        significance = "*"
    
    print(f"{var:15} | Control: {control_mean:8.3f} | Effect: {treatment_effect:8.3f}{significance:3} | p={pval:.3f}")

# Check for any significant imbalances
significant_imbalances = balance_results[balance_results['P_Value'] < 0.05]
if len(significant_imbalances) > 0:
    print(f"\n⚠️  Found {len(significant_imbalances)} significant imbalances at 5% level:")
    for _, row in significant_imbalances.iterrows():
        print(f"  - {row['Variable']}: p = {row['P_Value']:.3f}")
else:
    print("\n✓ No significant imbalances detected at 5% level")



Performing balance tests...
✓ Completed balance tests for 10 variables

Balance Test Results:
cinsiyet        | Control:      nan | Effect:   -0.000    | p=0.753
city            | Control:      nan | Effect:   -0.000    | p=0.783
medenihal       | Control:      nan | Effect:   -0.000    | p=0.784
falimit         | Control:      nan | Effect:   -0.000    | p=0.601
transactions    | Control:      nan | Effect:    0.000    | p=0.691
assets          | Control:      nan | Effect:    1.523*** | p=0.008
deposits        | Control:      nan | Effect:    0.414    | p=0.177
debt            | Control:      nan | Effect:    0.221    | p=0.295
pastuse         | Control:      nan | Effect:    0.000    | p=0.232
autobillpay     | Control:      nan | Effect:   -0.000    | p=0.824

⚠️  Found 1 significant imbalances at 5% level:
  - assets: p = 0.008


In [None]:
# Create formatted Table 1
print("\nCreating formatted Table 1...")

# Prepare formatted table
table1_data = []

for _, row in balance_results.iterrows():
    var = row['Variable']
    control_mean = row['Control_Mean']
    treatment_effect = row['Treatment_Effect']
    se = row['Std_Error']
    pval = row['P_Value']
    n = row['N']
    
    # Add significance stars
    stars = ""
    if pval < 0.01:
        stars = "***"
    elif pval < 0.05:
        stars = "**"
    elif pval < 0.1:
        stars = "*"
    
    # Format entries
    control_mean_str = f"{control_mean:.3f}"
    treatment_effect_str = f"{treatment_effect:.3f}{stars}"
    se_str = f"({se:.3f})"
    
    table1_data.append({
        'Variable': var,
        'Control_Mean': control_mean_str,
        'Treatment_Effect': treatment_effect_str,
        'Std_Error': se_str,
        'P_Value': f"{pval:.3f}",
        'N': int(n)
    })

table1_df = pd.DataFrame(table1_data)

# Add variable labels
var_labels = {
    'cinsiyet': 'Gender (1=Female)',
    'city': 'City',
    'medenihal': 'Married (1=Yes)', 
    'falimit': 'Overdraft Limit (TL)',
    'transactions': 'Monthly Transactions',
    'assets': 'Monthly Assets (TL)',
    'deposits': 'Monthly Deposits (TL)',
    'debt': 'Monthly Debt (TL)',
    'pastuse': 'Past Overdraft Use (1=Yes)',
    'autobillpay': 'Auto Bill Pay (1=Yes)'
}

table1_df['Variable_Label'] = table1_df['Variable'].map(var_labels).fillna(table1_df['Variable'])

# Reorder columns
table1_final = table1_df[['Variable_Label', 'Control_Mean', 'Treatment_Effect', 'Std_Error', 'P_Value', 'N']]
table1_final.columns = ['Variable', 'Control Mean', 'Treatment Effect', 'Std. Error', 'P-Value', 'N']

print("✓ Table 1 created")
print("\nTable 1: Orthogonality Checks")
print("=" * 80)
print(table1_final.to_string(index=False))


In [None]:
# Export Table 1 to Excel
print("\nExporting Table 1...")

# Create additional summary information
summary_info = {
    'Total_Observations': len(df),
    'Treatment_Group_Size': len(df[df['treatment'] == 1]) if 'treatment' in df.columns else 0,
    'Control_Group_Size': len(df[df['treatment'] == 0]) if 'treatment' in df.columns else 0,
    'Variables_Tested': len(balance_results),
    'Significant_at_5pct': len(balance_results[balance_results['P_Value'] < 0.05]),
    'Significant_at_1pct': len(balance_results[balance_results['P_Value'] < 0.01])
}

summary_df = pd.DataFrame([summary_info]).T
summary_df.columns = ['Value']
summary_df.index.name = 'Statistic'

# Export to Excel
tables_dict = {
    'Table1_Orthogonality': table1_final,
    'Balance_Test_Results': balance_results,
    'Summary_Statistics': summary_df
}

analysis_utils.export_to_excel(
    tables_dict=tables_dict,
    filename='Table1_Orthogonality.xlsx',
    output_dir=str(config.TABLES_DIR)
)

# Also save as CSV for easy viewing
csv_path = config.TABLES_DIR / 'Table1_Orthogonality.csv'
table1_final.to_csv(csv_path, index=False)
print(f"✓ Also saved as CSV: {csv_path}")

print("\n" + "="*60)
print("TABLE 1 ANALYSIS COMPLETE")
print("="*60)
print(f"✓ Balance tests completed for {len(balance_results)} variables")
print(f"✓ {len(significant_imbalances)} significant imbalances at 5% level")
print(f"✓ Results saved to: {config.TABLES_DIR / 'Table1_Orthogonality.xlsx'}")
print("✓ Ready for main analysis (Tables 2-6)")

# Notes for replication
print("\nReplication Notes:")
print("- Significance levels: *** p<0.01, ** p<0.05, * p<0.1")
print("- Standard errors are heteroskedasticity-robust")
print("- Each row represents a separate regression of baseline characteristic on treatment")
if len(significant_imbalances) == 0:
    print("- Randomization appears successful (no significant imbalances)")
else:
    print(f"- Found {len(significant_imbalances)} potential randomization issues to investigate")
