# Stage 1: Hybrid Synthetic Data Generation

This notebook implements the **hybrid approach** combining:
1. **Kaggle dataset structure** - Learn realistic patterns from Home Credit dataset
2. **RBI/Indian priors** - Calibrate for Indian market characteristics
3. **Synthetic generation** - Create privacy-preserving synthetic data

## Steps:
1. Download/prepare Kaggle dataset
2. Extract RBI priors
3. Generate synthetic data
4. Evaluate quality
5. Save data card


## Step 1: Setup and Imports


In [None]:
import sys
from pathlib import Path

# Add src to path
project_root = Path().absolute().parent
sys.path.append(str(project_root / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from scipy.stats import ks_2samp

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("‚úì Imports complete")


## Step 2: Download Kaggle Dataset (If Not Already Downloaded)


In [None]:
# Check if Kaggle dataset exists
kaggle_path = project_root / "data" / "kaggle_home_credit" / "application_train.csv"

if kaggle_path.exists():
    print(f"‚úì Kaggle dataset found at: {kaggle_path}")
else:
    print("‚ö†Ô∏è  Kaggle dataset not found.")
    print("\nTo download:")
    print("1. Run: python src/download_kaggle.py")
    print("2. OR manually download from: https://www.kaggle.com/c/home-credit-default-risk/data")
    print("\nFor now, we'll generate from priors only.")


## Step 3: Load RBI Priors


In [None]:
# Load RBI priors
priors_path = project_root / "config" / "priors_template.yaml"

with open(priors_path, 'r') as f:
    priors = yaml.safe_load(f)

print("üìä Loaded RBI Priors:")
print(f"  - Default Rate: {priors['default_rates']['overall_retail']*100:.2f}%")
print(f"  - Urban Income Mean: ‚Çπ{priors['income_stats']['urban']['mean']:,.0f}")
print(f"  - Average Age: {priors['demographics']['age']['mean']} years")
print(f"  - Credit Score Mean: {priors['credit_score']['mean']}")
print("\nüí° Tip: Update config/priors_template.yaml with actual values from RBI publications")


## Step 4: Initialize Hybrid Generator


In [None]:
from hybrid_synthetic_generator import HybridSyntheticGenerator

# Initialize generator
generator = HybridSyntheticGenerator(
    kaggle_data_path=str(kaggle_path) if kaggle_path.exists() else None,
    priors_path=str(priors_path)
)

print("‚úì Generator initialized")


## Step 5: Generate Synthetic Data


In [None]:
# Generate synthetic data
# Options:
# - method: 'gaussian_copula' (fast) or 'ctgan' (better quality, slower)
# - n_samples: Number of synthetic rows to generate

synthetic_data = generator.generate_synthetic(
    n_samples=10000,
    method='gaussian_copula',  # Start with this, use 'ctgan' for better quality
    use_kaggle_structure=kaggle_path.exists(),
    use_indian_priors=True
)

print(f"\n‚úÖ Generated {len(synthetic_data)} synthetic samples")
print(f"\nColumns: {list(synthetic_data.columns)}")
display(synthetic_data.head())


## Step 6: Explore and Save Synthetic Data


In [None]:
# Basic statistics
print("üìä Synthetic Data Summary:")
display(synthetic_data.describe())

# Save synthetic data
output_path = project_root / "data" / "synthetic_credit_data_v0.1.parquet"
generator.save_synthetic_data(synthetic_data, str(output_path))

print(f"\n‚úÖ Synthetic data saved successfully!")
