# GeoSTAD Synthetic Data Pipeline Test

This notebook demonstrates the complete pipeline for generating and evaluating synthetic business location data from the GeoSTAD dataset.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from synlab.data import load_geostad, prepare_for_synthesis
from synlab.evaluation import (
    compare_basic_stats,
    compare_category_frequencies,
    evaluate_spatial_metrics
)

sns.set_theme()
pd.set_option('display.max_columns', None)

## 2. Load GeoSTAD Data

In [None]:
# Load the real business data
df_real, domain = load_geostad(
    filter_geocoded=True,
    remove_duplicates=True
)

print(f"Loaded {len(df_real):,} businesses")
print(f"\nColumns: {df_real.columns.tolist()}")
print(f"\nDomain info:")
for key, value in domain.items():
    if key == 'coordinate_bounds':
        print(f"  {key}: X=[{value['X_min']:.0f}, {value['X_max']:.0f}], Y=[{value['Y_min']:.0f}, {value['Y_max']:.0f}]")
    elif key == 'spatial_info':
        print(f"  {key}: {value}")
    else:
        print(f"  {key}: {len(value) if isinstance(value, dict) else value}")

In [None]:
# Preview the data
df_real.head(10)

## 3. Prepare Features for Synthesis

In [None]:
# Select feature columns (excluding identifiers and names)
df_features = prepare_for_synthesis(df_real, sample_size=10000)  # Use 10k sample for demo

print(f"Feature data: {df_features.shape}")
print(f"Columns: {df_features.columns.tolist()}")

df_features.head()

## 4. Quick Visualization of Real Data

In [None]:
# Geographic distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot of locations
axes[0].scatter(df_features['X_2025'], df_features['Y_2025'], 
                alpha=0.3, s=1, c='blue')
axes[0].set_xlabel('X Coordinate (UTM)')
axes[0].set_ylabel('Y Coordinate (UTM)')
axes[0].set_title(f'Real Business Locations (n={len(df_features):,})')
axes[0].grid(True, alpha=0.3)

# Organization type distribution
df_features['orgf2025'].value_counts().head(10).plot(kind='barh', ax=axes[1])
axes[1].set_xlabel('Count')
axes[1].set_ylabel('Organization Type')
axes[1].set_title('Top 10 Organization Types')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 5. Load or Generate Synthetic Data

For this demo, we'll use the runner script to generate synthetic data:

```bash
python src/synlab/runners/run_geostad_dpmm_mst.py
```

Or load previously generated synthetic data:

In [None]:
# Load synthetic data from previous run (if available)
from pathlib import Path

synth_path = Path("outputs/synthetic/population/geostad_dpmm_mst_e1/synthetic.csv")

if synth_path.exists():
    df_synth = pd.read_csv(synth_path)
    print(f"Loaded synthetic data: {df_synth.shape}")
    print(f"Columns: {df_synth.columns.tolist()}")
    df_synth.head()
else:
    print("No synthetic data found. Run: python src/synlab/runners/run_geostad_dpmm_mst.py")

## 6. Compare Real vs Synthetic Data

### Basic Statistics

In [None]:
if 'df_synth' in locals():
    stats = compare_basic_stats(df_features, df_synth)
    stats

### Categorical Distributions

In [None]:
if 'df_synth' in locals():
    # Compare organization types
    cat_freq = compare_category_frequencies(
        df_features, df_synth, 
        column='orgf2025', 
        top_k=15
    )
    cat_freq

### Spatial Metrics

In [None]:
if 'df_synth' in locals():
    spatial_metrics = evaluate_spatial_metrics(
        df_features, df_synth,
        x_col='X_2025',
        y_col='Y_2025',
        unit_col='grk2025',
        categorical_col='orgf2025'
    )
    
    print("Spatial Quality Metrics:")
    print("=" * 60)
    display(spatial_metrics.T)

### Visual Comparison

In [None]:
if 'df_synth' in locals():
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Real data
    axes[0].scatter(df_features['X_2025'], df_features['Y_2025'], 
                    alpha=0.3, s=1, c='blue', label='Real')
    axes[0].set_xlabel('X Coordinate (UTM)')
    axes[0].set_ylabel('Y Coordinate (UTM)')
    axes[0].set_title(f'Real Business Locations (n={len(df_features):,})')
    axes[0].grid(True, alpha=0.3)
    
    # Synthetic data
    axes[1].scatter(df_synth['X_2025'], df_synth['Y_2025'], 
                    alpha=0.3, s=1, c='red', label='Synthetic')
    axes[1].set_xlabel('X Coordinate (UTM)')
    axes[1].set_ylabel('Y Coordinate (UTM)')
    axes[1].set_title(f'Synthetic Business Locations (n={len(df_synth):,})')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Summary

This notebook demonstrates:
1. ✓ Loading GeoSTAD business registry data with the new data loader
2. ✓ Preparing data for synthesis with feature selection
3. ✓ Generating synthetic data using DPMM-MST (via runner script)
4. ✓ Evaluating synthetic data quality with spatial metrics
5. ✓ Visualizing and comparing real vs synthetic distributions

**Next steps:**
- Experiment with different privacy budgets (epsilon values)
- Try different sample sizes for training
- Implement additional spatial evaluation metrics
- Create automated evaluation reports