In [None]:
"""
Which Routes/Regions Have Higher Delays?
Bayesian Hierarchical Analysis
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

# 1. LOAD & EXPLORE DATA
df = pd.read_csv('train_delays.csv')
print(f"Data shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst rows:\n{df.head()}")
print(f"\nDelay duration stats:\n{df['delay_duration'].describe()}")

# 2. SAMPLE DATA (30% for efficiency)

np.random.seed(42)
df_sample = df.sample(frac=0.3, random_state=42).reset_index(drop=True)

# Create hierarchical indices
route_idx, routes = pd.factorize(df_sample['route'])
region_idx, regions = pd.factorize(df_sample['region'])

y = df_sample['delay_duration'].values
route_idx = route_idx + 1  # Stan uses 1-based indexing
region_idx = region_idx + 1

n_obs = len(y)
n_routes = len(routes)
n_regions = len(regions)

print(f"\n{'='*70}")
print(f"SAMPLED DATA: {n_obs} records")
print(f"{'='*70}")
print(f"Routes: {n_routes}")
print(f"Regions: {n_regions}")

print(f"\nDELAY BY ROUTE (Observed)::")
route_summary = df_sample.groupby('route')['delay_duration'].agg(['count', 'mean', 'std']).round(2)
print(route_summary)

print(f"\nDELAY BY REGION (Observed):")
region_summary = df_sample.groupby('region')['delay_duration'].agg(['count', 'mean', 'std']).round(2)
print(region_summary)

# ============================================================================
# 3. DEFINE STAN MODELS
# ============================================================================

stan_route = """
data {
  int<lower=0> n_obs;
  int<lower=0> n_routes;
  vector[n_obs] y;
  int<lower=1,upper=n_routes> route_idx[n_obs];
}
parameters {
  real mu;
  real<lower=0> tau;
  vector[n_routes] alpha_route;
  real<lower=0> sigma;
}
model {
  mu ~ normal(0, 20);
  tau ~ exponential(0.1);
  alpha_route ~ normal(mu, tau);
  sigma ~ exponential(0.1);
  y ~ normal(alpha_route[route_idx], sigma);
}
generated quantities {
  vector[n_obs] y_pred;
  for (i in 1:n_obs)
    y_pred[i] = normal_rng(alpha_route[route_idx[i]], sigma);
}
"""

stan_region = """
data {
  int<lower=0> n_obs;
  int<lower=0> n_regions;
  vector[n_obs] y;
  int<lower=1,upper=n_regions> region_idx[n_obs];
}
parameters {
  real mu;
  real<lower=0> tau;
  vector[n_regions] alpha_region;
  real<lower=0> sigma;
}
model {
  mu ~ normal(0, 20);
  tau ~ exponential(0.1);
  alpha_region ~ normal(mu, tau);
  sigma ~ exponential(0.1);
  y ~ normal(alpha_region[region_idx], sigma);
}
generated quantities {
  vector[n_obs] y_pred;
  for (i in 1:n_obs)
    y_pred[i] = normal_rng(alpha_region[region_idx[i]], sigma);
}
"""

print("\n‚úì Stan models defined")

# ============================================================================
# 4. CHECK STAN AVAILABILITY
# ============================================================================

try:
    from cmdstanpy import CmdStanModel
    stan_available = True
    print("‚úì CmdStanPy available - using Stan models")
except ImportError:
    stan_available = False
    print("‚ö†Ô∏èCmdStanPy not available")
    print("   Install: pip install cmdstanpy")

# ============================================================================
# 5. FIT ROUTE MODEL
# ============================================================================

if stan_available:
    print(f"\n{'='*70}")
    print("FITTING ROUTE MODEL: Which Routes Have Higher Delays?")
    print(f"{'='*70}\n")
    
    model_route = CmdStanModel(stan_code=stan_route)
    data_route = {
        'n_obs': n_obs,
        'n_routes': n_routes,
        'y': y,
        'route_idx': route_idx
    }
    
    fit_route = model_route.sample(
        data=data_route,
        iter_sampling=1000,
        iter_warmup=1000,
        chains=2,
        seed=42,
        show_progress=True
    )
    
    print("\n‚úì Route model fitted successfully")
    samples_route = fit_route.draws_pd()

# ============================================================================
# 6. FIT REGION MODEL
# ============================================================================

if stan_available:
    print(f"\n{'='*70}")
    print("FITTING REGION MODEL: Which Regions Have Higher Delays?")
    print(f"{'='*70}\n")
    
    model_region = CmdStanModel(stan_code=stan_region)
    data_region = {
        'n_obs': n_obs,
        'n_regions': n_regions,
        'y': y,
        'region_idx': region_idx
    }
    
    fit_region = model_region.sample(
        data=data_region,
        iter_sampling=1000,
        iter_warmup=1000,
        chains=2,
        seed=42,
        show_progress=True
    )
    
    print("\n‚úì Region model fitted successfully")
    samples_region = fit_region.draws_pd()

# ============================================================================
# 7. EXTRACT & ANALYZE ROUTE EFFECTS
# ============================================================================

if stan_available:
    print(f"\n{'='*70}")
    print("ROUTE ANALYSIS: Which Routes Have Systematically Higher Delays?")
    print(f"{'='*70}\n")
    
    # Extract route effects from posterior samples
    route_cols = [col for col in samples_route.columns if col.startswith('alpha_route')]
    route_effects = samples_route[route_cols].values
    
    # Calculate posterior mean and 95% credible intervals
    route_means = np.mean(route_effects, axis=0)
    route_lower = np.percentile(route_effects, 2.5, axis=0)
    route_upper = np.percentile(route_effects, 97.5, axis=0)
    
    # Create results dataframe
    route_results = pd.DataFrame({
        'Route': routes,
        'Posterior_Mean': route_means,
        'CI_Lower': route_lower,
        'CI_Upper': route_upper,
        'CI_Width': route_upper - route_lower
    })
    
    route_results['Credible_Interval'] = route_results.apply(
        lambda x: f"[{x['CI_Lower']:.2f}, {x['CI_Upper']:.2f}]", axis=1
    )
    
    route_results = route_results.sort_values('Posterior_Mean', ascending=False)
    
    print("ALL ROUTES (Sorted by Posterior Mean Delay):")
    print(route_results[['Route', 'Posterior_Mean', 'Credible_Interval']].to_string(index=False))
    
    print(f"\n{'='*70}")
    print("üî¥ TOP 5 HIGHEST DELAY ROUTES (Need Attention):")
    print(f"{'='*70}")
    top_routes = route_results.head(5)
    for idx, row in top_routes.iterrows():
        print(f"  {row['Route']:20s} | Mean: {row['Posterior_Mean']:6.2f} mins | 95% CI: {row['Credible_Interval']}")
    
    print(f"\n{'='*70}")
    print("üü¢ TOP 5 LOWEST DELAY ROUTES (Best Performers):")
    print(f"{'='*70}")
    bottom_routes = route_results.tail(5)
    for idx, row in bottom_routes.iterrows():
        print(f"  {row['Route']:20s} | Mean: {row['Posterior_Mean']:6.2f} mins | 95% CI: {row['Credible_Interval']}")

# ============================================================================
# 8. EXTRACT & ANALYZE REGION EFFECTS
# ============================================================================

if stan_available:
    print(f"\n{'='*70}")
    print("REGION ANALYSIS: Which Regions Have Systematically Higher Delays?")
    print(f"{'='*70}\n")
    
    # Extract region effects from posterior samples
    region_cols = [col for col in samples_region.columns if col.startswith('alpha_region')]
    region_effects = samples_region[region_cols].values
    
    # Calculate posterior mean and 95% credible intervals
    region_means = np.mean(region_effects, axis=0)
    region_lower = np.percentile(region_effects, 2.5, axis=0)
    region_upper = np.percentile(region_effects, 97.5, axis=0)
    
    # Create results dataframe
    region_results = pd.DataFrame({
        'Region': regions,
        'Posterior_Mean': region_means,
        'CI_Lower': region_lower,
        'CI_Upper': region_upper,
        'CI_Width': region_upper - region_lower
    })
    
    region_results['Credible_Interval'] = region_results.apply(
        lambda x: f"[{x['CI_Lower']:.2f}, {x['CI_Upper']:.2f}]", axis=1
    )
    
    region_results = region_results.sort_values('Posterior_Mean', ascending=False)
    
    print("ALL REGIONS (Sorted by Posterior Mean Delay):")
    print(region_results[['Region', 'Posterior_Mean', 'Credible_Interval']].to_string(index=False))
    
    print(f"\n{'='*70}")
    print("üî¥ TOP 5 HIGHEST DELAY REGIONS (Need Attention):")
    print(f"{'='*70}")
    top_regions = region_results.head(5)
    for idx, row in top_regions.iterrows():
        print(f"  {row['Region']:20s} | Mean: {row['Posterior_Mean']:6.2f} mins | 95% CI: {row['Credible_Interval']}")
    
    print(f"\n{'='*70}")
    print("üü¢ TOP 5 LOWEST DELAY REGIONS (Best Performers):")
    print(f"{'='*70}")
    bottom_regions = region_results.tail(5)
    for idx, row in bottom_regions.iterrows():
        print(f"  {row['Region']:20s} | Mean: {row['Posterior_Mean']:6.2f} mins | 95% CI: {row['Credible_Interval']}")

# ============================================================================
# 9. POSTERIOR PREDICTIVE CHECKS
# ============================================================================

if stan_available:
    print(f"\n{'='*70}")
    print("POSTERIOR PREDICTIVE CHECKS")
    print(f"{'='*70}\n")
    
    # Extract predictions
    route_pred_cols = [col for col in samples_route.columns if col.startswith('y_pred')]
    region_pred_cols = [col for col in samples_region.columns if col.startswith('y_pred')]
    
    route_preds = samples_route[route_pred_cols].values
    region_preds = samples_region[region_pred_cols].values
    
    # Plot posterior predictive checks
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Route model PPC
    axes[0].hist(y, bins=30, alpha=0.6, label='Observed', color='navy', density=True)
    for i in range(100):
        axes[0].hist(route_preds[i], bins=30, alpha=0.01, color='red', density=True)
    axes[0].set_xlabel('Delay Duration (minutes)')
    axes[0].set_ylabel('Density')
    axes[0].set_title('Route Model: Posterior Predictive Check')
    axes[0].legend()
    
    # Region model PPC
    axes[1].hist(y, bins=30, alpha=0.6, label='Observed', color='navy', density=True)
    for i in range(100):
        axes[1].hist(region_preds[i], bins=30, alpha=0.01, color='green', density=True)
    axes[1].set_xlabel('Delay Duration (minutes)')
    axes[1].set_ylabel('Density')
    axes[1].set_title('Region Model: Posterior Predictive Check')
    axes[1].legend()
    
    plt.tight_layout()
    plt.savefig('posterior_predictive_checks.png', dpi=300, bbox_inches='tight')
    print("‚úì Saved: posterior_predictive_checks.png\n")
    plt.show()

# ============================================================================
# 10. VISUALIZATION: FOREST PLOTS
# ============================================================================

if stan_available:
    print(f"{'='*70}")
    print("CREATING VISUALIZATIONS")
    print(f"{'='*70}\n")
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 8))
    
    # Route forest plot
    route_plot = route_results.head(15).sort_values('Posterior_Mean')
    y_pos = np.arange(len(route_plot))
    
    axes[0].errorbar(route_plot['Posterior_Mean'], y_pos,
                     xerr=[route_plot['Posterior_Mean'] - route_plot['CI_Lower'],
                           route_plot['CI_Upper'] - route_plot['Posterior_Mean']],
                     fmt='o', markersize=8, capsize=5, capthick=2, color='steelblue')
    axes[0].axvline(route_results['Posterior_Mean'].mean(), color='red', linestyle='--', 
                    label='Overall Mean', linewidth=2)
    axes[0].set_yticks(y_pos)
    axes[0].set_yticklabels(route_plot['Route'])
    axes[0].set_xlabel('Posterior Mean Delay (minutes)')
    axes[0].set_title('Route Effects with 95% Credible Intervals\n(Top 15 Routes by Delay)')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Region forest plot
    region_plot = region_results.sort_values('Posterior_Mean')
    y_pos = np.arange(len(region_plot))
    
    axes[1].errorbar(region_plot['Posterior_Mean'], y_pos,
                     xerr=[region_plot['Posterior_Mean'] - region_plot['CI_Lower'],
                           region_plot['CI_Upper'] - region_plot['Posterior_Mean']],
                     fmt='s', markersize=8, capsize=5, capthick=2, color='darkgreen')
    axes[