# NYC Taxi Demand Prediction - All Models vs All Clusters
## Master Thesis - Vilnius University

This notebook trains 3 models on ALL 100 NYC taxi clusters for comparison.

**Models:**
- ConvLSTM (spatiotemporal learning)
- XGBoost (gradient boosting with features)
- Vector AutoRegression VAR (multivariate temporal model capturing spillovers)

**Output:** Comparative analysis showing which model works best for each cluster type

## CELL 0: Environment Setup

In [None]:
import os
import sys
import warnings
import pickle
import gc
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical models
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller, grangercausalitytests

# Machine Learning
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from xgboost import XGBRegressor

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import ConvLSTM2D, Conv2D, Dense, Flatten, Dropout, BatchNormalization, Input
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

# Configuration
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (14, 6)

# Set random seeds
np.random.seed(42)
tf.random.set_seed(42)

# Paths
INPUT_PATH = 'C:/Users/Anya/master_thesis/output'
OUTPUT_PATH = 'C:/Users/Anya/master_thesis/output/models_all_comparison'
CHECKPOINT_PATH = os.path.join(OUTPUT_PATH, 'checkpoints')

os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(CHECKPOINT_PATH, exist_ok=True)

# Hyperparameters
BATCH_SIZE = 32
N_LAGS = 24
TEST_SIZE = 0.2
VAL_SIZE = 0.1

print("="*80)
print("ENVIRONMENT CONFIGURED - ALL MODELS COMPARISON (XGBoost + VAR + ConvLSTM)")
print("="*80)
print(f"✓ TensorFlow version: {tf.__version__}")
print(f"✓ Output directory: {OUTPUT_PATH}")
print(f"✓ Will train 3 models: XGBoost (100 clusters), VAR (top 20), ConvLSTM (top 20)")

## CELL 1: Load & Prepare Data for All Clusters

In [None]:
print("\n" + "="*80)
print("STEP 1: Loading Data for All 100 Clusters")
print("="*80)

# Load raw data
data = pd.read_parquet(os.path.join(INPUT_PATH, 'taxi_data_with_clusters_full.parquet'))
print(f"\nRaw data loaded:")
print(f"  Shape: {data.shape}")
print(f"  Date range: {data['tpep_pickup_datetime'].min()} to {data['tpep_pickup_datetime'].max()}")
print(f"  Clusters: {data['kmeans_cluster'].nunique()}")

# Aggregate to hourly demand by cluster
data['time_period'] = data['tpep_pickup_datetime'].dt.floor('H')
demand = data.groupby(['time_period', 'kmeans_cluster']).size().reset_index(name='demand')
demand_matrix = demand.pivot(index='time_period', columns='kmeans_cluster', values='demand').fillna(0)
demand_matrix = demand_matrix.sort_index()

print(f"\nDemand matrix created:")
print(f"  Shape: {demand_matrix.shape}")
print(f"  All clusters: {demand_matrix.shape[1]}")
print(f"  Memory size: {demand_matrix.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Temporal split (70% train, 10% val, 20% test)
n = len(demand_matrix)
train_end = int(n * (1 - TEST_SIZE - VAL_SIZE))
val_end = int(n * (1 - TEST_SIZE))

train_data_all = demand_matrix.iloc[:train_end]
val_data_all = demand_matrix.iloc[train_end:val_end]
test_data_all = demand_matrix.iloc[val_end:]

print(f"\nTrain-Test Split:")
print(f"  Training: {len(train_data_all)} hours")
print(f"  Validation: {len(val_data_all)} hours")
print(f"  Test: {len(test_data_all)} hours")

# Save checkpoint
demand_matrix.to_pickle(os.path.join(CHECKPOINT_PATH, '01_demand_matrix.pkl'))
train_data_all.to_pickle(os.path.join(CHECKPOINT_PATH, '01_train_data.pkl'))
val_data_all.to_pickle(os.path.join(CHECKPOINT_PATH, '01_val_data.pkl'))
test_data_all.to_pickle(os.path.join(CHECKPOINT_PATH, '01_test_data.pkl'))

print(f"\n✓ Checkpoint saved")

## CELL 2: Analyze Cluster Characteristics

In [None]:
print("\n" + "="*80)
print("STEP 2: Analyze Cluster Characteristics")
print("="*80)

# Calculate statistics for each cluster
cluster_stats = pd.DataFrame({
    'cluster_id': demand_matrix.columns,
    'avg_hourly_demand': demand_matrix.mean(),
    'median_hourly_demand': demand_matrix.median(),
    'max_hourly_demand': demand_matrix.max(),
    'std_hourly_demand': demand_matrix.std(),
    'total_demand': demand_matrix.sum(),
    'sparsity_pct': (demand_matrix == 0).sum() / len(demand_matrix) * 100,
    'non_zero_hours': (demand_matrix != 0).sum()
}).sort_values('total_demand', ascending=False).reset_index(drop=True)

cluster_stats['demand_category'] = pd.cut(
    cluster_stats['avg_hourly_demand'],
    bins=[0, 10, 50, 500],
    labels=['Low', 'Medium', 'High']
)

print(f"\nCluster Demand Categories:")
print(f"  High-demand (>50 trips/hour): {(cluster_stats['demand_category'] == 'High').sum()} clusters")
print(f"  Medium-demand (10-50 trips/hour): {(cluster_stats['demand_category'] == 'Medium').sum()} clusters")
print(f"  Low-demand (<10 trips/hour): {(cluster_stats['demand_category'] == 'Low').sum()} clusters")

# Save statistics
cluster_stats.to_csv(os.path.join(OUTPUT_PATH, 'cluster_characteristics.csv'), index=False)
print(f"\n✓ Cluster characteristics saved")

print(f"\nTop 10 clusters:")
print(cluster_stats.head(10)[['cluster_id', 'avg_hourly_demand', 'sparsity_pct', 'demand_category']])

## CELL 3: Train XGBoost on All Clusters

In [None]:
print("\n" + "="*80)
print("MODEL 1: XGBoost Training (All 100 Clusters)")
print("="*80)

def create_xgboost_features(data_df, n_lags=24):
    """Create lag features for XGBoost"""
    df = data_df.copy()
    
    # Create lags
    for col in data_df.columns:
        for lag in range(1, n_lags + 1):
            df[f'{col}_lag_{lag}'] = data_df[col].shift(lag)
    
    # Rolling statistics
    for col in data_df.columns:
        df[f'{col}_rolling_mean_6'] = data_df[col].shift(1).rolling(window=6).mean()
        df[f'{col}_rolling_std_6'] = data_df[col].shift(1).rolling(window=6).std()
        df[f'{col}_rolling_mean_24'] = data_df[col].shift(1).rolling(window=24).mean()
    
    # Temporal features
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['is_weekend'] = (df.index.dayofweek >= 5).astype(int)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    
    df = df.dropna()
    return df

# Create features
print("\nCreating XGBoost features...")
train_features = create_xgboost_features(train_data_all, N_LAGS)
val_features = create_xgboost_features(val_data_all, N_LAGS)
test_features = create_xgboost_features(test_data_all, N_LAGS)

feature_cols = [col for col in train_features.columns if col not in demand_matrix.columns]
print(f"  Total features: {len(feature_cols)}")

# Train XGBoost for each cluster
xgb_models = {}
xgb_metrics = {}
xgb_predictions = pd.DataFrame(index=test_features.index)

print(f"\nTraining {len(demand_matrix.columns)} XGBoost models...")
for i, cluster in enumerate(demand_matrix.columns):
    if i % 20 == 0:
        print(f"  Progress: {i+1}/100")
    
    model = XGBRegressor(
        n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        random_state=42,
        n_jobs=1,
        verbosity=0
    )
    
    model.fit(
        train_features[feature_cols], 
        train_features[cluster],
        verbose=False
    )
    
    xgb_models[cluster] = model
    
    # Predict
    pred = model.predict(test_features[feature_cols])
    xgb_predictions[cluster] = pred
    
    # Evaluate
    actual = test_features[cluster].values
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mask = actual != 0
    mape = np.mean(np.abs((actual[mask] - pred[mask]) / actual[mask])) * 100 if mask.sum() > 0 else np.nan
    
    xgb_metrics[cluster] = {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}

# Summary
xgb_mape_values = [m['MAPE'] for m in xgb_metrics.values() if not np.isnan(m['MAPE'])]
print(f"\nXGBoost Summary (All 100 Clusters):")
print(f"  Mean MAPE: {np.mean(xgb_mape_values):.2f}%")
print(f"  Median MAPE: {np.median(xgb_mape_values):.2f}%")
print(f"  Std MAPE: {np.std(xgb_mape_values):.2f}%")
print(f"  Min MAPE: {np.min(xgb_mape_values):.2f}%")
print(f"  Max MAPE: {np.max(xgb_mape_values):.2f}%")

# Save
with open(os.path.join(OUTPUT_PATH, 'xgboost_models.pkl'), 'wb') as f:
    pickle.dump(xgb_models, f)
with open(os.path.join(OUTPUT_PATH, 'xgboost_metrics.pkl'), 'wb') as f:
    pickle.dump(xgb_metrics, f)

del train_features, val_features, test_features
gc.collect()

print(f"\n✓ XGBoost models trained and saved")

## CELL 4: Train Vector AutoRegression (VAR) on Top 20 High-Demand Clusters

In [None]:
print("\n" + "="*80)
print("MODEL 2: Vector AutoRegression (VAR) - Top 20 High-Demand Zones")
print("="*80)

# Select top 20 high-demand clusters
top_20_clusters = cluster_stats.head(20)['cluster_id'].tolist()

print(f"\nVAR on {len(top_20_clusters)} high-demand clusters")
print(f"  Reason: VAR captures spatial spillovers (taxi movements between zones)")
print(f"  Example: Taxi shortage at LaGuardia → higher demand in Midtown")

# Prepare data subsets
train_var = train_data_all[top_20_clusters]
test_var = test_data_all[top_20_clusters]

print(f"\nVAR Data:")
print(f"  Training shape: {train_var.shape}")
print(f"  Test shape: {test_var.shape}")
print(f"  Clusters: {len(top_20_clusters)}")

# Check stationarity (VAR requirement)
print(f"\nChecking stationarity (Augmented Dickey-Fuller test)...")
stationary_count = 0
for col in train_var.columns:
    _, p_value, _, _, _ = adfuller(train_var[col].dropna(), autolag='AIC')
    if p_value < 0.05:
        stationary_count += 1

print(f"  Stationary series: {stationary_count}/{len(train_var.columns)}")

# Difference if needed for stationarity
def ensure_stationary_for_var(data, max_diff=2):
    """Difference data until stationary for VAR"""
    for d in range(max_diff + 1):
        if d == 0:
            diff_data = data.copy()
        else:
            diff_data = data.diff(d).dropna()
        
        # Check if most series stationary
        adf_results = []
        for col in diff_data.columns:
            _, p_value, _, _, _ = adfuller(diff_data[col].dropna(), autolag='AIC')
            adf_results.append(p_value < 0.05)
        
        if sum(adf_results) >= len(diff_data.columns) * 0.8:
            return diff_data, d
    return data, 0

train_var_stat, d_order = ensure_stationary_for_var(train_var)
print(f"  Differencing order applied: d={d_order}")

# Fit VAR model
print(f"\nFitting VAR model with optimal lag selection...")
var_model = VAR(train_var_stat)

# Select optimal lag order
lag_order_results = var_model.select_order(maxlags=24)
optimal_lag_aic = lag_order_results.aic
optimal_lag_bic = lag_order_results.bic

print(f"  Optimal lag (AIC): {optimal_lag_aic}")
print(f"  Optimal lag (BIC): {optimal_lag_bic}")

# Fit with AIC lag (captures more dynamics)
var_results = var_model.fit(optimal_lag_aic)

print(f"\nVAR Model Summary:")
print(f"  Number of equations: {var_results.neqs}")
print(f"  Number of observations: {var_results.nobs}")
print(f"  AIC: {var_results.aic:.4f}")
print(f"  BIC: {var_results.bic:.4f}")

# Check stability
eigenvalues = var_results.roots
is_stable = np.all(np.abs(eigenvalues) < 1)
print(f"  Model stable: {'Yes ✓' if is_stable else 'No ✗ (check results)'}")
print(f"  Max eigenvalue: {np.max(np.abs(eigenvalues)):.4f}")

# Forecast on test period
print(f"\nGenerating VAR forecasts...")
var_pred_list = []

# Use last observations from training data to start forecasting
last_obs = train_var_stat.iloc[-optimal_lag_aic:].values

for i in range(len(test_var)):
    if i % 50 == 0 and i > 0:
        print(f"  Progress: {i}/{len(test_var)}")
    
    # Forecast 1 step ahead
    forecast = var_results.forecast(last_obs, steps=1)
    var_pred_list.append(forecast[0])
    
    # Update for next forecast (rolling window)
    last_obs = np.vstack([last_obs[1:], forecast])

var_predictions_arr = np.array(var_pred_list)

# Inverse difference if we differenced the data
if d_order > 0:
    # Add back the trend from training data
    var_predictions_undiff = var_predictions_arr.copy()
    for d in range(d_order):
        var_predictions_undiff = np.cumsum(var_predictions_undiff, axis=0) + train_var.iloc[-1].values
    var_predictions_arr = var_predictions_undiff

# Create predictions dataframe
var_predictions = pd.DataFrame(
    var_predictions_arr,
    index=test_var.index,
    columns=top_20_clusters
)

# Evaluate VAR
var_metrics = {}
var_mape_values = []

print(f"\nEvaluating VAR predictions...")
for cluster in top_20_clusters:
    actual = test_var[cluster].values
    pred = var_predictions[cluster].values
    
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mask = actual != 0
    mape = np.mean(np.abs((actual[mask] - pred[mask]) / actual[mask])) * 100 if mask.sum() > 0 else np.nan
    
    var_metrics[cluster] = {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}
    if not np.isnan(mape):
        var_mape_values.append(mape)

# Summary
print(f"\nVAR Summary (Top 20 High-Demand Clusters):")
print(f"  Mean MAPE: {np.mean(var_mape_values):.2f}%")
print(f"  Median MAPE: {np.median(var_mape_values):.2f}%")
print(f"  Std MAPE: {np.std(var_mape_values):.2f}%")
print(f"  Min MAPE: {np.min(var_mape_values):.2f}%")
print(f"  Max MAPE: {np.max(var_mape_values):.2f}%")

# Granger Causality Analysis
print(f"\nGranger Causality Analysis...")
print(f"  Testing if Zone j Granger-causes Zone i (does j help predict i?)\n")

causality_pairs = []
for i, effect_cluster in enumerate(top_20_clusters[:5]):  # Show first 5 for brevity
    for j, cause_cluster in enumerate(top_20_clusters[:5]):
        if i != j:
            try:
                gc_result = grangercausalitytests(
                    train_var[[effect_cluster, cause_cluster]].dropna(),
                    maxlag=7,
                    verbose=False
                )
                min_p_value = min([gc_result[lag][0][0][1] for lag in range(1, 8)])
                if min_p_value < 0.05:
                    causality_pairs.append({
                        'cause': int(cause_cluster),
                        'effect': int(effect_cluster),
                        'p_value': min_p_value
                    })
            except:
                pass

if causality_pairs:
    causality_df = pd.DataFrame(causality_pairs).sort_values('p_value')
    print("  Significant causal relationships (p < 0.05):")
    for idx, row in causality_df.head(10).iterrows():
        print(f"    Zone {row['cause']:.0f} → Zone {row['effect']:.0f} (p={row['p_value']:.4f})")
else:
    print("  No significant causal relationships detected in top 5 clusters")

# Save
with open(os.path.join(OUTPUT_PATH, 'var_metrics.pkl'), 'wb') as f:
    pickle.dump(var_metrics, f)

var_predictions.to_csv(os.path.join(OUTPUT_PATH, 'var_predictions.csv'))

print(f"\n✓ VAR model trained and saved")
print(f"  Key insight: VAR captures spatial spillovers via Granger causality")

## CELL 5: Train ConvLSTM on Top 20 High-Demand Clusters

In [None]:
print("\n" + "="*80)
print("MODEL 3: ConvLSTM Training (Top 20 High-Demand Clusters)")
print("="*80)

print(f"\nConvLSTM on {len(top_20_clusters)} high-demand clusters")
print(f"  Reason: ConvLSTM computationally expensive, best on rich data")

# Prepare data
train_convlstm = train_data_all[top_20_clusters]
val_convlstm = val_data_all[top_20_clusters]
test_convlstm = test_data_all[top_20_clusters]

# Scale
scaler = MinMaxScaler()
train_scaled = pd.DataFrame(
    scaler.fit_transform(train_convlstm),
    index=train_convlstm.index,
    columns=train_convlstm.columns
)
val_scaled = pd.DataFrame(
    scaler.transform(val_convlstm),
    index=val_convlstm.index,
    columns=val_convlstm.columns
)
test_scaled = pd.DataFrame(
    scaler.transform(test_convlstm),
    index=test_convlstm.index,
    columns=test_convlstm.columns
)

# Create grid (5x4 = 20 clusters)
def create_grid_data(data_df, grid_height=5, grid_width=4):
    n_timesteps = len(data_df)
    grid_data = np.zeros((n_timesteps, grid_height, grid_width, 1))
    for idx, col in enumerate(data_df.columns):
        row = idx // grid_width
        col_pos = idx % grid_width
        if row < grid_height:
            grid_data[:, row, col_pos, 0] = data_df[col].values
    return grid_data

grid_train = create_grid_data(train_scaled, 5, 4)
grid_val = create_grid_data(val_scaled, 5, 4)
grid_test = create_grid_data(test_scaled, 5, 4)

print(f"\nGrid shapes: {grid_train.shape}")

# Build ConvLSTM model
input_layer = Input(shape=(6, 5, 4, 1), name='input')
x = ConvLSTM2D(32, (3, 3), padding='same', return_sequences=True, activation='relu')(input_layer)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = ConvLSTM2D(32, (3, 3), padding='same', return_sequences=False, activation='relu')(x)
x = BatchNormalization()(x)
x = Conv2D(16, (3, 3), padding='same', activation='relu')(x)
output = Conv2D(1, (1, 1), padding='same', activation='relu', name='output')(x)

convlstm_model = Model(inputs=input_layer, outputs=output)
convlstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

print(f"\nTraining ConvLSTM...")
history = convlstm_model.fit(
    grid_train, grid_train,
    validation_data=(grid_val, grid_val),
    epochs=50,
    batch_size=BATCH_SIZE,
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=0),
    ],
    verbose=0
)

# Predict
convlstm_pred = convlstm_model.predict(grid_test, verbose=0)

# Extract and evaluate
convlstm_metrics = {}
convlstm_predictions = pd.DataFrame(index=test_convlstm.index)

for idx, cluster in enumerate(top_20_clusters):
    row = idx // 4
    col = idx % 4
    
    pred_grid = convlstm_pred[:, row, col, 0]
    convlstm_predictions[cluster] = scaler.inverse_transform(
        np.column_stack([pred_grid] * len(train_convlstm.columns))
    )[:, 0]
    
    actual = test_convlstm[cluster].values
    rmse = np.sqrt(mean_squared_error(actual, convlstm_predictions[cluster].values))
    mae = mean_absolute_error(actual, convlstm_predictions[cluster].values)
    mask = actual != 0
    mape = np.mean(np.abs((actual[mask] - convlstm_predictions[cluster].values[mask]) / actual[mask])) * 100 if mask.sum() > 0 else np.nan
    
    convlstm_metrics[cluster] = {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}

# Summary
convlstm_mape_values = [m['MAPE'] for m in convlstm_metrics.values() if not np.isnan(m['MAPE'])]
print(f"\nConvLSTM Summary (Top 20 High-Demand Clusters):")
print(f"  Mean MAPE: {np.mean(convlstm_mape_values):.2f}%")
print(f"  Median MAPE: {np.median(convlstm_mape_values):.2f}%")

# Save
convlstm_model.save(os.path.join(OUTPUT_PATH, 'convlstm_model.keras'))
with open(os.path.join(OUTPUT_PATH, 'convlstm_metrics.pkl'), 'wb') as f:
    pickle.dump(convlstm_metrics, f)

print(f"\n✓ ConvLSTM model trained and saved")

## CELL 6: Comparative Analysis & Model Performance Ranking

In [None]:
print("\n" + "="*80)
print("COMPARATIVE ANALYSIS: Which Model Works Best for Each Cluster?")
print("="*80)

# Create comparison dataframe
comparison_data = []

for cluster in demand_matrix.columns:
    row = {'cluster_id': cluster}
    
    # Get demand category
    cluster_info = cluster_stats[cluster_stats['cluster_id'] == cluster].iloc[0]
    row['avg_demand'] = cluster_info['avg_hourly_demand']
    row['demand_category'] = cluster_info['demand_category']
    row['sparsity_pct'] = cluster_info['sparsity_pct']
    
    # XGBoost metrics
    if cluster in xgb_metrics:
        row['xgboost_mape'] = xgb_metrics[cluster]['MAPE']
    else:
        row['xgboost_mape'] = np.nan
    
    # VAR metrics (only for top 20 clusters)
    if cluster in var_metrics:
        row['var_mape'] = var_metrics[cluster]['MAPE']
    else:
        row['var_mape'] = np.nan
    
    # ConvLSTM metrics (only for top 20 clusters)
    if cluster in convlstm_metrics:
        row['convlstm_mape'] = convlstm_metrics[cluster]['MAPE']
    else:
        row['convlstm_mape'] = np.nan
    
    # Determine best model
    mapes = {
        'XGBoost': row['xgboost_mape'],
        'VAR': row['var_mape'],
        'ConvLSTM': row['convlstm_mape']
    }
    # Remove NaN values
    mapes = {k: v for k, v in mapes.items() if not np.isnan(v)}
    
    if mapes:
        row['best_model'] = min(mapes, key=mapes.get)
        row['best_mape'] = min(mapes.values())
    else:
        row['best_model'] = 'N/A'
        row['best_mape'] = np.nan
    
    comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)

# Save comparison
comparison_df.to_csv(os.path.join(OUTPUT_PATH, 'model_comparison_all_clusters.csv'), index=False)

print(f"\nComparison Results:")
print(f"\nBest Model Distribution:")
print(comparison_df['best_model'].value_counts())

print(f"\nPerformance by Model (MAPE):")
print(f"  XGBoost - Mean: {comparison_df['xgboost_mape'].mean():.2f}%, Median: {comparison_df['xgboost_mape'].median():.2f}%")
print(f"  VAR     - Mean: {comparison_df['var_mape'].mean():.2f}%, Median: {comparison_df['var_mape'].median():.2f}%")
print(f"  ConvLSTM - Mean: {comparison_df['convlstm_mape'].mean():.2f}%, Median: {comparison_df['convlstm_mape'].median():.2f}%")

print(f"\nBest Model by Demand Category:")
for category in ['High', 'Medium', 'Low']:
    subset = comparison_df[comparison_df['demand_category'] == category]
    if len(subset) > 0:
        best_counts = subset['best_model'].value_counts()
        print(f"  {category}: {dict(best_counts)}")

print(f"\n✓ Comparison saved to model_comparison_all_clusters.csv")

## CELL 7: Summary Tables & Insights

In [None]:
print("\n" + "="*80)
print("FINAL SUMMARY & INSIGHTS FOR THESIS")
print("="*80)

# Overall statistics
summary_stats = {
    'Model': ['XGBoost', 'VAR (High-Demand)', 'ConvLSTM (High-Demand)'],
    'Clusters_Trained': [100, 20, 20],
    'Mean_MAPE': [
        comparison_df['xgboost_mape'].mean(),
        comparison_df['var_mape'].mean(),
        comparison_df['convlstm_mape'].mean()
    ],
    'Median_MAPE': [
        comparison_df['xgboost_mape'].median(),
        comparison_df['var_mape'].median(),
        comparison_df['convlstm_mape'].median()
    ],
    'Std_MAPE': [
        comparison_df['xgboost_mape'].std(),
        comparison_df['var_mape'].std(),
        comparison_df['convlstm_mape'].std()
    ],
    'Best_for_Count': [
        (comparison_df['best_model'] == 'XGBoost').sum(),
        (comparison_df['best_model'] == 'VAR').sum(),
        (comparison_df['best_model'] == 'ConvLSTM').sum()
    ]
}

summary_df = pd.DataFrame(summary_stats)
summary_df.to_csv(os.path.join(OUTPUT_PATH, 'model_performance_summary.csv'), index=False)

print("\n" + summary_df.to_string(index=False))

# Insights
print(f"\n" + "="*80)
print("KEY INSIGHTS FOR THESIS CONCLUSIONS")
print("="*80)

xgb_best = (comparison_df['best_model'] == 'XGBoost').sum()
var_best = (comparison_df['best_model'] == 'VAR').sum()
lstm_best = (comparison_df['best_model'] == 'ConvLSTM').sum()

print(f"\n1. OVERALL PERFORMANCE:")
print(f"   XGBoost wins for {xgb_best} clusters")
print(f"   VAR wins for {var_best} clusters (captures spillovers)")
print(f"   ConvLSTM wins for {lstm_best} clusters (spatiotemporal)")

print(f"\n2. MODEL SPECIALIZATION:")
high_dem = comparison_df[comparison_df['demand_category'] == 'High']
med_dem = comparison_df[comparison_df['demand_category'] == 'Medium']
low_dem = comparison_df[comparison_df['demand_category'] == 'Low']

if len(high_dem) > 0:
    print(f"   High-demand zones: {high_dem['best_model'].value_counts().to_dict()}")
if len(med_dem) > 0:
    print(f"   Medium-demand zones: {med_dem['best_model'].value_counts().to_dict()}")
if len(low_dem) > 0:
    print(f"   Low-demand zones: {low_dem['best_model'].value_counts().to_dict()}")

print(f"\n3. SPARSE VS DENSE DATA:")
sparse = comparison_df[comparison_df['sparsity_pct'] > 30]
dense = comparison_df[comparison_df['sparsity_pct'] <= 30]
if len(sparse) > 0:
    print(f"   Sparse zones (>30% zeros): {sparse['best_model'].value_counts().to_dict()}")
if len(dense) > 0:
    print(f"   Dense zones (≤30% zeros): {dense['best_model'].value_counts().to_dict()}")

print(f"\n4. PERFORMANCE RANGE:")
print(f"   XGBoost MAPE: {comparison_df['xgboost_mape'].min():.2f}% - {comparison_df['xgboost_mape'].max():.2f}%")
print(f"   VAR MAPE: {comparison_df['var_mape'].min():.2f}% - {comparison_df['var_mape'].max():.2f}%")
print(f"   ConvLSTM MAPE: {comparison_df['convlstm_mape'].min():.2f}% - {comparison_df['convlstm_mape'].max():.2f}%")

print(f"\n5. VAR ADVANTAGES:")
print(f"   ✓ Captures spatial spillovers (taxi movements between zones)")
print(f"   ✓ Granger causality reveals predictive relationships")
print(f"   ✓ Economic interpretation: which zones drive others")
print(f"   ⚠ Computational cost: limited to 20 zones (curse of dimensionality)")

print(f"\n✓ Summary saved")

## CELL 8: Visualizations

In [None]:
print("\nCreating visualizations...")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Model Performance Comparison
ax = axes[0, 0]
models = ['XGBoost', 'VAR', 'ConvLSTM']
means = [
    comparison_df['xgboost_mape'].mean(),
    comparison_df['var_mape'].mean(),
    comparison_df['convlstm_mape'].mean()
]
stds = [
    comparison_df['xgboost_mape'].std(),
    comparison_df['var_mape'].std(),
    comparison_df['convlstm_mape'].std()
]
ax.bar(models, means, yerr=stds, capsize=10, color=['#3498db', '#e74c3c', '#2ecc71'])
ax.set_ylabel('Mean MAPE (%)', fontsize=12)
ax.set_title('Model Performance Comparison (Mean ± Std)', fontsize=13, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# 2. Best Model Distribution
ax = axes[0, 1]
best_counts = comparison_df['best_model'].value_counts()
colors_dict = {'XGBoost': '#3498db', 'VAR': '#e74c3c', 'ConvLSTM': '#2ecc71'}
colors = [colors_dict.get(idx, '#95a5a6') for idx in best_counts.index]
ax.bar(best_counts.index, best_counts.values, color=colors)
ax.set_ylabel('Number of Clusters', fontsize=12)
ax.set_title('Best Model Distribution (Which Model Wins)', fontsize=13, fontweight='bold')
for i, v in enumerate(best_counts.values):
    ax.text(i, v + 0.5, str(v), ha='center', fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# 3. Performance by Demand Category
ax = axes[1, 0]
categories = ['High', 'Medium', 'Low']
xgb_means = []
var_means = []
lstm_means = []
for cat in categories:
    subset = comparison_df[comparison_df['demand_category'] == cat]
    if len(subset) > 0:
        xgb_means.append(subset['xgboost_mape'].mean())
        var_means.append(subset['var_mape'].mean())
        lstm_means.append(subset['convlstm_mape'].mean())

x = np.arange(len(categories))
width = 0.25
ax.bar(x - width, xgb_means, width, label='XGBoost', color='#3498db')
ax.bar(x, var_means, width, label='VAR', color='#e74c3c')
ax.bar(x + width, lstm_means, width, label='ConvLSTM', color='#2ecc71')
ax.set_ylabel('Mean MAPE (%)', fontsize=12)
ax.set_title('Performance by Demand Category', fontsize=13, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend()
ax.grid(axis='y', alpha=0.3)

# 4. MAPE Distribution Box Plot
ax = axes[1, 1]
data_to_plot = [
    comparison_df['xgboost_mape'].dropna(),
    comparison_df['var_mape'].dropna(),
    comparison_df['convlstm_mape'].dropna()
]
bp = ax.boxplot(data_to_plot, labels=['XGBoost', 'VAR', 'ConvLSTM'], patch_artist=True)
colors = ['#3498db', '#e74c3c', '#2ecc71']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
ax.set_ylabel('MAPE (%)', fontsize=12)
ax.set_title('MAPE Distribution by Model', fontsize=13, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_PATH, 'model_comparison_visualization.png'), dpi=300, bbox_inches='tight')
print("✓ Visualizations saved")
plt.show()

print(f"\n" + "="*80)
print(f"ALL ANALYSIS COMPLETE - XGBoost + VAR + ConvLSTM Comparison")
print(f"="*80)
print(f"\nOutput files saved to: {OUTPUT_PATH}")
print(f"\nKey files:")
print(f"  - model_comparison_all_clusters.csv (cluster-by-cluster comparison)")
print(f"  - model_performance_summary.csv (overall statistics)")
print(f"  - var_predictions.csv (VAR forecasts for top 20 zones)")
print(f"  - cluster_characteristics.csv (demand analysis)")
print(f"  - model_comparison_visualization.png (charts)")
print(f"\nThesis Insight:")
print(f"  VAR captures spatial spillovers (Granger causality) not seen by univariate models")
print(f"  Better for policy analysis: which zones drive demand elsewhere")