# Stage 10b: Time Series Modeling for Bike Demand Prediction

## Overview
Time series modeling approach with sklearn Pipeline, lag/rolling features, and temporal validation.

## Modeling Approach: Time Series
**Rationale**: Bike demand exhibits strong temporal patterns with hourly seasonality, making time series modeling optimal for capturing sequential dependencies.

In [None]:
# Import libraries
import sys
sys.path.append('../src')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
print("📈 Time Series Libraries Loaded")

In [None]:
# Load and prepare time series data
data = pd.read_csv('../data/sample-data.csv')
data['datetime'] = pd.to_datetime('2024-01-01') + pd.to_timedelta(data.index, unit='h')
data = data.set_index('datetime')

print(f"📊 Time Series Dataset: {data.shape}")
print(f"📅 Range: {data.index.min()} to {data.index.max()}")
data.head()

## 1. Time Series Feature Engineering with sklearn Pipeline

In [None]:
# Custom transformer for lag and rolling features
from sklearn.base import BaseEstimator, TransformerMixin

class TimeSeriesFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, target_col='demand', lags=[1, 2, 3, 6], windows=[3, 6, 12]):
        self.target_col = target_col
        self.lags = lags
        self.windows = windows
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_new = X.copy()
        
        # Lag features
        for lag in self.lags:
            X_new[f'{self.target_col}_lag_{lag}'] = X_new[self.target_col].shift(lag)
        
        # Rolling features
        for window in self.windows:
            X_new[f'{self.target_col}_rolling_{window}h'] = X_new[self.target_col].rolling(window, min_periods=1).mean()
            X_new[f'temp_rolling_{window}h'] = X_new['temperature'].rolling(window, min_periods=1).mean()
        
        # Temporal features
        X_new['hour'] = X_new.index.hour
        X_new['hour_sin'] = np.sin(2 * np.pi * X_new['hour'] / 24)
        X_new['hour_cos'] = np.cos(2 * np.pi * X_new['hour'] / 24)
        X_new['is_weekend'] = (X_new.index.dayofweek >= 5).astype(int)
        
        # Differencing
        X_new['demand_diff'] = X_new[self.target_col].diff()
        X_new['temp_diff'] = X_new['temperature'].diff()
        
        return X_new

# Apply feature engineering
ts_transformer = TimeSeriesFeatureTransformer()
data_transformed = ts_transformer.transform(data)
data_clean = data_transformed.dropna()

print(f"🔧 Feature Engineering Complete:")
print(f"   Original: {data.shape[1]} features")
print(f"   Transformed: {data_transformed.shape[1]} features")
print(f"   After cleaning: {data_clean.shape}")

## 2. Time Series Train-Test Split

In [None]:
# Prepare features and target
target_col = 'demand'
feature_cols = [col for col in data_clean.columns if col != target_col]
X = data_clean[feature_cols]
y = data_clean[target_col]

# Time series split (80/20)
split_idx = int(0.8 * len(data_clean))
X_train = X.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_train = y.iloc[:split_idx]
y_test = y.iloc[split_idx:]

print(f"📈 Time Series Split:")
print(f"   Train: {len(X_train)} samples ({len(X_train)/len(data_clean)*100:.1f}%)")
print(f"   Test: {len(X_test)} samples ({len(X_test)/len(data_clean)*100:.1f}%)")
print(f"   No temporal overlap: {X_train.index.max() < X_test.index.min()}")

## 3. sklearn Pipeline with Multiple Model Variations

In [None]:
# Feature selection for time series
correlations = X_train.corrwith(y_train).abs().sort_values(ascending=False)
top_features = correlations.head(12).index.tolist()

print(f"🏆 Top 12 Features Selected:")
for i, feature in enumerate(top_features, 1):
    corr = correlations[feature]
    print(f"   {i:2d}. {feature:25s} (r={corr:.3f})")

X_train_sel = X_train[top_features]
X_test_sel = X_test[top_features]

In [None]:
# Define sklearn Pipelines
pipelines = {
    'Linear_TS': Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),
    'Ridge_TS': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=1.0, random_state=42))
    ]),
    'Ridge_Strong': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=10.0, random_state=42))
    ]),
    'RandomForest_TS': Pipeline([
        ('model', RandomForestRegressor(n_estimators=50, max_depth=8, random_state=42))
    ])
}

# Train and evaluate
results = {}
print("🤖 Training Time Series Pipelines:")

for name, pipeline in pipelines.items():
    pipeline.fit(X_train_sel, y_train)
    
    y_train_pred = pipeline.predict(X_train_sel)
    y_test_pred = pipeline.predict(X_test_sel)
    
    results[name] = {
        'train_r2': r2_score(y_train, y_train_pred),
        'test_r2': r2_score(y_test, y_test_pred),
        'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'test_mae': mean_absolute_error(y_test, y_test_pred),
        'predictions': y_test_pred
    }
    
    print(f"   {name}: R²={results[name]['test_r2']:.3f}, RMSE={results[name]['test_rmse']:.2f}")

best_model = max(results.keys(), key=lambda k: results[k]['test_r2'])
print(f"\n🏆 Best Model: {best_model} (R²={results[best_model]['test_r2']:.3f})")

## 4. Time Series Cross-Validation

In [None]:
# TimeSeriesSplit cross-validation
tscv = TimeSeriesSplit(n_splits=4)
X_full = pd.concat([X_train_sel, X_test_sel])
y_full = pd.concat([y_train, y_test])

cv_scores = []
best_pipeline = pipelines[best_model]

print(f"📊 Time Series Cross-Validation ({best_model}):")
for fold, (train_idx, test_idx) in enumerate(tscv.split(X_full), 1):
    X_cv_train = X_full.iloc[train_idx]
    X_cv_test = X_full.iloc[test_idx]
    y_cv_train = y_full.iloc[train_idx]
    y_cv_test = y_full.iloc[test_idx]
    
    cv_pipeline = Pipeline(best_pipeline.steps)
    cv_pipeline.fit(X_cv_train, y_cv_train)
    y_cv_pred = cv_pipeline.predict(X_cv_test)
    
    cv_score = r2_score(y_cv_test, y_cv_pred)
    cv_scores.append(cv_score)
    print(f"   Fold {fold}: R² = {cv_score:.3f}")

print(f"\n📈 CV Results: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")

## 5. Diagnostic Plots and Analysis

In [None]:
# Time series diagnostics
best_predictions = results[best_model]['predictions']
residuals = y_test - best_predictions

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Actual vs Predicted time series
axes[0,0].plot(y_test.index, y_test.values, label='Actual', linewidth=2)
axes[0,0].plot(y_test.index, best_predictions, label='Predicted', linewidth=2)
axes[0,0].set_title('Time Series Forecast')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Residuals over time
axes[0,1].plot(y_test.index, residuals)
axes[0,1].axhline(y=0, color='red', linestyle='--')
axes[0,1].set_title('Residuals Over Time')
axes[0,1].grid(True, alpha=0.3)

# Residuals distribution
axes[1,0].hist(residuals, bins=12, alpha=0.7, edgecolor='black')
axes[1,0].axvline(residuals.mean(), color='red', linestyle='--')
axes[1,0].set_title('Residuals Distribution')

# Actual vs Predicted scatter
axes[1,1].scatter(y_test, best_predictions, alpha=0.7)
axes[1,1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
axes[1,1].set_title('Actual vs Predicted')
axes[1,1].set_xlabel('Actual')
axes[1,1].set_ylabel('Predicted')

plt.tight_layout()
plt.show()

print(f"📊 Residual Analysis:")
print(f"   Mean: {residuals.mean():.6f}")
print(f"   Std: {residuals.std():.3f}")
print(f"   Skewness: {residuals.skew():.3f}")

## 6. Model Performance Summary and Risk Assessment

In [None]:
# Performance summary
performance_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Test_R2': [results[m]['test_r2'] for m in results.keys()],
    'Test_RMSE': [results[m]['test_rmse'] for m in results.keys()],
    'Test_MAE': [results[m]['test_mae'] for m in results.keys()],
    'Overfitting': [results[m]['train_r2'] - results[m]['test_r2'] for m in results.keys()]
}).sort_values('Test_R2', ascending=False)

print("📊 Time Series Model Performance:")
print(performance_df.round(3))

# Risk assessment
best_r2 = results[best_model]['test_r2']
best_rmse = results[best_model]['test_rmse']
best_mae = results[best_model]['test_mae']

print(f"\n🎯 Time Series Model Insights:")
print(f"   • Explains {best_r2:.1%} of demand variance")
print(f"   • Average forecast error: {best_rmse:.1f} bikes")
print(f"   • Mean absolute error: {best_mae:.1f} bikes")
print(f"   • Cross-validation stability: {np.std(cv_scores):.3f}")

print(f"\n⚠️ Time Series Risk Assessment:")
max_error = abs(residuals).max()
print(f"   • Maximum forecast error: {max_error:.1f} bikes")
print(f"   • 95% of errors within: ±{np.percentile(abs(residuals), 95):.1f} bikes")
print(f"   • Temporal dependencies: {'Captured' if best_r2 > 0.7 else 'Partially captured'}")
print(f"   • Model stability: {'High' if np.std(cv_scores) < 0.1 else 'Moderate'}")

# Business interpretation
demand_range = y.max() - y.min()
relative_error = best_rmse / demand_range
print(f"   • Relative error: {relative_error:.1%} of demand range")
print(f"   • Suitable for: {'Real-time forecasting' if relative_error < 0.15 else 'Strategic planning'}")

## 7. Feature Importance Analysis

In [None]:
# Feature importance analysis
trained_pipeline = pipelines[best_model]

if hasattr(trained_pipeline.named_steps['model'], 'coef_'):
    # Linear model coefficients
    coefficients = pd.DataFrame({
        'feature': top_features,
        'coefficient': trained_pipeline.named_steps['model'].coef_
    }).sort_values('coefficient', key=abs, ascending=False)
    
    print(f"🔍 {best_model} Feature Coefficients:")
    print(coefficients.head(8))
    
elif hasattr(trained_pipeline.named_steps['model'], 'feature_importances_'):
    # Tree-based importance
    importance = pd.DataFrame({
        'feature': top_features,
        'importance': trained_pipeline.named_steps['model'].feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"🌳 {best_model} Feature Importance:")
    print(importance.head(8))

# Time series specific insights
lag_features = [f for f in top_features if 'lag' in f]
rolling_features = [f for f in top_features if 'rolling' in f]
temporal_features = [f for f in top_features if any(x in f for x in ['hour', 'weekend'])]

print(f"\n📈 Time Series Feature Categories:")
print(f"   • Lag features: {len(lag_features)} ({lag_features})")
print(f"   • Rolling features: {len(rolling_features)}")
print(f"   • Temporal features: {len(temporal_features)}")

## 8. Save Results

In [None]:
# Save time series model results
import os
os.makedirs('../data/processed', exist_ok=True)

# Save predictions with timestamps
ts_predictions = pd.DataFrame({
    'datetime': y_test.index,
    'actual': y_test.values,
    'predicted': best_predictions,
    'residuals': residuals
})
ts_predictions.to_csv('../data/processed/time_series_predictions.csv', index=False)

# Save performance summary
performance_df.to_csv('../data/processed/time_series_performance.csv', index=False)

print("💾 Time Series Results Saved:")
print("   • Predictions: ../data/processed/time_series_predictions.csv")
print("   • Performance: ../data/processed/time_series_performance.csv")
print(f"\n🎯 Time Series Modeling Complete!")
print(f"   Best model: {best_model} (R²={best_r2:.3f})")
print(f"   Forecast accuracy: {best_rmse:.1f} bikes RMSE")
print(f"   Cross-validation: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")