# **Objectives and Hypotheses**

## **Primary Hypotheses:**
### Season-wise Hypothesis
Can season-specific models better capture trends in bike demand than a unified model?

## **Experimental Design:**
### Baseline Model:
XGBoost Regressor
### Target Variable:
Rented Bike Count
### Dataset:
Seoul Bike Sharing Demand
### Data Split:
80% training, 20% testing

### Visualization:
PCA for dimensionality reduction and cluster visualization

### Performance Metrics:
RMSE, MAE, and R²

## **Expected Outcomes:**


# **Import Libraries**

In [None]:
pip install ucimlrepo

In [None]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import xgboost as xgb
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from scipy.spatial.distance import mahalanobis
from ucimlrepo import fetch_ucirepo
from statsmodels.stats.outliers_influence import variance_inflation_factor
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.mplot3d import Axes3D

plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams['axes.edgecolor'] = '#333333'
mpl.rcParams['axes.linewidth'] = 0.8
mpl.rcParams['xtick.color'] = '#333333'
mpl.rcParams['ytick.color'] = '#333333'


# **Import Data**

In [None]:
# Fetch Seoul Bike Sharing Demand dataset from UCI ML Repository
seoul_bike_sharing_demand = fetch_ucirepo(id=560)

In [None]:
# Data (as pandas dataframes)
X_original = seoul_bike_sharing_demand.data.features
y_original = seoul_bike_sharing_demand.data.targets

In [None]:
# Print dataset information
print("Dataset Metadata:")
print(seoul_bike_sharing_demand.metadata)
print("\nVariable Information:")
print(seoul_bike_sharing_demand.variables)

In [None]:
# Examine feature information
print("\nOriginal feature columns:")
print(X_original.columns.tolist())
print("\nOriginal target variable:")
print(y_original.columns.tolist())

# **Data Preprocessing**

In [None]:
# Make 'Rented Bike Count' the new target if it exists

if 'Rented Bike Count' in X_original.columns:
    # Make 'Rented Bike Count' the new target
    y = X_original[['Rented Bike Count']]
    # Remove 'Rented Bike Count' from features
    X = X_original.drop('Rented Bike Count', axis=1)
    # Add original target to features
    X = pd.concat([X, y_original], axis=1)

else:
    # If 'Rented Bike Count' is already the target, just confirm
    print("'Rented Bike Count' is already the target variable.")
    y = y_original
    X = X_original

In [None]:
print("\nNew feature columns:")
print(X.columns.tolist())
print("\nNew target variable:")
print(y.columns.tolist())

In [None]:
# Check for missing values
print("\nMissing values in features:")
print(X.isnull().sum())
print("\nMissing values in target:")
print(y.isnull().sum())

In [None]:
# Create a copy of the dataset to investigate periodic data
X_viz = X.copy()
y_viz = y.copy()

seasonal_data = X_viz.copy()
seasonal_data['Rented_Bike_Count'] = y_viz['Rented Bike Count']

In [None]:
# Seasonal Bike Demand Analysis
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
fig.suptitle('Seasonal Bike Sharing Demand Analysis', fontsize=16, fontweight='bold')

# Prepare data
seasonal_data = X_viz.copy()
seasonal_data['Rented_Bike_Count'] = y_viz['Rented Bike Count']

# Define season order
season_order = ['Winter', 'Spring', 'Summer', 'Autumn']
season_colors = ['#3498db', '#2ecc71', '#f39c12', '#e74c3c']  # Blue, Green, Orange, Red

# 1. Average Daily Demand by Season
seasonal_avg = seasonal_data.groupby('Seasons')['Rented_Bike_Count'].mean().reindex(season_order)

bars = ax1.bar(seasonal_avg.index, seasonal_avg.values, 
               color=season_colors, alpha=0.8, edgecolor='#333333', linewidth=1.2)
ax1.set_xlabel('Season', fontweight='bold')
ax1.set_ylabel('Average Daily Bike Rentals', fontweight='bold')
ax1.set_title('Average Daily Demand by Season', fontweight='bold')
ax1.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'{int(height)}', ha='center', va='bottom', fontweight='bold')

# 2. Box plot showing distribution of demand by season
seasonal_data_ordered = []
season_labels = []
for season in season_order:
    season_data = seasonal_data[seasonal_data['Seasons'] == season]['Rented_Bike_Count']
    if len(season_data) > 0:
        seasonal_data_ordered.append(season_data)
        season_labels.append(season)

box_plot = ax2.boxplot(seasonal_data_ordered, tick_labels=season_labels, patch_artist=True)
for patch, color in zip(box_plot['boxes'], season_colors[:len(season_labels)]):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax2.set_xlabel('Season', fontweight='bold')
ax2.set_ylabel('Daily Bike Rentals', fontweight='bold')
ax2.set_title('Distribution of Daily Demand by Season', fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Print seasonal summary statistics
print("Seasonal Bike Sharing Analysis")

for season in season_order:
    season_data = seasonal_data[seasonal_data['Seasons'] == season]['Rented_Bike_Count']
    if len(season_data) > 0:
        print(f"\n{season}:")
        print(f"  Average daily rentals: {season_data.mean():.1f}")
        print(f"  Peak daily demand: {season_data.max():,}")
        print(f"  Lowest daily demand: {season_data.min():,}")
        print(f"  Standard deviation: {season_data.std():.1f}")
        print(f"  Days of data: {len(season_data):,}")

# Compare seasons
print(f"\nSeasonal Comparisons:")

# Find best and worst seasons
season_averages = {}
for season in season_order:
    season_data = seasonal_data[seasonal_data['Seasons'] == season]['Rented_Bike_Count']
    if len(season_data) > 0:
        season_averages[season] = season_data.mean()

if season_averages:
    best_season = max(season_averages, key=season_averages.get)
    worst_season = min(season_averages, key=season_averages.get)
    
    print(f"Highest demand season: {best_season} ({season_averages[best_season]:.1f} avg daily)")
    print(f"Lowest demand season: {worst_season} ({season_averages[worst_season]:.1f} avg daily)")
    
    # Calculate difference
    seasonal_range = season_averages[best_season] - season_averages[worst_season]
    seasonal_range_pct = (seasonal_range / season_averages[worst_season]) * 100
    
    print(f"Seasonal variation: {seasonal_range:.1f} rentals ({seasonal_range_pct:.1f}% increase from lowest to highest)")

In [None]:
# Daily Bike Demand Over the Year
fig, ax = plt.subplots(figsize=(15, 6))

# Prepare data
daily_data = X_viz.copy()
daily_data['Rented_Bike_Count'] = y_viz['Rented Bike Count']

# Convert Date to datetime if needed
if daily_data['Date'].dtype == 'object':
    daily_data['Date'] = pd.to_datetime(daily_data['Date'], format='%d/%m/%Y')

# Group by date and sum hourly data to get daily totals
daily_totals = daily_data.groupby('Date')['Rented_Bike_Count'].sum().reset_index()

# Plot daily demand
ax.plot(daily_totals['Date'], daily_totals['Rented_Bike_Count'], 
        color='#1f77b4', linewidth=1.5, alpha=0.8)

# Add a 7-day rolling average to smooth the trend
if len(daily_totals) > 7:
    rolling_avg = daily_totals['Rented_Bike_Count'].rolling(window=7, center=True).mean()
    ax.plot(daily_totals['Date'], rolling_avg, 
            color='#ff7f0e', linewidth=2.5, label='7-Day Moving Average')
    ax.legend()

ax.set_xlabel('Date', fontweight='bold', fontsize=12)
ax.set_ylabel('Daily Bike Rentals', fontweight='bold', fontsize=12)
ax.set_title('Daily Bike Demand Over the Year', fontweight='bold', fontsize=16)
ax.grid(True, alpha=0.3)

# Format x-axis to show months nicely
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

## Cyclical Encoding
For ML processes we often want to encode data for more efficient processing, however in this case, we need to choose an encoding which preserves the periodic patterns that will help us make our demand predictions. We can use a technique called Cyclical Encoding for the features in the dataset which are periodic in nature.  This will better help us capture seasonal, monthly, weekly and diurnal trends for our demand prediction.

The technique is borrowed from harmonic analysis and signal processing, where we place our periodic data on a unite circle rather than on a linear scale.  This helps us preserve the natural relationships, for example, between Sunday and Monday, Decemeber and Janaury, which are now neighbors rather than distance points.

This also helps smooth artificial jumps between time periods, for example between the first day of January and the last day of December, or between the end of Spring and the start of Summer.  The elegance here is that Euclidean distance in sin/cos space matches circular distance.

In [None]:
# Add cyclical encoding for months and plot the difference
fig, axes = plt.subplots(1, 3, figsize=(18, 6))  # Changed from (2, 3) to (1, 3)
fig.suptitle('Cyclical Encoding Example: From Linear to Circular Representation', fontsize=16, fontweight='bold')

# Months of the year
months = np.arange(1, 13)  # Jan=1 to Dec=12
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# Linear representation
axes[0].scatter(months, [0]*12, c=range(12), cmap='viridis', s=100)  # Changed from axes[0, 0] to axes[0]
for i, name in enumerate(month_names):
    axes[0].annotate(name, (months[i], 0), xytext=(0, 20), 
                       textcoords='offset points', ha='center')
axes[0].set_xlabel('Month (Linear)', fontweight='bold')
axes[0].set_title('Jan and Dec plotted linearly', fontweight='bold')
axes[0].set_ylim(-0.5, 1)
axes[0].grid(True, alpha=0.3)

# Cyclical representation using sin/cos
month_angles = 2 * np.pi * months / 12
month_sin = np.sin(month_angles)
month_cos = np.cos(month_angles)

# Plot on unit circle
circle = plt.Circle((0, 0), 1, fill=False, color='gray', linestyle='--', alpha=0.5)
axes[1].add_patch(circle)  # Changed from axes[0, 1] to axes[1]
axes[1].scatter(month_cos, month_sin, c=range(12), cmap='viridis', s=100)
for i, name in enumerate(month_names):
    axes[1].annotate(name, (month_cos[i], month_sin[i]), xytext=(5, 5), 
                       textcoords='offset points', fontsize=8)
axes[1].set_xlabel('Month Cosine', fontweight='bold')
axes[1].set_ylabel('Month Sine', fontweight='bold')
axes[1].set_title('Jan and Dec are neighbors on the unit circle', fontweight='bold')
axes[1].set_aspect('equal')
axes[1].grid(True, alpha=0.3)

# Encoding formulas
axes[2].text(0.1, 0.8, 'Cyclical Encoding Formulas:', fontsize=14, fontweight='bold', transform=axes[2].transAxes)  # Changed from axes[0, 2] to axes[2]
axes[2].text(0.1, 0.7, 'For month M (1-12):', fontsize=12, transform=axes[2].transAxes)
axes[2].text(0.1, 0.6, 'angle = 2π × M / 12', fontsize=12, transform=axes[2].transAxes, family='monospace')
axes[2].text(0.1, 0.5, 'month_sin = sin(angle)', fontsize=12, transform=axes[2].transAxes, family='monospace')
axes[2].text(0.1, 0.4, 'month_cos = cos(angle)', fontsize=12, transform=axes[2].transAxes, family='monospace')
axes[2].text(0.1, 0.25, 'Benefits:', fontsize=12, fontweight='bold', transform=axes[2].transAxes)
axes[2].text(0.1, 0.15, '• Preserves cyclical relationships', fontsize=10, transform=axes[2].transAxes)
axes[2].text(0.1, 0.1, '• Smooth transitions', fontsize=10, transform=axes[2].transAxes)
axes[2].text(0.1, 0.05, '• No arbitrary ordering', fontsize=10, transform=axes[2].transAxes)
axes[2].set_xlim(0, 1)
axes[2].set_ylim(0, 1)
axes[2].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Function to convert periodic data to cyclical encoding
def create_cyclical_encoding(X_data):
    X_cyclical = X_data.copy()
    
    X_cyclical['Date'] = pd.to_datetime(X_cyclical['Date'])
    
    X_cyclical['Month_sin'] = np.sin(2 * np.pi * X_cyclical['Month'] / 12)
    X_cyclical['Month_cos'] = np.cos(2 * np.pi * X_cyclical['Month'] / 12)
    
    X_cyclical['DayOfYear'] = X_cyclical['Date'].dt.dayofyear
    X_cyclical['DayOfYear_sin'] = np.sin(2 * np.pi * X_cyclical['DayOfYear'] / 365)
    X_cyclical['DayOfYear_cos'] = np.cos(2 * np.pi * X_cyclical['DayOfYear'] / 365)
    
    X_cyclical['Hour_sin'] = np.sin(2 * np.pi * X_cyclical['Hour'] / 24)
    X_cyclical['Hour_cos'] = np.cos(2 * np.pi * X_cyclical['Hour'] / 24)
    
    X_cyclical['DayOfWeek_sin'] = np.sin(2 * np.pi * X_cyclical['DayOfWeek'] / 7)
    X_cyclical['DayOfWeek_cos'] = np.cos(2 * np.pi * X_cyclical['DayOfWeek'] / 7)
    
    # Map seasons to numbers (0-3) in seasonal order
    season_mapping = {
        'Winter': 0,    # Start of cycle
        'Spring': 1,    # 1/4 through cycle  
        'Summer': 2,    # 1/2 through cycle
        'Autumn': 3     # 3/4 through cycle
    }
    
    # Create numeric season column
    X_cyclical['Season_numeric'] = X_cyclical['Seasons'].map(season_mapping)
    
    # Create seasonal cyclical encoding
    X_cyclical['Season_sin'] = np.sin(2 * np.pi * X_cyclical['Season_numeric'] / 4)
    X_cyclical['Season_cos'] = np.cos(2 * np.pi * X_cyclical['Season_numeric'] / 4)    
    
    return X_cyclical

In [None]:
# Quick reset to avoid returning to the start of the notebook
seoul_bike_sharing_demand = fetch_ucirepo(id=560)
X_original = seoul_bike_sharing_demand.data.features
y_original = seoul_bike_sharing_demand.data.targets

# Make 'Rented Bike Count' the new target if it exists

if 'Rented Bike Count' in X_original.columns:
    # Make 'Rented Bike Count' the new target
    y = X_original[['Rented Bike Count']]
    # Remove 'Rented Bike Count' from features
    X = X_original.drop('Rented Bike Count', axis=1)
    # Add original target to features
    X = pd.concat([X, y_original], axis=1)

else:
    # If 'Rented Bike Count' is already the target, just confirm
    print("'Rented Bike Count' is already the target variable.")
    y = y_original
    X = X_original

In [None]:
# 1. Convert date column to datetime and extract useful components. date format is DD/MM/YYYY
if 'Date' in X.columns:
    # Specify the correct date format as DD/MM/YYYY
    X['Date'] = pd.to_datetime(X['Date'], format='%d/%m/%Y')
    X['Year'] = X['Date'].dt.year
    X['Month'] = X['Date'].dt.month
    X['Day'] = X['Date'].dt.day
    X['DayOfWeek'] = X['Date'].dt.dayofweek

In [None]:
# 2. Create cyclical encoding before adding one-encoding
X = create_cyclical_encoding(X)

# Drop Date after cyclical encoding
X = X.drop('Date', axis=1)

In [None]:
# 3. Convert categorical features to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)
X.describe()

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_df = pd.DataFrame(X_train, columns=X_train.columns)

correlation_matrix = X_train_df.corr()

plt.figure(figsize=(12, 8))
# Use the calculated correlation matrix in the heatmap
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False, fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Manual removal of collinear features, since cyclical features will have intentionally high VIF
definitely_remove = ["DayOfYear", "Month", "Day", "Dew point temperature"]  
X_train_filtered = X_train.drop(columns=[f for f in definitely_remove if f in X_train.columns])
X_test_filtered = X_test.drop(columns=[f for f in definitely_remove if f in X_test.columns])

In [None]:
# Standardize the filtered features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_filtered)
X_test_scaled = scaler.transform(X_test_filtered)

# **Model Development**
We break development into the following steps:
1. Create a baseline XGBoost model using default parameters
2. Use hyperparameter tuning with regularization to optimize the model
3. Implement early stopping to prevent overfitting
4. Evaluate the results and create visualizations

### Quick Reset
Before proceeding, let's create a quick way to start with fresh preprocessed data defined by our earlier scheme.

In [None]:
def preprocess_data():
    """
    Minimal preprocessing pipeline for bike sharing demand data
    """
    # Load data
    seoul_bike_sharing_demand = fetch_ucirepo(id=560)
    X_original = seoul_bike_sharing_demand.data.features
    y_original = seoul_bike_sharing_demand.data.targets
    
    # Handle target variable
    if 'Rented Bike Count' in X_original.columns:
        y = X_original[['Rented Bike Count']]
        X = X_original.drop('Rented Bike Count', axis=1)
        X = pd.concat([X, y_original], axis=1)
    else:
        y = y_original
        X = X_original
        
    # Process dates and create cyclical features
    if 'Date' in X.columns:
        X['Date'] = pd.to_datetime(X['Date'], format='%d/%m/%Y')
        X['Year'] = X['Date'].dt.year
        X['Month'] = X['Date'].dt.month
        X['Day'] = X['Date'].dt.day
        X['DayOfWeek'] = X['Date'].dt.dayofweek
        
    # Create cyclical encoding and clean up
    X = create_cyclical_encoding(X)
    X = X.drop('Date', axis=1)
    X = pd.get_dummies(X, drop_first=True)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Remove collinear features
    definitely_remove = ["DayOfYear", "Month", "Day", "Dew point temperature"]
    actually_removed = [f for f in definitely_remove if f in X_train.columns]
    
    if actually_removed:
        X_train = X_train.drop(columns=actually_removed)
        X_test = X_test.drop(columns=actually_removed)
    
    # Standardize
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler, X_train.columns.tolist()

# Run preprocessing
X_train_scaled, X_test_scaled, y_train, y_test, scaler, feature_names = preprocess_data()
print(f"Data preprocessed: {len(feature_names)} features, {len(X_train_scaled)} training samples")

## 1. Create a Baseline Model and Check for Overfitting

In [None]:
baseline_model = xgb.XGBRegressor(
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

# Train the baseline model
baseline_model.fit(X_train_scaled, y_train.values.ravel())

# Make predictions for baseline model
baseline_train_preds = baseline_model.predict(X_train_scaled)
baseline_test_preds = baseline_model.predict(X_test_scaled)

# Calculate metrics for baseline model
baseline_metrics = {
    'train_rmse': math.sqrt(mean_squared_error(y_train, baseline_train_preds)),
    'train_r2': r2_score(y_train, baseline_train_preds),
    'train_mae': mean_absolute_error(y_train, baseline_train_preds),
    'test_rmse': math.sqrt(mean_squared_error(y_test, baseline_test_preds)),
    'test_r2': r2_score(y_test, baseline_test_preds),
    'test_mae': mean_absolute_error(y_test, baseline_test_preds)
}

print("BASELINE MODEL PERFORMANCE (Minimal Config):")
print(f"Training RMSE:  {baseline_metrics['train_rmse']:.2f}")
print(f"Training R²:    {baseline_metrics['train_r2']:.4f}")
print(f"Training MAE:   {baseline_metrics['train_mae']:.2f}")
print(f"Test RMSE:      {baseline_metrics['test_rmse']:.2f}")
print(f"Test R²:        {baseline_metrics['test_r2']:.4f}")
print(f"Test MAE:       {baseline_metrics['test_mae']:.2f}")

# Check for overfitting in baseline model
baseline_rmse_gap = baseline_metrics['test_rmse'] - baseline_metrics['train_rmse']
baseline_mae_gap = baseline_metrics['test_mae'] - baseline_metrics['train_mae']

print(f"\nBaseline Model Overfitting Check:")
print(f"RMSE Gap (Test - Train): {baseline_rmse_gap:.2f}")
print(f"MAE Gap (Test - Train):  {baseline_mae_gap:.2f}")

if baseline_rmse_gap < 30:
    print("Minimal overfitting detected (RMSE)")
elif baseline_rmse_gap < 60:
    print("Moderate overfitting detected (RMSE)")
else:
    print("Significant overfitting detected (RMSE)")


We see that the baseline model overfits our training data with a RMSE Gap more than 60, we leave that for now and address overfitting for both the baseline and tuned models in step three.  

## 2. Perform Hyperparameter Tuning
Here we set up and conduct hyperparameter tuning to identify the optimal values for predicting bike usage. 

A table of the key XGBoost hyperparameters that we will use to control different aspects of model training, is provided:
| Parameter | Function | Impact |
|:-----------|:----------|:--------|
| `n_estimators` | Number of decision trees (rounds) | More trees = better fit |
| `learning_rate` | Step size for each tree's update | Lower = slower, more stable |
| `max_depth` | Maximum depth of each tree | Higher = more complex trees |
| `min_child_weight` | Minimum samples in leaf nodes | Higher = prevents overfitting |
| `reg_alpha` | L1 regularization (Lasso) | Higher = simpler model |
| `reg_lambda` | L2 regularization (Ridge) | Higher = smoother predictions |
| `subsample` | Fraction of samples per tree | Lower = more randomization |
| `colsample_bytree` | Fraction of features per tree | Lower = reduces correlation |

We can use gridsearch to evaluate different combinations of these parameters when we build our XGBoost decision trees.

In [None]:
# Define comprehensive parameter grid for tuned model with regularization focus
param_grid_comprehensive = {
    # Number of boosting rounds
    'n_estimators': [100, 200, 300],
    
    # Learning rate - conservative values to prevent overfitting
    'learning_rate': [0.01, 0.05, 0.1],
    
    # Tree structure - moderate depth to capture patterns without overfitting
    'max_depth': [3, 4, 6],
    
    # Minimum child weight - higher values prevent overfitting
    'min_child_weight': [1, 3, 5],
    
    # L1 regularization - encourages sparsity
    'reg_alpha': [0, 0.1, 0.5],
    
    # L2 regularization - smooths predictions
    'reg_lambda': [1, 1.5, 2],
    
    # Subsampling - reduces overfitting through randomization
    'subsample': [0.7, 0.8, 0.9],
    
    # Feature subsampling - reduces correlation between trees
    'colsample_bytree': [0.7, 0.8, 0.9]
}

## 3. Early Stopping with Cross Validation

### First we address overfitting with the baseline model

We can adjust our baseline model by adding hyperparameters, first we try adjusting the learning rate and tree depth:
| Parameter | Function | Impact |
|:-----------|:----------|:--------|
| `n_estimators` | Number of decision trees (rounds) | More trees = better fit |
| `learning_rate` | Step size for each tree's update | Lower = slower, more stable |
| `max_depth` | Maximum depth of each tree | Higher = more complex trees |

In [None]:
baseline_better_model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)

# Train and evaluate baseline better model
baseline_better_model.fit(X_train_scaled, y_train.values.ravel())
baseline_better_train_preds = baseline_better_model.predict(X_train_scaled)
baseline_better_test_preds = baseline_better_model.predict(X_test_scaled)

# Calculate metrics for baseline better model
baseline_better_train_rmse = math.sqrt(mean_squared_error(y_train, baseline_better_train_preds))
baseline_better_test_rmse = math.sqrt(mean_squared_error(y_test, baseline_better_test_preds))
baseline_better_train_r2 = r2_score(y_train, baseline_better_train_preds)
baseline_better_test_r2 = r2_score(y_test, baseline_better_test_preds)
baseline_better_train_mae = mean_absolute_error(y_train, baseline_better_train_preds)
baseline_better_test_mae = mean_absolute_error(y_test, baseline_better_test_preds)

print("BASELINE BETTER MODEL PERFORMANCE: \n(Adjusted tree depth and learning rate)\n")
print(f"Train RMSE: {baseline_better_train_rmse:.2f} | Test RMSE: {baseline_better_test_rmse:.2f}")
print(f"Train R²:   {baseline_better_train_r2:.4f} | Test R²:   {baseline_better_test_r2:.4f}")
print(f"Train MAE:  {baseline_better_train_mae:.2f} | Test MAE:  {baseline_better_test_mae:.2f}")
print(f"RMSE Overfitting Gap: {baseline_better_test_rmse:.2f} - {baseline_better_train_rmse:.2f} = {baseline_better_test_rmse - baseline_better_train_rmse:.2f}")
print(f"MAE Overfitting Gap:  {baseline_better_test_mae:.2f} - {baseline_better_train_mae:.2f} = {baseline_better_test_mae - baseline_better_train_mae:.2f}")


This improved our overfitting reducing the gap from 91.59 to 55.37 in this new model with minimal regularization.  This model has moderate overfitting but is a huge improvement.  Still, we can do better by adding stronger regularization.
| Parameter | Function | Impact |
|:-----------|:----------|:--------|
| `n_estimators` | Number of decision trees (rounds) | More trees = better fit |
| `learning_rate` | Step size for each tree's update | Lower = slower, more stable |
| `max_depth` | Maximum depth of each tree | Higher = more complex trees |
| `min_child_weight` | Minimum samples in leaf nodes | Higher = prevents overfitting |
| `reg_alpha` | L1 regularization (Lasso) | Higher = simpler model |
| `reg_lambda` | L2 regularization (Ridge) | Higher = smoother predictions |
| `subsample` | Fraction of samples per tree | Lower = more randomization |
| `colsample_bytree` | Fraction of features per tree | Lower = reduces correlation |

In [None]:
# Enhanced baseline with regularization to address overfitting
baseline_best_model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.08,          # Slightly lower
    max_depth=5,                 # Reduced depth
    min_child_weight=3,          # Higher minimum
    reg_alpha=0.5,               # L1 regularization
    reg_lambda=2,                # L2 regularization
    subsample=0.8,               # Sample 80% of data
    colsample_bytree=0.8,        # Sample 80% of features
    random_state=42,
    n_jobs=-1
)

# Train and evaluate baseline best model
baseline_best_model.fit(X_train_scaled, y_train.values.ravel())
baseline_best_train_preds = baseline_best_model.predict(X_train_scaled)
baseline_best_test_preds = baseline_best_model.predict(X_test_scaled)

# Calculate metrics for baseline best model
baseline_best_train_rmse = math.sqrt(mean_squared_error(y_train, baseline_best_train_preds))
baseline_best_test_rmse = math.sqrt(mean_squared_error(y_test, baseline_best_test_preds))
baseline_best_train_r2 = r2_score(y_train, baseline_best_train_preds)
baseline_best_test_r2 = r2_score(y_test, baseline_best_test_preds)
baseline_best_train_mae = mean_absolute_error(y_train, baseline_best_train_preds)
baseline_best_test_mae = mean_absolute_error(y_test, baseline_best_test_preds)

print("BASELINE BEST MODEL PERFORMANCE: \n(With L1 and L2 Regularization, and adjusted learning rate and tree depth)\n")

print(f"Train RMSE: {baseline_best_train_rmse:.2f} | Test RMSE: {baseline_best_test_rmse:.2f}")
print(f"Train R²:   {baseline_best_train_r2:.4f} | Test R²:   {baseline_best_test_r2:.4f}")
print(f"Train MAE:  {baseline_best_train_mae:.2f} | Test MAE:  {baseline_best_test_mae:.2f}")
print(f"RMSE Overfitting Gap: {baseline_best_test_rmse - baseline_best_train_rmse:.2f}")
print(f"MAE Overfitting Gap:  {baseline_best_test_mae - baseline_best_train_mae:.2f}")

baseline_best_rmse_gap = baseline_best_test_rmse - baseline_best_train_rmse
print(f"\nBaseline Best Model Improvement:")
print(f"Gap Reduction vs Baseline: {(baseline_rmse_gap - baseline_best_rmse_gap):.2f}")

We'll use this new baseline model as it shows a significant improvement in generalization (34.38 is a much lower overfitting gap). . 

### Building a seasonally-aware model while staying vigiliant of overfitting
To capture the complex seasonal interactions, we now turn to Grid Search to try to create a more seasonally-aware model. Our max depth parameter helps us capture complex seasonal interactions, reg_alpha and reg_lambda balance feature importance so cyclical features aren't overshadowed.  We adjust our learning rate so we can learn seasonal patterns slowly, and employ early stopping to ensure that the seasonally-aware model does not overfit.

In [None]:
# Anti-overfitting parameter grid specifically designed for seasonal patterns
param_grid_seasonal = {
    # Conservative number of trees
    'n_estimators': [100, 150, 200],
    
    # Lower learning rates for stable seasonal learning
    'learning_rate': [0.01, 0.05, 0.08],
    
    # Moderate tree depths to capture seasonal interactions without overfitting
    'max_depth': [4, 5, 6],
    
    # Higher minimum child weight for generalization
    'min_child_weight': [3, 5, 7],
    
    # Strong L1 regularization for feature selection (helps cyclical features shine)
    'reg_alpha': [0.5, 1.0, 2.0],
    
    # Strong L2 regularization for smooth predictions
    'reg_lambda': [2, 3, 4],
    
    # Aggressive subsampling for randomization (prevents seasonal overfitting)
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8]
}

print(f"Parameter combinations: {np.prod([len(v) for v in param_grid_seasonal.values()])}")
print("Focus: Optimize cyclical feature usage while preventing overfitting")

# Time Series Cross-Validation (preserves temporal relationships)
tscv = TimeSeriesSplit(n_splits=5, test_size=len(X_train_scaled)//8)
print("Using TimeSeriesSplit to respect temporal order in bike demand data")

# XGBoost configured for seasonal data
xgb_seasonal = xgb.XGBRegressor(
    random_state=42,
    n_jobs=-1,
    verbosity=0,
    objective='reg:squarederror',
    eval_metric='rmse'
)

# Grid Search with seasonal focus
grid_search_seasonal = GridSearchCV(
    estimator=xgb_seasonal,
    param_grid=param_grid_seasonal,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=7,   # number of cores to use (adjust for your system)
    verbose=1,  # 0-suppress progress, 1-show minimal progress, 2-show progress with timing, 3- detailed progress
    return_train_score=True
)

print("Starting seasonal-aware grid search...")
print("This will find optimal parameters for cyclical feature utilization, and will take some time to finish.")
print("Adjusting n_jobs can speed up the process based on the number of cores available.")

# Fit the grid search
grid_search_seasonal.fit(X_train_scaled, y_train.values.ravel())

print("Seasonal grid search completed!")

# Extract best results
best_params_seasonal = grid_search_seasonal.best_params_
best_cv_score = grid_search_seasonal.best_score_

print(f"\nBEST SEASONAL PARAMETERS FROM GRID SEARCH:")
print("-" * 45)
for param, value in best_params_seasonal.items():
    print(f"{param:18s}: {value}")

print(f"\nGrid Search Results:")
print(f"Best CV Score (Neg MSE): {best_cv_score:.2f}")
print(f"Best CV RMSE: {math.sqrt(-best_cv_score):.2f}")

print(f"\nThese parameters will be used as starting point for manual tuning...")

### Manual Tuning and Training of the Seasonal Model
We used these grid search results as a starting point, and manually fine tuned the Regressor function to find a well tuned season model.  This involved using this and the following block to evaluate.

In [None]:
print(f"TRAINING SEASONAL MODEL WITH MANUAL TUNING + EARLY STOPPING")

# Create validation split for early stopping
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train_scaled, y_train.values.ravel(), test_size=0.15, random_state=42
)

print(f"Training split: {X_train_final.shape}")
print(f"Validation split: {X_val_final.shape}")

print(f"\nManual tuning notes:")
print("Starting from grid search results, then fine-tuning based on:")
print("- Validation performance")
print("- Overfitting reduction")
print("- Cyclical feature importance")

# Seasonal model with manually tuned parameters (starting from grid search results)
seasonal_model = xgb.XGBRegressor(
    # Grid search starting points, then manually tuned:
    n_estimators=280,           # More trees = higher overfitting risk (was 320, tried 250, 200, 150)
    learning_rate=0.045,        # Moderate (was 0.05, tried 0.08, 0.03)
    max_depth=6,                # Controls amount of complexity for cyclical features (was 5, tried 4)  
    min_child_weight=3,         # Controls amount of complexity for cyclical features (was 2.5, tried 7)
    reg_alpha=1.0,              # Controls overfitting (was 0.9, tried 0.7, 0.5, 2.5)  
    reg_lambda=3.2,             # Controls overfitting (was 3.2, tried 2.0, 4.0)
    subsample=0.82,             # Less aggressive (was 0.85, tried 0.8, 0.7, 0.6)
    colsample_bytree=0.8,       # Helps cyclical features (was 0.85, tried 0.7, 0.6)
    early_stopping_rounds=18,   # More patient (was 20, tried 25, 15, 8)
    random_state=42,
    n_jobs=-1
)

print(f"\nFinal tuned parameters:")
seasonal_params = {
    'n_estimators': 280,
    'learning_rate': 0.045,
    'max_depth': 6,
    'min_child_weight': 3,
    'reg_alpha': 1.0,
    'reg_lambda': 3.2,
    'subsample': 0.82,
    'colsample_bytree': 0.8,
    'early_stopping_rounds': 18
}

for param, value in seasonal_params.items():
    grid_search_value = best_params_seasonal.get(param, 'N/A')
    print(f"{param:20s}: {value} (grid search: {grid_search_value})")

# Train with early stopping validation
print(f"\nTraining seasonal model with early stopping...")
seasonal_model.fit(
    X_train_final, y_train_final,
    eval_set=[(X_val_final, y_val_final)],
    verbose=False
)

print("Seasonal model training completed!")

# Check early stopping results
if hasattr(seasonal_model, 'best_iteration') and seasonal_model.best_iteration:
    print(f"Early stopping triggered at iteration: {seasonal_model.best_iteration}")
    trees_saved = 280 - seasonal_model.best_iteration
    print(f"Trees saved from overfitting: {trees_saved}")
    print(f"Effective learning: Used {seasonal_model.best_iteration}/{280} trees ({(seasonal_model.best_iteration/280)*100:.1f}%)")
else:
    print(f"Used all 280 trees (early stopping didn't trigger)")
    print("Model may benefit from further regularization")

### Seasonal Model Evaluation

In [None]:
# Make predictions on full train/test sets
seasonal_train_preds = seasonal_model.predict(X_train_scaled)
seasonal_test_preds = seasonal_model.predict(X_test_scaled)

# Calculate comprehensive metrics
seasonal_train_rmse = math.sqrt(mean_squared_error(y_train, seasonal_train_preds))
seasonal_test_rmse = math.sqrt(mean_squared_error(y_test, seasonal_test_preds))
seasonal_train_r2 = r2_score(y_train, seasonal_train_preds)
seasonal_test_r2 = r2_score(y_test, seasonal_test_preds)
seasonal_train_mae = mean_absolute_error(y_train, seasonal_train_preds)
seasonal_test_mae = mean_absolute_error(y_test, seasonal_test_preds)

print(f"\nSEASONAL MODEL PERFORMANCE:")
print(f"Training RMSE:  {seasonal_train_rmse:.2f}")
print(f"Training R²:    {seasonal_train_r2:.4f}")
print(f"Training MAE:   {seasonal_train_mae:.2f}")
print(f"Test RMSE:      {seasonal_test_rmse:.2f}")
print(f"Test R²:        {seasonal_test_r2:.4f}")
print(f"Test MAE:       {seasonal_test_mae:.2f}")

# Overfitting analysis with MAE and RMSE
seasonal_rmse_gap = seasonal_test_rmse - seasonal_train_rmse
seasonal_mae_gap = seasonal_test_mae - seasonal_train_mae

print(f"\nOVERFITTING ANALYSIS:")
print(f"RMSE Gap (Test - Train): {seasonal_rmse_gap:.2f}")
print(f"MAE Gap (Test - Train):  {seasonal_mae_gap:.2f}")

# RMSE gap assessment
if seasonal_rmse_gap < 30:
    rmse_status = "Excellent generalization achieved! (RMSE)"
elif seasonal_rmse_gap < 50:
    rmse_status = "Good generalization achieved! (RMSE)"
else:
    rmse_status = "Some overfitting remains - but model still usable (RMSE)"

# MAE gap assessment  
if seasonal_mae_gap < 20:
    mae_status = "Excellent generalization achieved! (MAE)"
elif seasonal_mae_gap < 35:
    mae_status = "Good generalization achieved! (MAE)"
else:
    mae_status = "Some overfitting remains - but model still usable (MAE)"

print(f"\nGeneralization Assessment:")
print(f"RMSE: {rmse_status}")
print(f"MAE:  {mae_status}")

# Feature importance analysis focusing on cyclical features
print(f"\nCYCLICAL FEATURE IMPORTANCE ANALYSIS:")

feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': seasonal_model.feature_importances_
}).sort_values('importance', ascending=False)

# Highlight cyclical features
cyclical_features = feature_importance[feature_importance['feature'].str.contains('_sin|_cos')].head(8)

if len(cyclical_features) > 0:
    print(f"Top cyclical features in seasonal model:")
    for i, (_, row) in enumerate(cyclical_features.iterrows(), 1):
        print(f"  {i}. {row['feature']:<20s}: {row['importance']:.4f}")
        
    cyclical_importance_sum = cyclical_features['importance'].sum()
    total_importance = feature_importance['importance'].sum()
    cyclical_percentage = (cyclical_importance_sum / total_importance) * 100
    
    print(f"\nCyclical features account for {cyclical_percentage:.1f}% of total importance")
    
    if cyclical_percentage > 15:
        print("Seasonal patterns successfully captured!")
    elif cyclical_percentage > 10:
        print("Moderate seasonal pattern usage")
    else:
        print("Low cyclical feature importance - check feature engineering")
else:
    print("No cyclical features found in top importance - check feature engineering")

# Model readiness assessment
print(f"\nMODEL READINESS ASSESSMENT:")
print(f"Seasonal model trained with {len(feature_names)} features")
print(f"Grid search + manual tuning completed")
print(f"Early stopping applied (saved {280 - (seasonal_model.best_iteration or 280)} trees)")
print(f"Overfitting controlled: RMSE gap = {seasonal_rmse_gap:.2f}, MAE gap = {seasonal_mae_gap:.2f}")
print(f"Cyclical features utilized: {cyclical_percentage:.1f}% importance")

# Store predictions for comparison charts
print(f"\nPrediction arrays available:")
print(f"- seasonal_train_preds: {len(seasonal_train_preds)} training predictions")  
print(f"- seasonal_test_preds: {len(seasonal_test_preds)} test predictions")

## 4. Evaluation and Visualizations

In [None]:
# Monthly X-axis with Seasonal Lines and Background Colors
def create_seasonal_plot():
    # Prepare combined data
    y_actual_all = np.concatenate([y_train.values.ravel(), y_test.values.ravel()])
    y_pred_all = np.concatenate([balanced_train_preds, balanced_test_preds])
    
    # Create correct date range: Dec 1, 2017 to Nov 30, 2018 (8760 hours)
    start_date = pd.Timestamp('2017-12-01')  # December 1, 2017
    dates = pd.date_range(start=start_date, periods=len(y_actual_all), freq='h')
    
    # Create DataFrame with correct dates
    df_combined = pd.DataFrame({
        'Date': dates,
        'Actual': y_actual_all,
        'Predicted': y_pred_all
    })
    
    # Aggregate to weekly (use 'W-THU' to end weeks on Thursday, closer to your data range)
    df_weekly = df_combined.set_index('Date').resample('W-THU').mean().reset_index()
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(16, 8))
    
    # Define season colors
    season_colors = {'Spring': '#90EE90', 'Summer': '#FFD700', 'Fall': '#FFA500', 'Winter': '#87CEEB'}
    
    # Create seasonal background blocks using correct data range
    data_start = df_weekly['Date'].min()
    data_end = df_weekly['Date'].max()
    
    print(f"Actual data range: {data_start.strftime('%m/%d/%Y')} to {data_end.strftime('%m/%d/%Y')}")
    
    season_patches = {}
    
    # Define seasonal boundaries for the actual data period (Dec 2017 - Nov 2018)
    seasons = [
        # Fall 2017: Dec 1, 2017 - Dec 21, 2017 (data starts in fall)
        (pd.Timestamp('2017-12-01'), pd.Timestamp('2017-12-21'), 'Fall'),
        # Winter 2017-2018: Dec 22, 2017 - Mar 20, 2018
        (pd.Timestamp('2017-12-22'), pd.Timestamp('2018-03-20'), 'Winter'),
        # Spring 2018: Mar 21, 2018 - Jun 21, 2018  
        (pd.Timestamp('2018-03-21'), pd.Timestamp('2018-06-21'), 'Spring'),
        # Summer 2018: Jun 22, 2018 - Sep 22, 2018
        (pd.Timestamp('2018-06-22'), pd.Timestamp('2018-09-22'), 'Summer'),
        # Fall 2018: Sep 23, 2018 - Nov 30, 2018
        (pd.Timestamp('2018-09-23'), pd.Timestamp('2018-11-30'), 'Fall')
    ]
    
    for season_start, season_end, season_name in seasons:
        # Clip to actual data range
        plot_start = max(season_start, data_start)
        plot_end = min(season_end, data_end)
        
        if plot_start < plot_end:
            patch = ax.axvspan(plot_start, plot_end, alpha=0.25, 
                             color=season_colors[season_name], zorder=0)
            if season_name not in season_patches:  # Only keep first occurrence for legend
                season_patches[season_name] = patch
    
    # Plot actual and predicted lines
    ax.plot(df_weekly['Date'], df_weekly['Actual'], 
            color='#1f77b4', linewidth=2, alpha=0.8, label='Actual', zorder=2)
    ax.plot(df_weekly['Date'], df_weekly['Predicted'], 
            color='#ff4444', linewidth=2, alpha=0.8, label='Predictions', zorder=2)
    
    # Add train/test split line
    train_size = len(y_train)
    total_size = len(y_actual_all)
    split_ratio = train_size / total_size
    split_week_idx = int(len(df_weekly) * split_ratio)
    
    if split_week_idx < len(df_weekly):
        split_date = df_weekly.iloc[split_week_idx]['Date']
        ax.axvline(x=split_date, color='green', linestyle='--', alpha=0.7, linewidth=2,
                  label=f'Train/Test Split ({split_date.strftime("%m/%d/%Y")})', zorder=1)
    
    # Formatting
    ax.set_xlabel('Date', fontweight='bold', fontsize=12)
    ax.set_ylabel('Weekly Average Bikes Shared', fontweight='bold', fontsize=12)
    ax.set_title('Weekly Bike Demand: Actual vs Predicted with Seasonal Backgrounds', fontweight='bold', fontsize=14)
    
    # Create legend
    handles, labels = ax.get_legend_handles_labels()
    
    # Add seasonal patches to legend in chronological order (starting with Fall since data begins Dec 1)
    for season in ['Fall', 'Winter', 'Spring', 'Summer']:
        if season in season_patches:
            handles.append(season_patches[season])
            labels.append(f'{season} Season')
    
    ax.legend(handles, labels, loc='upper right', frameon=True, fancybox=True, shadow=True, fontsize=11)
    ax.grid(True, alpha=0.3)
    
    # Format x-axis to show month names in M/D/YYYY format
    from matplotlib.dates import DateFormatter, MonthLocator
    ax.xaxis.set_major_locator(MonthLocator())
    ax.xaxis.set_major_formatter(DateFormatter('%m/%d/%Y'))
    ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print(f"Weekly Time Series Statistics:")
    print(f"Total weeks: {len(df_weekly)}")

# Call the function to create the plot
create_seasonal_plot()

### Visualize model performance by season

In [None]:
def create_seasonal_bar_comparison():
    """Create actual vs predicted bar chart by season using seasonal model"""
    # Prepare combined data using seasonal model predictions
    y_actual_all = np.concatenate([y_train.values.ravel(), y_test.values.ravel()])
    y_pred_all = np.concatenate([seasonal_train_preds, seasonal_test_preds])
    
    # Create correct date range: Dec 1, 2017 to Nov 30, 2018
    start_date = pd.Timestamp('2017-12-01')
    dates = pd.date_range(start=start_date, periods=len(y_actual_all), freq='h')
    
    # Create DataFrame
    df_combined = pd.DataFrame({
        'Date': dates,
        'Actual': y_actual_all,
        'Predicted': y_pred_all
    })
    
    # Add season column
    def get_season(date):
        if (date.month == 12 and date.day >= 22) or date.month in [1, 2] or (date.month == 3 and date.day <= 20):
            return 'Winter'
        elif (date.month == 3 and date.day >= 21) or date.month in [4, 5] or (date.month == 6 and date.day <= 21):
            return 'Spring'
        elif (date.month == 6 and date.day >= 22) or date.month in [7, 8] or (date.month == 9 and date.day <= 22):
            return 'Summer'
        else:
            return 'Fall'
    
    df_combined['Season'] = df_combined['Date'].apply(get_season)
    
    # Group by season and calculate means
    seasonal_summary = df_combined.groupby('Season')[['Actual', 'Predicted']].mean().round(1)
    
    # Reorder seasons chronologically (starting with Fall since data begins Dec 1)
    season_order = ['Fall', 'Winter', 'Spring', 'Summer']
    seasonal_summary = seasonal_summary.reindex(season_order)
    
    # Create bar plot with room for legend
    fig, ax = plt.subplots(figsize=(12, 8))
    
    x = np.arange(len(seasonal_summary))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, seasonal_summary['Actual'], width, 
                   label='Actual', color='#1f77b4', alpha=0.8)
    bars2 = ax.bar(x + width/2, seasonal_summary['Predicted'], width,
                   label='Predicted', color='#ff4444', alpha=0.8)
    
    # Add value labels on bars
    for bar in bars1:
        height = bar.get_height()
        ax.annotate(f'{height:.0f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontweight='bold')
    
    for bar in bars2:
        height = bar.get_height()
        ax.annotate(f'{height:.0f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontweight='bold')
    
    # Formatting with extended y-axis to make room for legend
    ax.set_xlabel('Season', fontweight='bold', fontsize=12)
    ax.set_ylabel('Average Bikes Shared', fontweight='bold', fontsize=12)
    ax.set_title('Seasonal Model: Bike Demand by Season (Actual vs Predicted)', fontweight='bold', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(seasonal_summary.index)
    
    # Set y-axis limit to 800 to make room for legend in upper right
    ax.set_ylim(0, 800)
    
    # Place legend in upper right
    ax.legend(loc='upper right', fontsize=12, frameon=True, fancybox=True, shadow=True)
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("Seasonal Model Performance Summary:")
    print("=" * 45)
    print(seasonal_summary)
    
    # Calculate seasonal errors
    seasonal_errors = seasonal_summary['Predicted'] - seasonal_summary['Actual']
    print(f"\nSeasonal Prediction Errors:")
    for season in seasonal_summary.index:
        error = seasonal_errors[season]
        print(f"{season:8s}: {error:+6.1f} bikes")
    
    return seasonal_summary

seasonal_ = create_seasonal_bar_comparison()

### Visualize model performance by month

In [None]:
def create_monthly_bar_comparison():
    """Create actual vs predicted bar chart by month using seasonal model"""
    # Prepare combined data using seasonal model predictions
    y_actual_all = np.concatenate([y_train.values.ravel(), y_test.values.ravel()])
    y_pred_all = np.concatenate([seasonal_train_preds, seasonal_test_preds])
    
    # Create correct date range: Dec 1, 2017 to Nov 30, 2018
    start_date = pd.Timestamp('2017-12-01')
    dates = pd.date_range(start=start_date, periods=len(y_actual_all), freq='h')
    
    # Create DataFrame
    df_combined = pd.DataFrame({
        'Date': dates,
        'Actual': y_actual_all,
        'Predicted': y_pred_all
    })
    
    # Add month column
    df_combined['Month'] = df_combined['Date'].dt.month
    df_combined['MonthName'] = df_combined['Date'].dt.strftime('%b')
    
    # Group by month and calculate means
    monthly_summary = df_combined.groupby(['Month', 'MonthName'])[['Actual', 'Predicted']].mean().round(1)
    monthly_summary = monthly_summary.reset_index()
    
    # Sort by chronological order (starting from December)
    month_order = [12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]  # Dec 2017 through Nov 2018
    monthly_summary['Order'] = monthly_summary['Month'].map({m: i for i, m in enumerate(month_order)})
    monthly_summary = monthly_summary.sort_values('Order')
    
    # Create bar plot
    fig, ax = plt.subplots(figsize=(14, 8))
    
    x = np.arange(len(monthly_summary))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, monthly_summary['Actual'], width, 
                   label='Actual', color='#1f77b4', alpha=0.8)
    bars2 = ax.bar(x + width/2, monthly_summary['Predicted'], width,
                   label='Predicted', color='#ff4444', alpha=0.8)
    
    # Add value labels on bars
    for bar in bars1:
        height = bar.get_height()
        ax.annotate(f'{height:.0f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontweight='bold', fontsize=9)
    
    for bar in bars2:
        height = bar.get_height()
        ax.annotate(f'{height:.0f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontweight='bold', fontsize=9)
    
    # Formatting with extended y-axis and upper left legend
    ax.set_xlabel('Month', fontweight='bold', fontsize=12)
    ax.set_ylabel('Average Bikes Shared', fontweight='bold', fontsize=12)
    ax.set_title('Seasonal Model: Monthly Bike Demand (Dec 2017 - Nov 2018)', fontweight='bold', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(monthly_summary['MonthName'])
    
    # Set y-axis limit to 850 and place legend in upper right
    ax.set_ylim(0, 850)
    ax.legend(loc='upper right', fontsize=12, frameon=True, fancybox=True, shadow=True)
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("Monthly Performance Summary:")
    print("=" * 50)
    monthly_display = monthly_summary[['MonthName', 'Actual', 'Predicted']].copy()
    monthly_display.columns = ['Month', 'Actual', 'Predicted']
    print(monthly_display.to_string(index=False))
    
    # Calculate monthly errors
    monthly_summary['Error'] = monthly_summary['Predicted'] - monthly_summary['Actual']
    print(f"\nMonthly Prediction Errors:")
    for _, row in monthly_summary.iterrows():
        print(f"{row['MonthName']:8s}: {row['Error']:+6.1f} bikes")
    
    return monthly_summary

monthly_bars = create_monthly_bar_comparison()

In [None]:
# Updated model comparison using new naming convention

# Baseline Best model metrics (regularized baseline)
baseline_best_train_rmse = math.sqrt(mean_squared_error(y_train, baseline_best_train_preds))
baseline_best_test_rmse = math.sqrt(mean_squared_error(y_test, baseline_best_test_preds))
baseline_best_train_mae = mean_absolute_error(y_train, baseline_best_train_preds)
baseline_best_test_mae = mean_absolute_error(y_test, baseline_best_test_preds)

# Seasonal model metrics (final tuned model)
seasonal_train_rmse = math.sqrt(mean_squared_error(y_train, seasonal_train_preds))
seasonal_test_rmse = math.sqrt(mean_squared_error(y_test, seasonal_test_preds))
seasonal_train_mae = mean_absolute_error(y_train, seasonal_train_preds)
seasonal_test_mae = mean_absolute_error(y_test, seasonal_test_preds)

# R² scores
baseline_best_train_r2 = r2_score(y_train, baseline_best_train_preds)
baseline_best_test_r2 = r2_score(y_test, baseline_best_test_preds)
seasonal_train_r2 = r2_score(y_train, seasonal_train_preds)
seasonal_test_r2 = r2_score(y_test, seasonal_test_preds)

# Create the comparison chart
plt.figure(figsize=(16, 6))
fig = plt.gcf()
fig.patch.set_facecolor('#f8f9fa')

# Define colors
colors = ['#39A0ED', '#FF5E5B']  # Blue for Baseline, Coral for Final

# Model names
models = ['Baseline\nBest', 'Seasonal\nModel']

# R² comparison (subplot 1)
plt.subplot(1, 3, 1)
r2_values = [baseline_best_test_r2, seasonal_test_r2]
bars1 = plt.bar(models, r2_values, color=colors,
               edgecolor='white', linewidth=0.8, width=0.7)

# Add values on top of bars
for bar in bars1:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
            f'{height:.4f}', ha='center', va='bottom', fontweight='bold', fontsize=11)

plt.title('Test R² Comparison\n(higher is better)', fontsize=14, pad=20, fontweight='bold', color='#333333')
plt.ylabel('Test R²', fontsize=12, color='#333333')
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add best performance line
max_r2 = max(r2_values)
plt.axhline(y=max_r2, color='#333333', linestyle='--', alpha=0.5)
plt.text(len(models)-1, max_r2 - 0.01, f'Best: {max_r2:.4f}',
         ha='right', va='top', color='#333333', alpha=0.7, fontweight='bold')

# RMSE comparison (subplot 2)
plt.subplot(1, 3, 2)
test_rmse_values = [baseline_best_test_rmse, seasonal_test_rmse]
bars2 = plt.bar(models, test_rmse_values, color=colors,
               edgecolor='white', linewidth=0.8, width=0.7)

# Add values on top of bars
for bar in bars2:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
            f'{height:.2f}', ha='center', va='bottom', fontweight='bold', fontsize=11)

plt.title('Test RMSE Comparison\n(lower is better)', fontsize=14, pad=20, fontweight='bold', color='#333333')
plt.ylabel('Test RMSE', fontsize=12, color='#333333')
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add best performance line
min_rmse = min(test_rmse_values)
plt.axhline(y=min_rmse, color='#333333', linestyle='--', alpha=0.5)
plt.text(len(models)-1, min_rmse + min_rmse*0.05, f'Best: {min_rmse:.2f}',
         ha='right', va='bottom', color='#333333', alpha=0.7, fontweight='bold')

# MAE comparison (subplot 3)
plt.subplot(1, 3, 3)
mae_values = [baseline_best_test_mae, seasonal_test_mae]
bars3 = plt.bar(models, mae_values, color=colors,
               edgecolor='white', linewidth=0.8, width=0.7)

# Add values on top of bars
for bar in bars3:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
            f'{height:.2f}', ha='center', va='bottom', fontweight='bold', fontsize=11)

plt.title('Test MAE Comparison\n(lower is better)', fontsize=14, pad=20, fontweight='bold', color='#333333')
plt.ylabel('Test MAE', fontsize=12, color='#333333')
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add best performance line
min_mae = min(mae_values)
plt.axhline(y=min_mae, color='#333333', linestyle='--', alpha=0.5)
plt.text(len(models)-1, min_mae + min_mae*0.05, f'Best: {min_mae:.2f}',
         ha='right', va='bottom', color='#333333', alpha=0.7, fontweight='bold')

plt.tight_layout()
plt.show()

# Print summary comparison
print("Model Performance Comparison:")
print("=" * 50)
print(f"{'Metric':<15} {'Baseline':<12} {'Final Model':<12} {'Winner':<10}")
print("-" * 50)
print(f"{'Test R²':<15} {baseline_best_test_r2:<12.4f} {seasonal_test_r2:<12.4f} {'Seasonal' if seasonal_test_r2 > baseline_best_test_r2 else 'Baseline Best':<10}")
print(f"{'Test RMSE':<15} {baseline_best_test_rmse:<12.2f} {seasonal_test_rmse:<12.2f} {'Seasonal' if seasonal_test_rmse < baseline_best_test_rmse else 'Baseline Best':<10}")
print(f"{'Test MAE':<15} {baseline_best_test_mae:<12.2f} {seasonal_test_mae:<12.2f} {'Seasonal' if seasonal_test_mae < baseline_best_test_mae else 'Baseline Best':<10}")

# Calculate improvements
r2_improvement = ((seasonal_test_r2 - baseline_best_test_r2) / baseline_best_test_r2) * 100
rmse_improvement = ((baseline_best_test_rmse - seasonal_test_rmse) / baseline_best_test_rmse) * 100
mae_improvement = ((baseline_best_test_mae - seasonal_test_mae) / baseline_best_test_mae) * 100

print(f"\nImprovements (Seasonal vs Baseline Best):")
print("-" * 40)
print(f"R² improvement:   {r2_improvement:+6.2f}%")
print(f"RMSE improvement: {rmse_improvement:+6.2f}%")
print(f"MAE improvement:  {mae_improvement:+6.2f}%")

### Exploring overfitting improvements

In [None]:
# Calculate metrics for all four models using new naming convention

# Model 1: Baseline Model (minimal configuration)
baseline_train_rmse = math.sqrt(mean_squared_error(y_train, baseline_train_preds))
baseline_test_rmse = math.sqrt(mean_squared_error(y_test, baseline_test_preds))
baseline_train_mae = mean_absolute_error(y_train, baseline_train_preds)
baseline_test_mae = mean_absolute_error(y_test, baseline_test_preds)

# Model 2: Baseline Better Model (basic hyperparameters)
baseline_better_train_rmse = math.sqrt(mean_squared_error(y_train, baseline_better_train_preds))
baseline_better_test_rmse = math.sqrt(mean_squared_error(y_test, baseline_better_test_preds))
baseline_better_train_mae = mean_absolute_error(y_train, baseline_better_train_preds)
baseline_better_test_mae = mean_absolute_error(y_test, baseline_better_test_preds)

# Model 3: Baseline Best Model (regularized)
baseline_best_train_rmse = math.sqrt(mean_squared_error(y_train, baseline_best_train_preds))
baseline_best_test_rmse = math.sqrt(mean_squared_error(y_test, baseline_best_test_preds))
baseline_best_train_mae = mean_absolute_error(y_train, baseline_best_train_preds)
baseline_best_test_mae = mean_absolute_error(y_test, baseline_best_test_preds)

# Model 4: Seasonal Model (grid search + manual tuning + early stopping)
seasonal_train_rmse = math.sqrt(mean_squared_error(y_train, seasonal_train_preds))
seasonal_test_rmse = math.sqrt(mean_squared_error(y_test, seasonal_test_preds))
seasonal_train_mae = mean_absolute_error(y_train, seasonal_train_preds)
seasonal_test_mae = mean_absolute_error(y_test, seasonal_test_preds)

# Calculate overfitting gaps
baseline_rmse_gap = baseline_test_rmse - baseline_train_rmse
baseline_better_rmse_gap = baseline_better_test_rmse - baseline_better_train_rmse
baseline_best_rmse_gap = baseline_best_test_rmse - baseline_best_train_rmse
seasonal_rmse_gap = seasonal_test_rmse - seasonal_train_rmse

baseline_mae_gap = baseline_test_mae - baseline_train_mae
baseline_better_mae_gap = baseline_better_test_mae - baseline_better_train_mae
baseline_best_mae_gap = baseline_best_test_mae - baseline_best_train_mae
seasonal_mae_gap = seasonal_test_mae - seasonal_train_mae

# Create the overfitting comparison chart
plt.figure(figsize=(16, 10))
fig = plt.gcf()
fig.patch.set_facecolor('#f8f9fa')

# Define colors for model progression
colors = ['#FF6B6B', '#FFA07A', '#4ECDC4', '#45B7D1']  # Red → Orange → Teal → Blue

# Model names
models = ['Baseline\nModel', 'Baseline\nBetter', 'Baseline\nBest', 'Seasonal\nModel']

# Training vs Test RMSE (subplot 1)
plt.subplot(2, 2, 1)
train_rmse_values = [baseline_train_rmse, baseline_better_train_rmse, baseline_best_train_rmse, seasonal_train_rmse]
test_rmse_values = [baseline_test_rmse, baseline_better_test_rmse, baseline_best_test_rmse, seasonal_test_rmse]

x = np.arange(len(models))
width = 0.35

bars1 = plt.bar(x - width/2, train_rmse_values, width, label='Train RMSE', 
                color=[c for c in colors], alpha=0.7, edgecolor='white', linewidth=0.8)
bars2 = plt.bar(x + width/2, test_rmse_values, width, label='Test RMSE',
                color=[c for c in colors], alpha=1.0, edgecolor='white', linewidth=0.8)

# Add value labels
for i, (train_val, test_val) in enumerate(zip(train_rmse_values, test_rmse_values)):
    plt.text(i - width/2, train_val + train_val*0.01, f'{train_val:.1f}', 
             ha='center', va='bottom', fontweight='bold', fontsize=10)
    plt.text(i + width/2, test_val + test_val*0.01, f'{test_val:.1f}', 
             ha='center', va='bottom', fontweight='bold', fontsize=10)

plt.title('RMSE: Training vs Test', fontsize=14, pad=20, fontweight='bold', color='#333333')
plt.ylabel('RMSE', fontsize=12, color='#333333')
plt.xticks(x, models)
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Training vs Test MAE (subplot 2)
plt.subplot(2, 2, 2)
train_mae_values = [baseline_train_mae, baseline_better_train_mae, baseline_best_train_mae, seasonal_train_mae]
test_mae_values = [baseline_test_mae, baseline_better_test_mae, baseline_best_test_mae, seasonal_test_mae]

bars3 = plt.bar(x - width/2, train_mae_values, width, label='Train MAE', 
                color=[c for c in colors], alpha=0.7, edgecolor='white', linewidth=0.8)
bars4 = plt.bar(x + width/2, test_mae_values, width, label='Test MAE',
                color=[c for c in colors], alpha=1.0, edgecolor='white', linewidth=0.8)

# Add value labels
for i, (train_val, test_val) in enumerate(zip(train_mae_values, test_mae_values)):
    plt.text(i - width/2, train_val + train_val*0.01, f'{train_val:.1f}', 
             ha='center', va='bottom', fontweight='bold', fontsize=10)
    plt.text(i + width/2, test_val + test_val*0.01, f'{test_val:.1f}', 
             ha='center', va='bottom', fontweight='bold', fontsize=10)

plt.title('MAE: Training vs Test', fontsize=14, pad=20, fontweight='bold', color='#333333')
plt.ylabel('MAE', fontsize=12, color='#333333')
plt.xticks(x, models)
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# RMSE Overfitting Gap (subplot 3)
plt.subplot(2, 2, 3)
rmse_gap_values = [baseline_rmse_gap, baseline_better_rmse_gap, baseline_best_rmse_gap, seasonal_rmse_gap]
bars5 = plt.bar(models, rmse_gap_values, color=colors,
                edgecolor='white', linewidth=0.8, width=0.7)

# Add value labels and color coding for gap severity
for i, (bar, gap) in enumerate(zip(bars5, rmse_gap_values)):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + height*0.02,
            f'{height:.2f}', ha='center', va='bottom', fontweight='bold', fontsize=11)
    
    # Color code the gap severity
    if gap > 50:
        bar.set_alpha(1.0)  # High overfitting
    elif gap > 30:
        bar.set_alpha(0.8)  # Moderate overfitting
    else:
        bar.set_alpha(0.6)  # Low overfitting

plt.title('RMSE Overfitting Gap\n(Test - Train)', fontsize=14, pad=20, fontweight='bold', color='#333333')
plt.ylabel('RMSE Gap', fontsize=12, color='#333333')
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add gap severity indicators
if max(rmse_gap_values) > 30:
    plt.axhline(y=30, color='orange', linestyle='--', alpha=0.5, linewidth=1)
    plt.text(len(models)-0.5, 32, 'Moderate', ha='center', va='bottom', color='orange', fontsize=9)
if max(rmse_gap_values) > 50:
    plt.axhline(y=50, color='red', linestyle='--', alpha=0.5, linewidth=1)
    plt.text(len(models)-0.5, 52, 'High', ha='center', va='bottom', color='red', fontsize=9)

# MAE Overfitting Gap (subplot 4)
plt.subplot(2, 2, 4)
mae_gap_values = [baseline_mae_gap, baseline_better_mae_gap, baseline_best_mae_gap, seasonal_mae_gap]
bars6 = plt.bar(models, mae_gap_values, color=colors,
                edgecolor='white', linewidth=0.8, width=0.7)

# Add value labels and color coding for gap severity
for i, (bar, gap) in enumerate(zip(bars6, mae_gap_values)):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + height*0.02,
            f'{height:.2f}', ha='center', va='bottom', fontweight='bold', fontsize=11)
    
    # Color code the gap severity (MAE thresholds are typically lower)
    if gap > 35:
        bar.set_alpha(1.0)  # High overfitting
    elif gap > 20:
        bar.set_alpha(0.8)  # Moderate overfitting
    else:
        bar.set_alpha(0.6)  # Low overfitting

plt.title('MAE Overfitting Gap\n(Test - Train)', fontsize=14, pad=20, fontweight='bold', color='#333333')
plt.ylabel('MAE Gap', fontsize=12, color='#333333')
plt.grid(axis='y', alpha=0.3)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Add gap severity indicators
if max(mae_gap_values) > 20:
    plt.axhline(y=20, color='orange', linestyle='--', alpha=0.5, linewidth=1)
    plt.text(len(models)-0.5, 22, 'Moderate', ha='center', va='bottom', color='orange', fontsize=9)
if max(mae_gap_values) > 35:
    plt.axhline(y=35, color='red', linestyle='--', alpha=0.5, linewidth=1)
    plt.text(len(models)-0.5, 37, 'High', ha='center', va='bottom', color='red', fontsize=9)

plt.tight_layout()
plt.show()

# Print comprehensive overfitting analysis
print("Model Progression: Overfitting Analysis Summary")
print("=" * 70)
print(f"{'Model':<15} {'RMSE Gap':<12} {'MAE Gap':<12} {'RMSE Status':<15} {'MAE Status':<15}")
print("-" * 70)

# Define status function
def get_overfitting_status(gap, high_threshold, moderate_threshold):
    if gap > high_threshold:
        return "High"
    elif gap > moderate_threshold:
        return "Moderate"
    else:
        return "Low"

model_names = ['Baseline', 'Baseline Better', 'Baseline Best', 'Seasonal']
for i, model in enumerate(model_names):
    rmse_gap = rmse_gap_values[i]
    mae_gap = mae_gap_values[i]
    rmse_status = get_overfitting_status(rmse_gap, 50, 30)
    mae_status = get_overfitting_status(mae_gap, 35, 20)
    
    print(f"{model:<15} {rmse_gap:<12.2f} {mae_gap:<12.2f} {rmse_status:<15} {mae_status:<15}")

# Calculate improvements in overfitting (vs baseline model)
print(f"\nOverfitting Reduction Progress (vs Baseline Model):")
print("-" * 55)
for i, model in enumerate(['Baseline Better', 'Baseline Best', 'Seasonal']):
    rmse_reduction = ((baseline_rmse_gap - rmse_gap_values[i+1]) / baseline_rmse_gap) * 100
    mae_reduction = ((baseline_mae_gap - mae_gap_values[i+1]) / baseline_mae_gap) * 100
    print(f"{model:<15} RMSE: {rmse_reduction:+6.1f}% | MAE: {mae_reduction:+6.1f}%")

### Can we train separate seaonal models and use as an ensemble approach to acheive a better result?

In [None]:
def load_and_split_seasonal_data():
    """Load Seoul bike data and create seasonal splits"""
    # Load data
    seoul_bike_sharing_demand = fetch_ucirepo(id=560)
    df = seoul_bike_sharing_demand.data.features.copy()
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
    
    # Add cyclical features
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df_encoded = create_cyclical_encoding(df)
    
    # Define season boundaries (matching our analysis)
    seasons = {
        'Fall': ('2017-12-01', '2017-12-21'),
        'Winter': ('2017-12-22', '2018-03-20'),
        'Spring': ('2018-03-21', '2018-06-21'), 
        'Summer': ('2018-06-22', '2018-09-22'),
        'Fall_2': ('2018-09-23', '2018-11-30')
    }
    
    splits = {}
    print("SEASONAL DATA SPLITS:")
    print("-" * 40)
    
    for season, (start, end) in seasons.items():
        mask = (df_encoded['Date'] >= start) & (df_encoded['Date'] <= end)
        season_data = df_encoded[mask].copy()
        
        # 80/20 split
        n_train = int(len(season_data) * 0.8)
        train_data = season_data.iloc[:n_train]
        test_data = season_data.iloc[n_train:]
        
        splits[season] = {
            'train': train_data,
            'test': test_data
        }
        
        # Print detailed split information
        print(f"{season:>8}: {n_train:4d} train, {len(test_data):3d} test | "
              f"Train: {train_data['Date'].min().strftime('%m/%d')} - {train_data['Date'].max().strftime('%m/%d')} | "
              f"Test: {test_data['Date'].min().strftime('%m/%d')} - {test_data['Date'].max().strftime('%m/%d')}")
    
    # Show total
    total_train = sum(len(s['train']) for s in splits.values())
    total_test = sum(len(s['test']) for s in splits.values())
    print(f"{'TOTAL':>8}: {total_train:4d} train, {total_test:3d} test")
    
    return splits

splits = load_and_split_seasonal_data()

In [None]:
def train_seasonal_ensemble(splits):
    """Train separate XGBoost model for each season"""
    # Conservative parameters (limited data per season)
    params = {
        'max_depth': 3,
        'learning_rate': 0.1,
        'n_estimators': 75,
        'min_child_weight': 15,
        'reg_alpha': 3.0,
        'reg_lambda': 5.0,
        'subsample': 0.6,
        'colsample_bytree': 0.6,
        'random_state': 42
    }
    
    models = {}
    results = {}
    
    # Feature columns
    exclude_cols = ['Date', 'Rented Bike Count', 'Seasons', 'Holiday']
    feature_cols = [col for col in splits['Winter']['train'].columns if col not in exclude_cols]
    
    for season, data in splits.items():
        if season == 'Fall_2':  # Skip duplicate fall
            continue
            
        print(f"\nTraining {season} model...")
        
        # Prepare data
        X_train = data['train'][feature_cols].select_dtypes(include=[np.number])
        y_train = data['train']['Rented Bike Count']
        X_test = data['test'][feature_cols].select_dtypes(include=[np.number])
        y_test = data['test']['Rented Bike Count']
        
        # Scale and train
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        model = xgb.XGBRegressor(**params)
        model.fit(X_train_scaled, y_train)
        
        # Evaluate with MAE as primary metric
        test_preds = model.predict(X_test_scaled)
        test_mae = mean_absolute_error(y_test, test_preds)
        test_rmse = math.sqrt(mean_squared_error(y_test, test_preds))
        test_r2 = r2_score(y_test, test_preds)
        
        models[season] = model
        results[season] = {
            'test_mae': test_mae,
            'test_rmse': test_rmse, 
            'test_r2': test_r2
        }
        print(f"  {season}: MAE={test_mae:.1f}, RMSE={test_rmse:.1f}, R²={test_r2:.3f}")
    
    return models, results

seasonal_models, seasonal_results = train_seasonal_ensemble(splits)

In [None]:
def train_seasonal_ensemble(splits):
    """Train separate XGBoost model for each season"""
    # Conservative parameters (limited data per season)
    params = {
        'max_depth': 3,
        'learning_rate': 0.1,
        'n_estimators': 75,
        'min_child_weight': 15,
        'reg_alpha': 3.0,
        'reg_lambda': 5.0,
        'subsample': 0.6,
        'colsample_bytree': 0.6,
        'random_state': 42
    }
    
    models = {}
    results = {}
    
    # Feature columns
    exclude_cols = ['Date', 'Rented Bike Count', 'Seasons', 'Holiday']
    feature_cols = [col for col in splits['Winter']['train'].columns if col not in exclude_cols]
    
    for season, data in splits.items():
        if season == 'Fall_2':  # Skip duplicate fall
            continue
            
        print(f"\nTraining {season} model...")
        
        # Prepare data
        X_train = data['train'][feature_cols].select_dtypes(include=[np.number])
        y_train = data['train']['Rented Bike Count']
        X_test = data['test'][feature_cols].select_dtypes(include=[np.number])
        y_test = data['test']['Rented Bike Count']
        
        # Scale and train
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        model = xgb.XGBRegressor(**params)
        model.fit(X_train_scaled, y_train)
        
        # Evaluate with MAE as primary metric
        test_preds = model.predict(X_test_scaled)
        test_mae = mean_absolute_error(y_test, test_preds)
        test_rmse = math.sqrt(mean_squared_error(y_test, test_preds))
        test_r2 = r2_score(y_test, test_preds)
        
        models[season] = model
        results[season] = {
            'test_mae': test_mae,
            'test_rmse': test_rmse, 
            'test_r2': test_r2
        }
        print(f"  {season}: MAE={test_mae:.1f}, RMSE={test_rmse:.1f}, R²={test_r2:.3f}")
    
    return models, results

seasonal_models, seasonal_results = train_seasonal_ensemble(splits)

In [None]:
def compare_with_simple_baseline(splits, results):
    """Compare seasonal models with just predicting the training mean"""
    
    print("BASELINE COMPARISON (predict training mean):")
    print("-" * 50)
    
    for season, data in splits.items():
        if season == 'Fall_2':  # Skip duplicate fall
            continue
            
        train_mean = data['train']['Rented Bike Count'].mean()
        test_actual = data['test']['Rented Bike Count']
        
        # Baseline: just predict the training mean
        baseline_preds = np.full(len(test_actual), train_mean)
        baseline_mae = mean_absolute_error(test_actual, baseline_preds)
        baseline_rmse = math.sqrt(mean_squared_error(test_actual, baseline_preds))
        baseline_r2 = r2_score(test_actual, baseline_preds)
        
        # Seasonal model performance
        seasonal_mae = results[season]['test_mae']
        seasonal_rmse = results[season]['test_rmse']
        seasonal_r2 = results[season]['test_r2']
        
        print(f"{season:} Mean Baseline  MAE={baseline_mae:.1f}, RMSE={baseline_rmse:.1f}, R²={baseline_r2:.3f}")
        print(f"         {season} Model MAE={seasonal_mae:.1f}, RMSE={seasonal_rmse:.1f}, R²={seasonal_r2:.3f}")
        print(f"         Improvement:   {baseline_mae - seasonal_mae:+6.1f} MAE, {baseline_rmse - seasonal_rmse:+6.1f} RMSE, {seasonal_r2 - baseline_r2:+.3f} R²")
        print()
        
compare_with_simple_baseline(splits, seasonal_results)        

### Comparing an ensemble of individual season models with a single seasonal model


In [None]:
# Calculate ensemble average performance
avg_seasonal_mae = np.mean([r['test_mae'] for r in seasonal_results.values()])
avg_seasonal_rmse = np.mean([r['test_rmse'] for r in seasonal_results.values()])
avg_seasonal_r2 = np.mean([r['test_r2'] for r in seasonal_results.values()])

# Show the comparison
print(f"{'Approach':<25} {'MAE':<8} {'RMSE':<8} {'R²':<8} {'Data Used'}")
print("-" * 60)
print(f"{'Single Seasonal Model':<25} {seasonal_test_mae:<8.1f} {seasonal_test_rmse:<8.1f} {seasonal_test_r2:<8.3f} Full dataset (8,760 hrs)")
print(f"{'Seasonal Ensemble Avg':<25} {avg_seasonal_mae:<8.1f} {avg_seasonal_rmse:<8.1f} {avg_seasonal_r2:<8.3f} Split by season")

# Individual seasonal model performance for context
print(f"\nIndividual Seasonal Model Performance:")
print("-" * 45)
for season, results in seasonal_results.items():
    train_samples = len(splits[season]['train'])
    test_samples = len(splits[season]['test'])
    print(f"{season:>8}: MAE={results['test_mae']:6.1f}, R²={results['test_r2']:6.3f} "
          f"({train_samples:,} train samples)")

# Determine winner
mae_difference = avg_seasonal_mae - seasonal_test_mae

if mae_difference > 0:
    print(f"\nSINGLE SEASONAL MODEL WINS by {mae_difference:.1f} MAE")
    print(f"\nWhy the single model is superior:")
    print(f"  • Uses full dataset: 8,760 hours vs {min([len(s['train']) for s in splits.values() if 'Fall_2' not in s]):,}-{max([len(s['train']) for s in splits.values() if 'Fall_2' not in s]):,} per season")
    print(f"  • Learns temporal patterns across seasons")
    print(f"  • Benefits from seasonal transitions and cross-seasonal features")
    print(f"  • Grid search + early stopping + manual tuning")
    print(f"  • Better generalization with {seasonal_test_rmse:.1f} RMSE vs {avg_seasonal_rmse:.1f} average")
else:
    print(f"\nSEASONAL ENSEMBLE WINS by {abs(mae_difference):.1f} MAE")
    print(f"Seasonal specialization overcomes data limitations")

# Fundamental problem with seasonal approach
min_samples = min([len(s['train']) for s in splits.values() if 'Fall_2' not in s])
total_samples = len(y_train)

print(f"\nFUNDAMENTAL LIMITATION:")
print("-" * 30)
print(f"Smallest seasonal training set: {min_samples:,} samples ({min_samples/total_samples*100:.1f}% of full dataset)")
print(f"Single model training set:      {total_samples:,} samples (100% of full dataset)")
print(f"Data efficiency ratio:          {total_samples/min_samples:.1f}x more data for single model")

print(f"\nCONCLUSION:")
print("-" * 15)
print(f"Use SINGLE SEASONAL MODEL for production")
print(f"   - Superior performance across all metrics")
print(f"   - Leverages full dataset for better learning") 
print(f"   - Captures cross-seasonal patterns and transitions")
print(f"   - More robust with comprehensive tuning approach")