# Model Setup using the LOGO-CV Approach - With Feature Selection and Hyperparameter Tuning

## Import the Required Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV, SequentialFeatureSelector, SelectKBest, f_regression
from sklearn.decomposition import PCA

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

## Baseline Model Set-up

In [5]:
# Let's load the data
df = pd.read_csv("processed_data_berlin_08042025.csv")

# And, drop stations with ≤ 2 years of data
station_years = df.groupby('station_name')['year'].nunique()
valid_stations = station_years[station_years > 2].index
df_filtered = df[df['station_name'].isin(valid_stations)].copy()

In [6]:
# SMAPE function
def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    )

# Let's set up LOGO-CV
logo = LeaveOneGroupOut()
groups = df_filtered['station_name']

In [7]:
# Store all predictions
all_actuals, all_predictions = [], []
aadb_actual_list, aadb_pred_list, aadb_station_list = [], [], []

# Run LOGO-CV
for train_idx, test_idx in logo.split(df_filtered, groups=groups):
    train_df = df_filtered.iloc[train_idx]
    test_df = df_filtered.iloc[test_idx]
    
    # Compute mean log volume per station (baseline)
    mean_log_volume = train_df['log_cycling_volume'].mean()
    pred_log = np.full(len(test_df), mean_log_volume)
    pred = np.expm1(pred_log)

    y_true = test_df['cycling_volume'].values

    # Daily predictions
    all_actuals.extend(y_true)
    all_predictions.extend(pred)

    # AADB-level for this test station
    aadb_actual = y_true.mean()
    aadb_pred = pred.mean()
    aadb_actual_list.append(aadb_actual)
    aadb_pred_list.append(aadb_pred)
    aadb_station_list.append(test_df['station_name'].iloc[0])

In [8]:
# Convert to arrays
y_true_all = np.array(all_actuals)
y_pred_all = np.array(all_predictions)

# Daily metrics
mae_daily = mean_absolute_error(y_true_all, y_pred_all)
rmse_daily = np.sqrt(mean_squared_error(y_true_all, y_pred_all))
smape_daily = smape(y_true_all, y_pred_all)

In [9]:
# AADB-level metrics
aadb_df = pd.DataFrame({
    'station': aadb_station_list,
    'actual': aadb_actual_list,
    'predicted': aadb_pred_list
})

mae_aadb = mean_absolute_error(aadb_df['actual'], aadb_df['predicted'])
rmse_aadb = np.sqrt(mean_squared_error(aadb_df['actual'], aadb_df['predicted']))
smape_aadb = smape(aadb_df['actual'], aadb_df['predicted'])

In [10]:
# Print results
print("\nLOGO Baseline Model Evaluation:")
print(f"Daily  → MAE: {mae_daily:.2f}, RMSE: {rmse_daily:.2f}, SMAPE: {smape_daily:.2f}%")
print(f"AADB   → MAE: {mae_aadb:.2f}, RMSE: {rmse_aadb:.2f}, SMAPE: {smape_aadb:.2f}%")


LOGO Baseline Model Evaluation:
Daily  → MAE: 2193.79, RMSE: 3252.34, SMAPE: 72.64%
AADB   → MAE: 1974.91, RMSE: 2573.91, SMAPE: 65.24%


## Linear Regression

In [11]:
# Define full feature list
non_features = ['station_name', 'date', 'year', 'cycling_volume', 'log_cycling_volume', 'num_motor_sources']
features = [col for col in df_filtered.columns if col not in non_features]

X = df_filtered[features]
y = df_filtered['log_cycling_volume']

In [12]:
# RFE with Linear Regression
lr = LinearRegression()
scaler = StandardScaler()

pipeline = Pipeline([
    ('scaler', scaler),
    ('feature_selection', RFECV(
        estimator=lr,
        step=1,
        cv=KFold(n_splits=5),
        scoring='neg_mean_absolute_error',
        n_jobs=-1
    )),
    ('regression', lr)
])

pipeline.fit(X, y)

# Extract selected features
selected_features = X.columns[pipeline.named_steps['feature_selection'].support_]
print("\nSelected features by RFE for Linear Regression:")
print(selected_features.tolist())


Selected features by RFE for Linear Regression:
['private_gardening', 'city_total_motor_vehicles', 'city_total_cars']


In [13]:
# Define LOGO grouping
groups = df_filtered['station_name']

# Prep for storing results
all_true, all_preds = [], []
aadb_actuals, aadb_preds, aadb_stations = [], [], []

logo = LeaveOneGroupOut()

for train_idx, test_idx in logo.split(df_filtered, groups=groups):
    train_df = df_filtered.iloc[train_idx]
    test_df = df_filtered.iloc[test_idx]

    X_train = train_df[selected_features]
    y_train = train_df['log_cycling_volume']
    X_test = test_df[selected_features]
    y_test_true = test_df['cycling_volume']

    # Scale inputs
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train and predict
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    y_pred_log = model.predict(X_test_scaled)
    y_pred = np.expm1(y_pred_log)

    all_preds.extend(y_pred)
    all_true.extend(y_test_true.values)

    aadb_preds.append(np.mean(y_pred))
    aadb_actuals.append(np.mean(y_test_true.values))
    aadb_stations.append(test_df['station_name'].iloc[0])

In [14]:
# SMAPE function
def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    )

# Daily Evaluation
mae_daily = mean_absolute_error(all_true, all_preds)
rmse_daily = np.sqrt(mean_squared_error(all_true, all_preds))
smape_daily = smape(np.array(all_true), np.array(all_preds))

In [15]:
# AADB Evaluation
aadb_df = pd.DataFrame({
    'station': aadb_stations,
    'actual': aadb_actuals,
    'predicted': aadb_preds
})

mae_aadb = mean_absolute_error(aadb_df['actual'], aadb_df['predicted'])
rmse_aadb = np.sqrt(mean_squared_error(aadb_df['actual'], aadb_df['predicted']))
smape_aadb = smape(aadb_df['actual'], aadb_df['predicted'])

In [16]:
# Output
print("\nLOGO Linear Regression Evaluation:")
print(f"Daily  → MAE: {mae_daily:.2f}, RMSE: {rmse_daily:.2f}, SMAPE: {smape_daily:.2f}%")
print(f"AADB   → MAE: {mae_aadb:.2f}, RMSE: {rmse_aadb:.2f}, SMAPE: {smape_aadb:.2f}%")


LOGO Linear Regression Evaluation:
Daily  → MAE: 1893.86, RMSE: 2899.07, SMAPE: 62.04%
AADB   → MAE: 1635.84, RMSE: 2269.95, SMAPE: 51.67%


## Decision Tree

In [17]:
# Use full feature set
non_features = ['station_name', 'date', 'year', 'cycling_volume', 'log_cycling_volume', 'num_motor_sources']
features = [col for col in df_filtered.columns if col not in non_features]

X = df_filtered[features]
y = df_filtered['log_cycling_volume']

In [18]:
# Base model for feature selection
base_tree = DecisionTreeRegressor(random_state=42)

# Sequential forward selection
sfs = SequentialFeatureSelector(
    base_tree,
    n_features_to_select='auto',
    direction='forward',
    scoring='neg_mean_absolute_error',
    cv=KFold(n_splits=5),
    n_jobs=-1
)

sfs.fit(X, y)

# Get selected features
selected_features = X.columns[sfs.get_support()]
print("\nSelected features by SFS for Decision Tree:")
print(selected_features.tolist())


Selected features by SFS for Decision Tree:
['longitude', 'shops_within_5km', 'education_within_2km', 'farming', 'forests', 'horticulture', 'waterways', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wspd', 'wpgt', 'pres', 'tsun', 'wdir', 'public_holiday', 'avg_truck_speed', 'total_motor_trucks', 'city_avg_motor_speed', 'city_avg_car_speed', 'city_avg_truck_speed', 'city_total_trucks', 'day_of_week', 'is_weekend', 'month', 'bicycle_lane_type_separate', 'bicycle_lane_type_sidepath', 'bicycle_lane_type_track']


In [19]:
# Use selected features from SFS
X_selected = df_filtered[selected_features]
y = df_filtered['log_cycling_volume']

# Parameter grid for Decision Tree
param_grid = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'criterion': ['squared_error', 'friedman_mse']
}

# Grid search with cross-validation
tree = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(
    tree,
    param_grid,
    scoring='neg_mean_absolute_error',
    cv=KFold(n_splits=5),
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_selected, y)

print("\nBest hyperparameters:")
print(grid_search.best_params_)

# Save best model for reuse
best_tree_model = grid_search.best_estimator_

Fitting 5 folds for each of 72 candidates, totalling 360 fits

Best hyperparameters:
{'criterion': 'friedman_mse', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [20]:
# SMAPE function
def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    )

# Setup LOGO
groups = df_filtered['station_name']
logo = LeaveOneGroupOut()

# Storage
all_preds, all_true = [], []
aadb_preds, aadb_actuals, aadb_stations = [], [], []

In [21]:
# Loop over stations
for train_idx, test_idx in logo.split(df_filtered, groups=groups):
    train_df = df_filtered.iloc[train_idx]
    test_df = df_filtered.iloc[test_idx]

    X_train = train_df[selected_features]
    y_train = train_df['log_cycling_volume']
    X_test = test_df[selected_features]
    y_test_true = test_df['cycling_volume']

    # Train with best hyperparameters
    tree_model = DecisionTreeRegressor(**grid_search.best_params_, random_state=42)
    tree_model.fit(X_train, y_train)

    y_pred_log = tree_model.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    all_preds.extend(y_pred)
    all_true.extend(y_test_true.values)

    aadb_preds.append(np.mean(y_pred))
    aadb_actuals.append(np.mean(y_test_true.values))
    aadb_stations.append(test_df['station_name'].iloc[0])

In [22]:
# Daily Evaluation
mae_daily = mean_absolute_error(all_true, all_preds)
rmse_daily = np.sqrt(mean_squared_error(all_true, all_preds))
smape_daily = smape(np.array(all_true), np.array(all_preds))

In [23]:
# AADB Evaluation
aadb_df = pd.DataFrame({
    'station': aadb_stations,
    'actual': aadb_actuals,
    'predicted': aadb_preds
})

mae_aadb = mean_absolute_error(aadb_df['actual'], aadb_df['predicted'])
rmse_aadb = np.sqrt(mean_squared_error(aadb_df['actual'], aadb_df['predicted']))
smape_aadb = smape(aadb_df['actual'], aadb_df['predicted'])

In [24]:
# Print results
print("\nLOGO Decision Tree Evaluation:")
print(f"Daily  → MAE: {mae_daily:.2f}, RMSE: {rmse_daily:.2f}, SMAPE: {smape_daily:.2f}%")
print(f"AADB   → MAE: {mae_aadb:.2f}, RMSE: {rmse_aadb:.2f}, SMAPE: {smape_aadb:.2f}%")


LOGO Decision Tree Evaluation:
Daily  → MAE: 1908.66, RMSE: 2864.92, SMAPE: 53.04%
AADB   → MAE: 1480.34, RMSE: 1880.08, SMAPE: 42.02%


## Random Forest

In [25]:
# Use full feature list
non_features = ['station_name', 'date', 'year', 'cycling_volume', 'log_cycling_volume', 'num_motor_sources']
features = [col for col in df_filtered.columns if col not in non_features]

X = df_filtered[features]
y = df_filtered['log_cycling_volume']

In [26]:
# RFECV with Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

rfecv = RFECV(
    estimator=rf_model,
    step=1,
    cv=KFold(n_splits=5),
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

rfecv.fit(X, y)

# Selected features
selected_features = X.columns[rfecv.support_]
print("\nSelected features by RFE for Random Forest:")
print(selected_features.tolist())


Selected features by RFE for Random Forest:
['latitude', 'longitude', 'maxspeed_near_station', 'shops_within_0km', 'shops_within_1km', 'shops_within_2km', 'shops_within_5km', 'hotels_within_0km', 'hotels_within_1km', 'hotels_within_2km', 'hotels_within_5km', 'education_within_0km', 'education_within_1km', 'education_within_2km', 'education_within_5km', 'hospitals_within_0km', 'hospitals_within_1km', 'hospitals_within_2km', 'hospitals_within_5km', 'cemeteries', 'farming', 'forests', 'horticulture', 'industry', 'parks', 'private_gardening', 'residential', 'traffic', 'waterways', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wspd', 'wpgt', 'pres', 'tsun', 'wdir', 'public_holiday', 'avg_motor_volume', 'avg_motor_speed', 'avg_car_speed', 'avg_truck_speed', 'total_motor_cars', 'total_motor_trucks', 'city_avg_motor_speed', 'city_avg_car_speed', 'city_avg_truck_speed', 'city_total_motor_vehicles', 'city_total_cars', 'city_total_trucks', 'day_of_week', 'is_weekend', 'month', 'bicycle_lane_type_none

In [27]:
# Use selected features
X_selected = df_filtered[selected_features]
y = df_filtered['log_cycling_volume']

# Parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'bootstrap': [True, False]
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=KFold(n_splits=5),
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_selected, y)

print("\nBest hyperparameters for Random Forest:")
print(grid_search.best_params_)

# Store best model
best_rf_model = grid_search.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best hyperparameters for Random Forest:
{'bootstrap': True, 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


In [28]:
# Define groups
groups = df_filtered['station_name']
logo = LeaveOneGroupOut()

# Storage
all_preds, all_true = [], []
aadb_preds, aadb_actuals, aadb_stations = [], [], []

In [29]:
# LOGO loop
for train_idx, test_idx in logo.split(df_filtered, groups=groups):
    train_df = df_filtered.iloc[train_idx]
    test_df = df_filtered.iloc[test_idx]

    X_train = train_df[selected_features]
    y_train = train_df['log_cycling_volume']
    X_test = test_df[selected_features]
    y_test_true = test_df['cycling_volume']

    model = RandomForestRegressor(**grid_search.best_params_, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)

    y_pred_log = model.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    all_preds.extend(y_pred)
    all_true.extend(y_test_true.values)

    aadb_preds.append(np.mean(y_pred))
    aadb_actuals.append(np.mean(y_test_true.values))
    aadb_stations.append(test_df['station_name'].iloc[0])

In [30]:
# SMAPE function
def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    )

# Daily metrics
mae_daily = mean_absolute_error(all_true, all_preds)
rmse_daily = np.sqrt(mean_squared_error(all_true, all_preds))
smape_daily = smape(np.array(all_true), np.array(all_preds))

In [31]:
# AADB metrics
aadb_df = pd.DataFrame({
    'station': aadb_stations,
    'actual': aadb_actuals,
    'predicted': aadb_preds
})

mae_aadb = mean_absolute_error(aadb_df['actual'], aadb_df['predicted'])
rmse_aadb = np.sqrt(mean_squared_error(aadb_df['actual'], aadb_df['predicted']))
smape_aadb = smape(aadb_df['actual'], aadb_df['predicted'])

In [32]:
# Output
print("\nLOGO Random Forest Evaluation:")
print(f"Daily  → MAE: {mae_daily:.2f}, RMSE: {rmse_daily:.2f}, SMAPE: {smape_daily:.2f}%")
print(f"AADB   → MAE: {mae_aadb:.2f}, RMSE: {rmse_aadb:.2f}, SMAPE: {smape_aadb:.2f}%")


LOGO Random Forest Evaluation:
Daily  → MAE: 1955.11, RMSE: 2857.10, SMAPE: 54.97%
AADB   → MAE: 1747.95, RMSE: 2277.25, SMAPE: 48.37%


## Gradient Boosting

In [33]:
# Full feature list
non_features = ['station_name', 'date', 'year', 'cycling_volume', 'log_cycling_volume', 'num_motor_sources']
features = [col for col in df_filtered.columns if col not in non_features]

X = df_filtered[features]
y = df_filtered['log_cycling_volume']

In [34]:
# RFECV with Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

rfecv = RFECV(
    estimator=gb_model,
    step=1,
    cv=KFold(n_splits=5),
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

rfecv.fit(X, y)

selected_features = X.columns[rfecv.support_]
print("\nSelected features by RFE for Gradient Boosting:")
print(selected_features.tolist())

Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 fe

In [35]:
# Use selected features
X_selected = df_filtered[selected_features]
y = df_filtered['log_cycling_volume']

# Parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3]
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    GradientBoostingRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=KFold(n_splits=5),
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_selected, y)

print("\nBest hyperparameters for Gradient Boosting:")
print(grid_search.best_params_)

# Save best model
best_gb_model = grid_search.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits

Best hyperparameters for Gradient Boosting:
{'learning_rate': 0.05, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [36]:
# Define groups
groups = df_filtered['station_name']
logo = LeaveOneGroupOut()

In [37]:
# Storage
all_preds, all_true = [], []
aadb_preds, aadb_actuals, aadb_stations = [], [], []

# LOGO Loop
for train_idx, test_idx in logo.split(df_filtered, groups=groups):
    train_df = df_filtered.iloc[train_idx]
    test_df = df_filtered.iloc[test_idx]

    X_train = train_df[selected_features]
    y_train = train_df['log_cycling_volume']
    X_test = test_df[selected_features]
    y_test_true = test_df['cycling_volume']

    gb_model = GradientBoostingRegressor(**grid_search.best_params_, random_state=42)
    gb_model.fit(X_train, y_train)
    y_pred_log = gb_model.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    all_preds.extend(y_pred)
    all_true.extend(y_test_true.values)

    aadb_preds.append(np.mean(y_pred))
    aadb_actuals.append(np.mean(y_test_true.values))
    aadb_stations.append(test_df['station_name'].iloc[0])

In [38]:
# SMAPE
def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    )

# Daily Metrics
mae_daily = mean_absolute_error(all_true, all_preds)
rmse_daily = np.sqrt(mean_squared_error(all_true, all_preds))
smape_daily = smape(np.array(all_true), np.array(all_preds))

In [39]:
# AADB Metrics
aadb_df = pd.DataFrame({
    'station': aadb_stations,
    'actual': aadb_actuals,
    'predicted': aadb_preds
})

mae_aadb = mean_absolute_error(aadb_df['actual'], aadb_df['predicted'])
rmse_aadb = np.sqrt(mean_squared_error(aadb_df['actual'], aadb_df['predicted']))
smape_aadb = smape(aadb_df['actual'], aadb_df['predicted'])

In [40]:
# Output
print("\nLOGO Gradient Boosting Evaluation:")
print(f"Daily  → MAE: {mae_daily:.2f}, RMSE: {rmse_daily:.2f}, SMAPE: {smape_daily:.2f}%")
print(f"AADB   → MAE: {mae_aadb:.2f}, RMSE: {rmse_aadb:.2f}, SMAPE: {smape_aadb:.2f}%")


LOGO Gradient Boosting Evaluation:
Daily  → MAE: 1496.72, RMSE: 2294.93, SMAPE: 49.86%
AADB   → MAE: 1294.61, RMSE: 1796.31, SMAPE: 43.21%


## XGBoost

In [41]:
# Define full feature list
non_features = ['station_name', 'date', 'year', 'cycling_volume', 'log_cycling_volume', 'num_motor_sources']
features = [col for col in df_filtered.columns if col not in non_features]

X = df_filtered[features]
y = df_filtered['log_cycling_volume']

In [42]:
# Base model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)

# Sequential Forward Selection
sfs = SequentialFeatureSelector(
    xgb_model,
    n_features_to_select='auto',
    direction='forward',
    scoring='neg_mean_absolute_error',
    cv=KFold(n_splits=5),
    n_jobs=-1
)

sfs.fit(X, y)

# Selected features
selected_features = X.columns[sfs.get_support()]
print("\nSelected features by SFS for XGBoost:")
print(selected_features.tolist())


Selected features by SFS for XGBoost:
['shops_within_0km', 'hotels_within_0km', 'education_within_2km', 'hospitals_within_0km', 'farming', 'horticulture', 'industry', 'parks', 'tavg', 'tmin', 'prcp', 'snow', 'wspd', 'wpgt', 'pres', 'tsun', 'wdir', 'avg_motor_volume', 'avg_truck_speed', 'total_motor_cars', 'total_motor_trucks', 'city_avg_truck_speed', 'city_total_motor_vehicles', 'city_total_cars', 'day_of_week', 'is_weekend', 'month', 'bicycle_lane_type_separate', 'bicycle_lane_type_sidepath', 'bicycle_lane_type_track']


In [43]:
# Use selected features
X_selected = df_filtered[selected_features]
y = df_filtered['log_cycling_volume']

# Parameter grid for XGBoost
param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 5],
    'gamma': [0, 1]
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1),
    param_grid,
    scoring='neg_mean_absolute_error',
    cv=KFold(n_splits=5),
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_selected, y)

print("\nBest hyperparameters for XGBoost:")
print(grid_search.best_params_)

# Store best model
best_xgb_model = grid_search.best_estimator_

Fitting 5 folds for each of 64 candidates, totalling 320 fits

Best hyperparameters for XGBoost:
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 1.0}


In [44]:
# Define groups
groups = df_filtered['station_name']
logo = LeaveOneGroupOut()

In [45]:
# Storage
all_preds, all_true = [], []
aadb_preds, aadb_actuals, aadb_stations = [], [], []

# LOGO Loop
for train_idx, test_idx in logo.split(df_filtered, groups=groups):
    train_df = df_filtered.iloc[train_idx]
    test_df = df_filtered.iloc[test_idx]

    X_train = train_df[selected_features]
    y_train = train_df['log_cycling_volume']
    X_test = test_df[selected_features]
    y_test_true = test_df['cycling_volume']

    xgb_model = XGBRegressor(**grid_search.best_params_, objective='reg:squarederror', random_state=42, n_jobs=-1)
    xgb_model.fit(X_train, y_train)
    y_pred_log = xgb_model.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    all_preds.extend(y_pred)
    all_true.extend(y_test_true.values)

    aadb_preds.append(np.mean(y_pred))
    aadb_actuals.append(np.mean(y_test_true.values))
    aadb_stations.append(test_df['station_name'].iloc[0])

In [46]:
# SMAPE
def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    )

# Daily metrics
mae_daily = mean_absolute_error(all_true, all_preds)
rmse_daily = np.sqrt(mean_squared_error(all_true, all_preds))
smape_daily = smape(np.array(all_true), np.array(all_preds))

In [47]:
# AADB metrics
aadb_df = pd.DataFrame({
    'station': aadb_stations,
    'actual': aadb_actuals,
    'predicted': aadb_preds
})

mae_aadb = mean_absolute_error(aadb_df['actual'], aadb_df['predicted'])
rmse_aadb = np.sqrt(mean_squared_error(aadb_df['actual'], aadb_df['predicted']))
smape_aadb = smape(aadb_df['actual'], aadb_df['predicted'])

In [48]:
# Output
print("\nLOGO XGBoost Evaluation:")
print(f"Daily  → MAE: {mae_daily:.2f}, RMSE: {rmse_daily:.2f}, SMAPE: {smape_daily:.2f}%")
print(f"AADB   → MAE: {mae_aadb:.2f}, RMSE: {rmse_aadb:.2f}, SMAPE: {smape_aadb:.2f}%")


LOGO XGBoost Evaluation:
Daily  → MAE: 1366.49, RMSE: 2048.08, SMAPE: 45.01%
AADB   → MAE: 1152.88, RMSE: 1465.96, SMAPE: 38.69%


## Shallow Neural Network

In [11]:
# Define full feature list
non_features = ['station_name', 'date', 'year', 'cycling_volume', 'log_cycling_volume', 'num_motor_sources']
features = [col for col in df_filtered.columns if col not in non_features]

X = df_filtered[features]
y = df_filtered['log_cycling_volume']

In [12]:
# Pipeline: scale → SelectKBest
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('select_k_best', SelectKBest(score_func=f_regression, k=31))  # You can adjust k here
])

pipeline.fit(X, y)

# Get selected features
mask = pipeline.named_steps['select_k_best'].get_support()
selected_features = X.columns[mask]

print("\nSelected features by SelectKBest for Shallow Neural Network:")
print(selected_features.tolist())


Selected features by SelectKBest for Shallow Neural Network:
['latitude', 'maxspeed_near_station', 'shops_within_0km', 'shops_within_1km', 'shops_within_2km', 'shops_within_5km', 'hotels_within_1km', 'hotels_within_2km', 'hotels_within_5km', 'education_within_0km', 'education_within_1km', 'education_within_2km', 'education_within_5km', 'forests', 'parks', 'private_gardening', 'tavg', 'tmin', 'tmax', 'tsun', 'avg_motor_volume', 'avg_motor_speed', 'avg_car_speed', 'total_motor_cars', 'total_motor_trucks', 'city_avg_motor_speed', 'city_avg_truck_speed', 'city_total_trucks', 'day_of_week', 'is_weekend', 'distance_to_center_km']


In [13]:
# Use the selected features
X_selected = df_filtered[selected_features]
y = df_filtered['log_cycling_volume']

# Scale inputs
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Parameter grid for SNN
param_grid = {
    'hidden_layer_sizes': [(10,), (50,), (100,)],
    'activation': ['relu', 'tanh'],
    'learning_rate_init': [0.001, 0.01]
}

mlp = MLPRegressor(max_iter=500, random_state=42)

# Grid search with cross-validation
grid_search = GridSearchCV(
    mlp,
    param_grid,
    scoring='neg_mean_absolute_error',
    cv=KFold(n_splits=5),
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_scaled, y)

print("\nBest hyperparameters for Shallow Neural Network:")
print(grid_search.best_params_)

# Store config
best_snn_params = grid_search.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best hyperparameters for Shallow Neural Network:
{'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.001}


In [14]:
# Define Groups
groups = df_filtered['station_name']
logo = LeaveOneGroupOut()

In [15]:
# Storage
all_preds, all_true = [], []
aadb_preds, aadb_actuals, aadb_stations = [], [], []

# LOGO Loop
for train_idx, test_idx in logo.split(df_filtered, groups=groups):
    train_df = df_filtered.iloc[train_idx]
    test_df = df_filtered.iloc[test_idx]

    X_train = train_df[selected_features]
    y_train = train_df['log_cycling_volume']
    X_test = test_df[selected_features]
    y_test_true = test_df['cycling_volume']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    mlp_model = MLPRegressor(**best_snn_params, max_iter=500, random_state=42)
    mlp_model.fit(X_train_scaled, y_train)
    y_pred_log = mlp_model.predict(X_test_scaled)
    y_pred = np.expm1(y_pred_log)

    all_preds.extend(y_pred)
    all_true.extend(y_test_true.values)

    aadb_preds.append(np.mean(y_pred))
    aadb_actuals.append(np.mean(y_test_true.values))
    aadb_stations.append(test_df['station_name'].iloc[0])

In [16]:
# SMAPE
def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    )

# Daily Metrics
mae_daily = mean_absolute_error(all_true, all_preds)
rmse_daily = np.sqrt(mean_squared_error(all_true, all_preds))
smape_daily = smape(np.array(all_true), np.array(all_preds))

In [17]:
# AADB Metrics
aadb_df = pd.DataFrame({
    'station': aadb_stations,
    'actual': aadb_actuals,
    'predicted': aadb_preds
})

mae_aadb = mean_absolute_error(aadb_df['actual'], aadb_df['predicted'])
rmse_aadb = np.sqrt(mean_squared_error(aadb_df['actual'], aadb_df['predicted']))
smape_aadb = smape(aadb_df['actual'], aadb_df['predicted'])

In [18]:
# Output
print("\nLOGO Shallow Neural Network Evaluation:")
print(f"Daily  → MAE: {mae_daily:.2f}, RMSE: {rmse_daily:.2f}, SMAPE: {smape_daily:.2f}%")
print(f"AADB   → MAE: {mae_aadb:.2f}, RMSE: {rmse_aadb:.2f}, SMAPE: {smape_aadb:.2f}%")


LOGO Shallow Neural Network Evaluation:
Daily  → MAE: 11670.52, RMSE: 39299.67, SMAPE: 112.44%
AADB   → MAE: 11562.97, RMSE: 24072.61, SMAPE: 106.15%


## Deep Neural Network

In [11]:
# Define features
non_features = ['station_name', 'date', 'year', 'cycling_volume', 'log_cycling_volume', 'num_motor_sources']
features = [col for col in df_filtered.columns if col not in non_features]

X = df_filtered[features]
y = df_filtered['log_cycling_volume']

In [12]:
# Build pipeline for scaling + PCA + MLP
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=20)),
    ('mlp', MLPRegressor(max_iter=500, random_state=42))
])

# Parameter grid for SNN
param_grid = {
    'mlp__hidden_layer_sizes': [(50, 50), (100, 50), (100, 100)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__learning_rate_init': [0.001, 0.01]
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=KFold(n_splits=5),
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

print("\nBest hyperparameters for Deep Neural Network:")
print(grid_search.best_params_)

# Store best model
best_dnn_pipeline = grid_search.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best hyperparameters for Deep Neural Network:
{'mlp__activation': 'tanh', 'mlp__hidden_layer_sizes': (100, 100), 'mlp__learning_rate_init': 0.01}


In [13]:
# Define Groups
groups = df_filtered['station_name']
logo = LeaveOneGroupOut()

In [14]:
# Storage
all_preds, all_true = [], []
aadb_preds, aadb_actuals, aadb_stations = [], [], []

# LOGO Loop
for train_idx, test_idx in logo.split(df_filtered, groups=groups):
    train_df = df_filtered.iloc[train_idx]
    test_df = df_filtered.iloc[test_idx]

    X_train = train_df[features]
    y_train = train_df['log_cycling_volume']
    X_test = test_df[features]
    y_test_true = test_df['cycling_volume']

    # Rebuild same pipeline with best params
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=20)),
        ('mlp', MLPRegressor(
            hidden_layer_sizes=grid_search.best_params_['mlp__hidden_layer_sizes'],
            activation=grid_search.best_params_['mlp__activation'],
            learning_rate_init=grid_search.best_params_['mlp__learning_rate_init'],
            max_iter=500,
            random_state=42
        ))
    ])

    pipeline.fit(X_train, y_train)
    y_pred_log = pipeline.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    all_preds.extend(y_pred)
    all_true.extend(y_test_true.values)

    aadb_preds.append(np.mean(y_pred))
    aadb_actuals.append(np.mean(y_test_true.values))
    aadb_stations.append(test_df['station_name'].iloc[0])

In [15]:
# SMAPE
def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred))
    )

# Daily Metrics
mae_daily = mean_absolute_error(all_true, all_preds)
rmse_daily = np.sqrt(mean_squared_error(all_true, all_preds))
smape_daily = smape(np.array(all_true), np.array(all_preds))

In [16]:
# AADB Metrics
aadb_df = pd.DataFrame({
    'station': aadb_stations,
    'actual': aadb_actuals,
    'predicted': aadb_preds
})

mae_aadb = mean_absolute_error(aadb_df['actual'], aadb_df['predicted'])
rmse_aadb = np.sqrt(mean_squared_error(aadb_df['actual'], aadb_df['predicted']))
smape_aadb = smape(aadb_df['actual'], aadb_df['predicted'])

In [17]:
# Output
print("\nLOGO Deep Neural Network Evaluation:")
print(f"Daily  → MAE: {mae_daily:.2f}, RMSE: {rmse_daily:.2f}, SMAPE: {smape_daily:.2f}%")
print(f"AADB   → MAE: {mae_aadb:.2f}, RMSE: {rmse_aadb:.2f}, SMAPE: {smape_aadb:.2f}%")


LOGO Deep Neural Network Evaluation:
Daily  → MAE: 2520.83, RMSE: 3598.29, SMAPE: 83.98%
AADB   → MAE: 2342.05, RMSE: 2952.58, SMAPE: 76.10%


## LSTM Model

In [18]:
# Config
n_steps = 30
epochs = 30
batch_size = 32
lstm_units = 64
dropout_rate = 0.2

In [19]:
# Features
non_features = ['station_name', 'date', 'year', 'cycling_volume', 'log_cycling_volume', 'num_motor_sources']
features = [col for col in df_filtered.columns if col not in non_features]
groups = df_filtered['station_name']

# SMAPE
def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(
        2 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))
    )

In [None]:
# Storage
all_preds, all_actuals = [], []
aadb_preds, aadb_actuals, aadb_stations = [], [], []

logo = LeaveOneGroupOut()

for train_idx, test_idx in logo.split(df_filtered, groups=groups):
    train_df = df_filtered.iloc[train_idx].sort_values('date')
    test_df = df_filtered.iloc[test_idx].sort_values('date')

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(train_df[features])
    X_test_scaled = scaler.transform(test_df[features])

    y_train = train_df['log_cycling_volume'].values
    y_test_true = test_df['cycling_volume'].values
    y_test_log = test_df['log_cycling_volume'].values

    # Build sequences
    def create_sequences(X, y, n_steps):
        X_seq, y_seq = [], []
        for i in range(n_steps, len(X)):
            X_seq.append(X[i - n_steps:i])
            y_seq.append(y[i])
        return np.array(X_seq), np.array(y_seq)

    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train, n_steps)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_log, n_steps)
    y_test_true_seq = y_test_true[n_steps:]

    if len(X_test_seq) == 0:
        continue

    # Build model
    tf.keras.backend.clear_session()
    model = Sequential()
    model.add(LSTM(lstm_units, input_shape=(n_steps, len(features))))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    model.compile(optimizer=Adam(), loss='mse')

    model.fit(X_train_seq, y_train_seq, epochs=epochs, batch_size=batch_size, verbose=0)

    # Predict
    y_pred_log = model.predict(X_test_seq).flatten()
    y_pred = np.expm1(y_pred_log)

    all_preds.extend(y_pred)
    all_actuals.extend(y_test_true_seq)

    aadb_preds.append(np.mean(y_pred))
    aadb_actuals.append(np.mean(y_test_true_seq))
    aadb_stations.append(test_df['station_name'].iloc[0])

In [20]:
# Daily Metrics
mae_daily = mean_absolute_error(all_actuals, all_preds)
rmse_daily = np.sqrt(mean_squared_error(all_actuals, all_preds))
r2_daily = r2_score(all_actuals, all_preds)
smape_daily = smape(np.array(all_actuals), np.array(all_preds))

In [21]:
# AADB Metrics
aadb_df = pd.DataFrame({
    'station': aadb_stations,
    'actual': aadb_actuals,
    'predicted': aadb_preds
})

mae_aadb = mean_absolute_error(aadb_df['actual'], aadb_df['predicted'])
rmse_aadb = np.sqrt(mean_squared_error(aadb_df['actual'], aadb_df['predicted']))
r2_aadb = r2_score(aadb_df['actual'], aadb_df['predicted'])
smape_aadb = smape(aadb_df['actual'], aadb_df['predicted'])

In [22]:
# Output
print("\nLOGO LSTM Evaluation:")
print(f"Daily  → MAE: {mae_daily:.2f}, RMSE: {rmse_daily:.2f}, SMAPE: {smape_daily:.2f}%")
print(f"AADB   → MAE: {mae_aadb:.2f}, RMSE: {rmse_aadb:.2f}, SMAPE: {smape_aadb:.2f}%")


LOGO LSTM Evaluation:
Daily  → MAE: 2927.27, RMSE: 4131.95, R²: -0.96, SMAPE: 146.65%
AADB   → MAE: 2712.44, RMSE: 3509.52, R²: -1.61, SMAPE: 131.54%
