In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [2]:
path = r"C:\Users\yanzh\Desktop\code_and_data\4. Deep learning part\处理数据\2015-2019 total trips.csv"
data = pd.read_csv(path)
data ['date'] = pd.to_datetime(data['date'])
data = data['total_trips']

num_windows = 5
window_size = math.ceil(len(data) // (num_windows - 1.2))
step_size = math.ceil(window_size * 0.7)

windows = []
train_dataset = []
validation_dataset = []
start = 0

while start + step_size <= len(data):
    end = start + window_size
    window_data = data[start:end]
    train_val_split = int(len(window_data) * 0.9)
    train_data = window_data[:train_val_split]
    validation_data = window_data[train_val_split:]

    windows.append(window_data,)
    train_dataset.append(train_data)
    validation_dataset.append(validation_data)

    start += step_size

In [3]:
total_mae_losses = []
total_mape_losses = []
total_rmse_losses = []
total_r2_scores = []

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 6, 9, 12],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

best_params = None
best_score = float('inf')

# Prepare feature and target data for grid search
X_train_all = []
y_train_all = []
for train_data in train_dataset:
    X_train_all.append(np.arange(len(train_data)).reshape(-1, 1))
    y_train_all.append(train_data.values)

X_train_all = np.vstack(X_train_all)
y_train_all = np.concatenate(y_train_all)

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_absolute_error')
grid_search.fit(X_train_all, y_train_all)

best_params = grid_search.best_params_

print(f'Best parameters found: {best_params}')

# Fit the XGBoost model with best parameters
for i in range(5):
    # Prepare training data
    X_train = np.arange(len(train_dataset[i])).reshape(-1, 1)
    y_train = train_dataset[i].values

    # Prepare validation data
    X_valid = np.arange(len(train_dataset[i]), len(train_dataset[i]) + len(validation_dataset[i])).reshape(-1, 1)
    y_valid = validation_dataset[i].values

    # Train the model with best parameters
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
    model.fit(X_train, y_train)

    # Make predictions
    forecast = model.predict(X_valid)
    forecast = np.ceil(forecast).astype(int)

    mae_losses = []
    mape_losses = []
    rmse_losses = []
    all_predicted = []
    all_actual = []

    for idx, forecast_value in enumerate(forecast):
        actual_value = y_valid[idx]
        all_actual.append(actual_value)
        all_predicted.append(forecast_value)

        # Calculate MAE
        mae_loss = np.abs(actual_value - forecast_value)
        mae_losses.append(mae_loss)

        # Calculate MAPE
        mape_loss = np.abs((actual_value - forecast_value) / actual_value)
        mape_losses.append(mape_loss)

        # Calculate RMSE
        rmse_loss = np.square(actual_value - forecast_value)
        rmse_losses.append(rmse_loss)
    
    # Convert lists to numpy arrays for R-squared calculation
    all_predicted = np.array(all_predicted)
    all_actual = np.array(all_actual)

    # Calculate R-squared
    ss_res = np.sum((all_actual - all_predicted) ** 2)
    ss_tot = np.sum((all_actual - np.mean(all_actual)) ** 2)
    r2 = 1 - (ss_res / ss_tot)

    total_mae_losses.append(np.mean(mae_losses))
    total_mape_losses.append(np.mean(mape_losses) * 100)
    total_rmse_losses.append(np.sqrt(np.mean(rmse_losses)))
    total_r2_scores.append(r2)

# Calculate average of metrics across all windows
average_mae_loss = np.mean(total_mae_losses)
average_mape_loss = np.mean(total_mape_losses)
average_rmse_loss = np.mean(total_rmse_losses)
average_r2_score = np.mean(total_r2_scores)

print(f'Average MAE on Test Set: {average_mae_loss:.4f}')
print(f'Average MAPE on Test Set: {average_mape_loss:.4f}')
print(f'Average RMSE on Test Set: {average_rmse_loss:.4f}')
print(f'R^2 Score on Test Set: {average_r2_score:.4f}')

Best parameters found: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
Average MAE on Test Set: 4577.8698
Average MAPE on Test Set: 169.4144
Average RMSE on Test Set: 5276.3183
R^2 Score on Test Set: -0.0358
