## XGBoost regressor for 1-h wind-speed forecast

In [1]:
import numpy as np
import pandas as pd


In [2]:
def make_lags(df, cols, lags):
    return pd.concat([df[col].shift(lag).rename(f"{col}_L{lag}") for col in cols for lag in lags], axis=1)

In [3]:
df = pd.read_csv("../../../data/cleaned/wind_hourly_cleaned.csv")
df = df.sort_values('datetime').reset_index(drop=True)
df['wind_speed'] = df['wind_speed'].astype(float)

In [4]:
LAGS = [1,2,3,23,24,25,48]
df = df.join(make_lags(df[['wind_speed']], ['wind_speed'], LAGS)).dropna()

In [5]:
df.head()

Unnamed: 0,datetime,wind_speed,wind_speed_L1,wind_speed_L2,wind_speed_L3,wind_speed_L23,wind_speed_L24,wind_speed_L25,wind_speed_L48
48,2024-01-03 00:00:00,0.1,0.8,1.0,0.216667,0.933333,1.05,0.716667,0.1
49,2024-01-03 01:00:00,0.616667,0.1,0.8,1.0,0.7,0.933333,1.05,0.283333
50,2024-01-03 02:00:00,0.5,0.616667,0.1,0.8,1.45,0.7,0.933333,0.2
51,2024-01-03 03:00:00,0.033333,0.5,0.616667,0.1,1.75,1.45,0.7,0.5
52,2024-01-03 04:00:00,0.116667,0.033333,0.5,0.616667,0.083333,1.75,1.45,0.966667


In [6]:
# ensure the column is datetime *type*
df['date'] = pd.to_datetime(df['datetime'])   # or df['datetime'] if that's the name

# now extract hour
df['hour'] = df['date'].dt.hour
df['sin_h'] = np.sin(2 * np.pi * df['hour'] / 24)
df['cos_h'] = np.cos(2 * np.pi * df['hour'] / 24)

In [7]:
# 3. train/test split (last 720 rows locked)
train_end = len(df) - 720
test_df = df.iloc[train_end:].copy()

FEATS = [c for c in df.columns if '_L' in c] + ['sin_h', 'cos_h']


In [8]:
from xgboost import XGBRegressor
param = {
    'n_estimators': [200, 400],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
}  # 8 combos

In [11]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

In [14]:
# 1. Define your main training set
train_df = df.iloc[:train_end]
X_train_main = train_df[FEATS].values
y_train_main = train_df['wind_speed'].values

# 2. Run GridSearchCV ONCE on this large set
tscv = TimeSeriesSplit(n_splits=5)
gs = GridSearchCV(XGBRegressor(objective='reg:absoluteerror', random_state=42, n_jobs=-1),
                  param, cv=tscv, scoring='neg_mean_absolute_error', verbose=3)

gs.fit(X_train_main, y_train_main)

# 3. Get the SINGLE best parameter set
best_params = gs.best_params_
print(f"Best parameters found: {best_params}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8;, score=-0.827 total time=   1.0s
[CV 2/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8;, score=-0.732 total time=   1.4s
[CV 3/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8;, score=-0.567 total time=   1.3s
[CV 4/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8;, score=-0.619 total time=   1.9s
[CV 5/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8;, score=-0.706 total time=   2.5s
[CV 1/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=1.0;, score=-0.821 total time=   1.6s
[CV 2/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=1.0;, score=-0.737 total time=   3.6s
[CV 3/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=1.0;, score=-0.571 total time=   2.7s
[CV 4/5] END learning_rate=0.05, max_depth=

In [15]:
# 4. Create the final model instance
final_model = XGBRegressor(objective='reg:absoluteerror', random_state=42, n_jobs=-1, **best_params)

preds = []
for t in range(train_end, len(df)):
    # Get rolling window
    w = df.iloc[t-168:t] 
    X_train_window, y_train_window = w[FEATS].values, w['wind_speed'].values

    # Fit the model with the *best_params*
    final_model.fit(X_train_window, y_train_window) 

    # Predict the next step
    x_next = df.iloc[t][FEATS].values.reshape(1,-1)
    preds.append(final_model.predict(x_next)[0])

test_df['pred_xgb'] = preds
# print('XGB MAE :', m.mean_absolute_error(test_df['wind_speed'], test_df['pred_xgb']))

NameError: name 'm' is not defined

In [16]:
from sklearn import metrics as m

In [17]:
test_df['pred_xgb'] = preds
print('XGB MAE :', m.mean_absolute_error(test_df['wind_speed'], test_df['pred_xgb']))

XGB MAE : 0.729171165451666


In [18]:
# 1. Main training
train_df = df.iloc[:train_end]
X_train_main = train_df[FEATS].values
y_train_main = train_df['wind_speed'].values

# 2. One-time GridSearch
tscv = TimeSeriesSplit(n_splits=5)
gs = GridSearchCV(
    XGBRegressor(objective='reg:absoluteerror', random_state=42, n_jobs=-1),
    param,
    cv=tscv,
    scoring='neg_mean_absolute_error',
    verbose=3
)
gs.fit(X_train_main, y_train_main)
best_params = gs.best_params_
print(f"Best parameters: {best_params}")

# 3. Rolling retraining every 168 hours
final_model = XGBRegressor(objective='reg:absoluteerror', random_state=42,
                           n_jobs=-1, **best_params)

preds = []
model = None

for t in range(train_end, len(df)):

    # retrain every 168 hours (weekly)
    if (t - train_end) % 168 == 0:
        w = df.iloc[:t]       # all available history
        X_train_window = w[FEATS].values
        y_train_window = w['wind_speed'].values

        model = XGBRegressor(objective='reg:absoluteerror', random_state=42,
                             n_jobs=-1, **best_params)
        model.fit(X_train_window, y_train_window)

    # prediction
    x_next = df.iloc[t][FEATS].values.reshape(1,-1)
    preds.append(model.predict(x_next)[0])

test_df['pred_xgb'] = preds
print("XGB MAE:", m.mean_absolute_error(test_df['wind_speed'], test_df['pred_xgb']))


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8;, score=-0.827 total time=   0.9s
[CV 2/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8;, score=-0.732 total time=   1.2s
[CV 3/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8;, score=-0.567 total time=   1.3s
[CV 4/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8;, score=-0.619 total time=   1.3s
[CV 5/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.8;, score=-0.706 total time=   1.6s
[CV 1/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=1.0;, score=-0.821 total time=   1.0s
[CV 2/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=1.0;, score=-0.737 total time=   1.3s
[CV 3/5] END learning_rate=0.05, max_depth=3, n_estimators=200, subsample=1.0;, score=-0.571 total time=   1.1s
[CV 4/5] END learning_rate=0.05, max_depth=