In [1]:
import numpy as np
import pandas as pd
import optuna
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

# Supress Optuna's huge log output to keep your console clean
optuna.logging.set_verbosity(optuna.logging.WARNING)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def make_lags(df, cols, lags):
    return pd.concat([df[col].shift(lag).rename(f"{col}_L{lag}") for col in cols for lag in lags], axis=1)

In [3]:
df = pd.read_csv("../../../data/cleaned/pincher_station_hourly_wind_cleaned.csv")


In [4]:
df.head()

Unnamed: 0,timestamp,temp_c,rel_humidity,wind_speed_kmh,pressure_kpa,wind_dir_deg,timestamp_rounded,u,v,pressure_24h
0,2011-06-27 14:00:00,7.9,85.0,9.0,88.14,320.0,2011-06-27 14:00:00,5.785088,-6.8944,88.14
1,2011-06-27 15:00:00,11.5,74.0,8.0,88.11,320.0,2011-06-27 15:00:00,5.142301,-6.128356,88.125
2,2011-06-27 16:00:00,15.4,61.0,5.0,88.04,320.0,2011-06-27 16:00:00,3.213938,-3.830222,88.096667
3,2011-06-27 17:00:00,17.3,48.0,8.0,87.99,40.0,2011-06-27 17:00:00,-5.142301,-6.128356,88.07
4,2011-06-27 18:00:00,18.4,47.0,15.0,87.93,120.0,2011-06-27 18:00:00,-12.990381,7.5,88.042


In [5]:
df.columns

Index(['timestamp', 'temp_c', 'rel_humidity', 'wind_speed_kmh', 'pressure_kpa',
       'wind_dir_deg', 'timestamp_rounded', 'u', 'v', 'pressure_24h'],
      dtype='object')

In [6]:
# Rename only the target columns
rename_map = {
    'timestamp':'datetime',
    'temp_c': 'temperature',
    'rel_humidity': 'humidity',
    'wind_speed_kmh': 'wind_speed',
    'pressure_kpa': 'pressure',
    'wind_dir_deg': 'wind_direction'
}

df = df.rename(columns=rename_map)



In [7]:
df = df.sort_values('datetime').reset_index(drop=True)
df['wind_speed'] = df['wind_speed'].astype(float)

In [9]:
df = df[['datetime', 'pressure', 'temperature', 'humidity', 'wind_speed', 'u', 'v']]

In [10]:
df.head()

Unnamed: 0,datetime,pressure,temperature,humidity,wind_speed,u,v
0,2011-06-27 14:00:00,88.14,7.9,85.0,9.0,5.785088,-6.8944
1,2011-06-27 15:00:00,88.11,11.5,74.0,8.0,5.142301,-6.128356
2,2011-06-27 16:00:00,88.04,15.4,61.0,5.0,3.213938,-3.830222
3,2011-06-27 17:00:00,87.99,17.3,48.0,8.0,-5.142301,-6.128356
4,2011-06-27 18:00:00,87.93,18.4,47.0,15.0,-12.990381,7.5


In [11]:
# ensure the column is datetime *type*
df['datetime'] = pd.to_datetime(df['datetime'])   # or df['datetime'] if that's the name

# now extract hour
df['hour'] = df['datetime'].dt.hour
df['sin_h'] = np.sin(2 * np.pi * df['hour'] / 24)
df['cos_h'] = np.cos(2 * np.pi * df['hour'] / 24)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = pd.to_datetime(df['datetime'])   # or df['datetime'] if that's the name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df['datetime'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sin_h'] = np.sin(2 * np.pi * df['hour'] / 24)


In [12]:
# --- 1. Define the Objective Function ---
def objective(trial, X, y):
    """
    Optuna will run this function multiple times with different hyperparams 
    to minimize the return value (MAE).
    """
    
    # Define the search space (Bayesian distributions)
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 800),
        'max_depth': trial.suggest_int('max_depth', 3, 8), # Keep low for small data
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        # Fixed params
        'objective': 'reg:absoluteerror',
        'n_jobs': -1,
        'random_state': 42
    }

    # Inner Cross-Validation (TimeSeriesSplit)
    # We use this to ensure the params don't overfit
    tscv = TimeSeriesSplit(n_splits=3)
    scores = []

    for train_index, val_index in tscv.split(X):
        X_train_cv, X_val_cv = X[train_index], X[val_index]
        y_train_cv, y_val_cv = y[train_index], y[val_index]

        model = xgb.XGBRegressor(**params)
        model.fit(X_train_cv, y_train_cv)
        
        preds = model.predict(X_val_cv)
        mae = mean_absolute_error(y_val_cv, preds)
        scores.append(mae)

    # Return the average MAE of the folds
    return np.mean(scores)


In [13]:
# --- 2. Integration into Your Rolling Loop ---

# 1. REVISED LAGS (Option 2)
# Keep recent history (1,2,3) and the exact time yesterday (24). 
# Drop the noisy lags (23, 25, 48).
LAGS = [1, 2, 3, 24] 
cols_to_lag = ['pressure', 'temperature', 'wind_speed', 'u', 'v']

# 2. Apply Lags
df_lags = make_lags(df, cols_to_lag, LAGS)
df_final = pd.concat([df, df_lags], axis=1)

# 3. Add Rolling Features (Trend & Volatility)
# Calculated on the CURRENT available data (at time t)
# This captures the "average weather of the last day" and "gustiness"
df_final['wind_rolling_24_mean'] = df_final['wind_speed'].rolling(window=24).mean()
df_final['wind_rolling_24_std'] = df_final['wind_speed'].rolling(window=24).std()

# 4. Target Creation & Cleanup
df_final['target_wind_speed'] = df_final['wind_speed'].shift(-1)
df_final.dropna(inplace=True)
X = df_final.drop(['datetime', 'target_wind_speed'], axis=1)
y = df_final['target_wind_speed']


# 3. train/test split (last 720 rows locked)
train_end = len(df_final) - 720
test_df = df_final.iloc[train_end:].copy()

current_obs = ['pressure', 'temperature', 'u', 'v', 'wind_speed']
FEATS = [c for c in df_final.columns if '_L' in c] + ['sin_h', 'cos_h'] + current_obs



In [14]:
RETRAIN_INTERVAL = 168 # hours (weekly)
TUNING_WINDOW_HOURS = 2160 # Approx 3 months for tuning
N_ITER_RANDOM_SEARCH = 15 # Number of param combinations to try (faster)
N_SPLITS_INNER_CV = 3 # Fewer splits for speed during tuning


In [15]:
print("Starting rolling forecast with Bayesian Optimization...")

preds = []
model = None
current_best_params = {}

# Number of Bayesian trials to run per re-tuning (Faster than Random Search)
N_TRIALS = 20 

for t in range(train_end, len(df_final)):

    # Check if it's time to retrain and re-tune
    if (t - train_end) % RETRAIN_INTERVAL == 0:
        print(f"\n--- [Step {t}] Retraining & Bayesian Tuning ---")
        
        # --- A. RE-TUNING (Bayesian Optimization) ---
        tune_start_idx = max(0, t - TUNING_WINDOW_HOURS)
        tune_df = df_final.iloc[tune_start_idx:t]
        
        X_tune = tune_df[FEATS].values
        y_tune = tune_df['target_wind_speed'].values
        
        # Create the Study
        # direction='minimize' because we want to lower MAE
        study = optuna.create_study(direction='minimize')
        
        # Run optimization
        # We use a lambda function to pass X_tune and y_tune into the objective
        study.optimize(lambda trial: objective(trial, X_tune, y_tune), n_trials=N_TRIALS)
        
        current_best_params = study.best_params
        
        # Add fixed params back in for the final model
        current_best_params['objective'] = 'reg:absoluteerror'
        current_best_params['n_jobs'] = -1
        current_best_params['random_state'] = 42
        
        print(f"Best MAE found: {study.best_value:.4f}")
        print(f"Params: {study.best_params}")

        # --- B. RE-TRAINING (Sliding Window) ---
        # Note: Ensure you updated this to use your SLIDING logic from previous steps
        train_start_idx = max(0, t - 4300) # Example 6-month slide
        train_window_df = df_final.iloc[train_start_idx:t]
        
        X_train = train_window_df[FEATS].values
        y_train = train_window_df['target_wind_speed'].values
        
        model = xgb.XGBRegressor(**current_best_params)
        model.fit(X_train, y_train)

    # --- C. PREDICTION ---
    if model is None:
        preds.append(np.nan)
        continue

    x_next = df_final.iloc[[t]][FEATS].values
    preds.append(model.predict(x_next)[0])



Starting rolling forecast with Bayesian Optimization...

--- [Step 117729] Retraining & Bayesian Tuning ---
Best MAE found: 6.3343
Params: {'n_estimators': 628, 'max_depth': 5, 'learning_rate': 0.08392858757104368, 'subsample': 0.6382003206606918, 'colsample_bytree': 0.8334024206761556, 'gamma': 4.820448675961428, 'reg_alpha': 7.655686484853524, 'reg_lambda': 0.003657209584907274}

--- [Step 117897] Retraining & Bayesian Tuning ---
Best MAE found: 6.4183
Params: {'n_estimators': 537, 'max_depth': 6, 'learning_rate': 0.018767939408746583, 'subsample': 0.6522341546509764, 'colsample_bytree': 0.9422999473992473, 'gamma': 4.330391591461765, 'reg_alpha': 0.014228114915342609, 'reg_lambda': 4.756506359571315}

--- [Step 118065] Retraining & Bayesian Tuning ---
Best MAE found: 5.9892
Params: {'n_estimators': 725, 'max_depth': 3, 'learning_rate': 0.01140476468648282, 'subsample': 0.7525120981276476, 'colsample_bytree': 0.9736508102160373, 'gamma': 4.54523560699685, 'reg_alpha': 1.2125093624894

In [17]:
import sklearn.metrics as m

In [18]:

# --- 4. Evaluation ---
test_df['pred_xgb_dynamic'] = preds
# Drop any initial NaNs if we had them
test_df = test_df.dropna(subset=['pred_xgb_dynamic'])

print("\n--- Final Results ---")
print("XGB MAE with Dynamic Re-Tuning:", 
      m.mean_absolute_error(test_df['target_wind_speed'], test_df['pred_xgb_dynamic']))


--- Final Results ---
XGB MAE with Dynamic Re-Tuning: 6.346468536721336
