In [1]:
import numpy as np
import pandas as pd


In [2]:
def make_lags(df, cols, lags):
    return pd.concat([df[col].shift(lag).rename(f"{col}_L{lag}") for col in cols for lag in lags], axis=1)

In [3]:
df = pd.read_csv("../../../data/cleaned/bandarAbas_multi_var_wind_3hourly_cleaned.csv")
df = df.sort_values('datetime').reset_index(drop=True)
df['wind_speed'] = df['wind_speed'].astype(float)

In [4]:
df.head()

Unnamed: 0,datetime,temp,pressure,humidity,wind_direction,wind_speed,u,v,pressure_24h
0,2023-03-21 00:00:00,20.300001,1004.2,84.0,50.0,3.0,-2.298133,-1.928363,1004.2
1,2023-03-21 03:00:00,20.9,1005.2,86.0,360.0,2.0,1.224647e-16,-2.0,1004.7
2,2023-03-21 06:00:00,25.800001,1006.4,63.0,40.0,4.0,-2.57115,-3.064178,1005.266667
3,2023-03-21 09:00:00,27.300001,1005.8,62.0,120.0,3.0,-2.598076,1.5,1005.4
4,2023-03-21 12:00:00,27.800001,1003.5,47.0,250.0,5.0,4.698463,1.710101,1005.02


In [5]:
df = df[['datetime', 'pressure', 'temp', 'humidity', 'wind_speed', 'u', 'v']]

In [6]:
df.head()

Unnamed: 0,datetime,pressure,temp,humidity,wind_speed,u,v
0,2023-03-21 00:00:00,1004.2,20.300001,84.0,3.0,-2.298133,-1.928363
1,2023-03-21 03:00:00,1005.2,20.9,86.0,2.0,1.224647e-16,-2.0
2,2023-03-21 06:00:00,1006.4,25.800001,63.0,4.0,-2.57115,-3.064178
3,2023-03-21 09:00:00,1005.8,27.300001,62.0,3.0,-2.598076,1.5
4,2023-03-21 12:00:00,1003.5,27.800001,47.0,5.0,4.698463,1.710101


In [8]:
# ensure the column is datetime *type*
df['datetime'] = pd.to_datetime(df['datetime'])   # or df['datetime'] if that's the name

# now extract hour
df['hour'] = df['datetime'].dt.hour
df['sin_h'] = np.sin(2 * np.pi * df['hour'] / 24)
df['cos_h'] = np.cos(2 * np.pi * df['hour'] / 24)

In [9]:
# 1. REVISED LAGS (Option 2)
# Keep recent history (1,2,3) and the exact time yesterday (24). 
# Drop the noisy lags (23, 25, 48).
LAGS = [1, 2, 3, 24] 
cols_to_lag = ['pressure', 'temp', 'wind_speed', 'u', 'v']

# 2. Apply Lags
df_lags = make_lags(df, cols_to_lag, LAGS)
df_final = pd.concat([df, df_lags], axis=1)

# 3. Add Rolling Features (Trend & Volatility)
# Calculated on the CURRENT available data (at time t)
# This captures the "average weather of the last day" and "gustiness"
df_final['wind_rolling_24_mean'] = df_final['wind_speed'].rolling(window=24).mean()
df_final['wind_rolling_24_std'] = df_final['wind_speed'].rolling(window=24).std()

# 4. Target Creation & Cleanup
df_final['target_wind_speed'] = df_final['wind_speed'].shift(-1)
df_final.dropna(inplace=True)
X = df_final.drop(['datetime', 'target_wind_speed'], axis=1)
y = df_final['target_wind_speed']

In [10]:
df_final.head()

Unnamed: 0,datetime,pressure,temp,humidity,wind_speed,u,v,hour,sin_h,cos_h,...,u_L2,u_L3,u_L24,v_L1,v_L2,v_L3,v_L24,wind_rolling_24_mean,wind_rolling_24_std,target_wind_speed
24,2023-03-24 00:00:00,1010.4,17.2,61.0,3.0,1.5,-2.598076,0,0.0,1.0,...,4.698463,5.142301,-2.298133,-1.928363,1.710101,6.128356,-1.928363,5.583333,2.991534,3.0
25,2023-03-24 03:00:00,1012.1,17.6,69.0,3.0,1.5,-2.598076,3,0.7071068,0.7071068,...,-2.298133,4.698463,1.224647e-16,-2.598076,-1.928363,1.710101,-2.0,5.625,2.946073,3.0
26,2023-03-24 06:00:00,1013.9,25.0,35.0,3.0,-2.819078,-1.02606,6,1.0,6.123234000000001e-17,...,1.5,-2.298133,-2.57115,-2.598076,-2.598076,-1.928363,-3.064178,5.583333,2.976965,5.0
27,2023-03-24 09:00:00,1012.2,26.7,27.0,5.0,0.868241,4.924039,9,0.7071068,-0.7071068,...,1.5,1.5,-2.598076,-1.02606,-2.598076,-2.598076,1.5,5.666667,2.929114,6.0
28,2023-03-24 12:00:00,1009.1,27.0,23.0,6.0,2.052121,5.638156,12,1.224647e-16,-1.0,...,-2.819078,1.5,4.698463,4.924039,-1.02606,-2.598076,1.710101,5.708333,2.92633,4.0


In [14]:
# 3. train/test split (last 720 rows locked)
train_end = len(df_final) - 720
test_df = df_final.iloc[train_end:].copy()

current_obs = ['pressure', 'temp', 'u', 'v', 'wind_speed']
FEATS = [c for c in df_final.columns if '_L' in c] + ['sin_h', 'cos_h'] + current_obs



In [15]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
import scipy.stats as stats # Used for parameter distributions

# --- Assumed Setup (from your original code) ---
# df = ... (your DataFrame)
# FEATS = [...] (your list of feature names)
# train_end = ... (the index where initial training ends)
# test_df = df.iloc[train_end:].copy()
# m = ... (your metrics module, e.g., sklearn.metrics)
# -------------------------------------------------

### 1. Define Parameter Distribution for RandomizedSearch
# We use distributions (like uniform, loguniform) instead of a fixed grid
# This is much faster and more efficient for periodic tuning.
param_dist = {
    'n_estimators': stats.randint(100, 1000),
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.loguniform(0.01, 0.3),
    'subsample': stats.uniform(0.6, 0.4), # [0.6, 1.0]
    'colsample_bytree': stats.uniform(0.6, 0.4), # [0.6, 1.0]
    'gamma': stats.uniform(0, 0.5),
    'reg_alpha': stats.loguniform(1e-3, 1.0),
    'reg_lambda': stats.loguniform(1e-3, 1.0),
}

### 2. Define Tuning and Retraining Settings
RETRAIN_INTERVAL = 168 # hours (weekly)
TUNING_WINDOW_HOURS = 2160 # Approx 3 months for tuning
N_ITER_RANDOM_SEARCH = 15 # Number of param combinations to try (faster)
N_SPLITS_INNER_CV = 3 # Fewer splits for speed during tuning


In [16]:

### 3. Rolling Retraining and Re-Tuning Loop
print("Starting rolling forecast with periodic re-tuning...")

preds = []
model = None # Start with no model
current_best_params = {} # Will be set on the first run

for t in range(train_end, len(df_final)):

    # Check if it's time to retrain and re-tune
    if (t - train_end) % RETRAIN_INTERVAL == 0:
        print(f"\n--- Retraining and Re-Tuning at index {t} ---")
        
        # --- A. RE-TUNING on a SLIDING window ---
        # We tune on a *recent* window to adapt to new conditions
        tune_start_idx = max(0, t - TUNING_WINDOW_HOURS)
        tune_window_df = df_final.iloc[tune_start_idx:t]
        
        X_tune = tune_window_df[FEATS].values
        y_tune = tune_window_df['target_wind_speed'].values
        
        print(f"Running RandomizedSearch on window {tune_start_idx} to {t}...")
        
        # Use TimeSeriesSplit for the inner cross-validation
        tscv_inner = TimeSeriesSplit(n_splits=N_SPLITS_INNER_CV)
        
        rs = RandomizedSearchCV(
            estimator=xgb.XGBRegressor(objective='reg:absoluteerror', 
                                       random_state=42, n_jobs=-1),
            param_distributions=param_dist,
            n_iter=N_ITER_RANDOM_SEARCH, # Much faster than grid search
            cv=tscv_inner,
            scoring='neg_mean_absolute_error',
            verbose=0, # Set to 1 or more for details
            n_jobs=-1
        )
        
        rs.fit(X_tune, y_tune)
        
        # Update the parameters to use for this week's model
        current_best_params = rs.best_params_
        print(f"New best parameters found: {current_best_params}")

        # --- B. RE-TRAINING on an EXPANDING window ---
        # We train the final model on all available history
        # using the *newly found* best parameters.
        train_window_df = df_final.iloc[:t] 
        X_train_window = train_window_df[FEATS].values
        y_train_window = train_window_df['target_wind_speed'].values
        
        print(f"Retraining model on window 0 to {t}...")
        model = xgb.XGBRegressor(objective='reg:absoluteerror', 
                                 random_state=42, n_jobs=-1, 
                                 **current_best_params) # Use the new params
        
        model.fit(X_train_window, y_train_window)
        print("Model retrained successfully.")

    # --- C. PREDICTION ---
    # This happens every hour, using the most recently trained model
    
    # Handle the very first prediction if model isn't trained yet
    if model is None:
        # This case should be handled, e.g., by training a default
        # model before the loop. For this structure, we'll
        # just predict 0 or NaN for simplicity until first retrain.
        print("Waiting for first retraining cycle...")
        preds.append(np.nan) # Or a default value
        continue

    x_next = df_final.iloc[t][FEATS].values.reshape(1,-1)
    preds.append(model.predict(x_next)[0])


Starting rolling forecast with periodic re-tuning...

--- Retraining and Re-Tuning at index 2175 ---
Running RandomizedSearch on window 15 to 2175...
New best parameters found: {'colsample_bytree': np.float64(0.9724374201613102), 'gamma': np.float64(0.3671576186217574), 'learning_rate': np.float64(0.025967636782366812), 'max_depth': 4, 'n_estimators': 644, 'reg_alpha': np.float64(0.021964041710396436), 'reg_lambda': np.float64(0.002402666030685783), 'subsample': np.float64(0.7922539387069559)}
Retraining model on window 0 to 2175...
Model retrained successfully.

--- Retraining and Re-Tuning at index 2343 ---
Running RandomizedSearch on window 183 to 2343...
New best parameters found: {'colsample_bytree': np.float64(0.7647644257540013), 'gamma': np.float64(0.2822998129172624), 'learning_rate': np.float64(0.017961065851537016), 'max_depth': 3, 'n_estimators': 741, 'reg_alpha': np.float64(0.3663617136386352), 'reg_lambda': np.float64(0.02503938321805517), 'subsample': np.float64(0.953016

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

# ==========================================
# 1. CONFIGURATION & STRATEGY SELECTION
# ==========================================

# --- CHOOSE YOUR STRATEGY HERE ---
# Options: "SLIDING" (Faster, adapts to recent climate) or "EXPANDING" (Uses all history)
WINDOW_STRATEGY = "SLIDING"  

# Configuration
SLIDING_WINDOW_SIZE = 4300  # ~6 months (used only if strategy is SLIDING)
RETRAIN_INTERVAL = 168      # Retrain every week (168 hours)
TUNING_WINDOW_HOURS = 2160  # Look back 3 months for hyperparameter tuning
TEST_SIZE = 720             # Last 30 days for testing

# ==========================================
# 2. DATA LOADING & FEATURE ENGINEERING
# ==========================================

# --- [A] MOCK DATA GENERATOR (Replace this with your pd.read_csv) ---
# Generates 10,000 hours of synthetic weather data
np.random.seed(42)
N_TOTAL = 10000
dates = pd.date_range(start='2023-01-01', periods=N_TOTAL, freq='H')
df = pd.DataFrame({
    'datetime': dates,
    'pressure': np.random.normal(1013, 5, N_TOTAL),
    'temperature': np.random.normal(15, 10, N_TOTAL),
    'humidity': np.random.uniform(30, 90, N_TOTAL),
    'wind_speed': np.abs(np.random.normal(5, 3, N_TOTAL)), # Current Speed
    'wind_direction': np.random.uniform(0, 360, N_TOTAL)   # Meteorological Degrees
})
# --------------------------------------------------------------------

print("Processing Data and Engineering Features...")

# 1. Cyclical Time Features
df['sin_h'] = np.sin(2 * np.pi * df['datetime'].dt.hour / 24)
df['cos_h'] = np.cos(2 * np.pi * df['datetime'].dt.hour / 24)

# 2. Vector Math (CORRECTED)
# Convert Meteorological Degrees (0=North) to Math Radians (0=East)
wd_rad = np.deg2rad(270 - df['wind_direction'])
df['u'] = df['wind_speed'] * np.cos(wd_rad)
df['v'] = df['wind_speed'] * np.sin(wd_rad)

# 3. Lag Features (Option 2: Clean Lags)
def make_lags(df, cols, lags):
    return pd.concat([df[col].shift(lag).rename(f"{col}_L{lag}") for col in cols for lag in lags], axis=1)

COLS_TO_LAG = ['pressure', 'temperature', 'wind_speed', 'u', 'v']
LAGS = [1, 2, 3, 24] # Recent inertia + Yesterday same time

df_lags = make_lags(df, COLS_TO_LAG, LAGS)
df_final = pd.concat([df, df_lags], axis=1)

# 4. Rolling Statistics (Trend & Volatility)
# Rolling window of 24h to capture daily average and gustiness
df_final['wind_rolling_24_mean'] = df_final['wind_speed'].rolling(window=24).mean()
df_final['wind_rolling_24_std'] = df_final['wind_speed'].rolling(window=24).std()

# 5. Create Target (Shift -1 for Next Hour Forecast)
df_final['target_wind_speed'] = df_final['wind_speed'].shift(-1)

# 6. Cleanup
df_final.dropna(inplace=True)
df_final.reset_index(drop=True, inplace=True)

# 7. Define Feature List (FEATS)
# Include Lags, Cyclicals, Rolling, AND Current State (u, v, pressure, etc)
FEATS = [c for c in df_final.columns if '_L' in c] + \
        ['sin_h', 'cos_h', 'wind_rolling_24_mean', 'wind_rolling_24_std'] + \
        ['pressure', 'temperature', 'u', 'v', 'wind_speed']

print(f"Data Ready. Shape: {df_final.shape}")
print(f"Features selected: {len(FEATS)}")

# ==========================================
# 3. WALK-FORWARD VALIDATION LOOP
# ==========================================

# Set split point
train_end = len(df_final) - TEST_SIZE
print(f"\nStarting Forecast Loop. Strategy: {WINDOW_STRATEGY}")
print(f"Training Range: 0 to {train_end}")
print(f"Testing Range: {train_end} to {len(df_final)}")

# Parameter Distributions for RandomizedSearch
param_dist = {
    'n_estimators': stats.randint(100, 500),
    'max_depth': stats.randint(3, 8), # Keep depth lower to prevent overfitting
    'learning_rate': stats.loguniform(0.01, 0.2),
    'subsample': stats.uniform(0.7, 0.3),
    'colsample_bytree': stats.uniform(0.7, 0.3),
    'reg_alpha': stats.loguniform(1e-3, 10.0),
    'reg_lambda': stats.loguniform(1e-3, 10.0),
}

preds = []
model = None
current_best_params = {}

for t in range(train_end, len(df_final)):
    
    # --- RETRAINING CHECK ---
    if (t - train_end) % RETRAIN_INTERVAL == 0:
        print(f"\n--- [Step {t}] Periodic Retraining & Tuning ---")
        
        # [A] RE-TUNING (Always use recent sliding window for speed)
        tune_start_idx = max(0, t - TUNING_WINDOW_HOURS)
        tune_df = df_final.iloc[tune_start_idx:t]
        
        X_tune = tune_df[FEATS].values
        y_tune = tune_df['target_wind_speed'].values # Correct Target
        
        print(f"Tuning on recent window: {tune_start_idx} -> {t}")
        
        tscv = TimeSeriesSplit(n_splits=3)
        rs = RandomizedSearchCV(
            estimator=xgb.XGBRegressor(objective='reg:absoluteerror', n_jobs=-1, random_state=42),
            param_distributions=param_dist,
            n_iter=10, # Low iter for speed in demo
            cv=tscv,
            scoring='neg_mean_absolute_error',
            verbose=0,
            n_jobs=-1
        )
        rs.fit(X_tune, y_tune)
        current_best_params = rs.best_params_
        print(f"Best Params: {current_best_params}")

        # [B] RE-TRAINING (Apply Selected Strategy)
        if WINDOW_STRATEGY == "EXPANDING":
            train_start_idx = 0
            desc = "EXPANDING (All History)"
        elif WINDOW_STRATEGY == "SLIDING":
            train_start_idx = max(0, t - SLIDING_WINDOW_SIZE)
            desc = f"SLIDING (Last {SLIDING_WINDOW_SIZE} hrs)"
        
        train_df = df_final.iloc[train_start_idx:t]
        X_train = train_df[FEATS].values
        y_train = train_df['target_wind_speed'].values
        
        print(f"Retraining Strategy: {desc}")
        print(f"Training Data: {train_start_idx} -> {t} (Size: {len(train_df)})")
        
        model = xgb.XGBRegressor(objective='reg:absoluteerror', 
                                 n_jobs=-1, random_state=42, 
                                 **current_best_params)
        model.fit(X_train, y_train)

    # --- PREDICTION ---
    if model is None:
        preds.append(np.nan) # Should not happen given logic above
        continue
        
    # Get features for current time t to predict t+1
    x_current = df_final.iloc[[t]][FEATS].values
    pred = model.predict(x_current)[0]
    preds.append(pred)

# ==========================================
# 4. EVALUATION & PLOTTING
# ==========================================

# Prepare Test DataFrame
test_df = df_final.iloc[train_end:].copy()
test_df['pred_xgb'] = preds

# Drop initial NaNs if any (e.g. if loop started without a model)
test_df.dropna(subset=['pred_xgb'], inplace=True)

# Calculate Metrics
mae = mean_absolute_error(test_df['target_wind_speed'], test_df['pred_xgb'])
print("\n" + "="*30)
print(f"FINAL RESULTS ({WINDOW_STRATEGY})")
print(f"MAE: {mae:.5f} m/s")
print("="*30)

# Plotting
plt.figure(figsize=(15, 6))
# Plot only the last 150 hours for clarity
limit = 150
plt.plot(test_df['target_wind_speed'].values[-limit:], label='Actual (Next Hour)', color='black', linewidth=2)
plt.plot(test_df['pred_xgb'].values[-limit:], label='Predicted (XGB)', color='red', linestyle='--', linewidth=2)
plt.title(f"Wind Speed Forecast - Strategy: {WINDOW_STRATEGY} | MAE: {mae:.3f}")
plt.ylabel("Wind Speed (m/s)")
plt.xlabel("Hours (Test Set)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

Processing Data and Engineering Features...
Data Ready. Shape: (9975, 33)
Features selected: 29

Starting Forecast Loop. Strategy: SLIDING
Training Range: 0 to 9255
Testing Range: 9255 to 9975

--- [Step 9255] Periodic Retraining & Tuning ---
Tuning on recent window: 7095 -> 9255


  dates = pd.date_range(start='2023-01-01', periods=N_TOTAL, freq='H')


Best Params: {'colsample_bytree': np.float64(0.7162512867627729), 'learning_rate': np.float64(0.012283449922914913), 'max_depth': 4, 'n_estimators': 245, 'reg_alpha': np.float64(0.005070465502989713), 'reg_lambda': np.float64(2.5495784614073704), 'subsample': np.float64(0.8789415459374241)}
Retraining Strategy: SLIDING (Last 4300 hrs)
Training Data: 4955 -> 9255 (Size: 4300)

--- [Step 9423] Periodic Retraining & Tuning ---
Tuning on recent window: 7263 -> 9423


In [17]:
import sklearn.metrics as m

In [18]:

# --- 4. Evaluation ---
test_df['pred_xgb_dynamic'] = preds
# Drop any initial NaNs if we had them
test_df = test_df.dropna(subset=['pred_xgb_dynamic'])

print("\n--- Final Results ---")
print("XGB MAE with Dynamic Re-Tuning:", 
      m.mean_absolute_error(test_df['target_wind_speed'], test_df['pred_xgb_dynamic']))


--- Final Results ---
XGB MAE with Dynamic Re-Tuning: 0.8446380645036697
