In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
path = "../../../data/cleaned/multi_var_wind_hourly_cleaned.csv"
required_columns = ['datetime', 'pressure', 'temperature', 'humidity', 'wind_speed', 'u', 'v']
dtype_cast = {'wind_speed': float}


In [3]:
import os
import sys

# Get the absolute path of the directory containing the notebook (ensemble)
# Then get the parent directory's parent (which is the project root: wind-forecast-benchmark)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

# Add the project root to the system path
if project_root not in sys.path:
    sys.path.insert(0, project_root)


In [23]:
from modules.preprocessing import *

In [5]:
pwd

'/home/amir/Desktop/wind-forecast-benchmark/notebooks/non-linear'

In [6]:
df_basic_clean = load_and_basic_clean(path=path,
                                      required_columns=required_columns,
                                      sort_by='datetime')

INFO:modules.preprocessing:Loaded CSV from '../../../data/cleaned/multi_var_wind_hourly_cleaned.csv': 10329 rows, 10 columns
INFO:modules.preprocessing:Selected 7 required columns
INFO:modules.preprocessing:Sorted DataFrame by column 'datetime'
INFO:modules.preprocessing:Data loading and cleaning completed successfully


In [7]:
df_basic_clean = df_basic_clean[0]

In [8]:
df_time_features = add_time_features(df_basic_clean)

In [9]:
df_time_features.head()

Unnamed: 0,datetime,pressure,temperature,humidity,wind_speed,u,v,hour,month,day_of_week,hour_sin,hour_cos
0,2024-01-01 00:00:00,758.466667,-1.016667,62.166667,0.1,-0.022693,-0.04734,0,1,0,0.0,1.0
1,2024-01-01 01:00:00,758.383333,-1.533333,64.0,0.283333,-0.113825,0.011167,1,1,0,0.258819,0.965926
2,2024-01-01 02:00:00,758.383333,-1.15,61.333333,0.2,-0.169286,-0.061781,2,1,0,0.5,0.866025
3,2024-01-01 03:00:00,758.783333,-1.166667,58.666667,0.5,0.142045,-0.39769,3,1,0,0.707107,0.707107
4,2024-01-01 04:00:00,759.0,-1.483333,62.166667,0.966667,-0.51259,-0.609574,4,1,0,0.866025,0.5


In [10]:
df_time_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   datetime     10329 non-null  datetime64[ns]
 1   pressure     10329 non-null  float64       
 2   temperature  10329 non-null  float64       
 3   humidity     10329 non-null  float64       
 4   wind_speed   10329 non-null  float64       
 5   u            10329 non-null  float64       
 6   v            10329 non-null  float64       
 7   hour         10329 non-null  int32         
 8   month        10329 non-null  int32         
 9   day_of_week  10329 non-null  int32         
 10  hour_sin     10329 non-null  float64       
 11  hour_cos     10329 non-null  float64       
dtypes: datetime64[ns](1), float64(8), int32(3)
memory usage: 847.4 KB


In [11]:
physical_featurs = ['temperature', 'pressure', 'u', 'v', 'wind_speed']

In [12]:
df_features = add_lag_and_rolling_features(df_time_features,target_cols=physical_featurs, lags=[1, 2, 3 ,6, 12 ,24], dropna=True)

In [13]:
df_features.head(3)

Unnamed: 0,datetime,pressure,temperature,humidity,wind_speed,u,v,hour,month,day_of_week,...,wind_speed_roll_min_3,wind_speed_roll_max_3,wind_speed_roll_mean_6,wind_speed_roll_std_6,wind_speed_roll_min_6,wind_speed_roll_max_6,wind_speed_roll_mean_12,wind_speed_roll_std_12,wind_speed_roll_min_12,wind_speed_roll_max_12
24,2024-01-02 00:00:00,760.4,-1.933333,89.166667,1.05,-0.871871,-0.322368,0,1,1,...,0.333333,0.9,0.625,0.28978,0.333333,1.0,1.093056,1.098195,0.183333,3.316667
25,2024-01-02 01:00:00,760.216667,-2.183333,89.666667,0.933333,-0.889509,-0.09685,1,1,1,...,0.716667,1.05,0.744444,0.293194,0.333333,1.05,0.904167,0.847221,0.183333,2.933333
26,2024-01-02 02:00:00,760.2,-2.416667,84.666667,0.7,-0.572663,-0.323016,2,1,1,...,0.716667,1.05,0.822222,0.265344,0.333333,1.05,0.7375,0.559677,0.183333,2.216667


In [14]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10305 entries, 24 to 10328
Columns: 102 entries, datetime to wind_speed_roll_max_12
dtypes: datetime64[ns](1), float64(98), int32(3)
memory usage: 8.0 MB


In [15]:
df_features.columns

Index(['datetime', 'pressure', 'temperature', 'humidity', 'wind_speed', 'u',
       'v', 'hour', 'month', 'day_of_week',
       ...
       'wind_speed_roll_min_3', 'wind_speed_roll_max_3',
       'wind_speed_roll_mean_6', 'wind_speed_roll_std_6',
       'wind_speed_roll_min_6', 'wind_speed_roll_max_6',
       'wind_speed_roll_mean_12', 'wind_speed_roll_std_12',
       'wind_speed_roll_min_12', 'wind_speed_roll_max_12'],
      dtype='object', length=102)

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Prepare Data (Drop datetime for correlation check)
# Ensure we have the target ready for correlation checking
if 'target_wind_speed' not in df_features.columns:
    df_features['target_wind_speed'] = df_features['wind_speed'].shift(-1)

df_corr = df_features.dropna().drop(['datetime'], axis=1)

# 2. Calculate Correlation with Target
correlations = df_corr.corrwith(df_corr['target_wind_speed']).abs().sort_values(ascending=False)

# 3. Display Top 20 Features
print("--- Top 20 Features correlated with Target ---")
print(correlations.head(20))

# 4. Display Bottom 10 (Useless features)
print("\n--- Bottom 10 Features (Likely Noise) ---")
print(correlations.tail(10))

--- Top 20 Features correlated with Target ---
target_wind_speed          1.000000
wind_speed                 0.894402
wind_speed_lag_1           0.777917
wind_speed_roll_mean_3     0.716671
wind_speed_roll_max_3      0.703457
wind_speed_roll_min_3      0.693032
wind_speed_lag_2           0.677941
wind_speed_roll_mean_6     0.632196
wind_speed_roll_max_6      0.614972
wind_speed_lag_3           0.588027
wind_speed_roll_min_6      0.580818
wind_speed_roll_mean_12    0.521921
wind_speed_roll_max_12     0.503913
wind_speed_roll_min_12     0.483767
wind_speed_lag_6           0.384799
v_roll_std_6               0.344839
v_roll_std_12              0.338866
wind_speed_roll_std_6      0.318503
wind_speed_roll_std_12     0.306793
v_roll_std_3               0.302952
dtype: float64

--- Bottom 10 Features (Likely Noise) ---
temperature_roll_min_12     0.016166
temperature_roll_max_12     0.015133
temperature_roll_mean_12    0.013944
u_roll_mean_6               0.013025
temperature_roll_std_3     

In [17]:
# Define your reduced feature set manually to ensure high quality inputs
FEATS = [
    # 1. The Physics (Current State) - MANDATORY
    'u', 'v', 'pressure', 'temperature', 'humidity', 'wind_speed',
    
    # 2. Time Cyclicals (Better than 'hour' or 'month' integers)
    'sin_h', 'cos_h', 
    # If you have month_sin/cos, add them. If not, ignore.
    
    # 3. Immediate Inertia (Lags)
    # Check your column names. Assuming standard lag naming:
    'wind_speed_L1', 'wind_speed_L2', 'u_L1', 'v_L1',
    
    # 4. Rolling Stats (Select ONLY ONE window size to avoid redundancy)
    # Window 6 is a good middle ground between 3 (too noisy) and 12 (too smooth)
    'wind_speed_roll_mean_6', 
    'wind_speed_roll_std_6',
    
    # 5. Volatility (Optional)
    # Max - Min tells us if the weather is gusty
    'wind_speed_roll_max_6' 
]

# Verify these columns actually exist in your df
FEATS = [c for c in FEATS if c in df_features.columns]
print(f"Selected {len(FEATS)} features for SVR.")

Selected 9 features for SVR.


In [18]:
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
import scipy.stats as stats

# Config
# Ensure df_features has 'target_wind_speed' created and NaNs dropped
df_svr = df_features.dropna(subset=['target_wind_speed'] + FEATS).copy()

# Split
test_size = 720
train_df = df_svr.iloc[:-test_size]
test_df = df_svr.iloc[-test_size:]

X_train = train_df[FEATS].values
y_train = train_df['target_wind_speed'].values
X_test = test_df[FEATS].values
y_test = test_df['target_wind_speed'].values

# --- SVR PIPELINE ---
# Pipeline ensures scaling happens inside CV correctly
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

# --- TUNING ---
# SVR is sensitive. We must tune C and Epsilon.
param_dist = {
    'svr__kernel': ['rbf'],             # RBF is best for weather
    'svr__C': stats.loguniform(0.1, 100),    # Penalty (High = less regularization)
    'svr__epsilon': stats.loguniform(0.01, 1.0), # Tube width
    'svr__gamma': ['scale', 'auto', 0.1, 0.01]
}

print("Tuning SVR (This may take a minute)...")
tscv = TimeSeriesSplit(n_splits=3)
rs = RandomizedSearchCV(
    pipeline, 
    param_dist, 
    n_iter=15, # Keep low for speed testing
    cv=tscv, 
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42
)

rs.fit(X_train, y_train)

print(f"Best Params: {rs.best_params_}")
print(f"Best CV Score: {-rs.best_score_:.4f}")

# --- FINAL EVALUATION ---
best_model = rs.best_estimator_
preds = best_model.predict(X_test)
mae = mean_absolute_error(y_test, preds)

print("\n" + "="*30)
print(f"FINAL SVR MAE: {mae:.5f}")
print("="*30)

Tuning SVR (This may take a minute)...
Best Params: {'svr__C': np.float64(65.41210527692726), 'svr__epsilon': np.float64(0.010035927878780916), 'svr__gamma': 0.01, 'svr__kernel': 'rbf'}
Best CV Score: 0.7433

FINAL SVR MAE: 0.66597


In [28]:
from modules.preprocessing import save_predictions



In [29]:
save_predictions(preds, "svr_preds_1.npy")

[SUCCESS] Saved svr_preds_1.npy to ../../results/
