In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
path = "../../../data/cleaned/pincher_station_hourly_wind_cleaned.csv"
required_columns = ['timestamp', 'temp_c', 'rel_humidity', 'wind_speed_kmh', 'pressure_kpa','u', 'v']
dtype_cast = {'wind_speed': float}


In [3]:
import os
import sys

# Get the absolute path of the directory containing the notebook (ensemble)
# Then get the parent directory's parent (which is the project root: wind-forecast-benchmark)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

# Add the project root to the system path
if project_root not in sys.path:
    sys.path.insert(0, project_root)


In [4]:
from modules.preprocessing import *

INFO:modules.preprocessing:preprocessing module loaded (v1.0.0)


In [5]:
import pandas as pd

In [6]:
test_df = pd.read_csv(path)

In [7]:
test_df.head()

Unnamed: 0,timestamp,temp_c,rel_humidity,wind_speed_kmh,pressure_kpa,wind_dir_deg,timestamp_rounded,u,v,pressure_24h
0,2011-06-27 14:00:00,7.9,85.0,9.0,88.14,320.0,2011-06-27 14:00:00,5.785088,-6.8944,88.14
1,2011-06-27 15:00:00,11.5,74.0,8.0,88.11,320.0,2011-06-27 15:00:00,5.142301,-6.128356,88.125
2,2011-06-27 16:00:00,15.4,61.0,5.0,88.04,320.0,2011-06-27 16:00:00,3.213938,-3.830222,88.096667
3,2011-06-27 17:00:00,17.3,48.0,8.0,87.99,40.0,2011-06-27 17:00:00,-5.142301,-6.128356,88.07
4,2011-06-27 18:00:00,18.4,47.0,15.0,87.93,120.0,2011-06-27 18:00:00,-12.990381,7.5,88.042


In [8]:
test_df.columns

Index(['timestamp', 'temp_c', 'rel_humidity', 'wind_speed_kmh', 'pressure_kpa',
       'wind_dir_deg', 'timestamp_rounded', 'u', 'v', 'pressure_24h'],
      dtype='object')

In [9]:
df_basic_clean = load_and_basic_clean(path=path,
                                      required_columns=required_columns,
                                      sort_by='timestamp')

INFO:modules.preprocessing:Loaded CSV from '../../../data/cleaned/pincher_station_hourly_wind_cleaned.csv': 118474 rows, 10 columns
INFO:modules.preprocessing:Selected 7 required columns
INFO:modules.preprocessing:Sorted DataFrame by column 'timestamp'
INFO:modules.preprocessing:Data loading and cleaning completed successfully


In [10]:
df_basic_clean = df_basic_clean[0]

In [11]:
df_basic_clean.head(2)

Unnamed: 0,timestamp,temp_c,rel_humidity,wind_speed_kmh,pressure_kpa,u,v
0,2011-06-27 14:00:00,7.9,85.0,9.0,88.14,5.785088,-6.8944
1,2011-06-27 15:00:00,11.5,74.0,8.0,88.11,5.142301,-6.128356


In [13]:
clmns = {
    'timestamp':'datetime',
    'temp_c':'temperature',
    'rel_humidity':'humidity',
    'wind_speed_kmh':'wind_speed',
    'pressure_kpa':'pressure',
    
}
df_basic_clean.rename(columns=clmns, inplace=True)

In [14]:
df_basic_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118474 entries, 0 to 118473
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   datetime     118474 non-null  object 
 1   temperature  118474 non-null  float64
 2   humidity     118474 non-null  float64
 3   wind_speed   118474 non-null  float64
 4   pressure     118474 non-null  float64
 5   u            118474 non-null  float64
 6   v            118474 non-null  float64
dtypes: float64(6), object(1)
memory usage: 6.3+ MB


In [16]:
df_time_features = add_time_features(df_basic_clean)

In [17]:
df_time_features.head()

Unnamed: 0,datetime,temperature,humidity,wind_speed,pressure,u,v,hour,month,day_of_week,hour_sin,hour_cos
0,2011-06-27 14:00:00,7.9,85.0,9.0,88.14,5.785088,-6.8944,14,6,0,-0.5,-0.8660254
1,2011-06-27 15:00:00,11.5,74.0,8.0,88.11,5.142301,-6.128356,15,6,0,-0.707107,-0.7071068
2,2011-06-27 16:00:00,15.4,61.0,5.0,88.04,3.213938,-3.830222,16,6,0,-0.866025,-0.5
3,2011-06-27 17:00:00,17.3,48.0,8.0,87.99,-5.142301,-6.128356,17,6,0,-0.965926,-0.258819
4,2011-06-27 18:00:00,18.4,47.0,15.0,87.93,-12.990381,7.5,18,6,0,-1.0,-1.83697e-16


In [18]:
df_time_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118474 entries, 0 to 118473
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   datetime     118474 non-null  datetime64[ns]
 1   temperature  118474 non-null  float64       
 2   humidity     118474 non-null  float64       
 3   wind_speed   118474 non-null  float64       
 4   pressure     118474 non-null  float64       
 5   u            118474 non-null  float64       
 6   v            118474 non-null  float64       
 7   hour         118474 non-null  int32         
 8   month        118474 non-null  int32         
 9   day_of_week  118474 non-null  int32         
 10  hour_sin     118474 non-null  float64       
 11  hour_cos     118474 non-null  float64       
dtypes: datetime64[ns](1), float64(8), int32(3)
memory usage: 9.5 MB


In [19]:
physical_featurs = ['temperature', 'pressure', 'u', 'v', 'wind_speed']

In [20]:
df_features = add_lag_and_rolling_features(df_time_features,target_cols=physical_featurs, lags=[1, 2, 3 ,6, 12 ,24], dropna=True)

In [21]:
df_features.head(3)

Unnamed: 0,datetime,temperature,humidity,wind_speed,pressure,u,v,hour,month,day_of_week,...,wind_speed_roll_min_3,wind_speed_roll_max_3,wind_speed_roll_mean_6,wind_speed_roll_std_6,wind_speed_roll_min_6,wind_speed_roll_max_6,wind_speed_roll_mean_12,wind_speed_roll_std_12,wind_speed_roll_min_12,wind_speed_roll_max_12
24,2011-06-28 14:00:00,12.9,72.0,11.0,87.47,10.832885,1.91013,14,6,1,...,9.0,15.0,14.833333,3.371449,9.0,18.0,14.166667,5.474459,5.0,26.0
25,2011-06-28 15:00:00,15.2,68.0,11.0,87.42,10.832885,-1.91013,15,6,1,...,9.0,15.0,13.666667,3.265986,9.0,17.0,12.916667,4.055486,5.0,18.0
26,2011-06-28 16:00:00,17.7,60.0,9.0,87.36,8.457234,-3.078181,16,6,1,...,11.0,15.0,12.666667,2.94392,9.0,17.0,13.083333,3.918681,5.0,18.0


In [22]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118450 entries, 24 to 118473
Columns: 102 entries, datetime to wind_speed_roll_max_12
dtypes: datetime64[ns](1), float64(98), int32(3)
memory usage: 91.7 MB


In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Prepare Data (Drop datetime for correlation check)
# Ensure we have the target ready for correlation checking
if 'target_wind_speed' not in df_features.columns:
    df_features['target_wind_speed'] = df_features['wind_speed'].shift(-1)

df_corr = df_features.dropna().drop(['datetime'], axis=1)

# 2. Calculate Correlation with Target
correlations = df_corr.corrwith(df_corr['target_wind_speed']).abs().sort_values(ascending=False)

# 3. Display Top 20 Features
print("--- Top 20 Features correlated with Target ---")
print(correlations.head(20))

# 4. Display Bottom 10 (Useless features)
print("\n--- Bottom 10 Features (Likely Noise) ---")
print(correlations.tail(10))

--- Top 20 Features correlated with Target ---
target_wind_speed          1.000000
wind_speed                 0.852327
wind_speed_lag_1           0.782267
wind_speed_roll_mean_3     0.772731
wind_speed_roll_max_3      0.769825
wind_speed_roll_mean_6     0.731093
wind_speed_roll_max_6      0.727913
u                          0.727561
wind_speed_lag_2           0.724987
wind_speed_roll_min_3      0.724766
u_roll_max_3               0.689262
u_lag_1                    0.687413
u_roll_mean_3              0.680875
wind_speed_lag_3           0.674781
u_roll_max_6               0.668240
wind_speed_roll_mean_12    0.658439
u_roll_mean_6              0.657540
u_lag_2                    0.652791
wind_speed_roll_max_12     0.651076
wind_speed_roll_min_6      0.644930
dtype: float64

--- Bottom 10 Features (Likely Noise) ---
temperature_lag_3           0.052726
temperature_lag_24          0.046814
temperature_roll_mean_6     0.042072
temperature_roll_max_12     0.038921
temperature_roll_min_12    

In [24]:
# Define your reduced feature set manually to ensure high quality inputs
FEATS = [
    # 1. The Physics (Current State) - MANDATORY
    'u', 'v', 'pressure', 'temperature', 'humidity', 'wind_speed',
    
    # 2. Time Cyclicals (Better than 'hour' or 'month' integers)
    'sin_h', 'cos_h', 
    # If you have month_sin/cos, add them. If not, ignore.
    
    # 3. Immediate Inertia (Lags)
    # Check your column names. Assuming standard lag naming:
    'wind_speed_L1', 'wind_speed_L2', 'u_L1', 'v_L1',
    
    # 4. Rolling Stats (Select ONLY ONE window size to avoid redundancy)
    # Window 6 is a good middle ground between 3 (too noisy) and 12 (too smooth)
    'wind_speed_roll_mean_6', 
    'wind_speed_roll_std_6',
    
    # 5. Volatility (Optional)
    # Max - Min tells us if the weather is gusty
    'wind_speed_roll_max_6' 
]

# Verify these columns actually exist in your df
FEATS = [c for c in FEATS if c in df_features.columns]
print(f"Selected {len(FEATS)} features for SVR.")

Selected 9 features for SVR.


In [25]:
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
import scipy.stats as stats

# Config
# Ensure df_features has 'target_wind_speed' created and NaNs dropped
df_svr = df_features.dropna(subset=['target_wind_speed'] + FEATS).copy()

# Split
test_size = 720
train_df = df_svr.iloc[:-test_size]
test_df = df_svr.iloc[-test_size:]

X_train = train_df[FEATS].values
y_train = train_df['target_wind_speed'].values
X_test = test_df[FEATS].values
y_test = test_df['target_wind_speed'].values

# --- SVR PIPELINE ---
# Pipeline ensures scaling happens inside CV correctly
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

# --- TUNING ---
# SVR is sensitive. We must tune C and Epsilon.
param_dist = {
    'svr__kernel': ['rbf'],             # RBF is best for weather
    'svr__C': stats.loguniform(0.1, 100),    # Penalty (High = less regularization)
    'svr__epsilon': stats.loguniform(0.01, 1.0), # Tube width
    'svr__gamma': ['scale', 'auto', 0.1, 0.01]
}

print("Tuning SVR (This may take a minute)...")
tscv = TimeSeriesSplit(n_splits=3)
rs = RandomizedSearchCV(
    pipeline, 
    param_dist, 
    n_iter=15, # Keep low for speed testing
    cv=tscv, 
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42
)

rs.fit(X_train, y_train)

print(f"Best Params: {rs.best_params_}")
print(f"Best CV Score: {-rs.best_score_:.4f}")

# --- FINAL EVALUATION ---
best_model = rs.best_estimator_
preds = best_model.predict(X_test)
mae = mean_absolute_error(y_test, preds)

print("\n" + "="*30)
print(f"FINAL SVR MAE: {mae:.5f}")
print("="*30)

Tuning SVR (This may take a minute)...
Best Params: {'svr__C': np.float64(65.41210527692726), 'svr__epsilon': np.float64(0.010035927878780916), 'svr__gamma': 0.01, 'svr__kernel': 'rbf'}
Best CV Score: 4.8835

FINAL SVR MAE: 6.41591


In [27]:
# from modules.preprocessing import save_predictions



In [None]:
preds

In [32]:
result_path = "/home/amir/Desktop/wind-forecast-benchmark/results/"
np.save(result_path + "pincher_svr_preds.npy", preds)

In [33]:
np.save(result_path + "/pincher_y_test.npy", y_test)