In [None]:
pwd

In [1]:
path = "../../../data/cleaned/multi_var_wind_hourly_cleaned.csv"
required_columns = ['datetime', 'pressure', 'temperature', 'humidity', 'wind_speed', 'u', 'v']
dtype_cast = {'wind_speed': float}


In [2]:
import os
import sys

# Get the absolute path of the directory containing the notebook (ensemble)
# Then get the parent directory's parent (which is the project root: wind-forecast-benchmark)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))

# Add the project root to the system path
if project_root not in sys.path:
    sys.path.insert(0, project_root)


In [5]:

# -----------------------------------------------------------------
# NOW you can import the module using the 'modules' path
# -----------------------------------------------------------------

from modules.preprocessing import (load_and_basic_clean,
                                    add_time_features,
                                    add_lag_and_rolling_features,
)


In [6]:
df_clean = load_and_basic_clean(path=path,
                                required_columns=required_columns,
                                sort_by='datetime',
                               dtype_cast=dtype_cast)

In [7]:
df_clean.head()

Unnamed: 0,datetime,pressure,temperature,humidity,wind_speed,u,v
0,2024-01-01 00:00:00,758.466667,-1.016667,62.166667,0.1,-0.022693,-0.04734
1,2024-01-01 01:00:00,758.383333,-1.533333,64.0,0.283333,-0.113825,0.011167
2,2024-01-01 02:00:00,758.383333,-1.15,61.333333,0.2,-0.169286,-0.061781
3,2024-01-01 03:00:00,758.783333,-1.166667,58.666667,0.5,0.142045,-0.39769
4,2024-01-01 04:00:00,759.0,-1.483333,62.166667,0.966667,-0.51259,-0.609574


In [8]:
df_clean = add_time_features(df_clean)

In [9]:
df_clean.head()

Unnamed: 0,datetime,pressure,temperature,humidity,wind_speed,u,v,hour,month,day_of_week,hour_sin,hour_cos
0,2024-01-01 00:00:00,758.466667,-1.016667,62.166667,0.1,-0.022693,-0.04734,0,1,0,0.0,1.0
1,2024-01-01 01:00:00,758.383333,-1.533333,64.0,0.283333,-0.113825,0.011167,1,1,0,0.258819,0.965926
2,2024-01-01 02:00:00,758.383333,-1.15,61.333333,0.2,-0.169286,-0.061781,2,1,0,0.5,0.866025
3,2024-01-01 03:00:00,758.783333,-1.166667,58.666667,0.5,0.142045,-0.39769,3,1,0,0.707107,0.707107
4,2024-01-01 04:00:00,759.0,-1.483333,62.166667,0.966667,-0.51259,-0.609574,4,1,0,0.866025,0.5


In [11]:
target_cols = ['pressure', 'temperature', 'humidity', 'u', 'v',]
df_clean = add_lag_and_rolling_features(df_clean, target_cols=target_cols)

In [12]:
df_clean.head()

Unnamed: 0,datetime,pressure,temperature,humidity,wind_speed,u,v,hour,month,day_of_week,...,v_roll_min_3,v_roll_max_3,v_roll_mean_6,v_roll_std_6,v_roll_min_6,v_roll_max_6,v_roll_mean_12,v_roll_std_12,v_roll_min_12,v_roll_max_12
0,2024-01-01 00:00:00,758.466667,-1.016667,62.166667,0.1,-0.022693,-0.04734,0,1,0,...,,,,,,,,,,
1,2024-01-01 01:00:00,758.383333,-1.533333,64.0,0.283333,-0.113825,0.011167,1,1,0,...,,,,,,,,,,
2,2024-01-01 02:00:00,758.383333,-1.15,61.333333,0.2,-0.169286,-0.061781,2,1,0,...,-0.061781,0.011167,,,,,,,,
3,2024-01-01 03:00:00,758.783333,-1.166667,58.666667,0.5,0.142045,-0.39769,3,1,0,...,-0.39769,0.011167,,,,,,,,
4,2024-01-01 04:00:00,759.0,-1.483333,62.166667,0.966667,-0.51259,-0.609574,4,1,0,...,-0.609574,-0.061781,,,,,,,,
