In [21]:
# %% [markdown]
# # DeepAR for Ocean‑Wave Forecasting ‑ v2 (namedtuple + time_idx standardised)

# %% 0️⃣ imports & seeds
import warnings, numpy as np, pandas as pd, torch
import matplotlib.pyplot as plt
from pathlib import Path
from collections import namedtuple
import lightning as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
import pytorch_forecasting as ptf
from pytorch_forecasting import TimeSeriesDataSet, DeepAR
from pytorch_forecasting.metrics import MAE
from sktime.split import temporal_train_test_split

from oceanwave_forecast import data_manager, data_pipeline, forecasting_utils, config, mlflow_utils, training
import importlib
importlib.reload(data_manager)
importlib.reload(data_pipeline)
importlib.reload(forecasting_utils)
importlib.reload(config)
importlib.reload(mlflow_utils)
importlib.reload(training)

from collections import namedtuple

pl.seed_everything(42, workers=True)
warnings.filterwarnings("ignore")

# %% 1️⃣ feature configuration -------------------------------------------------
FeatureConfig = namedtuple(
    "FeatureConfig",
    [
        "target",
        "index_cols",
        "static_categoricals",
        "static_reals",
        "time_varying_known_categoricals",
        "time_varying_known_reals",
        "time_varying_unknown_reals",
        "group_ids",
    ],
)

feat_cfg = FeatureConfig(
    target              = "Hs",                              # <- main forecast target
    index_cols          = ["series", "timestamp"],           # timestamp + series ID
    static_categoricals = ["series"],                        # ocean buoy ID
    static_reals        = [],
    time_varying_known_categoricals = [],                    # e.g. holiday flags
    time_varying_known_reals        = ["time_idx"],          # we always know time
    time_varying_unknown_reals      = [],                    # filled later (lags & exog)
    group_ids           = ["series"],
)

# %% 2️⃣ load & pre‑process raw data ------------------------------------------
raw_path   = config.RAW_DATA_DIR / "Standard meteorological data 2024" / "46088h2024.txt"
df_raw     = data_manager.extract_raw_data(raw_path)
df_clean   = data_pipeline.preprocess_ocean_data(df_raw)
df_clean   = df_clean.loc[config.START_DATE : config.END_DATE]

# split target & features
Y          = df_clean[config.TARGETS]
X          = df_clean.drop(columns=config.TARGETS)

# %% 3️⃣ initial train / test split (test = 3×horizon) -------------------------
y_train, y_test, X_train, X_test = temporal_train_test_split(
    y=Y, X=X, test_size=config.HORIZON * 3
)

# %% 4️⃣ scaling pipelines (removes NaNs, redundant cols) ---------------------
pipe_X, pipe_Y = data_pipeline.get_pipelines(list(X_train.columns))
X_train_transformed  = pipe_X.fit_transform(X_train)
X_test_transformed   = pipe_X.transform(X_test)
y_train_transformed  = pipe_Y.fit_transform(y_train)
y_test_transformed   = pipe_Y.transform(y_test)


[32m2025-07-21 03:40:33.137[0m | [1mINFO    [0m | [36moceanwave_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: D:\CML\Term 8\ML projects\forecasting_workspace\oceanwave_forecast[0m
Global seed set to 42


DataFrame shape: (52650, 13)

Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 52650 entries, 2024-01-01 00:00:00 to 2024-12-31 23:50:00
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   WDIR    52650 non-null  float64
 1   WSPD    52650 non-null  float64
 2   GST     52650 non-null  float64
 3   WVHT    52650 non-null  float64
 4   DPD     52650 non-null  float64
 5   APD     52650 non-null  float64
 6   MWD     52650 non-null  float64
 7   PRES    52650 non-null  float64
 8   ATMP    52650 non-null  float64
 9   WTMP    52650 non-null  float64
 10  DEWP    52650 non-null  float64
 11  VIS     52650 non-null  float64
 12  TIDE    52650 non-null  float64
dtypes: float64(13)
memory usage: 5.6 MB

Descriptive statistics:
               WDIR          WSPD           GST          WVHT           DPD  \
count  52650.000000  52650.000000  52650.000000  52650.000000  52650.000000   
mean     194.421026      4.962283      6.

In [None]:
# Note that none of the dataframes X or y has timeindex cols. it is datatimeindexed but htere is not dedicated column for it.


def tidy_long_df(X, y, buoy_id="buoy_46088"):
    """
    Merge features + targets and give PyTorch‑Forecasting its three required columns:
      • series     – constant 'buoy_46088'
      • time_idx   – 0…N counter per series (int64, contiguous)
      • target     – numeric columns to predict
    """
    # 1) concat horizontally, then pull the datetime index into a column
    df = pd.concat([X, y], axis=1).reset_index()  
    # 2) rename the datetime column
    df.rename(columns={"datetime": "timestamp"}, inplace=True)
    # 3) assign the single-series ID
    df["series"]   = buoy_id
    # 4) zero‑based time index
    df["time_idx"] = np.arange(len(df), dtype=np.int64)
    return df

# build your full_df with vertical concat on X and y, then tidy:
full_df = tidy_long_df(
    pd.concat([X_train_transformed, X_test_transformed]),
    pd.concat([y_train_transformed, y_test_transformed])
)


In [None]:
max_pred_len   = config.HORIZON
enc_len        = config.WINDOW
n_test_steps   = max_pred_len * 3        # same logic you used

cutoff         = full_df.time_idx.max() - n_test_steps
train_cutoff   = cutoff - max_pred_len   # keep 1 horizon for validation

train_df = full_df[full_df.time_idx <= train_cutoff]
val_df   = full_df[(full_df.time_idx > train_cutoff) & (full_df.time_idx <= cutoff)]
test_df  = full_df[full_df.time_idx >  cutoff]

print(train_df.shape, val_df.shape, test_df.shape)


(4080, 16) (72, 16) (216, 16)


In [40]:
print(train_df.tail())

               timestamp      WSPD       GST      PRES      ATMP      WTMP  \
4075 2024-06-18 19:00:00 -1.301136 -1.296310  0.440481  1.223932  2.686325   
4076 2024-06-18 20:00:00 -1.385455 -1.393711  0.409650  1.389521  2.661535   
4077 2024-06-18 21:00:00 -1.016558 -1.097275  0.368541  1.342210  2.364054   
4078 2024-06-18 22:00:00 -0.821569 -0.923648  0.302767  1.300813  2.016993   
4079 2024-06-18 23:00:00 -0.510641 -0.589099  0.247271  1.241674  3.008596   

          DEWP  WDIR_sin  WDIR_cos   MWD_sin   MWD_cos      series      WVHT  \
4075  1.068871  0.826631 -1.428777 -0.693789 -0.624926  buoy_46088  0.059369   
4076  1.082758  0.370505 -1.179541 -0.710523 -0.595855  buoy_46088 -0.421368   
4077  1.179967 -1.034299 -0.436354 -0.706679 -0.595855  buoy_46088 -0.639885   
4078  1.267918 -1.071429 -0.303864 -0.706724 -0.595855  buoy_46088 -0.712724   
4079  1.323466 -1.047234 -0.426575 -0.706769 -0.595855  buoy_46088 -0.756427   

           APD      series  time_idx  
4075  0.979

In [None]:
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import GroupNormalizer, MultiNormalizer

# build one GroupNormalizer *per* target
normalizers = [
    GroupNormalizer(
        groups=["series"],
        transformation="softplus"
    )
    for _ in config.TARGETS
]

common = dict(
    time_idx                   = "time_idx",
    target                     = config.TARGETS,
    group_ids                  = ["series"],
    time_varying_known_reals   = ["time_idx"],
    time_varying_unknown_reals = [c for c in X_train_transformed.columns if c not in {"series"}],
    static_categoricals        = ["series"],
    max_encoder_length         = enc_len,
    max_prediction_length      = max_pred_len,
    # <-- use MultiNormalizer here:
    target_normalizer          = MultiNormalizer(normalizers),
    allow_missing_timesteps    = True,
)

train_ds = TimeSeriesDataSet(train_df, **common)
val_ds   = TimeSeriesDataSet.from_dataset(train_ds, val_df)
test_ds  = TimeSeriesDataSet.from_dataset(
    train_ds, test_df,
    predict=True, stop_randomization=True
)


ValueError: The column label 'series' is not unique.

In [39]:
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import GroupNormalizer, MultiNormalizer

# build one GroupNormalizer *per* target
normalizers = [
    GroupNormalizer(
        groups=["series"],
        transformation="softplus"
    )
    for _ in config.TARGETS
]

common = dict(
    time_idx                   = "time_idx",
    target                     = config.TARGETS,
    group_ids                  = ["series"],
    time_varying_known_reals   = ["time_idx"],
    time_varying_unknown_reals = [c for c in X_train_transformed.columns if c not in {"series"}],
    static_categoricals        = ["series"],
    max_encoder_length         = enc_len,
    max_prediction_length      = max_pred_len,
    # <-- use MultiNormalizer here:
    target_normalizer          = MultiNormalizer(normalizers),
    allow_missing_timesteps    = True,
)

train_ds = TimeSeriesDataSet(train_df, **common)
val_ds   = TimeSeriesDataSet.from_dataset(train_ds, val_df)
test_ds  = TimeSeriesDataSet.from_dataset(
    train_ds, test_df,
    predict=True, stop_randomization=True
)


ValueError: The column label 'series' is not unique.

In [None]:
batch   = 64

train_loader = train_ds.to_dataloader(train=True,  batch_size=batch, num_workers=4)
val_loader   = val_ds  .to_dataloader(train=False, batch_size=batch)
test_loader  = test_ds .to_dataloader(train=False, batch_size=batch)


In [None]:
net = DeepAR.from_dataset(train_ds, hidden_size=64, rnn_layers=2)
trainer.fit(net, train_loader, val_loader)
