In [12]:

# Ocean Wave Height and Period Forecasting with DeepAR
# Deep Autoregressive Time Series Modeling using PyTorch Forecasting

import warnings, numpy as np, pandas as pd, torch
import matplotlib.pyplot as plt
from pathlib import Path
import lightning as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
import pytorch_forecasting as ptf
from pytorch_forecasting import TimeSeriesDataSet, DeepAR
from pytorch_forecasting.metrics import MAE, RMSE
from sktime.split import temporal_train_test_split
import importlib

from oceanwave_forecast import data_manager, data_pipeline, forecasting_utils, config, mlflow_utils, training

importlib.reload(data_manager)
importlib.reload(data_pipeline)
importlib.reload(forecasting_utils)
importlib.reload(config)
importlib.reload(mlflow_utils)
importlib.reload(training)

from collections import namedtuple



# Set random seeds for reproducibility
pl.seed_everything(config.RANDOM_STATE)
torch.manual_seed(config.RANDOM_STATE)
np.random.seed(config.RANDOM_STATE)


[32m2025-07-21 02:57:21.447[0m | [1mINFO    [0m | [36moceanwave_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: D:\CML\Term 8\ML projects\forecasting_workspace\oceanwave_forecast[0m
Global seed set to 42


# 1. DATA PREPARATION AND PREPROCESSING


In [15]:
FeatureConfig = namedtuple(
    "FeatureConfig",
    [
        "target",
        "index_cols",
        "static_categoricals",
        "static_reals",
        "time_varying_known_categoricals",
        "time_varying_known_reals",
        "time_varying_unknown_reals",
        "group_ids",
    ],
)

feat_cfg = FeatureConfig(
    target              = "Hs",                              # <- main forecast target
    index_cols          = ["series", "timestamp"],           # timestamp + series ID
    static_categoricals = ["series"],                        # ocean buoy ID
    static_reals        = [],
    time_varying_known_categoricals = [],                    # e.g. holiday flags
    time_varying_known_reals        = ["time_idx"],          # we always know time
    time_varying_unknown_reals      = [],                    # filled later (lags & exog)
    group_ids           = ["series"],
)


In [18]:
raw_path   = config.RAW_DATA_DIR / "Standard meteorological data 2024" / "46088h2024.txt"
df_raw     = data_manager.extract_raw_data(raw_path)
df_clean   = data_pipeline.preprocess_ocean_data(df_raw)
df_clean   = df_clean.loc[config.START_DATE : config.END_DATE]

# split target & features
Y = df_clean[config.TARGETS]
X = df_clean.drop(columns=config.TARGETS)

y_train, y_test, X_train, X_test = temporal_train_test_split(
    y=Y, X=X, test_size=config.HORIZON * 3
)


DataFrame shape: (52650, 13)

Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 52650 entries, 2024-01-01 00:00:00 to 2024-12-31 23:50:00
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   WDIR    52650 non-null  float64
 1   WSPD    52650 non-null  float64
 2   GST     52650 non-null  float64
 3   WVHT    52650 non-null  float64
 4   DPD     52650 non-null  float64
 5   APD     52650 non-null  float64
 6   MWD     52650 non-null  float64
 7   PRES    52650 non-null  float64
 8   ATMP    52650 non-null  float64
 9   WTMP    52650 non-null  float64
 10  DEWP    52650 non-null  float64
 11  VIS     52650 non-null  float64
 12  TIDE    52650 non-null  float64
dtypes: float64(13)
memory usage: 5.6 MB

Descriptive statistics:
               WDIR          WSPD           GST          WVHT           DPD  \
count  52650.000000  52650.000000  52650.000000  52650.000000  52650.000000   
mean     194.421026      4.962283      6.

# 2. FEATURE ENGINEERING FOR DEEPAR


In [22]:
pipe_X, pipe_Y = data_pipeline.get_pipelines(list(X_train.columns))

X_train_transformed = pipe_X.fit_transform(X_train)
X_test_transformed  = pipe_X.transform(X_test)
y_train_transformed = pipe_Y.fit_transform(y_train)
y_test_transformed  = pipe_Y.transform(y_test)


In [27]:
for _df in (X_train_transformed, X_test_transformed, y_train_transformed):
    _df["series"] = "buoy_46088"


In [28]:
def build_time_idx(train_df, test_df, ts_col="timestamp", id_col="series"):
    train_df["train"] = True
    test_df["train"]  = False
    data  = pd.concat([train_df, test_df], sort=False)
    data["time_idx_raw"] = data[ts_col].astype("int64")           # ns since epoch
    diff  = data["time_idx_raw"].iloc[1] - data["time_idx_raw"].iloc[0]
    data["_min_idx"] = data.groupby(id_col)["time_idx_raw"].transform("min")
    data["time_idx"] = ((data["time_idx_raw"] - data["_min_idx"]) / diff).astype(int)
    data.drop(columns=["_min_idx", "time_idx_raw"], inplace=True)
    train_df = data.loc[data.train].drop(columns="train")
    test_df  = data.loc[~data.train].drop(columns="train")
    return train_df, test_df

# Remove the duplicate 'series' column from y before joining
y_train_transformed = y_train_transformed.drop(columns="series")



Xy_train = X_train_transformed.join(y_train_transformed)
Xy_test  = X_test_transformed.join(y_test_transformed)
train_df, base_test_df = build_time_idx(
    Xy_train.reset_index().rename(columns={"index": "timestamp"}),
    Xy_test.reset_index(). rename(columns={"index": "timestamp"})
)

KeyError: 'timestamp'

In [None]:
history_hours = 48 * 2
val_hours     = 48

cutoff_ts   = train_df.timestamp.max() - pd.Timedelta(val_hours/48, "D")
hist_cutoff = train_df.timestamp.max() - pd.Timedelta((history_hours+val_hours)/48, "D")

val_history = train_df[(train_df.timestamp >= hist_cutoff) & (train_df.timestamp <= cutoff_ts)]
val_df      = train_df[train_df.timestamp >  cutoff_ts]
train_df    = train_df[train_df.timestamp <= cutoff_ts]

# 3. TIMESERIESDATASET CONFIGURATION


In [None]:

# Training cutoff
static_cat   = ["series", "tgt_name"]
known_reals  = ["time_idx"]                       # always known into future
unknown_reals = X_train_transformed.columns.tolist()           # your exog lags & calendar feats

max_enc_len  = config.WINDOW             # history
max_pred_len = config.HORIZON                     # horizon

train_ds = TimeSeriesDataSet(
    train_df,
    time_idx="time_idx",
    target="target",
    group_ids=static_cat,
    static_categoricals=static_cat,
    time_varying_known_reals=known_reals,
    time_varying_unknown_reals=unknown_reals,
    max_encoder_length=max_enc_len,
    max_prediction_length=max_pred_len,
    min_encoder_length=24,
    target_normalizer=ptf.GroupNormalizer(groups=static_cat, transformation="softplus"),
    allow_missing_timesteps=True,
)

val_ds = TimeSeriesDataSet.from_dataset(train_ds, test_df, predict=True, stop_randomization=True)

train_loader = train_ds.to_dataloader(train=True,  batch_size=64, num_workers=4)
val_loader   = val_ds.to_dataloader(  train=False, batch_size=64, num_workers=0)

# 4. CREATE TIMESERIESDATASET


In [None]:
# Prepare categorical encoders
categorical_encoders = {}
for cat_col in static_categoricals + time_varying_known_categoricals:
    if cat_col in deepar_data.columns:
        categorical_encoders[cat_col] = NaNLabelEncoder().fit(deepar_data[cat_col].astype(str))

# Create training dataset
training = TimeSeriesDataSet(
    deepar_data[deepar_data.time_idx <= training_cutoff],
    time_idx="time_idx",
    target=config.TARGETS[0],  # Primary target (e.g., wave height)
    group_ids=["series"],
    
    # Categorical features
    static_categoricals=static_categoricals,
    time_varying_known_categoricals=time_varying_known_categoricals,
    categorical_encoders=categorical_encoders,
    
    # Continuous features  
    time_varying_known_reals=time_varying_known_reals,
    time_varying_unknown_reals=time_varying_unknown_reals,
    
    # Sequence lengths
    max_encoder_length=MAX_ENCODER_LENGTH,
    max_prediction_length=MAX_PREDICTION_LENGTH,
    min_encoder_length=MIN_ENCODER_LENGTH,
    min_prediction_length=MIN_PREDICTION_LENGTH,
    
    # Normalization - using GroupNormalizer for better stability
    target_normalizer=GroupNormalizer(
        groups=["series"], 
        transformation="softplus",
        center=True
    ),
    
    # Handle missing values
    allow_missing_timesteps=True,
    
    # Add encoder length as feature (helps model understand sequence position)
    add_encoder_length=True,
    
    # Add relative time index 
    add_relative_time_idx=True,
    
    # Add target scales as features
    add_target_scales=True,
    
    # Randomize length for regularization
    randomize_length=(0.1, 0.2),
)

# Create validation dataset
validation = TimeSeriesDataSet.from_dataset(
    training, 
    deepar_data, 
    min_prediction_idx=training_cutoff + 1,
    stop_randomization=True
)

print(f"Training dataset: {len(training)} samples")
print(f"Validation dataset: {len(validation)} samples")
