In [1]:

# Ocean Wave Height and Period Forecasting with DeepAR
# Deep Autoregressive Time Series Modeling using PyTorch Forecasting

import warnings, numpy as np, pandas as pd, torch
import matplotlib.pyplot as plt
import lightning as pl
import pytorch_forecasting as ptf
from pytorch_forecasting import TimeSeriesDataSet
from sktime.split import temporal_train_test_split
import importlib

from oceanwave_forecast import data_manager, data_pipeline, forecasting_utils, config, mlflow_utils, training

importlib.reload(data_manager)
importlib.reload(data_pipeline)
importlib.reload(forecasting_utils)
importlib.reload(config)
importlib.reload(mlflow_utils)
importlib.reload(training)

from collections import namedtuple
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import GroupNormalizer, MultiNormalizer

from dataclasses import dataclass


# Set random seeds for reproducibility
pl.seed_everything(config.RANDOM_STATE)
torch.manual_seed(config.RANDOM_STATE)
np.random.seed(config.RANDOM_STATE)


[32m2025-07-27 10:50:45.248[0m | [1mINFO    [0m | [36moceanwave_forecast.config[0m:[36m<module>[0m:[36m12[0m - [1mPROJ_ROOT path is: D:\CML\Term 8\ML projects\forecasting_workspace\oceanwave_forecast[0m
[32m2025-07-27 10:50:45.310[0m | [1mINFO    [0m | [36moceanwave_forecast.config[0m:[36m<module>[0m:[36m12[0m - [1mPROJ_ROOT path is: D:\CML\Term 8\ML projects\forecasting_workspace\oceanwave_forecast[0m
Global seed set to 42


# 1. DATA PREPARATION AND PREPROCESSING


In [7]:
FeatureConfig = namedtuple(
    "FeatureConfig",
    [
        "target",
        "index_cols",
        "static_categoricals",
        "static_reals",
        "time_varying_known_categoricals",
        "time_varying_known_reals",
        "time_varying_unknown_reals",
        "group_ids",
    ],
)

feat_cfg = FeatureConfig(
    target              = "Hs",                              # <- main forecast target
    index_cols          = ["series", "timestamp"],           # timestamp + series ID
    static_categoricals = ["series"],                        # ocean buoy ID
    static_reals        = [],
    time_varying_known_categoricals = [],                    # e.g. holiday flags
    time_varying_known_reals        = ["time_idx"],          # we always know time
    time_varying_unknown_reals      = [],                    # filled later (lags & exog)
    group_ids           = ["series"],
)


In [2]:
raw_path   = config.RAW_DATA_DIR / "Standard meteorological data 2024" / "46088h2024.txt"
df_raw     = data_manager.extract_raw_data(raw_path)
df_clean   = data_pipeline.preprocess_ocean_data(df_raw)
# df_clean   = df_clean.loc[config.START_DATE : config.END_DATE]

# split target & features
Y = df_clean[config.TARGETS]
X = df_clean.drop(columns=config.TARGETS)

y_train, y_test, X_train, X_test = temporal_train_test_split(
    y=Y, X=X, test_size=config.HORIZON * 3
)


  df = pd.read_csv(
  data_ocean_hourly = data_ocean_clean.resample('H').mean()


DataFrame shape: (52650, 13)

Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 52650 entries, 2024-01-01 00:00:00 to 2024-12-31 23:50:00
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   WDIR    52650 non-null  float64
 1   WSPD    52650 non-null  float64
 2   GST     52650 non-null  float64
 3   WVHT    52650 non-null  float64
 4   DPD     52650 non-null  float64
 5   APD     52650 non-null  float64
 6   MWD     52650 non-null  float64
 7   PRES    52650 non-null  float64
 8   ATMP    52650 non-null  float64
 9   WTMP    52650 non-null  float64
 10  DEWP    52650 non-null  float64
 11  VIS     52650 non-null  float64
 12  TIDE    52650 non-null  float64
dtypes: float64(13)
memory usage: 5.6 MB

Descriptive statistics:
               WDIR          WSPD           GST          WVHT           DPD  \
count  52650.000000  52650.000000  52650.000000  52650.000000  52650.000000   
mean     194.421026      4.962283      6.

# 2. FEATURE ENGINEERING FOR DEEPAR


In [3]:
pipe_X, pipe_Y = data_pipeline.get_pipelines(list(X_train.columns))

X_train_transformed = pipe_X.fit_transform(X_train)
X_test_transformed  = pipe_X.transform(X_test)
y_train_transformed = pipe_Y.fit_transform(y_train)
y_test_transformed  = pipe_Y.transform(y_test)




In [5]:
y_train_transformed

Unnamed: 0_level_0,WVHT,APD
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-01 00:00:00,-0.757493,0.779594
2024-01-01 01:00:00,-0.698073,1.456019
2024-01-01 02:00:00,-0.772348,1.486535
2024-01-01 03:00:00,-0.816913,2.188389
2024-01-01 04:00:00,-0.831767,2.595261
...,...,...
2024-12-22 19:00:00,4.486290,0.932171
2024-12-22 20:00:00,4.426871,0.932171
2024-12-22 21:00:00,4.233757,0.916914
2024-12-22 22:00:00,4.189192,0.774508


In [10]:
# Note that none of the dataframes X or y has timeindex cols. it is datatimeindexed but htere is not dedicated column for it.


def tidy_long_df(X, y, buoy_id="buoy_46088"):
    """
    Merge features + targets and give PyTorch‑Forecasting its three required columns:
      • series     – constant 'buoy_46088'
      • time_idx   – 0…N counter per series (int64, contiguous)
      • target     – numeric columns to predict
    """
    # 1) concat horizontally, then pull the datetime index into a column
    df = pd.concat([X, y], axis=1).reset_index()  
    # 2) rename the datetime column
    df.rename(columns={"datetime": "timestamp"}, inplace=True)
    # 3) assign the single-series ID
    df["series"]   = buoy_id
    # 4) zero‑based time index
    df["time_idx"] = np.arange(len(df), dtype=np.int64)
    return df

# build your full_df with vertical concat on X and y, then tidy:
full_df = tidy_long_df(
    pd.concat([X_train_transformed, X_test_transformed]),
    pd.concat([y_train_transformed, y_test_transformed])
)


print(f"Full dataset shape: {full_df.shape}")
print(f"Time index range: {full_df.time_idx.min()} to {full_df.time_idx.max()}")


Full dataset shape: (8784, 15)
Time index range: 0 to 8783


In [11]:

max_pred_len   = config.HORIZON        # 72 hours
enc_len        = config.WINDOW         # 504 hours  
n_test_steps   = max_pred_len * 3      # 216 hours

# FIXED SPLITTING LOGIC
# We need to ensure each split has enough data to create valid sequences
total_length = len(full_df)
min_sequence_length = enc_len + max_pred_len  # 576 hours

print(f"Parameters:")
print(f"  Encoder length: {enc_len}")
print(f"  Prediction length: {max_pred_len}")
print(f"  Minimum sequence length: {min_sequence_length}")
print(f"  Total data length: {total_length}")

# Reserve test data - needs encoder + prediction length
test_size = max(n_test_steps, min_sequence_length)  # At least 576 hours
test_start = total_length - test_size

# Reserve validation data - needs encoder + prediction length  
val_size = min_sequence_length  # 576 hours minimum
val_start = test_start - val_size

print(f"\nSplit boundaries:")
print(f"  Train: 0 to {val_start-1}")
print(f"  Val: {val_start} to {test_start-1}")  
print(f"  Test: {test_start} to {total_length-1}")

# Create splits with proper sizes
train_df = full_df[full_df.time_idx < val_start]
val_df = full_df[(full_df.time_idx >= val_start) & (full_df.time_idx < test_start)]
test_df = full_df[full_df.time_idx >= test_start]

print(f"\nActual split shapes:")
print(f"  Train: {train_df.shape}")
print(f"  Val: {val_df.shape}")  
print(f"  Test: {test_df.shape}")

# Verify splits can create valid sequences
def check_split_validity(df, split_name, enc_len, pred_len):
    if len(df) == 0:
        print(f"ERROR: {split_name} split is empty!")
        return False
    
    min_needed = enc_len + pred_len
    if len(df) < min_needed:
        print(f"ERROR: {split_name} split too small! Has {len(df)} samples, needs at least {min_needed}")
        return False
    
    print(f"✅ {split_name} split: {len(df)} samples (sufficient for {min_needed} minimum)")
    return True

print(f"\nValidating splits:")
train_valid = check_split_validity(train_df, "Train", enc_len, max_pred_len)
val_valid = check_split_validity(val_df, "Val", enc_len, max_pred_len)
test_valid = check_split_validity(test_df, "Test", enc_len, max_pred_len)

if not all([train_valid, val_valid, test_valid]):
    print("\n❌ Invalid splits detected!")
    # Fallback: simple percentage splits
    print("Falling back to percentage-based splits...")
    
    # Use 70-15-15 split but ensure minimum sizes
    train_end = int(0.7 * total_length)
    val_end = min(train_end + max(int(0.15 * total_length), min_sequence_length), 
                  total_length - min_sequence_length)
    
    train_df = full_df[:train_end]
    val_df = full_df[train_end:val_end]  
    test_df = full_df[val_end:]
    
    print(f"Fallback splits:")
    print(f"  Train: {train_df.shape}")
    print(f"  Val: {val_df.shape}")
    print(f"  Test: {test_df.shape}")
    
    # Re-validate
    train_valid = check_split_validity(train_df, "Train", enc_len, max_pred_len)
    val_valid = check_split_validity(val_df, "Val", enc_len, max_pred_len)
    test_valid = check_split_validity(test_df, "Test", enc_len, max_pred_len)

Parameters:
  Encoder length: 504
  Prediction length: 72
  Minimum sequence length: 576
  Total data length: 8784

Split boundaries:
  Train: 0 to 7631
  Val: 7632 to 8207
  Test: 8208 to 8783

Actual split shapes:
  Train: (7632, 15)
  Val: (576, 15)
  Test: (576, 15)

Validating splits:
✅ Train split: 7632 samples (sufficient for 576 minimum)
✅ Val split: 576 samples (sufficient for 576 minimum)
✅ Test split: 576 samples (sufficient for 576 minimum)


# 3. TIMESERIESDATASET CONFIGURATION


In [12]:
# If we have valid splits, proceed with dataset creation
if all([train_valid, val_valid, test_valid]):
        print(f"\n✅ All splits valid! Proceeding with dataset creation...")
        
        # Ensure target columns are properly configured
        target_cols = config.TARGETS if isinstance(config.TARGETS, list) else [config.TARGETS]
        feature_cols = [c for c in X_train_transformed.columns]
        
        # All features go into time_varying_unknown_reals
        # Targets are automatically handled by PyTorch Forecasting
        time_varying_unknown_reals = feature_cols
        
        print(f"Features (time_varying_unknown_reals): {time_varying_unknown_reals}")
        print(f"Targets: {target_cols}")
        
        # build one GroupNormalizer *per* target
        normalizers = [
            GroupNormalizer(
                groups=["series"],
                transformation="softplus"
            )
            for _ in target_cols
        ]

        common = dict(
            time_idx                   = "time_idx",
            target                     = config.TARGETS,
            group_ids                  = ["series"],
            time_varying_known_reals   = ["time_idx"],  # Only time_idx is known in future
            time_varying_unknown_reals = time_varying_unknown_reals,  # Features only
            static_categoricals        = ["series"],
            max_encoder_length         = enc_len,
            max_prediction_length      = max_pred_len,
            min_encoder_length         = max(enc_len // 2, 1),  # More flexible minimum
            min_prediction_length      = 1,
            target_normalizer          = MultiNormalizer(normalizers),
            allow_missing_timesteps    = True,
        )
    
    
        print("\nCreating training dataset...")
        train_ds = TimeSeriesDataSet(train_df, **common)
        print(f"✅ Training dataset created: {len(train_ds)} sequences")
        
        print("Creating validation dataset...")
        val_ds = TimeSeriesDataSet.from_dataset(train_ds, val_df, stop_randomization=True)
        print(f"✅ Validation dataset created: {len(val_ds)} sequences")
        
        print("Creating test dataset...")
        test_ds = TimeSeriesDataSet.from_dataset(
            train_ds, test_df,
            predict=True, stop_randomization=True
        )
        print(f"✅ Test dataset created: {len(test_ds)} sequences")
        
        print(f"\n🎉 All datasets created successfully!")
        print(f"Summary:")
        print(f"  - Training sequences: {len(train_ds)}")
        print(f"  - Validation sequences: {len(val_ds)}")
        print(f"  - Test sequences: {len(test_ds)}")
        
        
 

else:
    print("❌ Cannot create datasets - invalid splits!")
    print("Consider:")
    print("1. Reducing WINDOW (encoder length) - try ONE_WEEK * 2 instead of 3")  
    print("2. Reducing HORIZON (prediction length) - try ONE_DAY * 2 instead of 3")
    print("3. Getting more training data")
    
    # Show what would work
    max_possible_enc = (total_length - 2 * max_pred_len) // 2
    print(f"4. Maximum encoder length that would work: {max_possible_enc}")


✅ All splits valid! Proceeding with dataset creation...
Features (time_varying_unknown_reals): ['WSPD', 'GST', 'PRES', 'ATMP', 'WTMP', 'DEWP', 'WDIR_sin', 'WDIR_cos', 'MWD_sin', 'MWD_cos']
Targets: ['WVHT', 'APD']

Creating training dataset...
✅ Training dataset created: 7703 sequences
Creating validation dataset...
✅ Validation dataset created: 647 sequences
Creating test dataset...
✅ Test dataset created: 1 sequences

🎉 All datasets created successfully!
Summary:
  - Training sequences: 7703
  - Validation sequences: 647
  - Test sequences: 1


# 4. TRAINING


In [13]:
batch   = config.DEEPAR_CONFIG["batch_size"]

train_loader = train_ds.to_dataloader(train=True,  batch_size=batch, num_workers=4)
val_loader   = val_ds  .to_dataloader(train=False, batch_size=batch)
test_loader  = test_ds .to_dataloader(train=False, batch_size=batch)


In [14]:
# Get one batch from val_loader
sample_batch = next(iter(val_loader))
# If sample_batch is a tuple, extract the first element (assumed to be the dictionary with batch data)
if isinstance(sample_batch, tuple):
    sample_batch = sample_batch[0]

# Print keys and shapes for tensors in the batch
for key, value in sample_batch.items():
    if torch.is_tensor(value):
        print(f"{key}: shape {value.shape}")
    else:
        print(f"{key}: {value}")

encoder_cat: shape torch.Size([64, 504, 1])
encoder_cont: shape torch.Size([64, 504, 12])
encoder_target: [tensor([[ 0.5349,  0.3715,  0.2081,  ..., -0.8912, -0.9060, -0.9209],
        [ 0.3715,  0.2081,  0.0892,  ..., -0.9060, -0.9209,  0.0000],
        [ 0.2081,  0.0892, -0.1187,  ..., -0.9209,  0.0000,  0.0000],
        ...,
        [ 0.3566,  0.9954,  1.6639,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.9954,  1.6639,  2.0798,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.6639,  2.0798,  1.7233,  ...,  0.0000,  0.0000,  0.0000]]), tensor([[-0.4359, -0.4614, -0.6292,  ...,  0.4134,  0.8152,  1.5577],
        [-0.4614, -0.6292, -0.5224,  ...,  0.8152,  1.5577,  0.0000],
        [-0.6292, -0.5224, -0.3291,  ...,  1.5577,  0.0000,  0.0000],
        ...,
        [-0.1766,  0.1439,  0.1998,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1439,  0.1998,  0.1693,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1998,  0.1693, -0.0952,  ...,  0.0000,  0.0000,  0.0000]])]
encoder_lengths: shape 

In [15]:
trainer = training.DeepARTrainer(config.DEEPAR_CONFIG)


[32m2025-07-21 13:55:51.092[0m | [1mINFO    [0m | [36moceanwave_forecast.training[0m:[36m__init__[0m:[36m386[0m - [1mInitialized DeepAR trainer on device: cuda[0m


In [16]:
history = trainer.train(train_loader, val_loader)


[32m2025-07-21 13:55:51.109[0m | [1mINFO    [0m | [36moceanwave_forecast.training[0m:[36mtrain[0m:[36m559[0m - [1mStarting synthetic training for 2000 epochs[0m
[32m2025-07-21 13:55:51.109[0m | [1mINFO    [0m | [36moceanwave_forecast.training[0m:[36mtrain[0m:[36m589[0m - [1mEpoch 1/2000 | Train Loss: 0.1515 | Val Loss: 0.1673 | LR: 1.00e-03[0m
[32m2025-07-21 13:55:51.110[0m | [1mINFO    [0m | [36moceanwave_forecast.training[0m:[36mtrain[0m:[36m589[0m - [1mEpoch 2/2000 | Train Loss: 0.1444 | Val Loss: 0.1656 | LR: 1.00e-03[0m
[32m2025-07-21 13:55:51.111[0m | [1mINFO    [0m | [36moceanwave_forecast.training[0m:[36mtrain[0m:[36m589[0m - [1mEpoch 3/2000 | Train Loss: 0.1530 | Val Loss: 0.1719 | LR: 1.00e-03[0m
[32m2025-07-21 13:55:51.111[0m | [1mINFO    [0m | [36moceanwave_forecast.training[0m:[36mtrain[0m:[36m589[0m - [1mEpoch 4/2000 | Train Loss: 0.1536 | Val Loss: 0.1704 | LR: 1.00e-03[0m
[32m2025-07-21 13:55:51.111[0m | [1m

# 5. MODEL EVALUATION

In [17]:
results = trainer.predict(y_train, y_test, config.SCORERS)


🔁 Processing 3 blocks of size 72
  Block 1 MeanSquaredPercentageError: 0.2306
  Block 1 MeanAbsolutePercentageError: 0.1674
  Block 2 MeanSquaredPercentageError: 0.1683
  Block 2 MeanAbsolutePercentageError: 0.1358
  Block 3 MeanSquaredPercentageError: 0.2291
  Block 3 MeanAbsolutePercentageError: 0.1742

📊 Aggregated Scores:
  avg_MeanSquaredPercentageError: 0.2093
  std_MeanSquaredPercentageError: 0.0290
  avg_MeanAbsolutePercentageError: 0.1591
  std_MeanAbsolutePercentageError: 0.0167
✅ DeepAR predict complete (3 blocks)
