In [1]:
# Ensure project root is on sys.path so `import src...` works
import sys
from pathlib import Path

root = Path.cwd().resolve()
# If running from inside notebooks/ adjust to parent directory containing src
if not (root / 'src').exists():
    candidate = root.parent
    if (candidate / 'src').exists():
        root = candidate
# Prepend if missing
root_str = str(root)
if root_str not in sys.path:
    sys.path.insert(0, root_str)
print('Added to sys.path:', root_str)
print('Current working directory:', Path.cwd())

Added to sys.path: /Users/zak/Repos/E-commerce-Demand-Forecasting
Current working directory: /Users/zak/Repos/E-commerce-Demand-Forecasting/notebooks


# N-BEATS Training Notebook

Train the minimal N-BEATS implementation on the panel parquet subset.

## Objectives
1. Load processed panel data (item_id, date, demand).
2. Create sliding window dataset (input_length -> forecast_length).
3. Train N-BEATS LightningModule for a few epochs.
4. Compute validation metrics (MAE, WAPE).
5. Save checkpoint + metrics artifacts.

If the panel file is missing, a synthetic dataset will be generated so the pipeline can run end-to-end.

In [2]:
# Imports and environment checks (single accelerator definition)
import os, json, math
from pathlib import Path
import pandas as pd
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from src.models.nbeats_module import NBeatsModule, NBeatsConfig
from src.data.dataset_nbeats import PanelForecastDataset, PanelWindowConfig, split_dataset

# Device & accelerator selection (Apple Silicon, CUDA, CPU)
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    accelerator = 'mps'
    backend_note = "Using Apple Silicon MPS backend"
elif torch.cuda.is_available():
    device = torch.device("cuda")
    accelerator = 'gpu'
    backend_note = f"Using CUDA GPU: {torch.cuda.get_device_name(0)}"
else:
    device = torch.device("cpu")
    accelerator = 'cpu'
    backend_note = "Falling back to CPU"

print('PyTorch version:', torch.__version__)
print('Lightning version:', pl.__version__)
print('Device:', device)
print('Accelerator:', accelerator)
print('Backend note:', backend_note)


PyTorch version: 2.8.0
Lightning version: 2.5.5
Device: mps
Accelerator: mps
Backend note: Using Apple Silicon MPS backend


In [16]:
# Configuration parameters (aggressive compute - batch 512)
PANEL_PATH = Path('data/processed/m5_panel_subset.parquet')
ARTIFACTS_DIR = Path('artifacts/models')
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
INPUT_LENGTH = 28 * 4  # 112 days lookback
FORECAST_LENGTH = 30
BATCH_SIZE = 512  # increased substantially
EPOCHS = 30
LEARNING_RATE = 1e-3
NUM_STACKS = 3
BLOCKS_PER_STACK = 3
LAYER_WIDTH = 768
N_LAYERS = 4
DROPOUT = 0.05
MAX_ITEMS = 50
MAX_WINDOWS_PER_ITEM = 40
VAL_FRACTION = 0.1
SEED = 42
pl.seed_everything(SEED, workers=True)
print('Config -> stacks:', NUM_STACKS, 'blocks/stack:', BLOCKS_PER_STACK, 'layer_width:', LAYER_WIDTH, 'batch_size:', BATCH_SIZE)

Seed set to 42


Config -> stacks: 3 blocks/stack: 3 layer_width: 768 batch_size: 512


In [17]:
# Load or create synthetic panel
if PANEL_PATH.exists():
    print('Loading panel from', PANEL_PATH)
    panel_df = pd.read_parquet(PANEL_PATH)
else:
    print('Panel not found. Creating synthetic panel for demo...')
    # Synthetic: 20 items, 200 days, simple seasonal pattern + noise
    import numpy as np
    items = [f'ITEM_{i:03d}' for i in range(20)]
    dates = pd.date_range('2024-01-01', periods=200, freq='D')
    rows = []
    for item in items:
        base = np.random.randint(5, 25)
        seasonal = np.sin(np.linspace(0, 12 * math.pi, len(dates))) * np.random.uniform(3, 8)
        noise = np.random.randn(len(dates)) * np.random.uniform(0.5, 2.0)
        demand = (base + seasonal + noise).clip(min=0).round(2)
        for d, val in zip(dates, demand):
            rows.append({'item_id': item, 'date': d, 'demand': float(val)})
    panel_df = pd.DataFrame(rows)
    PANEL_PATH.parent.mkdir(parents=True, exist_ok=True)
    panel_df.to_parquet(PANEL_PATH, index=False)
print(panel_df.head())
print('Panel shape:', panel_df.shape)

Loading panel from data/processed/m5_panel_subset.parquet
    item_id       date  demand
0  ITEM_000 2024-01-01   11.90
1  ITEM_000 2024-01-02   11.30
2  ITEM_000 2024-01-03   11.60
3  ITEM_000 2024-01-04   18.30
4  ITEM_000 2024-01-05   15.64
Panel shape: (4000, 3)


In [18]:
# Build dataset windows with chronological split
# Determine cutoff date for validation based on VAL_FRACTION of unique days
unique_dates = sorted(panel_df['date'].unique())
val_days = max(FORECAST_LENGTH, int(len(unique_dates) * VAL_FRACTION))
# Ensure we have enough history for validation windows
val_history_needed = INPUT_LENGTH + FORECAST_LENGTH
cutoff_index = len(unique_dates) - val_days
cutoff_date = unique_dates[cutoff_index]

# Train: all dates strictly before cutoff_date
train_df = panel_df[panel_df['date'] < cutoff_date]
# Validation: last segment plus required preceding history window
val_start_history_date = unique_dates[max(0, cutoff_index - (val_history_needed - 1))]
val_df = panel_df[panel_df['date'] >= val_start_history_date]

print('Cutoff date for validation segment:', cutoff_date)
print('Train date range:', train_df['date'].min(), '->', train_df['date'].max(), '| rows:', len(train_df))
print('Val+history date range:', val_df['date'].min(), '->', val_df['date'].max(), '| rows:', len(val_df))

# Persist temporary parquet shards (avoids modifying original panel file)
train_path = PANEL_PATH.parent / 'm5_panel_subset_train.parquet'
val_path = PANEL_PATH.parent / 'm5_panel_subset_val.parquet'
train_df.to_parquet(train_path, index=False)
val_df.to_parquet(val_path, index=False)

cfg_ds = PanelWindowConfig(input_length=INPUT_LENGTH, forecast_length=FORECAST_LENGTH, max_items=MAX_ITEMS, max_windows_per_item=MAX_WINDOWS_PER_ITEM)
train_ds = PanelForecastDataset(train_path, cfg_ds)
val_ds = PanelForecastDataset(val_path, cfg_ds)
print('Windows -> train:', len(train_ds), 'val:', len(val_ds))
# Inspect one sample
x0, y0 = train_ds[0]
print('Sample shapes -> x:', x0.shape, 'y:', y0.shape)

Cutoff date for validation segment: 2024-06-19 00:00:00
Train date range: 2024-01-01 00:00:00 -> 2024-06-18 00:00:00 | rows: 3400
Val+history date range: 2024-01-30 00:00:00 -> 2024-07-18 00:00:00 | rows: 3420
Windows -> train: 580 val: 600
Sample shapes -> x: torch.Size([112]) y: torch.Size([30])


In [19]:
# DataLoaders (batch 512)
num_workers = 6  # adjust based on CPU cores
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers, persistent_workers=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, num_workers=num_workers, persistent_workers=True)
print('Batches -> train:', len(train_loader), 'val:', len(val_loader), '| workers:', num_workers, '| batch_size:', BATCH_SIZE)

Batches -> train: 2 val: 2 | workers: 6 | batch_size: 512


In [20]:
# Initialize model
cfg_model = NBeatsConfig(input_length=INPUT_LENGTH, forecast_length=FORECAST_LENGTH, learning_rate=LEARNING_RATE, num_stacks=NUM_STACKS, num_blocks_per_stack=BLOCKS_PER_STACK, layer_width=LAYER_WIDTH, n_layers=N_LAYERS, dropout=DROPOUT)
model = NBeatsModule(cfg_model).to(device)
print(model)

NBeatsModule(
  (stacks): ModuleList(
    (0-2): 3 x ModuleList(
      (0-2): 3 x NBeatsBlock(
        (fc): Sequential(
          (0): Linear(in_features=112, out_features=768, bias=True)
          (1): ReLU()
          (2): Dropout(p=0.05, inplace=False)
          (3): Linear(in_features=768, out_features=768, bias=True)
          (4): ReLU()
          (5): Dropout(p=0.05, inplace=False)
          (6): Linear(in_features=768, out_features=768, bias=True)
          (7): ReLU()
          (8): Dropout(p=0.05, inplace=False)
          (9): Linear(in_features=768, out_features=768, bias=True)
          (10): ReLU()
          (11): Dropout(p=0.05, inplace=False)
        )
        (backcast_head): Linear(in_features=768, out_features=112, bias=True)
        (forecast_head): Linear(in_features=768, out_features=30, bias=True)
      )
    )
  )
  (loss_fn): MSELoss()
)


In [21]:
# Optional experimental torch.compile step (PyTorch 2.x)
import torch

if hasattr(torch, 'compile'):
    compile_mode = 'reduce-overhead'  # alternatives: 'max-autotune', 'default'
    try:
        # For MPS backend, fullgraph=False tends to be safer; dynamic shapes may break some passes.
        model = torch.compile(model, mode=compile_mode, fullgraph=False)
        print(f'Model compiled successfully with torch.compile (mode={compile_mode}).')
    except Exception as e:
        print('torch.compile failed:', type(e).__name__, str(e)[:300])
        print('Falling back to original (uncompiled) model.')
else:
    print('torch.compile not available in this PyTorch build.')

Model compiled successfully with torch.compile (mode=reduce-overhead).


In [22]:
# Parameter count & quick inference timing benchmark (post torch.compile, batch=512)
import time, inspect

if not hasattr(model, 'parameters') or inspect.isfunction(model):
    print('Model reference invalid; recreating model instance...')
    model = NBeatsModule(cfg_model).to(device)
    if hasattr(torch, 'compile'):
        try:
            model = torch.compile(model, mode='reduce-overhead', fullgraph=False)
            print('Recompiled new model instance.')
        except Exception as e:
            print('Recompile failed:', type(e).__name__, str(e)[:200])

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total params: {total_params:,} | Trainable: {trainable_params:,} (~{trainable_params/1e6:.3f}M)")

try:
    first_batch = next(iter(train_loader))[0]
except StopIteration:
    first_batch = torch.zeros((BATCH_SIZE, INPUT_LENGTH), dtype=torch.float32)
xb = first_batch.to(device)

for _ in range(3):
    _ = model(xb)

n_runs = 5
start = time.perf_counter()
for _ in range(n_runs):
    _ = model(xb)
end = time.perf_counter()

avg_ms = (end - start) / n_runs * 1000
throughput = (xb.shape[0] * FORECAST_LENGTH) / ((end - start) / n_runs)
print(f"Avg forward time (compiled, batch=512): {avg_ms:.2f} ms | Throughput items*horizon/sec: {throughput:.1f}")
print(f"Device: {device} | Accelerator: {accelerator}")

Total params: 17,709,822 | Trainable: 17,709,822 (~17.710M)
Avg forward time (compiled, batch=512): 2.94 ms | Throughput items*horizon/sec: 5231949.5
Device: mps | Accelerator: mps
Avg forward time (compiled, batch=512): 2.94 ms | Throughput items*horizon/sec: 5231949.5
Device: mps | Accelerator: mps


In [None]:
# ðŸ“Š Metric Interpretation & Overfitting Guide (updated for scaled run)
# Refer to earlier explanations. After scaling model, monitor:
#  - train_loss vs val_loss divergence
#  - val_wape flattening
#  - potential instability if precision < 32 on MPS
# If OOM occurs, reduce BATCH_SIZE first, then LAYER_WIDTH.


In [None]:
# Adaptive training cell v2: add CPU fallback if all MPS attempts fail; ensure model in train mode.
import os, torch, pytorch_lightning as pl
from torch.utils.data import DataLoader
from torch._dynamo import reset as dynamo_reset

print('\n=== Adaptive Training Start (v2) ===')
print(f'Current device: {device} | accelerator: {accelerator}')

# Restore original model if compiled
if hasattr(model, '_orig_mod'):
    print('Restoring original (uncompiled) model from compiled wrapper.')
    model = model._orig_mod
else:
    print('Model is already uncompiled.')

model.train()

# Try to suppress dynamo entirely
os.environ['TORCH_DISABLE_TORCHDYNAMO'] = '1'
print('Set TORCH_DISABLE_TORCHDYNAMO=1 for this training scope.')

attempt_batches = []
if BATCH_SIZE not in (512, 256, 128, 64, 32):
    attempt_batches.append(BATCH_SIZE)
attempt_batches.extend([BATCH_SIZE, 256, 128, 64, 32])
seen = set(); ordered_batches = []
for b in attempt_batches:
    if b not in seen and b > 0:
        seen.add(b); ordered_batches.append(b)

successful_batch = None
last_error = None

for bs in ordered_batches:
    print(f'\n--- Attempting MPS training with batch_size={bs} ---')
    dynamo_reset()
    try:
        train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True, num_workers=num_workers, persistent_workers=(num_workers>0))
        val_loader = DataLoader(val_ds, batch_size=bs, shuffle=False, num_workers=num_workers)
        # Recreate trainer with original kwargs
        trainer = pl.Trainer(**trainer_kwargs)
        trainer.fit(model, train_loader, val_loader)
        successful_batch = bs
        print(f'MPS training succeeded with batch_size={bs}')
        break
    except Exception as e:
        err_type = type(e).__name__
        msg = str(e)[:200]
        print(f'MPS batch {bs} failed: {err_type}: {msg}')
        last_error = e
        continue

if successful_batch is None and device.type == 'mps':
    print('\n>>> All MPS attempts failed; switching to CPU fallback.')
    cpu_batch = 64 if 64 in ordered_batches else ordered_batches[-1]
    cpu_device = torch.device('cpu')
    model.to(cpu_device)
    cpu_train_loader = DataLoader(train_ds, batch_size=cpu_batch, shuffle=True, num_workers=0)
    cpu_val_loader = DataLoader(val_ds, batch_size=cpu_batch, shuffle=False, num_workers=0)
    cpu_trainer_kwargs = dict(trainer_kwargs)
    cpu_trainer_kwargs['accelerator'] = 'cpu'
    cpu_trainer_kwargs['devices'] = 1
    print(f'CPU fallback: batch_size={cpu_batch}')
    try:
        trainer = pl.Trainer(**cpu_trainer_kwargs)
        trainer.fit(model, cpu_train_loader, cpu_val_loader)
        successful_batch = cpu_batch
        print('CPU training succeeded.')
        device = cpu_device  # update global
    except Exception as e:
        print('CPU fallback failed:', type(e).__name__, str(e)[:200])

if successful_batch is None:
    print('\nTraining failed on all backends. Suggestions:')
    print('- Reduce LAYER_WIDTH (e.g., 512)')
    print('- Reduce BLOCKS_PER_STACK or NUM_STACKS')
    print('- Upgrade PyTorch (nightly) for MPS Inductor fixes')
    print('- Use CPU for now and rely on smaller model')
else:
    BATCH_SIZE = successful_batch
    print(f'\nEffective batch size used: {BATCH_SIZE} | Final device: {device}')

print('=== Adaptive Training End (v2) ===')


Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/zak/anaconda3/envs/DataCamp/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /Users/zak/Repos/E-commerce-Demand-Forecasting/notebooks/artifacts/models exists and is not empty.

  | Name    | Type       | Params | Mode
----------------------------------------------
0 | stacks  | ModuleList | 17.7 M | eval
1 | loss_fn | MSELoss    | 0      | eval
----------------------------------------------
17.7 M    Trainable params
0         Non-trainable params
17.7 M    Total params
70.839    Total estimated model params size (MB)
0       


=== Adaptive Training Start ===
Current device: mps | accelerator: mps
Model is already uncompiled.
Set TORCH_DISABLE_TORCHDYNAMO=1 for this training run (MPS backend).

--- Attempting training with batch_size=256 ---
Running sanity check + fit...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/zak/anaconda3/envs/DataCamp/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:428: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


Training: |          | 0/? [00:00<?, ?it/s]

W1104 14:01:58.730000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8] torch._dynamo hit config.recompile_limit (8)
W1104 14:01:58.730000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8]    function: 'log' (/Users/zak/anaconda3/envs/DataCamp/lib/python3.11/site-packages/pytorch_lightning/core/module.py:384)
W1104 14:01:58.730000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8]    last reason: 4/7: name == 'train_mae'                                    
W1104 14:01:58.730000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W1104 14:01:58.730000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
C1104 14:02:01.815000 79198 site-packages/torch/_inductor/scheduler.py:1198] [8/1_1] Error in codegen for ComputedBuffer(name='buf55', layout=FixedLayout('mps:0', 

Batch 256 failed: InductorError: TypeError: cannot determine truth value of Relational: 30*s71 <= 1024

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer cont

--- Attempting training with batch_size=128 ---
Running sanity check + fit...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/zak/anaconda3/envs/DataCamp/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:428: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


Training: |          | 0/? [00:00<?, ?it/s]

W1104 14:02:16.790000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8] torch._dynamo hit config.recompile_limit (8)
W1104 14:02:16.790000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8]    function: 'log' (/Users/zak/anaconda3/envs/DataCamp/lib/python3.11/site-packages/pytorch_lightning/core/module.py:384)
W1104 14:02:16.790000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8]    last reason: 4/7: name == 'train_mae'                                    
W1104 14:02:16.790000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W1104 14:02:16.790000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
C1104 14:02:19.984000 79198 site-packages/torch/_inductor/scheduler.py:1198] [8/1_1] Error in codegen for ComputedBuffer(name='buf55', layout=FixedLayout('mps:0', 

Batch 128 failed: InductorError: TypeError: cannot determine truth value of Relational: 30*s71 <= 1024

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer cont

--- Attempting training with batch_size=64 ---
Running sanity check + fit...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/zak/anaconda3/envs/DataCamp/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:428: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


Training: |          | 0/? [00:00<?, ?it/s]

W1104 14:02:35.931000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8] torch._dynamo hit config.recompile_limit (8)
W1104 14:02:35.931000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8]    function: 'log' (/Users/zak/anaconda3/envs/DataCamp/lib/python3.11/site-packages/pytorch_lightning/core/module.py:384)
W1104 14:02:35.931000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8]    last reason: 4/7: name == 'train_mae'                                    
W1104 14:02:35.931000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W1104 14:02:35.931000 79198 site-packages/torch/_dynamo/convert_frame.py:1016] [4/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.
C1104 14:02:38.626000 79198 site-packages/torch/_inductor/scheduler.py:1198] [8/1_1] Error in codegen for ComputedBuffer(name='buf55', layout=FixedLayout('mps:0', 

Batch 64 failed: InductorError: TypeError: cannot determine truth value of Relational: 30*s71 <= 1024

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer cont

All attempts failed. Consider:
- Reducing model width/blocks
- Removing dropout
- Switching off MPS (force CPU)
- Upgrading PyTorch (Inductor fixes)
=== Adaptive Training End ===


In [11]:
# Evaluate on validation set (manual pass) with explicit device move
model.eval()
model.to(device)
val_losses = []
val_mae = []
val_wape_num = 0.0
val_wape_den = 0.0
with torch.no_grad():
    for xb, yb in val_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        pred = model(xb)
        loss = torch.mean((pred - yb)**2)
        mae = torch.mean(torch.abs(pred - yb))
        val_losses.append(loss.item())
        val_mae.append(mae.item())
        val_wape_num += torch.sum(torch.abs(pred - yb)).item()
        val_wape_den += torch.sum(torch.abs(yb)).item()
avg_loss = sum(val_losses)/len(val_losses)
avg_mae = sum(val_mae)/len(val_mae)
wape = math.nan if val_wape_den == 0 else 100.0 * val_wape_num/val_wape_den
print(f'Validation MSE: {avg_loss:.4f} | MAE: {avg_mae:.4f} | WAPE: {wape:.2f}%')

Validation MSE: 2.2445 | MAE: 1.1766 | WAPE: 7.38%


In [12]:
# Save artifacts
CKPT_PATH = ARTIFACTS_DIR / 'nbeats_notebook.ckpt'
torch.save(model.state_dict(), CKPT_PATH)
metrics = {
    'validation_mse': avg_loss,
    'validation_mae': avg_mae,
    'validation_wape': wape,
    'config': cfg_model.__dict__,
    'n_train_windows': len(train_ds),
    'n_val_windows': len(val_ds),
}
with open(ARTIFACTS_DIR / 'nbeats_notebook_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)
print('Saved checkpoint ->', CKPT_PATH)
print('Saved metrics ->', ARTIFACTS_DIR / 'nbeats_notebook_metrics.json')

Saved checkpoint -> artifacts/models/nbeats_notebook.ckpt
Saved metrics -> artifacts/models/nbeats_notebook_metrics.json


## Next Steps
- Integrate with backtesting harness.
- Add basis (trend/seasonality) blocks for improved decomposition.
- Introduce quantile heads for probabilistic forecasts.
- Add per-item embeddings & categorical covariates.
- Promote best checkpoint to API service for live forecasts.