In [None]:
# %matplotlib widget
%matplotlib inline

In [1]:
from __future__ import annotations               # allow postponed evaluation of annotations

import os                                          # for setting environment variables
# — Silence noisy logs —
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"           # only show warning+errors
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"  # disable XLA memory prealloc
# ensure we load cuDNN9 before any TF bindings
os.environ["LD_LIBRARY_PATH"] = "/usr/local/cuda/lib64:" + os.environ.get("LD_LIBRARY_PATH", "")

import ctypes                                      # to force-load cudnn.so.9
# Force-load cuDNN from /usr/local/cuda/lib64/libcudnn.so (v9.x)
ctypes.CDLL("libcudnn.so", mode=ctypes.RTLD_GLOBAL)

import logging                                     # for controlling TF log level
logging.getLogger("tensorflow").setLevel(logging.ERROR)

import tensorflow as tf                            # core TF
# — Configure TensorFlow —
tf.config.optimizer.set_jit(True)                   # enable XLA JIT compilation
# allow GPU memory to grow instead of pre-allocating
gpus = tf.config.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# — Keras & TF utilities —
from tensorflow import keras
from tensorflow.keras.mixed_precision import LossScaleOptimizer  # FP16 loss scaling
from tensorflow.keras.optimizers import Adam                    # optimizer
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts  # LR schedule
from tensorflow.keras import layers, models, metrics, regularizers, optimizers, initializers   # NN building blocks
from tensorflow.python.client import device_lib

# — Data & plotting libs —
import numpy as np                                  # numerical arrays
from numpy.lib.stride_tricks import sliding_window_view  # sliding windows
import pandas as pd                                 # dataframes
from sklearn.preprocessing import StandardScaler    # scaling features

import matplotlib.pyplot as plt                      # for plotting training curves
from IPython.display import display, update_display, clear_output  # Jupyter helpers

from tqdm.auto import tqdm                          # progress bars in notebooks

# — Misc utilities —
import time                                        # timing
import math                                        # math functions
import pickle                                      # serialization
import platform                                    # platform info
import sys                                         # system-specific parameters
from datetime import datetime
from pathlib import Path                            # filesystem paths
from typing import Sequence, Tuple, List           # type hints

# — Custom libs —
import importlib
import stockanalibs                                # user utilities
importlib.reload(stockanalibs)


2025-07-02 14:11:23.411162: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9373] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-02 14:11:23.411580: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-02 14:11:23.440170: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1534] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


<module 'stockanalibs' from '/workspace/stockanalibs.py'>

In [3]:
# 1) Basic TF info
print(f"Python:    {platform.python_version()}")
print(f"TensorFlow: {tf.__version__}")
build_info = tf.sysconfig.get_build_info()
print(f"Built with CUDA: {build_info['cuda_version']}")
print(f"cuDNN version: {build_info['cudnn_version']}\n")

# 2) GPU summary
phys = tf.config.list_physical_devices("GPU")
logi = tf.config.list_logical_devices("GPU")
print(f"Physical GPUs: {phys}")
print(f"Logical GPUs:  {logi}\n")

# 3) List all local devices via device_lib
print("All devices:")
for d in device_lib.list_local_devices():
    print(f" - {d.name:20s} {d.device_type:6s}  memory_limit={d.memory_limit}")

Python:    3.10.12
TensorFlow: 2.15.0
Built with CUDA: 12.4
cuDNN version: 9

Physical GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Logical GPUs:  [LogicalDevice(name='/device:GPU:0', device_type='GPU')]

All devices:
 - /device:CPU:0        CPU     memory_limit=268435456
 - /device:GPU:0        GPU     memory_limit=13798211584


In [4]:
ticker = stockanalibs.ticker

df = pd.read_csv(f"dfs training/{ticker}_final.csv", index_col=0, parse_dates=True)
df

Unnamed: 0,open,high,low,close,volume,bid,ask,trade_action,StrategyEarning,EarningDiff,signal_smooth
2014-04-03 13:00:00,28.6251,28.6251,28.62100,28.62100,4900.0,28.61240,28.62960,0,0.000,0.000,1.018362
2014-04-03 13:01:00,28.6258,28.6258,28.62182,28.62182,5056.6,28.61324,28.63042,0,0.000,0.000,1.020963
2014-04-03 13:02:00,28.6265,28.6265,28.62264,28.62264,5213.2,28.61408,28.63124,0,0.000,0.000,1.023603
2014-04-03 13:03:00,28.6272,28.6272,28.62346,28.62346,5369.8,28.61492,28.63206,0,0.000,0.000,1.026278
2014-04-03 13:04:00,28.6279,28.6279,28.62428,28.62428,5526.4,28.61576,28.63288,0,0.000,0.000,1.029433
...,...,...,...,...,...,...,...,...,...,...,...
2025-06-18 20:56:00,173.3750,173.6771,173.21500,173.56500,621199.0,173.51290,173.61710,0,-0.981,1.729,0.000000
2025-06-18 20:57:00,173.5650,173.5900,173.24000,173.38000,624198.0,173.32800,173.43200,0,-0.981,1.914,0.000000
2025-06-18 20:58:00,173.3900,173.4100,173.20000,173.31000,454542.0,173.25800,173.36200,0,-0.981,1.984,0.000000
2025-06-18 20:59:00,173.3150,173.4000,173.23000,173.28000,1094746.0,173.22800,173.33200,0,-0.981,2.014,0.000000


In [5]:
###############################################################################
# 0 ·  DATA & PATHS                                                           #
###############################################################################
label_col      = stockanalibs.label_col
feature_cols   = stockanalibs.feature_cols

LOOK_BACK      = stockanalibs.look_back                                
N_FEATS        = len(feature_cols) 

today          = datetime.now().strftime("%Y-%m-%d")

rth_start      = stockanalibs.regular_start

save_dir       = Path("dfs training")
weights_path   = save_dir / f"{ticker}_{today}.weights.h5"   # auto-per‐ticker
model_path     = save_dir / f"{ticker}_{today}_model.keras"

# dataset split proportions
TRAIN_PROP, VAL_PROP = 0.70, 0.15                 # → 0.15 test remainder

In [6]:
###############################################################################
# 1 · MODEL HYPER-PARAMETERS (tuned defaults)
###############################################################################

# ── Architecture Parameters ───────────────────────────────────────────────
SHORT_UNITS         = 64       # LSTM short-term units (32–128 recommended)
LONG_UNITS          = 192      # LSTM long-term units (64–256 recommended)
DROPOUT_SHORT       = 0.25     # Dropout for short LSTM outputs (0.1–0.3)
DROPOUT_LONG        = 0.20     # Dropout for long LSTM outputs (0.1–0.3)
KERNEL_REG          = 1e-4     # L2 regularization factor for LSTM layers
    
# ── Optimizer Settings: Cosine Decay with Restarts ──────────────────────────
INITIAL_LR          = 3e-4     # Initial learning rate (1e-4 to 1e-3)
FIRST_DECAY_EPOCHS  = 5        # Epochs before first decay
T_MUL               = 2.0      # Cycle length multiplier
M_MUL               = 1.0      # Cycle LR scaling factor after restarts
ALPHA               = 0.01     # Minimum LR factor relative to INITIAL_LR
CLIPNORM            = 2.0      # Gradient clipping norm (0.5–5.0)

# ── Training Control Parameters ─────────────────────────────────────────────
TRAIN_BATCH         = 64       # Training batch size (32–128)
VAL_BATCH           = 1        # Validation batch size (usually 1)
MAX_EPOCHS          = 100      # Max training epochs (50–150)
EARLY_STOP_PATIENCE = 10       # Early stopping patience (10–20 epochs)


In [7]:

def build_lstm_tensors(
    df: pd.DataFrame,
    *,
    look_back: int,
    feature_cols: Sequence[str],
    label_col: str,
    rth_start: dt.time
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Converts a minute-bar DataFrame into arrays for a stateful LSTM with fixed
    window length look_back.  Returns:

      X         : shape (n_samples, look_back, n_feats)
      y         : shape (n_samples,)
      raw_close : shape (n_samples,)
      raw_bid   : shape (n_samples,)
      raw_ask   : shape (n_samples,)
    """
    X_windows, y_list, close_list, bid_list, ask_list = [], [], [], [], []

    for date, day_df in df.groupby(df.index.normalize(), sort=False):
        day_df = day_df.sort_index()

        # raw prices before scaling
        raw_close = day_df["close"].to_numpy(dtype=np.float32)
        raw_bid   = day_df["bid"].to_numpy(dtype=np.float32)
        raw_ask   = day_df["ask"].to_numpy(dtype=np.float32)

        # per-day standardization of features
        day_df[feature_cols] = StandardScaler().fit_transform(day_df[feature_cols])
        feats_np = day_df[feature_cols].to_numpy(dtype=np.float32)  # (T, n_feats)
        label_np = day_df[label_col].to_numpy(dtype=np.float32)     # (T,)

        # only windows whose final timestamp ≥ rth_start
        mask_rth = day_df.index.time >= rth_start
        if not mask_rth.any():
            continue

        # build all sliding windows of shape (T - look_back + 1, look_back, n_feats)
        win3d = sliding_window_view(feats_np, (look_back, feats_np.shape[1]), axis=(0,1))
        win3d = win3d[:,0]           # drop extra dim → (n_wins, look_back, n_feats)

        # align targets: drop last window so y has same count
        win3d = win3d[:-1]           # now (T - look_back, look_back, n_feats)
        y_aligned     = label_np[look_back:]     # (T - look_back,)
        close_aligned = raw_close[look_back:]
        bid_aligned   = raw_bid[look_back:]
        ask_aligned   = raw_ask[look_back:]

        # only keep windows within RTH
        rth_win_mask  = mask_rth[look_back:]
        win3d         = win3d[rth_win_mask]
        y_aligned     = y_aligned[rth_win_mask]
        close_aligned = close_aligned[rth_win_mask]
        bid_aligned   = bid_aligned[rth_win_mask]
        ask_aligned   = ask_aligned[rth_win_mask]

        # collect without flattening
        X_windows.append(win3d)  
        y_list   .append(y_aligned)
        close_list.append(close_aligned)
        bid_list .append(bid_aligned)
        ask_list .append(ask_aligned)

    if not X_windows:
        raise ValueError("No RTH windows found; check rth_start or data.")

    # concatenate along sample axis
    X         = np.concatenate(X_windows, axis=0)  # (n_samples, look_back, n_feats)
    y         = np.concatenate(y_list,    axis=0)
    raw_close = np.concatenate(close_list,axis=0)
    raw_bid   = np.concatenate(bid_list,  axis=0)
    raw_ask   = np.concatenate(ask_list,  axis=0)

    return X, y, raw_close, raw_bid, raw_ask


In [8]:
X, y, raw_close, raw_bid, raw_ask = build_lstm_tensors(
    df=df,
    look_back=LOOK_BACK,
    feature_cols=feature_cols,
    label_col=label_col,
    rth_start=rth_start
)

print(X.shape) # we use 'm' features and 'n' previous look back values to predict each 1 label
print(y.shape) 

(1101447, 90, 5)
(1101447,)


In [9]:
def chronological_split(
    X: np.ndarray,             # shape (n_samples, look_back, n_feats)
    y: np.ndarray,             # shape (n_samples,)
    raw_close: np.ndarray,     
    raw_bid: np.ndarray,
    raw_ask: np.ndarray,
    df: pd.DataFrame,           # same minute‐bar DataFrame used to build X, y
    *,
    look_back: int,            # window length in timesteps
    rth_start: dt.time,        # “regular” start time mask
    train_prop: float,         # fraction of days → train
    val_prop: float,           # fraction of days → validation
    TRAIN_BATCH: int           # batch size (must divide # days after rounding)
) -> Tuple[
        Tuple[np.ndarray, np.ndarray],    
        Tuple[np.ndarray, np.ndarray],    
        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray],
        List[int],
        np.ndarray, np.ndarray, np.ndarray
    ]:
    """
    Splits X, y, and the raw price arrays into chronological train/val/test sets
    by whole calendar days.  Each split is a contiguous block of days, and the
    train portion is rounded up to a multiple of TRAIN_BATCH days to guarantee
    full batches.

    Args:
      X            (n_samples, look_back, n_feats)
      y            (n_samples,)
      raw_*        (n_samples,)
      df           original minute‐bar DataFrame (for day boundaries)
      look_back    window length in timesteps
      rth_start    time-of-day to start counting samples each day
      train_prop   fraction of days to use for training
      val_prop     fraction of days (after train) to use for validation
      TRAIN_BATCH  number of days per “batch” (rounds train days up to multiple)

    Returns:
      (X_tr, y_tr), (X_val, y_val), (X_te, y_te, raw_close_te, raw_bid_te, raw_ask_te),
      samples_per_day, day_id_tr, day_id_val, day_id_te
    """

    # 1) count usable windows per calendar day
    samples_per_day: List[int] = []
    for _, day_df in df.groupby(df.index.normalize(), sort=False):
        T = len(day_df)
        idx = np.arange(T)
        mask_ready = idx >= look_back
        mask_rth   = day_df.index.time >= rth_start
        samples_per_day.append(int((mask_ready & mask_rth).sum()))

    # sanity check
    if sum(samples_per_day) != len(X):
        raise ValueError("X length mismatch: check look_back & rth_start vs. build_lstm_tensors().")

    # 2) tag each sample by its day index
    day_id_vec = np.repeat(np.arange(len(samples_per_day)), samples_per_day)

    # 3) total days
    D = len(samples_per_day)

    # 4) compute training days → round up to full TRAIN_BATCH days
    orig_train_days = int(D * train_prop)
    train_days = int(np.ceil(orig_train_days / TRAIN_BATCH) * TRAIN_BATCH)
    train_days = min(train_days, D)
    cut_train = train_days - 1

    # 5) validation cut
    cut_val = int(D * (train_prop + val_prop))

    # 6) build boolean masks
    mask_tr  = day_id_vec <= cut_train
    mask_val = (day_id_vec > cut_train) & (day_id_vec <= cut_val)
    mask_te  = day_id_vec > cut_val

    # 7) slice arrays (X remains 3-D, masks work on axis-0)
    X_tr,    y_tr    = X[mask_tr],    y[mask_tr]
    X_val,  y_val   = X[mask_val],  y[mask_val]
    X_te,   y_te    = X[mask_te],   y[mask_te]
    raw_close_te     = raw_close[mask_te]
    raw_bid_te       = raw_bid[mask_te]
    raw_ask_te       = raw_ask[mask_te]

    # 8) per-split day-IDs (useful for state reset logic)
    day_id_tr  = day_id_vec[mask_tr]
    day_id_val = day_id_vec[mask_val]
    day_id_te  = day_id_vec[mask_te]

    return (
        (X_tr,    y_tr),
        (X_val,  y_val),
        (X_te,    y_te, raw_close_te, raw_bid_te, raw_ask_te),
        samples_per_day,
        day_id_tr, day_id_val, day_id_te
    )


In [10]:
# The splitter now requires raw_close, raw_bid, and raw_ask arrays as additional arguments.
(X_tr, y_tr), (X_val, y_val), (X_te, y_te, raw_close_te, raw_bid_te, raw_ask_te), \
samples_per_day, day_id_tr, day_id_val, day_id_te = chronological_split(
    X, y, raw_close, raw_bid, raw_ask, df,
    look_back=LOOK_BACK,
    rth_start=rth_start,
    train_prop=TRAIN_PROP,
    val_prop=VAL_PROP,
    TRAIN_BATCH=TRAIN_BATCH
)

print(f"Training: {len(np.unique(day_id_tr))} distinct calendar days retained, organized into batches of {TRAIN_BATCH} days each (no partial batches).")
print(f"Validation: {len(np.unique(day_id_val))} distinct calendar days available for model validation.")
print(f"Test: {len(np.unique(day_id_te))} distinct calendar days available for testing, with raw price signals (close, bid, ask) included.")


Training: 1984 distinct calendar days retained, organized into batches of 64 days each (no partial batches).
Validation: 411 distinct calendar days available for model validation.
Test: 422 distinct calendar days available for testing, with raw price signals (close, bid, ask) included.


In [11]:
# Compute steps and decay‐steps
n_train_samples = X_tr.shape[0]
steps_per_epoch = n_train_samples // TRAIN_BATCH
FIRST_DECAY_STEPS = FIRST_DECAY_EPOCHS * steps_per_epoch
print("Steps/epoch:", steps_per_epoch, "First decay steps:", FIRST_DECAY_STEPS)

Steps/epoch: 12121 First decay steps: 60605


In [12]:
def make_day_dataset(
    X           : np.ndarray,    # (n_samples, look_back, n_feats)
    y           : np.ndarray,    # (n_samples,)
    day_id      : np.ndarray,    # (n_samples,)
    weekday_vec : np.ndarray,    # (n_samples,)
    raw_close   : np.ndarray = None,  # (n_samples,) optional
    raw_bid     : np.ndarray = None,  # (n_samples,) optional
    raw_ask     : np.ndarray = None   # (n_samples,) optional
) -> tf.data.Dataset:
    """
    Creates a tf.data.Dataset where each element is one calendar day’s worth of
    sliding windows.  Yields either:

      • (x_day, y_day, weekday)  
        x_day: (1, n_windows, look_back, n_feats)  
        y_day: (1, n_windows)  
        weekday: scalar  

    or, if raw prices provided:

      • (x_day, y_day, raw_close_day, raw_bid_day, raw_ask_day, weekday)  
        raw_*_day: (1, n_windows)

    where `n_windows` is the number of look_back-length windows for that day.
    """

    # 1) sort by day, group into slices
    idx = np.argsort(day_id, kind="stable")
    X, y, day_id, weekday_vec = [a[idx] for a in (X, y, day_id, weekday_vec)]
    if raw_close is not None:
        raw_close, raw_bid, raw_ask = [a[idx] for a in (raw_close, raw_bid, raw_ask)]

    change = np.where(np.diff(day_id) != 0)[0] + 1
    day_slices = np.split(np.arange(len(day_id)), change)

    # 2) generator that yields one day at a time
    def gen():
        for sl in day_slices:
            x_block = X[sl]        # (n_windows, look_back, n_feats)
            y_block = y[sl]        # (n_windows,)
            weekday = int(weekday_vec[sl[0]])

            xb = np.expand_dims(x_block, 0).astype(np.float32)  # (1, n_windows, look_back, n_feats)
            yb = np.expand_dims(y_block, 0).astype(np.float32)  # (1, n_windows)
            if raw_close is None:
                yield xb, yb, np.int32(weekday)
            else:
                cb = np.expand_dims(raw_close[sl], 0).astype(np.float32)
                bb = np.expand_dims(raw_bid[sl],   0).astype(np.float32)
                ab = np.expand_dims(raw_ask[sl],   0).astype(np.float32)
                yield xb, yb, cb, bb, ab, np.int32(weekday)

    feat_shape = X.shape[1:]  # (look_back, n_feats)
    if raw_close is None:
        output_signature = (
            tf.TensorSpec((1, None, *feat_shape), tf.float32),
            tf.TensorSpec((1, None),          tf.float32),
            tf.TensorSpec((),                 tf.int32),
        )
    else:
        output_signature = (
            tf.TensorSpec((1, None, *feat_shape), tf.float32),
            tf.TensorSpec((1, None),             tf.float32),
            tf.TensorSpec((1, None),             tf.float32),
            tf.TensorSpec((1, None),             tf.float32),
            tf.TensorSpec((1, None),             tf.float32),
            tf.TensorSpec((),                    tf.int32),
        )

    return tf.data.Dataset.from_generator(gen, output_signature=output_signature) \
                         .prefetch(tf.data.AUTOTUNE)


In [13]:
def split_to_day_datasets(
    X_tr, y_tr, day_id_tr,                 # training arrays: (n_train, look_back, n_feats), (n_train,), (n_train,)
    X_val, y_val, day_id_val,              # validation arrays
    X_te, y_te, day_id_te,                 # test arrays
    raw_close_te, raw_bid_te, raw_ask_te,  # test raw price arrays: each (n_test,)
    *,
    df,                                    # original minute‐bar DataFrame
    train_batch: int                       # stateful training batch size
) -> Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]:
    """
    Splits train/val/test arrays into day‐level tf.data.Datasets.
    
    - Training & validation datasets yield 3‐tuples:
        (x_day, y_day, weekday)
      where x_day has shape (n_windows, look_back, n_feats).
    
    - Test dataset yields 6‐tuples:
        (x_day, y_day, raw_close_day, raw_bid_day, raw_ask_day, weekday)
    """

    # Build a single weekday vector over all timestamps.
    weekday_all = df.index.dayofweek.to_numpy(dtype=np.int8)
    
    # Compute split lengths.
    n_tr  = len(X_tr)
    n_val = len(X_val)
    n_te  = len(X_te)
    
    # Slice out each split's weekday vector.
    weekday_vec_tr  = weekday_all[:n_tr]
    weekday_vec_val = weekday_all[n_tr:n_tr + n_val]
    weekday_vec_te  = weekday_all[n_tr + n_val:n_tr + n_val + n_te]
    
    # Create day‐level datasets.
    ds_tr  = make_day_dataset(X_tr,  y_tr,  day_id_tr,  weekday_vec_tr)
    ds_val = make_day_dataset(X_val, y_val, day_id_val, weekday_vec_val)
    ds_test= make_day_dataset(
        X_te, y_te, day_id_te, weekday_vec_te,
        raw_close=raw_close_te, raw_bid=raw_bid_te, raw_ask=raw_ask_te
    )
    
    # For training: strip the extra leading batch dimension
    # then padded‐batch by train_batch for stateful training.
    def _strip(x_day, y_day, wd):
        # x_day: (1, n_windows, look_back, n_feats)
        # squeeze removes the outer 1
        return tf.squeeze(x_day, 0), tf.squeeze(y_day, 0), wd

    ds_train_batched = (
        ds_tr
        .map(_strip, num_parallel_calls=tf.data.AUTOTUNE)
        .padded_batch(train_batch, drop_remainder=True)
        .prefetch(tf.data.AUTOTUNE)
    )
    
    # Optionally save the test dataset for later inspection
    ds_test.save(str(save_dir / f"{ticker}_ds_test"), compression="GZIP")
    
    return ds_train_batched, ds_val, ds_test


In [14]:
ds_train_batched, ds_val_unbatched, ds_test_unbatched = split_to_day_datasets(
    X_tr, y_tr, day_id_tr,
    X_val, y_val, day_id_val,
    X_te, y_te, day_id_te, raw_close_te, raw_bid_te, raw_ask_te,
    df=df,
    train_batch=TRAIN_BATCH
)


In [15]:
'''
                       (inference / trading time)

┌──────────────────────────────────────────────────────────────────────────┐
│ ❶  NETWORK  WEIGHTS  θ  – learned across all history, fixed at runtime   │
└──────────────────────────────────────────────────────────────────────────┘
        │
        ▼
┌──────────────────────────────────────────────────────────────────────────┐
│ ❷  CELL STATE  cₜ  – slow integrator covering the *whole* current day    │
│    • retains early-morning context                                       │
│    • reset_states()  at every midnight → zero on next session            │
└──────────────────────────────────────────────────────────────────────────┘
        │
        ▼
┌──────────────────────────────────────────────────────────────────────────┐
│ ❸  HIDDEN STATE  hₜ  – fast dynamics (a few bars)                        │
│    • captures spikes / micro-structure                                   │
│    • reset together with cₜ midnight                                     │
└──────────────────────────────────────────────────────────────────────────┘
        │
        ▼
┌──────────────────────────────────────────────────────────────────────────┐
│ ❹  INPUT WINDOW  xₜ  – last 60 minutes of raw features                   │
│    • first RTH prediction uses 60 *pre-trade* minutes only               │
│    • later predictions mix pre-trade + today’s RTH, never yesterday RTH  │
└──────────────────────────────────────────────────────────────────────────┘
        │
        ▼
                        Predicted signal ŷₜ


Day i                               Day i+1
|────────────┬──────────────┬───…──┬────────┐
08:00        09:30        16:00   08:00    09:30
pre-trade       RTH                pre-trade  RTH
cₜ,hₜ: 0 → accumulate → reset_states() → 0 → accumulate


'''

'\n                       (inference / trading time)\n\n┌──────────────────────────────────────────────────────────────────────────┐\n│ ❶  NETWORK  WEIGHTS  θ  – learned across all history, fixed at runtime   │\n└──────────────────────────────────────────────────────────────────────────┘\n        │\n        ▼\n┌──────────────────────────────────────────────────────────────────────────┐\n│ ❷  CELL STATE  cₜ  – slow integrator covering the *whole* current day    │\n│    • retains early-morning context                                       │\n│    • reset_states()  at every midnight → zero on next session            │\n└──────────────────────────────────────────────────────────────────────────┘\n        │\n        ▼\n┌──────────────────────────────────────────────────────────────────────────┐\n│ ❸  HIDDEN STATE  hₜ  – fast dynamics (a few bars)                        │\n│    • captures spikes / micro-structure                                   │\n│    • reset together with cₜ midnight    

In [16]:
# ─────────────────────────────────────────────────────────────────────────────
# 1) YOUR ORIGINAL STATEFUL DUAL-LSTM BUILDER (pure Keras LSTM)
# ─────────────────────────────────────────────────────────────────────────────
from tensorflow.keras import layers, models, regularizers, optimizers, metrics
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts

def build_stateful_lstm_dual(
    *, time_steps:int, n_feats:int, batch_size:int,
      short_units:int, long_units:int,
      dropout_short:float, dropout_long:float,
      initial_lr:float, first_decay_steps:int,
      t_mul:float, m_mul:float, alpha:float,
      clipnorm:float
) -> models.Model:
    inp = layers.Input(
        batch_shape=(batch_size, time_steps, n_feats),
        dtype="float32", name="inp"
    )

    x = layers.LSTM(
        short_units,
        stateful=True, return_sequences=True,
        dtype="float32",
        kernel_initializer="orthogonal",
        kernel_regularizer=regularizers.l2(1e-4),
        name="short_lstm"
    )(inp)
    x = layers.Dropout(dropout_short, name="dropout_short")(x)
    x = layers.LayerNormalization(name="ln_short")(x)

    x = layers.LSTM(
        long_units,
        stateful=True, return_sequences=True,
        dtype="float32",
        kernel_initializer="orthogonal",
        kernel_regularizer=regularizers.l2(1e-4),
        name="long_lstm"
    )(x)
    x = layers.Dropout(dropout_long, name="dropout_long")(x)
    x = layers.LayerNormalization(name="ln_long")(x)

    out = layers.TimeDistributed(layers.Dense(1), name="pred")(x)

    model = models.Model(inputs=inp, outputs=out, name="dual_mem_lstm")
    lr_schedule = CosineDecayRestarts(
        initial_learning_rate=initial_lr,
        first_decay_steps=first_decay_steps,
        t_mul=t_mul, m_mul=m_mul, alpha=alpha
    )
    opt = optimizers.Adam(learning_rate=lr_schedule, clipnorm=clipnorm)
    model.compile(
        optimizer=opt,
        loss="mse",
        metrics=[metrics.RootMeanSquaredError(name="rmse")]
    )
    return model


In [17]:
###############################################################################
# BUILD THE TWO STATEFUL COPIES                                               #
###############################################################################

# fast mixed-precision (FP16) train graph
model_train = build_stateful_lstm_dual(
    # sequence dimensions
    time_steps        = LOOK_BACK,      # fixed window length
    n_feats           = N_FEATS,        # features per timestep
    batch_size        = TRAIN_BATCH,    # stateful batch size for training

    # architecture
    short_units       = SHORT_UNITS,
    long_units        = LONG_UNITS,
    dropout_short     = DROPOUT_SHORT,
    dropout_long      = DROPOUT_LONG,

    # optimizer & schedule
    initial_lr        = INITIAL_LR,
    first_decay_steps = FIRST_DECAY_STEPS,  # absolute gradient steps before first decay
    t_mul             = T_MUL,
    m_mul             = M_MUL,
    alpha             = ALPHA,

    # misc
    clipnorm          = CLIPNORM
)

# pure FP32 validation/inference graph (day-by-day; batch_size = 1)
model_val = build_stateful_lstm_dual(
    time_steps        = LOOK_BACK,
    n_feats           = N_FEATS,
    batch_size        = VAL_BATCH,       # typically 1 for per-day eval

    short_units       = SHORT_UNITS,
    long_units        = LONG_UNITS,
    dropout_short     = DROPOUT_SHORT,
    dropout_long      = DROPOUT_LONG,

    initial_lr        = INITIAL_LR,
    first_decay_steps = FIRST_DECAY_STEPS,
    t_mul             = T_MUL,
    m_mul             = M_MUL,
    alpha             = ALPHA,

    clipnorm          = CLIPNORM
)

# save the val/inference model to disk
model_val.save(model_path)


In [18]:
###############################################################################
# LiveRMSEPlot  ▸  INLINE- & WIDGET-SAFE, SINGLE FIGURE
###############################################################################
class LiveRMSEPlot:
    """
    LiveRMSEPlot updates a single figure to show training progress without spawning
    a new image for each epoch. It works with different matplotlib backends, e.g.,
    %matplotlib inline, widget, or notebook.

    The plot displays:
      - Blue line and dot: training RMSE history.
      - Orange line and dot: validation RMSE history.

    If the latest validation RMSE is not a number (NaN), the corresponding dot is
    hidden by setting its offsets to an empty 2D array.
    """

    def __init__(self):
        # Retrieve the current matplotlib backend and convert it to lowercase.
        self.backend = matplotlib.get_backend().lower()
        # Build the figure and axes.
        self._build_figure()
        # Display the figure once and keep a reference to the display_id so that we can
        # update the same output cell on subsequent calls instead of spawning a new figure.
        self.disp_id = display(self.fig, display_id=True)
        # Initialize empty lists to store epoch numbers and RMSE metrics.
        self.e, self.tr, self.va = [], [], []      # e = epochs, tr = train RMSE, va = validation RMSE

    # ------------------------------------------------------------------ #
    def _build_figure(self):
        """
        Constructs and configures the matplotlib figure and axes.
        - Creates empty line plots for training (blue) and validation (orange).
        - Creates scatter plot objects (dots) for the latest RMSE values.
        - Sets up grid, labels, title, and legend.
        """
        self.fig, self.ax = plt.subplots(figsize=(6, 4), dpi=110)
        self.ax.set(xlabel="epoch", ylabel="RMSE", title="Training progress")
        self.ax.grid(True)
        
        # Create a blue line for training RMSE.
        (self.tr_line,) = self.ax.plot([], [], c="#1f77b4", lw=1.5)
        # Create an orange line for validation RMSE.
        (self.va_line,) = self.ax.plot([], [], c="#ff7f0e", lw=1.5)
        # Create scatter objects for the latest training and validation points.
        self.tr_dot = self.ax.scatter([], [], c="#1f77b4", s=30)
        self.va_dot = self.ax.scatter([], [], c="#ff7f0e", s=30)
        
        # Add a legend to differentiate between training and validation RMSE.
        self.ax.legend(["train", "val"])

    # ------------------------------------------------------------------ #
    def update(self, train_rmse: float, val_rmse: float):
        """
        Updates the live plot with new training and validation RMSE values.

        Steps:
         1. Append the new epoch and metric values.
         2. Update the line plots with the full RMSE history.
         3. Update the latest dot position for both training and validation.
            - If the validation RMSE is NaN, hide its dot by setting an empty 2D array.
         4. Recalculate and update axis limits.
         5. Redraw the figure using the appropriate method for the backend.
        """
        # 1. Append new data:
        #    - Epochs are automatically numbered starting from 1.
        self.e.append(len(self.e) + 1)
        self.tr.append(train_rmse)
        self.va.append(val_rmse)
        
        # 2. Update line plots:
        #    - For the training line, simply use all available data.
        self.tr_line.set_data(self.e, self.tr)
        
        #    - For the validation line, filter out non-finite values (e.g., NaN).
        finite = np.isfinite(self.va)
        self.va_line.set_data(np.asarray(self.e)[finite],
                              np.asarray(self.va)[finite])
        
        # 3. Update the latest dots:
        #    - Always update the training dot with the most recent training RMSE.
        self.tr_dot.set_offsets([[self.e[-1], self.tr[-1]]])
        
        #    - For the validation dot, only update if the latest value is finite.
        if np.isfinite(self.va[-1]):
            self.va_dot.set_offsets([[self.e[-1], self.va[-1]]])
        else:
            # Instead of an empty list, we pass an empty 2D NumPy array with shape (0,2)
            # to properly hide the dot when the validation RMSE is NaN.
            self.va_dot.set_offsets(np.empty((0, 2)))
        
        # 4. Rescale the axes:
        #    - This ensures all data is visible in the plot.
        self.ax.relim()
        self.ax.autoscale_view()
        
        # 5. Redraw the figure:
        #    - For widget backends, use draw_idle to schedule a redraw.
        #    - For inline / notebook backends, force a redraw and update the output cell.
        if "widget" in self.backend or "ipympl" in self.backend:
            self.fig.canvas.draw_idle()
        else:
            self.fig.canvas.draw()
            self.disp_id.update(self.fig)


In [19]:
# =============================================================================
# 2) STATEFUL TRAINING STEP (_train_step)
# =============================================================================
@tf.function(jit_compile=True)
def _train_step(xb, yb, model, loss_fn, opt):
    """
    One mixed-precision update on (B, time_steps, n_feats) windows.
    - Uses LossScaleOptimizer for FP16 scaling.
    """
    with tf.GradientTape() as tape:
        # 1) forward pass
        y_pred = model(xb, training=True)
        y_pred = tf.cast(y_pred, tf.float32)
        tf.debugging.check_numerics(y_pred, "y_pred has NaN/Inf")

        # 2) compute raw MSE loss
        loss = loss_fn(yb, y_pred)
        tf.debugging.check_numerics(loss, "loss has NaN/Inf")

        # 3) scale the loss for FP16
        scaled_loss = opt.get_scaled_loss(loss)

    # 4) compute scaled gradients
    scaled_grads = tape.gradient(scaled_loss, model.trainable_weights)
    # 4a) check each scaled grad individually
    for g in scaled_grads:
        if g is not None:
            tf.debugging.check_numerics(g, "scaled_grad has NaN/Inf")

    # 5) unscale gradients
    grads = opt.get_unscaled_gradients(scaled_grads)
    # 5a) check each unscaled grad individually
    for g in grads:
        if g is not None:
            tf.debugging.check_numerics(g, "unscaled_grad has NaN/Inf")

    # 6) apply
    opt.apply_gradients(zip(grads, model.trainable_weights))

    # 7) return true RMSE
    return tf.sqrt(loss)




# =============================================================================
# 2. Helper to Extract the Current Learning Rate
# =============================================================================
def current_lr_from(opt: tf.keras.optimizers.Optimizer) -> float:
    """
    Extracts and returns the scalar learning rate from an optimizer.
    
    This function supports various cases:
      • The optimizer might be wrapped in a LossScaleOptimizer (for mixed-precision training).
      • The learning rate can be a static value (or tf.Variable) or be managed dynamically
        by a LearningRateSchedule.
    
    Steps:
      1. If the optimizer is a LossScaleOptimizer, unwrap it to get the inner optimizer.
      2. Retrieve the learning rate from the optimizer.
      3. If the learning rate is part of a LearningRateSchedule, compute its current value based
         on the optimizer's iteration count.
      4. Otherwise, extract and return the numerical value of the learning rate.
    
    Returns:
        A float representing the current learning rate.
    """
    # Unwrap if the optimizer is wrapped by a mixed-precision LossScaleOptimizer.
    if isinstance(opt, LossScaleOptimizer):
        opt = opt.inner_optimizer  # For some TF versions, this might be opt._optimizer
    
    # Retrieve the learning rate property from the optimizer.
    lr = opt.learning_rate
    # If the learning rate is scheduled (i.e., a LearningRateSchedule), determine its value.
    if isinstance(lr, LearningRateSchedule):
        return float(lr(opt.iterations))
    else:
        # Otherwise, extract the value directly (it might be stored in a tf.Variable).
        return float(tf.keras.backend.get_value(lr))


In [20]:

def custom_stateful_training_loop(
    model_train,
    model_val,
    ds_train_batched,    # yields (xb_days, yb_days, wd_days)
    ds_val,              # yields (x_day, y_day, wd)
    *,
    n_train_days: int,
    max_epochs: int,
    early_stop_patience: int,
    baseline_val_rmse: float,
    weights_path
) -> float:
    """
    Executes a stateful LSTM training regime with two internal memories:
      - short_lstm: reset every day
      - long_lstm:  reset once per week (on weekend rollover)

    Training bundles TRAIN_BATCH days for prefetching but still
    processes them day-by-day to preserve chronological resets.
    Validation runs one day at a time (VAL_BATCH=1).
    """

    # ---------------------------------------------------------------------------------
    # 1) Setup: loss, optimizer, visualization, and locate stateful layers
    # ---------------------------------------------------------------------------------
    loss_fn = tf.keras.losses.MeanSquaredError()
    opt     = model_train.optimizer           # your LossScaleOptimizer from step 1
    live_plot = LiveRMSEPlot()                # optional live plotting helper

    # Extract TRAIN_BATCH (days bundled per ds_train_batched element)
    TRAIN_BATCH = model_train.input_shape[0]

    # Find your two stateful LSTM layers by name for resetting
    short_tr = [l for l in model_train.layers if l.name == "short_lstm"]
    long_tr  = [l for l in model_train.layers if l.name == "long_lstm"]
    short_val = [l for l in model_val.layers if l.name == "short_lstm"]
    long_val  = [l for l in model_val.layers if l.name == "long_lstm"]

    best_val_rmse = math.inf
    patience_ctr  = 0

    # ---------------------------------------------------------------------------------
    # 2) Epoch Loop
    # ---------------------------------------------------------------------------------
    for epoch in range(1, max_epochs + 1):
        pbar = tqdm(
            total=n_train_days,
            desc=f"Epoch {epoch:03d}",
            unit="day",
            dynamic_ncols=True,
            leave=False
        )
        batch_rmses = []

        # -----------------------------
        # 2A) TRAINING PHASE
        # -----------------------------
        for xb_days, yb_days, wd_days in ds_train_batched:
            # xb_days: (TRAIN_BATCH, windows_per_day, LOOK_BACK, n_feats)
            # yb_days: (TRAIN_BATCH, windows_per_day)
            prev_wd = None
            collected_x, collected_y = [], []

            # 1) Iterate each of the bundled days in date order
            for day_i in range(xb_days.shape[0]):
                x_day = xb_days[day_i]     # (W_i, LOOK_BACK, n_feats)
                y_day = yb_days[day_i]     # (W_i,)
                wd    = int(wd_days[day_i])

                # -- reset short-term memory each new day
                for layer in short_tr:
                    layer.reset_states()

                # -- reset long-term memory if weekday rolled over
                if prev_wd is not None and wd < prev_wd:
                    for layer in long_tr:
                        layer.reset_states()
                prev_wd = wd

                # collect this day's windows & labels
                collected_x.append(x_day)
                collected_y.append(y_day)

            # 2) Concatenate into one big batch of windows
            xb_flat = tf.concat(collected_x, axis=0)  # (sum W_i, LOOK_BACK, n_feats)
            yb_flat = tf.concat(collected_y, axis=0)  # (sum W_i,)

            # 3) Slice the flat windows into TRAIN_BATCH-sized chunks
            total_windows = xb_flat.shape[0]
            for start in range(0, total_windows, TRAIN_BATCH):
                xb_batch = xb_flat[start:start+TRAIN_BATCH]
                yb_batch = yb_flat[start:start+TRAIN_BATCH]
                if xb_batch.shape[0] < TRAIN_BATCH:
                    break  # drop incomplete batch at end

                # run one training step on this mini-batch
                rmse = _train_step(xb_batch, yb_batch, model_train, loss_fn, opt)
                batch_rmses.append(float(rmse))

            # advance progress bar by the number of days in this bundle
            pbar.update(xb_days.shape[0])

        epoch_train = float(np.mean(batch_rmses))

        # -----------------------------
        # 2B) VALIDATION PHASE
        # -----------------------------
        model_val.set_weights(model_train.get_weights())
        val_rmses = []
        prev_wd_val = None

        for x_day, y_day, wd in ds_val:
            wd = int(wd)
            # reset daily memory
            for layer in short_val:
                layer.reset_states()
            # reset weekly memory on rollover
            if prev_wd_val is not None and wd < prev_wd_val:
                for layer in long_val:
                    layer.reset_states()
            prev_wd_val = wd

            # forward pass on one day
            y_pred = model_val(x_day, training=False)
            y_pred = tf.cast(tf.squeeze(y_pred, (0, 2)), tf.float32)
            y_true = tf.cast(tf.reshape(y_day, [-1]),        tf.float32)
            day_rmse = tf.sqrt(tf.reduce_mean((y_true - y_pred) ** 2))
            val_rmses.append(float(day_rmse))

        epoch_val = float(np.mean(val_rmses))
        impr_pct  = 100.0 * (1.0 - epoch_val / baseline_val_rmse)
        current_lr = current_lr_from(opt)
        grad_norm  = np.mean(batch_rmses) / current_lr

        # Log & plot
        print(
            f"Epoch {epoch:03d} • train={epoch_train:.6f} • val={epoch_val:.6f}"
            f" • impr={impr_pct:5.1f}% • lr={current_lr:.2e} • g≈{grad_norm:.2f}"
        )
        live_plot.update(epoch_train, epoch_val)
        pbar.close()

        # -----------------------------
        # 2C) Early Stopping & Checkpoint
        # -----------------------------
        if epoch_val < best_val_rmse:
            best_val_rmse = epoch_val
            patience_ctr  = 0
            model_train.save_weights(weights_path)
        else:
            patience_ctr += 1
            if patience_ctr >= early_stop_patience:
                print("Early stopping triggered.")
                break

    # End of epoch loop: restore best weights and return RMSE
    model_train.load_weights(weights_path)
    return best_val_rmse


In [21]:
cf = tf.function(lambda x: model_train(x, training=False), jit_compile=False) \
       .get_concrete_function(
         tf.TensorSpec((TRAIN_BATCH, LOOK_BACK, N_FEATS), tf.float32)
       )

fused_ops = [
  (op.name, op.type)
  for op in cf.graph.get_operations()
  if "Fused" in op.type or "RNN" in op.type
]
print("Possible fused RNN ops:", fused_ops)


Possible fused RNN ops: [('dual_mem_lstm/ln_short/FusedBatchNormV3', 'FusedBatchNormV3'), ('dual_mem_lstm/ln_long/FusedBatchNormV3', 'FusedBatchNormV3')]


In [22]:
import time, numpy as np, tensorflow as tf

# 1) Versions
info = tf.sysconfig.get_build_info()
print(f"CUDA:   {info['cuda_version']}")
print(f"cuDNN:  {info['cudnn_version']}")

# 2) Graph inspection
@tf.function
def infer(x):
    return model_train(x, training=False)

# warm up & get concrete function
spec = tf.TensorSpec((TRAIN_BATCH, LOOK_BACK, N_FEATS), tf.float32)
_ = infer(tf.zeros((TRAIN_BATCH, LOOK_BACK, N_FEATS)))
cf = infer.get_concrete_function(spec)

ops = [op.type for op in cf.graph.get_operations()]
found = [o for o in ops if "CudnnRNN" in o]
print("Fused CuDNN ops:", found)

# 3) Micro-benchmark
optimizer = model_train.optimizer

@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        y_pred = model_train(x, training=True)
        loss   = tf.reduce_mean((y - y_pred)**2)
    grads = tape.gradient(loss, model_train.trainable_weights)
    optimizer.apply_gradients(zip(grads, model_train.trainable_weights))
    return loss

xb = tf.random.uniform((TRAIN_BATCH, LOOK_BACK, N_FEATS), dtype=tf.float32)
yb = tf.random.uniform((TRAIN_BATCH, LOOK_BACK, 1),       dtype=tf.float32)

# warm up
_ = train_step(xb, yb)

# GPU timing
tf.config.set_visible_devices(tf.config.list_physical_devices("GPU"), "GPU")
tf.config.optimizer.set_jit(True)
gpu_times = [time.perf_counter() for _ in range(1)]  # single-shot
_ = train_step(xb, yb)
gpu_time = (time.perf_counter() - gpu_times[0]) * 1000
print(f"GPU step time: {gpu_time:.1f} ms")

# CPU timing
tf.config.set_visible_devices([], "GPU")
_ = train_step(xb, yb)  # rebuild on CPU
cpu_times = [time.perf_counter() for _ in range(1)]
_ = train_step(xb, yb)
cpu_time = (time.perf_counter() - cpu_times[0]) * 1000
print(f"CPU step time: {cpu_time:.1f} ms")


CUDA:   12.4
cuDNN:  9


I0000 00:00:1751460084.634446    6279 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2025-07-02 12:41:24.647483: E external/local_xla/xla/stream_executor/dnn.cc:1140] Sequence lengths for RNN are required from CUDNN 9.0+


InternalError: Graph execution error:

Detected at node CudnnRNN defined at (most recent call last):
<stack traces unavailable>
Failed to call ThenRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 5, 64, 1, 90, 64, 64] 
	 [[{{node CudnnRNN}}]]
	 [[dual_mem_lstm/short_lstm/PartitionedCall]] [Op:__inference_infer_4407]

In [None]:

# ─────────────────────────────────────────────────────────────────────────────
#  ONE-OFF  :  baseline on the untouched validation split
# ─────────────────────────────────────────────────────────────────────────────
def naive_rmse(ds_val):
    """RMSE when the forecast is always zero."""
    mse, n = 0.0, 0
    for _, y_day, _ in ds_val:
        y = y_day.numpy().squeeze()
        mse += np.sum(y ** 2)
        n   += y.size
    return math.sqrt(mse / n)

baseline_val_rmse = naive_rmse(ds_val_unbatched)
print(f"Baseline (predict-zero) RMSE on validation = {baseline_val_rmse:.6f}")

# ─────────────────────────────────────────────────────────────────────────────
#  Training loop (outer bar only → maximum throughput)                     #
# ─────────────────────────────────────────────────────────────────────────────
n_train_days = len(np.unique(day_id_tr))
print(f"Training sees {n_train_days} calendar days per epoch\n")

best_val_rmse = custom_stateful_training_loop(
        model_train         = model_train,
        model_val           = model_val,
        ds_train_batched    = ds_train_batched,
        ds_val              = ds_val_unbatched,
        n_train_days        = n_train_days,
        max_epochs          = MAX_EPOCHS,
        early_stop_patience = EARLY_STOP_PATIENCE,
        baseline_val_rmse   = baseline_val_rmse,
        weights_path        = weights_path)

print(f"\nChampion validation RMSE = {best_val_rmse:.6f}")
print(f"Improvement vs baseline   = {(1 - best_val_rmse/baseline_val_rmse)*100:5.1f} %")

