In [14]:
import datetime as dt
from pathlib import Path
from typing import Sequence, Tuple, List

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

import tensorflow as tf

In [2]:
ticker = 'AAPL'

df = pd.read_csv(f"dfs training/merged_{ticker}.csv", index_col=0, parse_dates=True)
df

Unnamed: 0,open,high,low,close,volume,bid,ask,trade_action,StrategyEarning,EarningDiff,signal_smooth_norm
2025-01-02 13:30:00,250.5906,250.6435,250.5244,250.5753,2259.0,250.5001,250.6505,0,0.00,0.000,0.0
2025-01-02 13:31:00,250.5806,250.6317,250.5121,250.5606,2351.0,250.4854,250.6358,0,0.00,0.000,0.0
2025-01-02 13:32:00,250.5712,250.6200,250.4938,250.5453,2455.0,250.4701,250.6205,0,0.00,0.000,0.0
2025-01-02 13:33:00,250.5580,250.6094,250.4762,250.5347,2474.0,250.4595,250.6099,0,0.00,0.000,0.0
2025-01-02 13:34:00,250.5491,250.5994,250.4600,250.5168,2792.0,250.4416,250.5919,0,0.00,0.000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2025-06-03 20:56:00,203.2500,203.3500,203.2450,203.3200,189023.0,203.2590,203.3810,0,1.99,0.942,0.0
2025-06-03 20:57:00,203.3200,203.4200,203.3050,203.3800,222383.0,203.3190,203.4410,0,1.99,0.882,0.0
2025-06-03 20:58:00,203.3800,203.4300,203.3322,203.3750,279702.0,203.3140,203.4360,0,1.99,0.887,0.0
2025-06-03 20:59:00,203.3700,203.4100,203.2500,203.3400,724307.0,203.2790,203.4010,0,1.99,0.922,0.0


In [3]:
# ======================================================================
# Turn a multi-day minute-bar DataFrame into leakage-free tensors that an
# LSTM can digest.  
# Key guarantees
#   • NO window ever crosses a midnight boundary              (⇒ no leak)
#   • Each feature column is standard-scaled *inside the same day*       
#   • The label column can be scaled the same way
#   • Final dtype = float32  (GPU-friendly, half the RAM of float64)
# ======================================================================

def build_lstm_tensors(
    df: pd.DataFrame,
    *,                           # ← everything after this must be keyworded
    look_back: int = 60,         # minutes of history the LSTM should see
    feature_cols: Sequence[str] = ["open", "high", "low", "close", "volume"],
    label_col: str = "signal_smooth_norm",
    rth_start: dt.time = dt.time(14, 30),    # 09:30 ET expressed in CET
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Parameters
    ----------
    df : pd.DataFrame
        All sessions stacked vertically, index = DateTimeIndex.
    look_back : int
        Window length (in rows/minutes) fed into the LSTM.
    feature_cols : sequence[str]
        Predictor columns.
    label_col : str
        Column the model must predict (1-step-ahead, same-bar, etc.).
    rth_start : datetime.time
        All bars ≥ rth_start count as “Regular Trading Hours”; only those
        become *targets*.  Earlier bars are context only.

    Returns
    -------
    X : np.ndarray  shape (samples, look_back, n_features)
    y : np.ndarray  shape (samples,)
    """

    X_windows, y_targets = [], []     # will collect windows from *all* days

    # ------------------------------------------------------------------
    # 1)  iterate ONE calendar day at a time —> main anti-leak safeguard
    # ------------------------------------------------------------------
    for _, day_df in df.groupby(df.index.date):
        # ----- keep rows in chronological order -----
        day_df = day_df.sort_index()

        # ----- fit a per-day StandardScaler on *features* -----
        f_scaler = StandardScaler()
        day_df.loc[:, feature_cols] = f_scaler.fit_transform(day_df[feature_cols])

        # ---- pull NumPy views & cast to float32 (no scaling here) ----
        feat_np = day_df[feature_cols].values.astype("float32")
        label_np = day_df[label_col].values.astype("float32")

        # indices of bars that are INSIDE the regular session
        mask_rth = day_df.index.time >= rth_start      # ← keeps ONLY ≥ 14 : 30 CET
        idx_rth  = np.flatnonzero(mask_rth)            # ← integer positions of those rows

        # --------------------------------------------------------------
        # 2)  build windows **within this single day only**
        #     => Monday 16:00 can never join Tuesday 08:30
        # --------------------------------------------------------------
        for i in idx_rth:            # ← iterate *only* over RTH bars
            if i < look_back:        # need full context; skip if not
                continue
                
            # the network learns “given the previous look_back candles, predict the label at the next minute.”
            window = feat_np[i - look_back : i]   # shape (look_back, n_feat) --- rows t-look_back … t-1
            target = label_np[i]                  # 1-D float --- row t  (always RTH)

            X_windows.append(window)
            y_targets.append(target)

    # ------------------------------------------------------------------
    # 3)  final stacking → tensors ready for model.fit()
    # ------------------------------------------------------------------
    X = np.stack(X_windows, dtype="float32")      # (samples, look_back, n_feat)
    y = np.asarray(y_targets, dtype="float32")    # (samples,)

    return X, y


In [4]:
X, y = build_lstm_tensors(df)

print(X.shape) # we use 'm' features and 'n' previous look back values to predict each 1 label
print(y.shape) # 'n' lookback values * 'n_days_df' (all pretrade values) are deducted from the original df shape

(40664, 60, 5)
(40664,)


In [7]:
def chronological_split(
    X: np.ndarray,
    y: np.ndarray,
    df: pd.DataFrame,
    *,
    look_back: int,
    rth_start: dt.time,
    train_prop: float = 0.70,
    val_prop: float = 0.15,
) -> Tuple[
        Tuple[np.ndarray, np.ndarray],   # train
        Tuple[np.ndarray, np.ndarray],   # val
        Tuple[np.ndarray, np.ndarray],   # test
        List[int]                        # samples per day
    ]:
    """
    Split minute-bar tensors into chronological train / val / test
    without hard-coding “391 windows per day”.

    X, y            — tensors from build_lstm_tensors  
    df              — the same multi-day DataFrame used to create X, y  
    look_back       — window length you used (e.g. 60)  
    rth_start       — session open time (≥ labels)  
    train_prop      — fraction of days → train  
    val_prop        — fraction of days → validation  
                     remainder goes to test
    """

    # 1️⃣  count windows per calendar day *exactly as build_lstm_tensors did*
    samples_per_day: List[int] = []
    for _, day_df in df.groupby(df.index.date):
        day_df = day_df.sort_index()

        # How many indices are valid labels (time ≥ rth_start)?
        rth_rows = (day_df.index.time >= rth_start).sum()

        # No extra subtraction: the first RTH index is already ≥ look_back
        samples_per_day.append(rth_rows)

    # 2️⃣  map every sample in X to its day-number
    day_id = np.repeat(np.arange(len(samples_per_day)), samples_per_day)
    if day_id.size != len(X):
        raise ValueError(
            f"Mismatch: computed {day_id.size} samples from df "
            f"but X has {len(X)} rows"
        )

    # 3️⃣  calculate day cut-points (chronological)
    last_day = day_id.max()                      # e.g. 103
    train_cut = int(last_day * train_prop)
    val_cut   = int(last_day * (train_prop + val_prop))

    train_mask =  day_id <= train_cut
    val_mask   = (day_id > train_cut) & (day_id <= val_cut)
    test_mask  =  day_id > val_cut

    X_train, y_train = X[train_mask], y[train_mask]
    X_val,   y_val   = X[val_mask],   y[val_mask]
    X_test,  y_test  = X[test_mask],  y[test_mask]

    return (X_train, y_train), (X_val, y_val), (X_test, y_test), samples_per_day


In [17]:
(X_tr, y_tr), (X_val, y_val), (X_te, y_te), samples_pd = chronological_split(
    X, y, df,
    look_back=60,
    rth_start=dt.time(14, 30),   # 09:30 ET in CET
    train_prop=0.70,
    val_prop=0.15, # let's create only train an test sets for the moment
)

print(f"Per-day windows   : {samples_pd[:5]} …")
print(f"Set shapes        : train {X_tr.shape}, val {X_val.shape}, test {X_te.shape}")


Per-day windows   : [391, 391, 391, 391, 391] …
Set shapes        : train (28543, 60, 5), val (5865, 60, 5), test (6256, 60, 5)


######################################################################

GOAL
----- 

Build a **stateful** LSTM that remembers the whole RTH session,
then flushes (“forgets”) at every midnight so yesterday’s state
never bleeds into today’s open.  

We still keep a 60-bar
look-back, which acts as the “short-term” component.

KEY DESIGN CHANGES vs the stateless baseline

───────────────────────────────────────────────────────────────

1. `stateful=True`  →  hidden–cell state is preserved from one
   training batch to the next.
2. Fixed `batch_size`.  Keras requires you to specify it in the
   `batch_input_shape` when the layer is stateful.
3. Data must arrive in strict chronological order (`shuffle=False`).
4. We call `model.reset_states()` after finishing every *day*.
5. We feed one **day** per `model.fit(...)` call, so that the callback
   machinery (early stopping, LR scheduler) still works automatically.
   
######################################################################


In [None]:

########################################################################
# 1)  MAKE A DAY-WISE DATASET  (generator that yields one day at a time)
########################################################################
def make_day_dataset(
    X: np.ndarray,
    y: np.ndarray,
    day_id: np.ndarray,
    batch_size: int = 1
) -> tf.data.Dataset:
    """
    Yields windows in *chronological* order, grouped by calendar day.
    Each element = (X_day, y_day) where
        X_day shape → (n_samples_in_that_day, 60, n_feats)
        y_day shape → (n_samples_in_that_day,)
    """
    unique_days = np.unique(day_id)

    def gen():
        for d in unique_days:
            idx = np.flatnonzero(day_id == d)
            yield X[idx], y[idx]

    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            tf.TensorSpec(shape=(None, X.shape[1], X.shape[2]), dtype=tf.float32),
            tf.TensorSpec(shape=(None,),                        dtype=tf.float32)
        )
    ).batch(1, drop_remainder=False)   # 1 "sequence" (the day) per step




In [None]:
#########################################################
# 2)  BUILD THE STATEFUL  LSTM
#########################################################
look_back  = X_tr.shape[1]       # 60
n_feats    = X_tr.shape[2]       # 5
batch_size = 1                   # 1 day per training step

model = tf.keras.Sequential([
    # note batch_input_shape instead of input_shape
    tf.keras.layers.Input(
        batch_shape=(batch_size, look_back, n_feats)
    ),
    tf.keras.layers.LSTM(
        64,
        stateful=True,           # <── remembers across batches
        dropout=0.2,
        recurrent_dropout=0.2
    ),
    tf.keras.layers.Dense(1, activation="linear")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss="mse",
    metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse")]
)



In [None]:
#########################################################
# 3)  PREPARE day_id ARRAYS  FOR  TRAIN / VAL / TEST SETS
#########################################################
# chronological_split already built the masks; we can reuse its logic.
# For brevity we recompute here:
def build_day_id(df: pd.DataFrame, rth_start: dt.time, look_back: int) -> np.ndarray:
    samples_per_day = []
    for _, day_df in df.groupby(df.index.date):
        rth_rows = (day_df.index.time >= rth_start).sum()
        samples_per_day.append(rth_rows)
    return np.repeat(np.arange(len(samples_per_day)), samples_per_day)

day_id_full = build_day_id(df, dt.time(14, 30), look_back=60)
day_id_tr   = day_id_full[:len(X_tr)]
day_id_val  = day_id_full[len(X_tr): len(X_tr)+len(X_val)]
day_id_te   = day_id_full[-len(X_te):]

ds_train = make_day_dataset(X_tr,  y_tr,  day_id_tr)
ds_val   = make_day_dataset(X_val, y_val, day_id_val)



In [None]:
################################################################
# 4)  TRAIN  —  Reset hidden state after every day automatically
################################################################
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_rmse",
        patience=5,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_rmse",
        factor=0.5,
        patience=3,
        min_lr=1e-5
    )
]

EPOCHS = 100
for epoch in range(EPOCHS):
    # ---- TRAIN on every day sequentially ---------------------------------
    for X_day, y_day in ds_train:
        model.train_on_batch(X_day, y_day)
        model.reset_states()        # flush memory overnight
    
    # ---- VALIDATE on every validation day -------------------------------
    val_losses = []
    val_rmses  = []
    for X_day, y_day in ds_val:
        loss, rmse = model.evaluate(X_day, y_day, verbose=0)
        val_losses.append(loss); val_rmses.append(rmse)
        model.reset_states()
    val_rmse_epoch = np.mean(val_rmses)
    print(f"Epoch {epoch:3d}   val_RMSE = {val_rmse_epoch:.5f}")
    
    # ---- Early-stopping bookkeeping -------------------------------------
    callbacks[0].on_epoch_end(epoch, logs={"val_rmse": val_rmse_epoch})
    callbacks[1].on_epoch_end(epoch, logs={"val_rmse": val_rmse_epoch})
    if callbacks[0].stopped_epoch > 0:
        break




In [None]:
#########################################################
# 5)  FINAL  EVALUATION  ON CHRONOLOGICAL TEST SET
#########################################################
test_rmses = []
for X_day, y_day in make_day_dataset(X_te, y_te, day_id_te):
    loss, rmse = model.evaluate(X_day, y_day, verbose=0)
    test_rmses.append(rmse)
    model.reset_states()

print(f"\nFINAL  TEST  RMSE  =  {np.mean(test_rmses):.5f}")