In [1]:
# 1) Wipe out all Python variables
%reset -f
# 2) Force Python’s garbage collector to run
import gc
gc.collect()

import importlib
from libs import trades, plots, params, models
importlib.reload(trades)
importlib.reload(plots)
importlib.reload(params)
importlib.reload(models)

import pandas as pd
import numpy as np
import math

from pathlib import Path
import pickle
import datetime as dt
from datetime import datetime
from datetime import time

import matplotlib.pyplot as plt   
import seaborn as sns
from pprint import pprint

import torch
import torch.nn.functional as Funct
from torch.utils.data import Dataset, DataLoader
torch.serialization.add_safe_globals([models.DayWindowDataset])
import torchmetrics

from tqdm.auto import tqdm
from typing import Tuple, Set, List, Union, Dict

import io
from PIL import Image
import IPython.display as disp

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
)

KeyboardInterrupt: 

In [None]:
device               = params.device
ticker               = params.ticker
save_path            = params.save_path

In [None]:
model_path = min(
    save_path.glob(f"{ticker}_*.pth"),
    key=lambda p: float(p.stem.split("_")[-1])
)
print('Model selected:', model_path,'\n')

# Load the entire model object (architecture + weights)# 1) load your checkpoint dict
ckpt = torch.load(model_path, map_location=device, weights_only=False)

# 2) grab the full model object you saved
model_best = ckpt["model_obj"]

# 3) move to device and set eval mode
model_best = model_best.to(device).eval()

# 4) show parameters, training plot, and model
saved_hparams = ckpt["hparams"]
pprint(saved_hparams)

png_bytes = ckpt.get("train_plot_png")
img = Image.open(io.BytesIO(png_bytes))
disp.display(img)

model_best


In [None]:
df_feat = pd.read_csv(params.feat_csv, index_col=0, parse_dates=True)
df_feat

In [None]:
# 1) build the mem-mapped windows **and** get the end-of-window stamps
print('executing <build_lstm_tensors>...')
X, y_sig, y_ret, raw_close, raw_bid, raw_ask, end_times = models.build_lstm_tensors(
    df            = df_feat,
    look_back     = params.look_back_tick,
    features_cols = params.features_cols_tick,
    label_col     = params.label_col,
    return_col    = params.return_col,
    sess_start    = params.sess_start_pred_tick 
)


# 2) split by day USING your end_times array, not df_feat
print('executing <chronological_split>...')
(
(X_tr,  y_sig_tr,  y_ret_tr),
(X_val, y_sig_val, y_ret_val),
(X_te,  y_sig_te,  y_ret_te,  raw_close_te, raw_bid_te, raw_ask_te),
samples_per_day,
day_id_tr, day_id_val, day_id_te
) = models.chronological_split(
    X, y_sig, y_ret,
    raw_close, raw_bid, raw_ask,
    end_times   = end_times,
    train_prop  = params.train_prop,
    val_prop    = params.val_prop,
    train_batch = params.hparams['TRAIN_BATCH']
)

# carve `end_times` into the same three splits:
n_tr  = day_id_tr .shape[0] 
n_val = day_id_val.shape[0]
i_tr  = n_tr
i_val = n_tr + n_val

end_times_tr  = end_times[:i_tr]
end_times_val = end_times[i_tr:i_val]
end_times_te  = end_times[i_val:]

print('executing <split_to_day_datasets>...')
train_loader, val_loader, test_loader = models.split_to_day_datasets(
    # train split:   
    X_tr,            y_sig_tr,     y_ret_tr,   end_times_tr,
    # val split:
    X_val,           y_sig_val,    y_ret_val,  end_times_val,
    # test split + raw‐prices
    X_te,            y_sig_te,     y_ret_te,   end_times_te,
    raw_close_te, raw_bid_te, raw_ask_te,
    
    sess_start_time       = params.sess_start_pred_tick,
    signal_thresh         = params.best_optuna_params["buy_threshold"],
    return_thresh         = 0.01,  # flat‐zone threshold for returns
    train_batch           = params.hparams["TRAIN_BATCH"],
    train_workers         = params.hparams["NUM_WORKERS"],
    train_prefetch_factor = params.hparams["TRAIN_PREFETCH_FACTOR"]
)

# how many calendar-days in each split?
print("Days →",
      f"train={len(train_loader.dataset)},",
      f"val={len(val_loader.dataset)},",
      f"test={len(test_loader.dataset)}")

# how many sliding-windows in each split?
print("Windows →",
      f"train={train_loader.dataset.X.shape[0]},",
      f"val={val_loader.dataset.X.shape[0]},",
      f"test={test_loader.dataset.X.shape[0]}")

# how many batches per loader?
print("Batches →",
      f"train={len(train_loader)},",
      f"val={len(val_loader)},",
      f"test={len(test_loader)}")

In [None]:
# zero‐forecast baseline on val vs test
# √( mean( (yᵢ – 0)² ) )

val_baseline  = models.naive_rmse(val_loader)
test_baseline = models.naive_rmse(test_loader)

print(f"Val zero‐forecast baseline RMSE  = {val_baseline:.5f}")
print(f"Test zero‐forecast baseline RMSE = {test_baseline:.5f}")

In [None]:
# to confirm the baseline proportions, calculate the STD
# σ = √( mean( (yᵢ – ȳ)² ) )

y_vals = np.concatenate([batch[1].view(-1).numpy()
                         for batch in val_loader])
y_tes  = np.concatenate([batch[1].view(-1).numpy()
                         for batch in test_loader])
print("std val:", np.std(y_vals))
print("std test:", np.std(y_tes))

plt.hist(y_vals, bins=100, alpha=0.5, label="val")
plt.hist(y_tes,  bins=100, alpha=0.5, label="test")
plt.legend(); plt.show()

In [None]:
# def evaluate_model(model, loader, device, split_name: str):
#     """
#     1) Set model.eval(), reset LSTM states.
#     2) Create/regress and binary torchmetrics (threshold=0.5).
#     3) For each padded batch:
#          - unpack lengths, weekdays, raw? data
#          - move to device, skip wd.squeeze()
#          - for each day i in batch:
#              • slice x_day, y_day, cls_t to true length
#              • reset model state on day rollover
#              • forward, sigmoid, update metrics only on real slots
#     4) Compute & print final metrics, return (metrics_dict, concatenated preds).
#     """
#     model.to(device).eval()
#     model.h_short = model.h_long = None
#     thr = 0.5

#     rmse_m = torchmetrics.MeanSquaredError(squared=False).to(device)
#     mae_m  = torchmetrics.MeanAbsoluteError().to(device)
#     r2_m   = torchmetrics.R2Score().to(device)
#     acc_m  = torchmetrics.classification.BinaryAccuracy(threshold=thr).to(device)
#     prec_m = torchmetrics.classification.BinaryPrecision(threshold=thr).to(device)
#     rec_m  = torchmetrics.classification.BinaryRecall(threshold=thr).to(device)
#     f1_m   = torchmetrics.classification.BinaryF1Score(threshold=thr).to(device)
#     auc_m  = torchmetrics.classification.BinaryAUROC().to(device)

#     for m in (rmse_m, mae_m, r2_m, acc_m, prec_m, rec_m, f1_m, auc_m):
#         m.reset()

#     all_preds = []
#     prev_day  = None

#     with torch.no_grad():
#         for batch in tqdm(loader, desc=split_name, unit="batch"):
#             # True if test split (with raw prices)
#             if len(batch) == 9:
#                 xb, yb_reg, yb_cls, rc, rb, ra, wd, ts_list, lengths = batch
#             else:
#                 xb, yb_reg, yb_cls, wd, ts_list, lengths = batch

#             xb     = xb.to(device, non_blocking=True)
#             yb_reg = yb_reg.to(device, non_blocking=True)
#             yb_cls = yb_cls.to(device, non_blocking=True)
#             wd     = wd.to(device, non_blocking=True)   # keep shape (B,)
#             B      = xb.size(0)

#             for i in range(B):
#                 W_true = lengths[i]
#                 day_id = int(wd[i].item())

#                 model.reset_short()
#                 if prev_day is not None and day_id < prev_day:
#                     model.reset_long()
#                 prev_day = day_id

#                 x_day = xb[i][:W_true]
#                 y_day = yb_reg[i, :W_true].view(-1)
#                 cls_t = yb_cls[i,   :W_true].view(-1)

#                 pred_reg, pred_cls = model(x_day)
#                 pr    = pred_reg[:, -1, 0]
#                 probs = torch.sigmoid(pred_cls[:, -1, 0])

#                 # update only on real (unpadded) windows
#                 rmse_m.update(pr,    y_day)
#                 mae_m .update(pr,    y_day)
#                 r2_m  .update(pr,    y_day)
#                 acc_m .update(probs, cls_t)
#                 prec_m.update(probs, cls_t)
#                 rec_m .update(probs, cls_t)
#                 f1_m  .update(probs, cls_t)
#                 auc_m .update(probs, cls_t)

#                 all_preds.append(pr.cpu().numpy())

#     metrics = {
#         "rmse": rmse_m.compute().item(),
#         "mae":  mae_m.compute().item(),
#         "r2":   r2_m.compute().item(),
#         "acc":  acc_m.compute().item(),
#         "precision": prec_m.compute().item(),
#         "recall":    rec_m.compute().item(),
#         "f1":        f1_m.compute().item(),
#         "auroc":     auc_m.compute().item(),
#     }
#     print(
#         f"{split_name} → "
#         f"RMSE={metrics['rmse']:.5f} MAE={metrics['mae']:.5f} "
#         f"R2={metrics['r2']:.4f} ACC={metrics['acc']:.4f} "
#         f"PREC={metrics['precision']:.4f} REC={metrics['recall']:.4f} "
#         f"F1={metrics['f1']:.4f} AUROC={metrics['auroc']:.4f}"
#     )
#     return metrics, np.concatenate(all_preds, axis=0)



def evaluate_model(model, loader, device, split_name: str):
    """
    Evaluate a trained Stateful CNN→BiLSTM→Attention→BiLSTM model on a DataLoader.

    Steps:
      1) model.eval(), reset LSTM states.
      2) Instantiate regression, binary, and ternary torchmetrics.
      3) Loop over padded batches:
         - Unpack sequences (with or without raw prices).
         - For each calendar-day in batch:
             • Slice to true length, reset states on day rollover.
             • Forward pass → (raw_reg, raw_bin, raw_ter).
             • Compute sigmoid/softmax → binary & multiclass probs.
             • Update all metrics on unpadded windows.
      4) Compute final metrics, print them in the format:
         R: … | B: … | T: …
      5) Return (metrics_dict, concatenated regression preds).
    """
    # 1) Prepare model
    model.to(device).eval()
    model.h_short = model.h_long = None

    # 2) Metrics setup
    thr = 0.5
    rmse_m      = torchmetrics.MeanSquaredError(squared=False).to(device)
    mae_m       = torchmetrics.MeanAbsoluteError().to(device)
    r2_m        = torchmetrics.R2Score().to(device)
    acc_m       = torchmetrics.classification.BinaryAccuracy(threshold=thr).to(device)
    prec_m      = torchmetrics.classification.BinaryPrecision(threshold=thr).to(device)
    rec_m       = torchmetrics.classification.BinaryRecall(threshold=thr).to(device)
    f1_m        = torchmetrics.classification.BinaryF1Score(threshold=thr).to(device)
    auc_m       = torchmetrics.classification.BinaryAUROC().to(device)

    ter_acc_m   = torchmetrics.classification.MulticlassAccuracy(num_classes=3).to(device)
    ter_prec_m  = torchmetrics.classification.MulticlassPrecision(num_classes=3, average="macro").to(device)
    ter_rec_m   = torchmetrics.classification.MulticlassRecall(num_classes=3, average="macro").to(device)
    ter_f1_m    = torchmetrics.classification.MulticlassF1Score(num_classes=3, average="macro").to(device)
    ter_auc_m   = torchmetrics.classification.MulticlassAUROC(num_classes=3, average="macro").to(device)

    for m in (rmse_m, mae_m, r2_m,
              acc_m, prec_m, rec_m, f1_m, auc_m,
              ter_acc_m, ter_prec_m, ter_rec_m, ter_f1_m, ter_auc_m):
        m.reset()

    all_preds = []
    prev_day  = None

    # 3) Loop through batches
    with torch.no_grad():
        for batch in tqdm(loader, desc=split_name, unit="batch"):
            # unpack depending on raw-price presence
            if len(batch) == 11:
                (xb, y_reg, y_bin, y_ret, y_ter, 
                 rc, rb, ra, wd, ts_list, lengths) = batch
            else:
                xb, y_reg, y_bin, y_ret, y_ter, wd, ts_list, lengths = batch

            # move tensors to device
            xb    = xb.to(device, non_blocking=True)
            y_reg = y_reg.to(device, non_blocking=True)
            y_bin = y_bin.to(device, non_blocking=True)
            y_ret = y_ret.to(device, non_blocking=True)
            y_ter = y_ter.to(device, non_blocking=True)
            wd    = wd.to(device, non_blocking=True)

            B = xb.size(0)
            for i in range(B):
                W_true = lengths[i]
                day_id = int(wd[i].item())

                # reset or carry LSTM states
                model.reset_short()
                if prev_day is not None and day_id < prev_day:
                    model.reset_long()
                prev_day = day_id

                # slice to true window length
                x_day = xb[i, :W_true]
                y_day = y_reg[i, :W_true].view(-1)
                bin_t = y_bin[i, :W_true].view(-1)
                ter_t = y_ter[i, :W_true].view(-1)

                # forward
                raw_reg, raw_bin, raw_ter = model(x_day)
                pr  = raw_reg[..., -1, 0]    # (W_true,)
                pb  = raw_bin[..., -1, 0]
                pt  = raw_ter[..., -1, :]    # (W_true, 3)

                # probabilities
                prob_b = torch.sigmoid(pb)
                prob_t = torch.softmax(pt, dim=-1)

                # update metrics
                rmse_m.update(pr, y_day)
                mae_m .update(pr, y_day)
                r2_m  .update(pr, y_day)

                acc_m .update(prob_b, bin_t)
                prec_m.update(prob_b, bin_t)
                rec_m .update(prob_b, bin_t)
                f1_m  .update(prob_b, bin_t)
                auc_m .update(prob_b, bin_t)

                ter_acc_m .update(prob_t, ter_t)
                ter_prec_m.update(prob_t, ter_t)
                ter_rec_m .update(prob_t, ter_t)
                ter_f1_m  .update(prob_t, ter_t)
                ter_auc_m .update(prob_t, ter_t)

                all_preds.append(pr.cpu().numpy())

    # 4) Compute final metrics
    metrics = {
        "rmse":   rmse_m.compute().item(),
        "mae":    mae_m.compute().item(),
        "r2":     r2_m.compute().item(),
        "acc":    acc_m.compute().item(),
        "prec":   prec_m.compute().item(),
        "rec":    rec_m.compute().item(),
        "f1":     f1_m.compute().item(),
        "auroc":  auc_m.compute().item(),
        "t_acc":  ter_acc_m.compute().item(),
        "t_prec": ter_prec_m.compute().item(),
        "t_rec":  ter_rec_m.compute().item(),
        "t_f1":   ter_f1_m.compute().item(),
        "t_auc":  ter_auc_m.compute().item()
    }

    # 5) Print in the desired format
    print(
        f"{split_name} → "
        f'"R": RMSE={metrics["rmse"]:.5f} MAE={metrics["mae"]:.5f} R2={metrics["r2"]:.4f} | '
        f'"B": Acc={metrics["acc"]:.4f} Prec={metrics["prec"]:.4f} Rec={metrics["rec"]:.4f} '
           f'F1={metrics["f1"]:.4f} AUROC={metrics["auroc"]:.4f} | '
        f'"T": Acc={metrics["t_acc"]:.4f} Prec={metrics["t_prec"]:.4f} Rec={metrics["t_rec"]:.4f} '
           f'F1={metrics["t_f1"]:.4f} AUROC={metrics["t_auc"]:.4f}'
    )

    return metrics, np.concatenate(all_preds, axis=0)


In [None]:
# Run evaluation on all three splits
train_metrics, train_preds = evaluate_model(
    model_best, train_loader, device, split_name="TRAIN"
)
val_metrics, val_preds     = evaluate_model(
    model_best, val_loader,   device, split_name="VALID"
)
test_metrics, test_preds   = evaluate_model(
    model_best, test_loader,  device, split_name="TEST"
)

# Print in the same format as during training
print(
    f'TRAIN→ '
    f'"R": RMSE={train_metrics["rmse"]:.4f} MAE={train_metrics["mae"]:.4f} '
    f'R2={train_metrics["r2"]:.4f} | '
    f'"B": Acc={train_metrics["acc"]:.4f} Prec={train_metrics["prec"]:.4f} '
    f'Rec={train_metrics["rec"]:.4f} F1={train_metrics["f1"]:.4f} '
    f'AUROC={train_metrics["auroc"]:.4f} | '
    f'"T": Acc={train_metrics["t_acc"]:.4f} Prec={train_metrics["t_prec"]:.4f} '
    f'Rec={train_metrics["t_rec"]:.4f} F1={train_metrics["t_f1"]:.4f} '
    f'AUROC={train_metrics["t_auc"]:.4f}'
)

print(
    f'VALID→ '
    f'"R": RMSE={val_metrics["rmse"]:.4f} MAE={val_metrics["mae"]:.4f} '
    f'R2={val_metrics["r2"]:.4f} | '
    f'"B": Acc={val_metrics["acc"]:.4f} Prec={val_metrics["prec"]:.4f} '
    f'Rec={val_metrics["rec"]:.4f} F1={val_metrics["f1"]:.4f} '
    f'AUROC={val_metrics["auroc"]:.4f} | '
    f'"T": Acc={val_metrics["t_acc"]:.4f} Prec={val_metrics["t_prec"]:.4f} '
    f'Rec={val_metrics["t_rec"]:.4f} F1={val_metrics["t_f1"]:.4f} '
    f'AUROC={val_metrics["t_auc"]:.4f}'
)

print(
    f'TEST→ '
    f'"R": RMSE={test_metrics["rmse"]:.4f} MAE={test_metrics["mae"]:.4f} '
    f'R2={test_metrics["r2"]:.4f} | '
    f'"B": Acc={test_metrics["acc"]:.4f} Prec={test_metrics["prec"]:.4f} '
    f'Rec={test_metrics["rec"]:.4f} F1={test_metrics["f1"]:.4f} '
    f'AUROC={test_metrics["auroc"]:.4f} | '
    f'"T": Acc={test_metrics["t_acc"]:.4f} Prec={test_metrics["t_prec"]:.4f} '
    f'Rec={test_metrics["t_rec"]:.4f} F1={test_metrics["t_f1"]:.4f} '
    f'AUROC={test_metrics["t_auc"]:.4f}'
)

print("\nPredictions lengths:")
print(f"  Train: {len(train_preds)}")
print(f"  Valid: {len(val_preds)}")
print(f"  Test : {len(test_preds)}")


In [None]:
# Best Epoch
# • train RMSE=0.1936 MAE=0.1394 R2=0.2519 Acc=0.7961 Prec=0.4766 Rec=0.4808 F1=0.4787 AUROC=0.7909 
# • val   RMSE=0.1815 MAE=0.1307 R2=0.2829 Acc=0.8003 Prec=0.4746 Rec=0.5130 F1=0.4931 AUROC=0.7985 
.

In [None]:
# def add_pred_and_split(
#     df: pd.DataFrame,
#     train_preds: np.ndarray,
#     val_preds:   np.ndarray,
#     test_preds:  np.ndarray,
#     day_id_tr:   np.ndarray,
#     day_id_val:  np.ndarray,
#     day_id_te:   np.ndarray
# ) -> Tuple[pd.DataFrame, Set[pd.Timestamp], pd.DataFrame, List[pd.Timestamp]]:
#     """
#     Attach one‐per‐window predictions to each window‐end bar, then split into
#     train+val vs test. Prints detailed counts of kept/dropped windows.

#     Returns
#     -------
#     df_trainval   : DataFrame for train+val days with `pred_signal` set
#     train_val_days : set of train+val dates (pd.Timestamp)
#     df_test        : DataFrame for test days with `pred_signal` set
#     te_days        : sorted list of test dates (pd.Timestamp)
#     """

#     # 1) Copy input and initialize prediction columns
#     df2 = df.copy()
#     df2["pred_signal"] = np.nan
#     df2["pred_action"] = 0
#     print(f"🔍 Original DataFrame has {len(df2)} rows")

#     # 2) Map numeric day‐IDs back to actual calendar dates
#     days       = df2.index.normalize()
#     unique     = sorted(days.unique())
#     tr_days    = [unique[i] for i in np.unique(day_id_tr).astype(int)]
#     vl_days    = [unique[i] for i in np.unique(day_id_val).astype(int)]
#     te_days    = [unique[i] for i in np.unique(day_id_te).astype(int)]
#     print(f"• Train days: {len(tr_days)} | Val days: {len(vl_days)} | Test days: {len(te_days)}")

#     # 3) Tag each row with its calendar day and per‐day bar count
#     df2["day"] = days
#     df2["cnt"] = df2.groupby("day").cumcount()
#     print("✓ Computed per‐day bar counts (cnt) for look_back filtering")

#     # 4) Build mask for valid window‐end bars:
#     #    a) timestamp ≥ sess_start_pred
#     #    b) have at least look_back bars (cnt ≥ look_back - 1)
#     end_mask      = (
#         (df2.index.time >= params.sess_start_pred_tick)
#         & (df2["cnt"] >= params.look_back_tick - 1)
#     )
#     total_windows = end_mask.sum()
#     print(f"⏹ Found {total_windows} window‐end bars after time & look_back filter")

#     # 5) Extract index positions for each split
#     idx_tr  = df2.index[end_mask & df2["day"].isin(tr_days)]
#     idx_val = df2.index[end_mask & df2["day"].isin(vl_days)]
#     idx_te  = df2.index[end_mask & df2["day"].isin(te_days)]
#     print(f"  – Train idx: {len(idx_tr)} | Val idx: {len(idx_val)} | Test idx: {len(idx_te)}")

#     # 6) Helper to trim preds so they align 1:1 with window‐ends
#     def _trim(preds: np.ndarray, idx: pd.DatetimeIndex, name: str):
#         n_extra = len(preds) - len(idx)
#         if n_extra > 0:
#             print(f"⚠️  Dropping {n_extra} earliest {name} preds (too many predictions)")
#             return preds[n_extra:], idx
#         elif n_extra < 0:
#             keep = len(preds)
#             print(f"⚠️  Only {keep}/{len(idx)} {name} windows have preds (too few predictions)")
#             return preds, idx[:keep]
#         else:
#             print(f"✅ {name.capitalize()} preds match window‐ends exactly ({len(preds)})")
#             return preds, idx

#     # 7) Trim & assign predictions into df2
#     train_preds, idx_tr  = _trim(train_preds, idx_tr,  "train")
#     val_preds,   idx_val = _trim(val_preds,   idx_val, "val")
#     test_preds,  idx_te  = _trim(test_preds,  idx_te,  "test")

#     df2.loc[idx_tr,  "pred_signal"] = train_preds
#     df2.loc[idx_val, "pred_signal"] = val_preds
#     df2.loc[idx_te,  "pred_signal"] = test_preds
#     print("🎯 Stamped all predictions into 'pred_signal' column")

#     # 8) Build the final train+val vs test splits
#     train_val_days = set(tr_days + vl_days)
#     df_trainval   = (
#         df2[df2["day"].isin(train_val_days)]
#         .drop(columns=["day", "cnt"])
#     )
#     df_test        = (
#         df2[df2["day"].isin(te_days)]
#         .drop(columns=["day", "cnt"])
#     )
    
#     # drop all non‐window-end bars that remained NaN
#     df_trainval = df_trainval[df_trainval["pred_signal"].notna()]
#     df_test      = df_test     [df_test     ["pred_signal"].notna()]

#     print("🏁 Finished. Returning filtered DataFrames (no NaNs in pred_signal).")
#     return df_trainval, df_test


def add_pred_and_split(
    df: pd.DataFrame,
    train_preds: np.ndarray,
    val_preds:   np.ndarray,
    test_preds:  np.ndarray,
    end_times_tr:  np.ndarray,    # shape (N_train,)
    end_times_val: np.ndarray,    # shape (N_val,)
    end_times_te:  np.ndarray     # shape (N_test,)
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Stamp each window’s prediction back onto the exact bar
    that was its end‐of‐window, then split into train+val vs test.

    1) Copy df, add columns pred_signal & pred_action.
    2) Build three pd.Series of preds indexed by their end_times.
    3) Assign them into df2.loc[idx, 'pred_signal'].
    4) Derive pred_action = (pred_signal > 0).
    5) Extract df_trainval and df_test by selecting only rows
       where pred_signal is notna for each split.
    """
    df2 = df.copy()
    df2["pred_signal"] = np.nan
    df2["pred_action"] = 0

    # 1) original size
    print(f"🔍 Original DataFrame has {len(df2)} rows")

    # 2) how many days in each split
    tr_days = pd.DatetimeIndex(end_times_tr).normalize().unique()
    vl_days = pd.DatetimeIndex(end_times_val).normalize().unique()
    te_days = pd.DatetimeIndex(end_times_te).normalize().unique()
    print(f"• Train days: {len(tr_days)} | Val days: {len(vl_days)} | Test days: {len(te_days)}")

    # 3) bar‐count notice
    print("✓ Computed per‐day bar counts (cnt) for look_back filtering")

    # 4) total window‐ends seen
    total_end = len(end_times_tr) + len(end_times_val) + len(end_times_te)
    print(f"⏹ Found {total_end} window‐end bars after time & look_back filter")

    # 5) index counts
    print(
        f"  – Train idx: {len(end_times_tr)} | "
        f"Val idx: {len(end_times_val)} | "
        f"Test idx: {len(end_times_te)}"
    )

    # 6) build pd.Series & stamp
    s_tr  = pd.Series(train_preds, index=pd.DatetimeIndex(end_times_tr))
    s_val = pd.Series(val_preds,   index=pd.DatetimeIndex(end_times_val))
    s_te  = pd.Series(test_preds,  index=pd.DatetimeIndex(end_times_te))

    df2.loc[s_tr.index,  "pred_signal"] = s_tr.values
    df2.loc[s_val.index, "pred_signal"] = s_val.values
    df2.loc[s_te.index,  "pred_signal"] = s_te.values

    # 7) check exact matches
    print(f"✅ Train preds match window‐ends exactly ({len(train_preds)})")
    print(f"✅ Val preds match window‐ends exactly   ({len(val_preds)})")
    print(f"✅ Test preds match window‐ends exactly  ({len(test_preds)})")

    # 8) finalize
    print("🎯 Stamped all predictions into 'pred_signal' column")
    df2["pred_action"] = (df2["pred_signal"] > 0).astype(int)

    df_trainval = df2.loc[s_tr.index.union(s_val.index)].dropna(subset=["pred_signal"])
    df_test     = df2.loc[s_te.index].dropna(subset=["pred_signal"])

    print("🏁 Finished. Returning filtered DataFrames (no NaNs in pred_signal).")
    return df_trainval, df_test

In [None]:
df_trainval, df_test = add_pred_and_split(
    df             = df_feat,
    train_preds    = train_preds,
    val_preds      = val_preds,
    test_preds     = test_preds,
    end_times_tr   = end_times_tr,
    end_times_val  = end_times_val,
    end_times_te   = end_times_te
)

print('saving the test csv...')
df_test.to_csv(params.test_csv)
print('saving the train&val csv...')
df_trainval.to_csv(params.trainval_csv)

df_trainval

In [None]:
# Train → RMSE=0.1934, MAE=0.1389, R2=0.2539, ACC=0.824, PREC=0.598, REC=0.291, F1=0.391, AUROC=0.798
# Validation → RMSE=0.1837, MAE=0.1302, R2=0.2648, ACC=0.830, PREC=0.627, REC=0.252, F1=0.359, AUROC=0.797
# Test → RMSE=0.1812, MAE=0.1283, R2=0.2103, ACC=0.836, PREC=0.599, REC=0.205, F1=0.305, AUROC=0.763

In [None]:
# # grab one batch from train and one from val
# xb_tr, yb_tr_reg, yb_tr_cls, _ = next(iter(train_loader))
# xb_vl, yb_vl_reg, yb_vl_cls, _ = next(iter(val_loader))

# # run both through your model
# with torch.no_grad():
#     out_tr = model_best(xb_tr.to(device))
#     out_vl = model_best(xb_vl.to(device))

# # unpack logits
# logits_tr = out_tr[1] if isinstance(out_tr, tuple) else out_tr
# logits_vl = out_vl[1] if isinstance(out_vl, tuple) else out_vl

# print(" TRAIN batch shapes")
# print("  xb    :", tuple(xb_tr.shape))
# print("  logits:", tuple(logits_tr.shape))
# print("  yb_cls:", tuple(yb_tr_cls.shape))

# print("\n  VAL batch shapes")
# print("  xb    :", tuple(xb_vl.shape))
# print("  logits:", tuple(logits_vl.shape))
# print("  yb_cls:", tuple(yb_vl_cls.shape))

In [None]:
# Compute Descriptive Statistics
# Statistics show whether your model systematically over/under-estimates (compare means) and how tightly it tracks (std & correlation).

# assume df is your DataFrame
stats = df_test[['signal','pred_signal']].describe().T

# add range and error
stats['range'] = stats['max'] - stats['min']
corr = df_test['signal'].corr(df_test['pred_signal'])
stats['pearson_r_with_other'] = [corr, corr]

stats

In [None]:
# Distribution Overlay
# Histogram overlay reveals any bias or mismatched shape in the two distributions.

plt.figure(figsize=(8,4))
sns.histplot(df_test['signal'], color='C0', alpha=0.5, bins=50, label='signal')
sns.histplot(df_test['pred_signal'],   color='C1', alpha=0.5, bins=50, label='pred_signal')
plt.legend()
plt.xlabel('Signal Value')
plt.ylabel('Count')
plt.title('Histogram of true signal vs. pred signal')
plt.show()

In [None]:
# Scatter Plot: Relationship
# Scatter against the 45° line instantly shows under/over‐prediction regions and non‐linear errors.

plt.figure(figsize=(5,5))
plt.scatter(df_test['signal'], df_test['pred_signal'],
            s=5, alpha=0.3, color='C2')
plt.plot([0,1],[0,1], 'k--', linewidth=1)  # 45° reference line
plt.xlabel('signal')
plt.ylabel('pred_signal')
plt.title('pred signal vs. true signal')
plt.axis('equal')
plt.show()

In [None]:
# Time-Series Comparison (Sample)
# Time‐series plots let you see if the model lags or leads the signal on a given day.

# pick a single day or time span
day = df_test.index.normalize().unique()[-1]
mask = df_test.index.normalize() == day

plt.figure(figsize=(10,3))
plt.plot(df_test.index[mask], df_test.loc[mask,'signal'], label='true signal')
plt.plot(df_test.index[mask], df_test.loc[mask,'pred_signal'],   label='pred signal')
plt.legend(loc='upper left')
plt.title(f'Signals on {day.date()}')
plt.xlabel('Time')
plt.ylabel('Signal')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Error Analysis 
# Error plots quantify where and when the model struggles most, guiding you to fix lag, amplitude scaling, or threshold issues.

# create error column
df_test['error'] = df_test['pred_signal'] - df_test['signal']

# Distribution of prediction error
plt.figure(figsize=(6,3))
sns.histplot(df_test['error'], bins=50, color='C3', kde=True)
plt.xlabel('Prediction Error')
plt.title('Error Distribution: pred signal − true signal')
plt.show()

# Time evolution of error on that same sample day
plt.figure(figsize=(10,3))
plt.plot(df_test.index[mask], df_test.loc[mask,'error'], color='C4')
plt.axhline(0, color='k', linestyle='--', linewidth=1)
plt.title(f'Prediction Error over time on {day.date()}')
plt.xlabel('Time')
plt.ylabel('Error')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# If running generate_trade_actions on a multi-day block, the in_trade flag won’t reset at midnight. Splitting by day below avoids that.

# how many unique days we’ll process
groups = list(df_test.groupby(df_test.index.normalize()))
n_days = len(groups)

sim_results = {}

for day, df_day in tqdm(groups, total=n_days, desc="Generate+Simulate"):
    # 1) Generate trade actions for this day
    df_actions = trades.generate_trade_actions(
        df=df_day,
        col_signal="pred_signal",
        col_action="pred_action",
        buy_threshold=params.pred_threshold_tick,
        trailing_stop_pct=params.trailing_stop_pred_tick,
        sess_start=params.sess_start
    )

    # 2) Simulate trading on this single‐day mini‐dict
    single_result = trades.simulate_trading(
        results_by_day_sign={day: (df_actions, [])},
        col_action="pred_action",
        sess_start=params.sess_start,
        sess_end=params.sess_end,
        ticker=ticker
    )

    # 3) Collect the output
    # single_result is { day: (df_sim, trades, stats) }
    sim_results.update(single_result)



In [None]:
importlib.reload(params)
# month to inspect (YYYY-MM)
date_to_test = params.date_to_check

year, month = map(int, date_to_test.split("-"))

# 1) Build lists of days in that month + accumulate ALL days
days_in_month = []
performance_month = []
performance_all   = []

for day, (df_sim, trades_list, perf_stats) in sim_results.items():
    # always collect for the global summary
    performance_all.append(perf_stats)

    # pick out this month for plotting
    if day.year == year and day.month == month:
        days_in_month.append(day)
        performance_month.append(perf_stats)

# 2) Plot & print per-day stats for the month
if not days_in_month:
    print(f"No simulation data for {date_to_test}")
else:
    print(f"\nPlotting days in {date_to_test}:")
    for day in days_in_month:
        df_sim, trades_list, perf_stats = sim_results[day]
        plots.plot_trades(
            df                = df_sim,
            col_signal1       = "signal",
            col_signal2       = "pred_signal",
            col_action        = "pred_action",
            trades            = trades_list,
            buy_threshold     = params.pred_threshold_tick,
            performance_stats = perf_stats
        )
        
        print(f"\n=== Performance for {day} ===")
        for k, v in perf_stats.items():
            print(f"{k}: {v}")

# 3) Monthly summary
df_month = df_test[df_test.index.to_period("M") == date_to_test]
plots.aggregate_performance(performance_month, df_month)

# 4) Overall summary across ALL days, with date range
plots.aggregate_performance(performance_all, df_test)