In [58]:
# ============================
# Call 1: ตั้งค่าเริ่มต้น + import + โหลด meta
# ============================

import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, Tuple, List

import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print("Device:", device)

# ---- paths ----
DATA_DIR = Path(
    "/Users/thanaporn/Desktop/EURO_H1_AI/prepared_datasets/boosting_dl_residual"
)
NPZ_PATH = DATA_DIR / "eurusd_struct_sequences.npz"
META_PATH = DATA_DIR / "eurusd_struct_meta.json"

with open(META_PATH, "r", encoding="utf-8") as f:
    meta = json.load(f)

print("Loaded meta keys:", meta.keys())
print("Feature cols:", len(meta["feature_cols"]))
print("Targets boosting:", meta["targets_boosting"])
print("Targets DL true:", meta["targets_dl_true"])

Device: mps
Loaded meta keys: dict_keys(['csv_path', 'seq_len', 'horizon', 'train_ratio', 'val_ratio', 'feature_cols', 'columns_required', 'targets_boosting', 'targets_dl_true', 'note'])
Feature cols: 24
Targets boosting: ['gap_next', 'range_next', 'body_next']
Targets DL true: ['gap_next', 'range_next', 'body_next', 'upper_wick_next', 'lower_wick_next']


In [59]:
# ============================
# Call 2: โหลด NPZ dataset
# ============================

npz = np.load(NPZ_PATH, allow_pickle=True)

# ---- Boosting tabular ----
Xb_train = npz["Xb_train"]
Xb_val = npz["Xb_val"]
Xb_test = npz["Xb_test"]

yb_train = npz["yb_train"]  # [N,3] = gap_next, range_next, body_next
yb_val = npz["yb_val"]
yb_test = npz["yb_test"]

idxb_train = npz["idxb_train"]
idxb_val = npz["idxb_val"]
idxb_test = npz["idxb_test"]

# ---- DL sequences ----
Xs_train = npz["Xs_train"]  # [N, seq_len, F]
Xs_val = npz["Xs_val"]
Xs_test = npz["Xs_test"]

yd_train_true = npz["yd_train_true"]  # [N,5] = gap,range,body,uw,lw (true)
yd_val_true = npz["yd_val_true"]
yd_test_true = npz["yd_test_true"]

idxs_train = npz["idxs_train"]
idxs_val = npz["idxs_val"]
idxs_test = npz["idxs_test"]

print("Boosting train:", Xb_train.shape, yb_train.shape)
print("DL train:", Xs_train.shape, yd_train_true.shape)

Boosting train: (2912, 24) (2912, 3)
DL train: (2895, 24, 24) (2895, 5)


In [60]:
# ============================
# Call 3: Train Boosting (LightGBM 3 โมเดล) + ทำ pred ทั้งชุด
# ============================

target_names = meta["targets_boosting"]  # ["gap_next","range_next","body_next"]

params = dict(
    objective="regression",
    n_estimators=3000,
    learning_rate=0.02,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
)

boost_models = []


def train_one_target(i: int):
    model = lgb.LGBMRegressor(**params)
    model.fit(
        Xb_train,
        yb_train[:, i],
        eval_set=[(Xb_val, yb_val[:, i])],
        eval_metric="l2",
        callbacks=[lgb.early_stopping(200, verbose=False)],
    )
    return model


for i, tname in enumerate(target_names):
    m = train_one_target(i)
    boost_models.append(m)
    print(f"Trained boosting target: {tname}, best_iter={m.best_iteration_}")


# ---- predictions for all splits ----
def predict_boost(X_np):
    X_df = pd.DataFrame(X_np, columns=meta["feature_cols"])
    preds = []
    for m in boost_models:
        preds.append(m.predict(X_df, num_iteration=m.best_iteration_))
    return np.stack(preds, axis=1)  # [N,3]


yb_pred_train = predict_boost(Xb_train)
yb_pred_val = predict_boost(Xb_val)
yb_pred_test = predict_boost(Xb_test)


# ---- evaluation ----
def eval_boost(name, y_true, y_pred):
    maes = mean_absolute_error(y_true, y_pred, multioutput="raw_values")
    rmses = np.sqrt(mean_squared_error(y_true, y_pred, multioutput="raw_values"))
    print(f"[Boost {name}] MAE:", dict(zip(target_names, maes)))
    print(f"[Boost {name}] RMSE:", dict(zip(target_names, rmses)))


eval_boost("train", yb_train, yb_pred_train)
eval_boost("val", yb_val, yb_pred_val)
eval_boost("test", yb_test, yb_pred_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4618
[LightGBM] [Info] Number of data points in the train set: 2912, number of used features: 20
[LightGBM] [Info] Start training from score -0.000018




Trained boosting target: gap_next, best_iter=8
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4618
[LightGBM] [Info] Number of data points in the train set: 2912, number of used features: 20
[LightGBM] [Info] Start training from score 0.009904




Trained boosting target: range_next, best_iter=137
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4618
[LightGBM] [Info] Number of data points in the train set: 2912, number of used features: 20
[LightGBM] [Info] Start training from score -0.000058




Trained boosting target: body_next, best_iter=11
[Boost train] MAE: {'gap_next': np.float64(0.00032645472697125614), 'range_next': np.float64(0.0021351013930006545), 'body_next': np.float64(0.004817963623254481)}
[Boost train] RMSE: {'gap_next': np.float64(0.0012012072929406489), 'range_next': np.float64(0.0030708938555879203), 'body_next': np.float64(0.006557385153565503)}
[Boost val] MAE: {'gap_next': np.float64(0.00023072437597563967), 'range_next': np.float64(0.0024945631913204282), 'body_next': np.float64(0.004142512703122564)}
[Boost val] RMSE: {'gap_next': np.float64(0.0006999679107057043), 'range_next': np.float64(0.003344482495706121), 'body_next': np.float64(0.005418909247105769)}
[Boost test] MAE: {'gap_next': np.float64(0.000235636980620905), 'range_next': np.float64(0.0025889435095274175), 'body_next': np.float64(0.0035548009195827666)}
[Boost test] RMSE: {'gap_next': np.float64(0.0007902346938645659), 'range_next': np.float64(0.0034403079437801683), 'body_next': np.float6

In [61]:
# ============================
# Call 4: Align boosting preds -> make residual -> scale DL inputs
# ============================

# ---- 1) รวม tabular ทั้งหมด แล้ว predict boosting ทั้งชุด ----
Xb_all = np.concatenate([Xb_train, Xb_val, Xb_test], axis=0)
yb_pred_all = predict_boost(Xb_all)  # shape [N_tab, 3]

# ---- 2) รวม DL true targets ทั้งหมด ----
yd_true_all = np.concatenate([yd_train_true, yd_val_true, yd_test_true], axis=0)
N_dl = len(yd_true_all)

# ---- 3) คำนวณ offset ของ DL ต่อ tabular ----
# DL เริ่มใช้ target ที่แถว i = seq_len-1 ของ tabular
offset = meta["seq_len"] - 1

# ---- 4) slice boosting preds ให้ตรงกับ DL timeline ----
yb_pred_dl_all = yb_pred_all[offset : offset + N_dl]
assert (
    len(yb_pred_dl_all) == N_dl
), f"Aligned pred length mismatch: {len(yb_pred_dl_all)} vs {N_dl}"

# ---- 5) แยกกลับเป็น train/val/test ตามความยาว DL จริง ----
n_tr = len(yd_train_true)
n_va = len(yd_val_true)
n_te = len(yd_test_true)

yb_pred_train_aligned = yb_pred_dl_all[:n_tr]
yb_pred_val_aligned = yb_pred_dl_all[n_tr : n_tr + n_va]
yb_pred_test_aligned = yb_pred_dl_all[n_tr + n_va :]

assert yb_pred_train_aligned.shape[0] == n_tr
assert yb_pred_val_aligned.shape[0] == n_va
assert yb_pred_test_aligned.shape[0] == n_te


# ---- 6) ทำ residual targets สำหรับ DL ----
def make_residual(yd_true_split, yb_pred_split):
    yd = yd_true_split.copy()
    yd[:, 0:3] = yd[:, 0:3] - yb_pred_split  # res_gap,res_range,res_body
    return yd


yd_train = make_residual(yd_train_true, yb_pred_train_aligned)
yd_val = make_residual(yd_val_true, yb_pred_val_aligned)
yd_test = make_residual(yd_test_true, yb_pred_test_aligned)

print("DL residual y shapes:", yd_train.shape, yd_val.shape, yd_test.shape)

# ---- 7) standardize DL inputs (fit scaler on train only) ----
B, T, F = Xs_train.shape
scaler = StandardScaler()
scaler.fit(Xs_train.reshape(-1, F))


def scale_seq(Xs):
    Xflat = Xs.reshape(-1, F)
    Xflat = scaler.transform(Xflat)
    return Xflat.reshape(Xs.shape)


Xs_train_s = scale_seq(Xs_train)
Xs_val_s = scale_seq(Xs_val)
Xs_test_s = scale_seq(Xs_test)

print("Scaled DL X:", Xs_train_s.shape)
print("Alignment check (train):", yd_train.shape, yb_pred_train_aligned.shape)

DL residual y shapes: (2895, 5) (620, 5) (621, 5)
Scaled DL X: (2895, 24, 24)
Alignment check (train): (2895, 5) (2895, 3)


In [62]:
# ============================
# Call 5: สร้าง DataLoader + โมเดล DL (TCN) สำหรับ residual + wick
# ============================


class SeqDataset(Dataset):
    def __init__(self, X_seq, y):
        self.X = torch.tensor(X_seq, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]


train_ds = SeqDataset(Xs_train_s, yd_train)
val_ds = SeqDataset(Xs_val_s, yd_val)
test_ds = SeqDataset(Xs_test_s, yd_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=128, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=128, shuffle=False)


class TCNBlock(nn.Module):
    def __init__(self, c_in, c_out, k=3, dilation=1, dropout=0.1):
        super().__init__()
        pad = (k - 1) * dilation
        self.net = nn.Sequential(
            nn.Conv1d(c_in, c_out, k, padding=pad, dilation=dilation),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Conv1d(c_out, c_out, k, padding=pad, dilation=dilation),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        self.down = nn.Conv1d(c_in, c_out, 1) if c_in != c_out else nn.Identity()

    def forward(self, x):
        y = self.net(x)
        y = y[..., : x.size(-1)]  # causal crop
        return y + self.down(x)


class TCN(nn.Module):
    def __init__(self, n_features, n_outputs=5, channels=(64, 64, 64)):
        super().__init__()
        layers = []
        c_in = n_features
        for i, c_out in enumerate(channels):
            layers.append(TCNBlock(c_in, c_out, dilation=2**i))
            c_in = c_out
        self.tcn = nn.Sequential(*layers)
        self.head = nn.Linear(channels[-1], n_outputs)
        self.softplus = nn.Softplus()

    def forward(self, x):
        # x: [B, T, F] -> [B, F, T]
        x = x.transpose(1, 2)
        z = self.tcn(x)
        z_last = z[..., -1]
        raw = self.head(z_last)  # [B,5]

        # non-inplace: สร้างคอลัมน์ใหม่
        res_part = raw[:, 0:3]
        uw = self.softplus(raw[:, 3:4])
        lw = self.softplus(raw[:, 4:5])

        out = torch.cat([res_part, uw, lw], dim=1)
        return out


model = TCN(n_features=F, n_outputs=5).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
loss_fn = nn.HuberLoss()

print(model)

TCN(
  (tcn): Sequential(
    (0): TCNBlock(
      (net): Sequential(
        (0): Conv1d(24, 64, kernel_size=(3,), stride=(1,), padding=(2,))
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
        (3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(2,))
        (4): ReLU()
        (5): Dropout(p=0.1, inplace=False)
      )
      (down): Conv1d(24, 64, kernel_size=(1,), stride=(1,))
    )
    (1): TCNBlock(
      (net): Sequential(
        (0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(2,))
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
        (3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(2,))
        (4): ReLU()
        (5): Dropout(p=0.1, inplace=False)
      )
      (down): Identity()
    )
    (2): TCNBlock(
      (net): Sequential(
        (0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(4,))
        (1): ReLU()
        (2): Dropout(p=0.1, inplace=False)
    

In [63]:
# ============================
# Call 6: Train DL (early stopping) + evaluate residual space
# ============================

best_val = 1e9
best_state = None
patience, wait = 20, 0
max_epochs = 200

for epoch in range(1, max_epochs + 1):
    # ---- train ----
    model.train()
    tr_losses = []
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        opt.zero_grad()
        loss.backward()
        opt.step()
        tr_losses.append(loss.item())

    # ---- val ----
    model.eval()
    va_losses = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            va_losses.append(loss.item())

    tr_loss = float(np.mean(tr_losses))
    va_loss = float(np.mean(va_losses))

    if va_loss < best_val:
        best_val = va_loss
        wait = 0
        best_state = {k: v.cpu() for k, v in model.state_dict().items()}
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping.")
            break

    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d} | train {tr_loss:.5f} | val {va_loss:.5f}")

model.load_state_dict(best_state)


# ---- eval residual+wick predictions ----
def predict_dl(loader):
    model.eval()
    preds = []
    trues = []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            pred = model(xb).cpu().numpy()
            preds.append(pred)
            trues.append(yb.numpy())
    return np.concatenate(preds), np.concatenate(trues)


dl_pred_test, dl_true_test = predict_dl(test_loader)

dl_names = ["res_gap", "res_range", "res_body", "upper_wick_next", "lower_wick_next"]
mae_dl = mean_absolute_error(dl_true_test, dl_pred_test, multioutput="raw_values")
rmse_dl = np.sqrt(
    mean_squared_error(dl_true_test, dl_pred_test, multioutput="raw_values")
)
print("[DL test] MAE:", dict(zip(dl_names, mae_dl)))
print("[DL test] RMSE:", dict(zip(dl_names, rmse_dl)))

Epoch   1 | train 0.01780 | val 0.00178
Epoch   5 | train 0.00093 | val 0.00020
Epoch  10 | train 0.00018 | val 0.00005
Epoch  15 | train 0.00005 | val 0.00002
Epoch  20 | train 0.00002 | val 0.00001
Epoch  25 | train 0.00002 | val 0.00001
Epoch  30 | train 0.00001 | val 0.00001
Epoch  35 | train 0.00001 | val 0.00001
Epoch  40 | train 0.00001 | val 0.00001
Epoch  45 | train 0.00001 | val 0.00001
Epoch  50 | train 0.00001 | val 0.00001
Epoch  55 | train 0.00001 | val 0.00001
Epoch  60 | train 0.00001 | val 0.00001
Epoch  65 | train 0.00001 | val 0.00001
Epoch  70 | train 0.00001 | val 0.00001
Early stopping.
[DL test] MAE: {'res_gap': np.float32(0.0011957579), 'res_range': np.float32(0.0028181921), 'res_body': np.float32(0.0039695823), 'upper_wick_next': np.float32(0.0015001134), 'lower_wick_next': np.float32(0.0013984661)}
[DL test] RMSE: {'res_gap': np.float32(0.0016117169), 'res_range': np.float32(0.0037278363), 'res_body': np.float32(0.0053097545), 'upper_wick_next': np.float32(0.0

In [64]:
# ============================
# Call 7 (UPDATED): Align test preds -> Reconstruct OHLC -> Final metrics -> Save models
# ============================

feature_cols = meta["feature_cols"]
close_pos = feature_cols.index("Close")
seq_len = meta["seq_len"]
offset = seq_len - 1

# --- 1) รวม tabular ทั้งหมด + boosting pred ทั้งชุด (เหมือน Call 4) ---
Xb_all = np.concatenate([Xb_train, Xb_val, Xb_test], axis=0)
yb_pred_all = predict_boost(Xb_all)  # [N_tab, 3]

# --- 2) รวม DL true targets ทั้งหมดเพื่อหา N_dl และ slice pred ---
yd_true_all = np.concatenate([yd_train_true, yd_val_true, yd_test_true], axis=0)
N_dl = len(yd_true_all)

yb_pred_dl_all = yb_pred_all[offset : offset + N_dl]
assert len(yb_pred_dl_all) == N_dl

# --- 3) แยก yb_pred ที่ align แล้วกลับเป็น train/val/test ตาม DL lengths ---
n_tr = len(yd_train_true)
n_va = len(yd_val_true)
n_te = len(yd_test_true)

yb_pred_train_aligned = yb_pred_dl_all[:n_tr]
yb_pred_val_aligned = yb_pred_dl_all[n_tr : n_tr + n_va]
yb_pred_test_aligned = yb_pred_dl_all[n_tr + n_va :]

assert yb_pred_test_aligned.shape[0] == n_te == dl_pred_test.shape[0]

# --- 4) close_t สำหรับ DL test ต้อง align ช่วงเดียวกับ DL test ด้วย ---
# tabular rows ที่ตรงกับ DL ทั้งชุดคือ offset : offset+N_dl
close_t_all = Xb_all[:, close_pos]
close_t_dl_all = close_t_all[offset : offset + N_dl]

close_t_test_aligned = close_t_dl_all[n_tr + n_va :]  # ยาวเท่า DL test
assert close_t_test_aligned.shape[0] == n_te

# --- 5) รวม struct preds บน test (boost aligned + dl residual) ---
struct_pred_test = yb_pred_test_aligned + dl_pred_test[:, 0:3]  # [N_te,3]
gap_pred, range_pred, body_pred = (
    struct_pred_test[:, 0],
    struct_pred_test[:, 1],
    struct_pred_test[:, 2],
)
uw_pred, lw_pred = dl_pred_test[:, 3], dl_pred_test[:, 4]

# --- 6) reconstruct OHLC preds ---
open_next_pred = close_t_test_aligned + gap_pred
close_next_pred = open_next_pred + body_pred
high_next_pred = np.maximum(open_next_pred, close_next_pred) + uw_pred
low_next_pred = np.minimum(open_next_pred, close_next_pred) - lw_pred

# --- 7) reconstruct OHLC true (ใช้ DL true + close_t ที่ align แล้ว) ---
gap_true, range_true, body_true = (
    yd_test_true[:, 0],
    yd_test_true[:, 1],
    yd_test_true[:, 2],
)
uw_true, lw_true = yd_test_true[:, 3], yd_test_true[:, 4]

open_next_true = close_t_test_aligned + gap_true
close_next_true = open_next_true + body_true
high_next_true = np.maximum(open_next_true, close_next_true) + uw_true
low_next_true = np.minimum(open_next_true, close_next_true) - lw_true

# --- 8) metric ---
from sklearn.metrics import mean_absolute_error, mean_squared_error


def mae(a, b):
    return float(mean_absolute_error(a, b))


def rmse(a, b):
    return float(np.sqrt(mean_squared_error(a, b)))


print("\n[FINAL OHLC test - ALIGNED]")
print(
    "Open  MAE/RMSE:",
    mae(open_next_true, open_next_pred),
    rmse(open_next_true, open_next_pred),
)
print(
    "High  MAE/RMSE:",
    mae(high_next_true, high_next_pred),
    rmse(high_next_true, high_next_pred),
)
print(
    "Low   MAE/RMSE:",
    mae(low_next_true, low_next_pred),
    rmse(low_next_true, low_next_pred),
)
print(
    "Close MAE/RMSE:",
    mae(close_next_true, close_next_pred),
    rmse(close_next_true, close_next_pred),
)

# --- 9) save models ---
MODEL_DIR = DATA_DIR / "trained_models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

import joblib

for i, m in enumerate(boost_models):
    joblib.dump(m, MODEL_DIR / f"lgb_{target_names[i]}.pkl")

torch.save(model.state_dict(), MODEL_DIR / "tcn_residual.pth")
joblib.dump(scaler, MODEL_DIR / "dl_scaler.pkl")

print("\n✔ Saved models to:", MODEL_DIR)
print(" - LightGBM x3")
print(" - TCN residual")
print(" - DL scaler")


[FINAL OHLC test - ALIGNED]
Open  MAE/RMSE: 0.001195757478553965 0.0016117184871266151
High  MAE/RMSE: 0.00316229766539879 0.004590333359410505
Low   MAE/RMSE: 0.0028585699776050725 0.004015433682250394
Close MAE/RMSE: 0.003884972620721231 0.005273697946394204

✔ Saved models to: /Users/thanaporn/Desktop/EURO_H1_AI/prepared_datasets/boosting_dl_residual/trained_models
 - LightGBM x3
 - TCN residual
 - DL scaler
