In [1]:
# ============================
# Call 1: ตั้งค่าเริ่มต้น + import library
# ============================

import os
import json
from dataclasses import dataclass
from typing import Tuple, Dict, List, Optional

import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 80)
pd.set_option("display.width", 140)


@dataclass
class DatasetConfig:
    """
    config หลักของ dataset:
    - csv_path: path ไฟล์ OHLCV
    - seq_len: ความยาว sequence สำหรับ DL residual
    - horizon: ทำนายล่วงหน้าอีกกี่แท่ง (default=1)
    - train_ratio / val_ratio: time-based split
    """

    csv_path: str = "data_csv/EURUSD_D1.csv"

    time_candidates: Tuple[str, ...] = ("Time", "time", "date", "datetime")
    open_candidates: Tuple[str, ...] = ("Open", "open", "open_price")
    high_candidates: Tuple[str, ...] = ("High", "high")
    low_candidates: Tuple[str, ...] = ("Low", "low")
    close_candidates: Tuple[str, ...] = ("Close", "close", "close_price")
    volume_candidates: Tuple[str, ...] = ("Volume", "volume", "vol", "tick_volume")
    spread_candidates: Tuple[str, ...] = ("Spread", "spread", "sprd")

    seq_len: int = 24
    horizon: int = 1

    train_ratio: float = 0.7
    val_ratio: float = 0.15  # ส่วนที่เหลือเป็น test


config = DatasetConfig()
config

DatasetConfig(csv_path='data_csv/EURUSD_D1.csv', time_candidates=('Time', 'time', 'date', 'datetime'), open_candidates=('Open', 'open', 'open_price'), high_candidates=('High', 'high'), low_candidates=('Low', 'low'), close_candidates=('Close', 'close', 'close_price'), volume_candidates=('Volume', 'volume', 'vol', 'tick_volume'), spread_candidates=('Spread', 'spread', 'sprd'), seq_len=24, horizon=1, train_ratio=0.7, val_ratio=0.15)

In [2]:
# ============================
# Call 2: โหลดข้อมูล EURUSD จาก CSV + เตรียม DataFrame ให้สะอาด
# ============================


def _search_csv_path(cfg: DatasetConfig) -> Path:
    """หาไฟล์ CSV แบบยืดหยุ่น (relative / ancestor)"""
    cwd = Path.cwd()
    for _ in range(6):
        probe = cwd / cfg.csv_path
        if probe.exists():
            return probe
        cwd = cwd.parent

    candidates = [
        Path(cfg.csv_path),
        Path("..") / cfg.csv_path,
        Path.cwd() / cfg.csv_path,
    ]
    for c in candidates:
        if c.exists():
            return c

    return Path(cfg.csv_path)


def _find_col_by_candidates(
    df: pd.DataFrame, candidates: Tuple[str, ...]
) -> Optional[str]:
    """หา column ใน df จาก list candidates (case-insensitive)"""
    col_map_lower = {c.lower(): c for c in df.columns}
    for name in candidates:
        lower = name.lower()
        if lower in col_map_lower:
            return col_map_lower[lower]
    return None


def load_ohlcv(cfg: DatasetConfig) -> pd.DataFrame:
    """
    โหลด OHLCV แล้วรีเนมเป็นมาตรฐาน:
      ['Time','Open','High','Low','Close','Volume','Spread']
    ถ้าไม่มี Volume/Spread -> สร้างเป็น 0.0
    """
    csv_path = _search_csv_path(cfg)

    try:
        df = pd.read_csv(csv_path, sep=None, engine="python")
    except Exception:
        df = pd.read_csv(csv_path, sep="\t", engine="python")

    df.columns = [c.strip() for c in df.columns]

    time_col = _find_col_by_candidates(df, cfg.time_candidates)
    open_col = _find_col_by_candidates(df, cfg.open_candidates)
    high_col = _find_col_by_candidates(df, cfg.high_candidates)
    low_col = _find_col_by_candidates(df, cfg.low_candidates)
    close_col = _find_col_by_candidates(df, cfg.close_candidates)
    volume_col = _find_col_by_candidates(df, cfg.volume_candidates)
    spread_col = _find_col_by_candidates(df, cfg.spread_candidates)

    missing = [
        name
        for name, col in (
            ("Open", open_col),
            ("High", high_col),
            ("Low", low_col),
            ("Close", close_col),
        )
        if col is None
    ]
    if time_col is None:
        raise KeyError(f"Time column not found. Available: {df.columns.tolist()}")
    if missing:
        raise KeyError(
            f"Missing required OHLC columns: {missing}. Available: {df.columns.tolist()}"
        )

    rename_map = {
        time_col: "Time",
        open_col: "Open",
        high_col: "High",
        low_col: "Low",
        close_col: "Close",
    }
    if volume_col is not None:
        rename_map[volume_col] = "Volume"
    if spread_col is not None:
        rename_map[spread_col] = "Spread"

    df = df.rename(columns=rename_map)

    if "Volume" not in df.columns:
        df["Volume"] = 0.0
    if "Spread" not in df.columns:
        df["Spread"] = 0.0

    df["Time"] = pd.to_datetime(df["Time"])

    numeric_cols = ["Open", "High", "Low", "Close", "Volume", "Spread"]
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

    df = df.sort_values("Time").reset_index(drop=True)
    df = df.set_index("Time")

    return df[["Open", "High", "Low", "Close", "Volume", "Spread"]]


raw_df = load_ohlcv(config)
raw_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Spread
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-11-26,1.51073,1.51381,1.49404,1.49682,70915,9
2009-11-27,1.49653,1.49985,1.48264,1.49853,76695,11
2009-11-30,1.49931,1.50825,1.49702,1.50343,79595,9
2009-12-01,1.50351,1.51194,1.49719,1.50885,79059,9
2009-12-02,1.50881,1.51096,1.5029,1.50635,80300,9


In [3]:
# ============================
# Call 3: ฟีเจอร์พื้นฐาน + ATR + volatility + session + candle
# ============================


def ema(series: pd.Series, span: int) -> pd.Series:
    return series.ewm(span=span, adjust=False).mean()


def rsi(series: pd.Series, window: int = 14) -> pd.Series:
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.ewm(alpha=1 / window, adjust=False).mean()
    avg_loss = loss.ewm(alpha=1 / window, adjust=False).mean()

    rs = avg_gain / avg_loss.replace(0, 1e-10)
    return 100 - (100 / (1 + rs))


def atr(
    high: pd.Series, low: pd.Series, close: pd.Series, window: int = 14
) -> pd.Series:
    prev_close = close.shift(1)
    tr1 = high - low
    tr2 = (high - prev_close).abs()
    tr3 = (low - prev_close).abs()
    true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    # ถ้าอยากใช้ Wilder ATR จริง ๆ ให้เปลี่ยนเป็น ewm
    return true_range.rolling(window=window, min_periods=1).mean()


def add_basic_features(df: pd.DataFrame, cfg: DatasetConfig) -> pd.DataFrame:
    open_ = df["Open"]
    high = df["High"]
    low = df["Low"]
    close = df["Close"]

    # short-term returns (%)
    df["ret_1"] = close.pct_change(1) * 100
    df["ret_4"] = close.pct_change(4) * 100
    df["ret_12"] = close.pct_change(12) * 100

    # EMA
    df["ema_20"] = ema(close, 20)
    df["ema_50"] = ema(close, 50)
    df["ema_100"] = ema(close, 100)

    # RSI
    df["rsi_14"] = rsi(close, 14)

    # ATR + rolling volatility
    df["atr_14"] = atr(high, low, close, window=14)
    df["vol_20"] = df["ret_1"].rolling(window=20, min_periods=20).std()

    # Candle anatomy
    candle_body = close - open_
    candle_range = high - low
    is_bull = candle_body >= 0

    upper_wick = np.where(is_bull, high - close, high - open_)
    lower_wick = np.where(is_bull, open_ - low, close - low)

    df["candle_body"] = candle_body
    df["candle_range"] = candle_range
    df["upper_wick"] = upper_wick
    df["lower_wick"] = lower_wick

    # Time/session features (ถ้า D1 แล้ว hour คงที่ อาจตัดออกทีหลัง)
    df["hour"] = df.index.hour
    df["dayofweek"] = df.index.dayofweek

    hour = df["hour"]
    df["session_asia"] = ((hour >= 0) & (hour < 8)).astype(int)
    df["session_london"] = ((hour >= 8) & (hour < 16)).astype(int)
    df["session_ny"] = ((hour >= 16) & (hour < 24)).astype(int)

    df["Spread"] = df["Spread"].astype(float)
    return df


df_feat = add_basic_features(raw_df.copy(), config)
df_feat = df_feat.dropna().copy()
df_feat.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Spread,ret_1,ret_4,ret_12,ema_20,ema_50,ema_100,rsi_14,atr_14,vol_20,candle_body,candle_range,upper_wick,lower_wick,hour,dayofweek,session_asia,session_london,session_ny
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2009-12-24,1.43331,1.4418,1.43278,1.43767,79350,9.0,0.30629,0.281103,-2.138739,1.458305,1.476832,1.485852,33.7811,0.013414,0.558658,0.00436,0.00902,0.00413,0.00053,0,3,1,0,0
2009-12-25,1.43761,1.44365,1.43558,1.43772,75425,9.0,0.003478,0.656706,-2.425583,1.456345,1.475299,1.484899,33.829254,0.012914,0.555922,0.00011,0.00807,0.00593,0.00203,0,4,1,0,0
2009-12-28,1.43726,1.44138,1.43505,1.43719,80135,11.0,-0.036864,0.842701,-2.414531,1.454521,1.473804,1.483954,33.550741,0.011971,0.543394,-7e-05,0.00633,0.00412,0.00214,0,0,1,0,0
2009-12-29,1.43723,1.44572,1.43311,1.43386,80669,11.0,-0.231702,0.040467,-1.865692,1.452553,1.472238,1.482962,31.780362,0.012092,0.525736,-0.00337,0.01261,0.00849,0.00075,0,1,1,0,0
2009-12-30,1.43397,1.43606,1.42694,1.43384,79328,11.0,-0.001395,-0.266403,-2.165695,1.450771,1.470732,1.481989,31.769519,0.012194,0.528453,-0.00013,0.00912,0.00209,0.0069,0,2,1,0,0


In [4]:
# ============================
# Call 4: สร้าง targets โครงสร้างสำหรับวันถัดไป (gap/range/body + wick)
# ============================


def add_struct_targets(df: pd.DataFrame, cfg: DatasetConfig) -> pd.DataFrame:
    h = cfg.horizon

    df["Open_next"] = df["Open"].shift(-h)
    df["High_next"] = df["High"].shift(-h)
    df["Low_next"] = df["Low"].shift(-h)
    df["Close_next"] = df["Close"].shift(-h)

    # targets โครงสร้าง
    df["gap_next"] = df["Open_next"] - df["Close"]  # Close_t -> Open_next
    df["range_next"] = df["High_next"] - df["Low_next"]
    df["body_next"] = df["Close_next"] - df["Open_next"]

    # wick ของวันถัดไป (ให้ DL เรียน)
    df["upper_wick_next"] = df["High_next"] - df[["Open_next", "Close_next"]].max(
        axis=1
    )
    df["lower_wick_next"] = df[["Open_next", "Close_next"]].min(axis=1) - df["Low_next"]

    return df


df_all = add_struct_targets(df_feat.copy(), config)
df_all = df_all.dropna().copy()
df_all[
    ["gap_next", "range_next", "body_next", "upper_wick_next", "lower_wick_next"]
].head()

Unnamed: 0_level_0,gap_next,range_next,body_next,upper_wick_next,lower_wick_next
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-12-24,-6e-05,0.00807,0.00011,0.00593,0.00203
2009-12-25,-0.00046,0.00633,-7e-05,0.00412,0.00214
2009-12-28,4e-05,0.01261,-0.00337,0.00849,0.00075
2009-12-29,0.00011,0.00912,-0.00013,0.00209,0.0069
2009-12-30,-7e-05,0.01375,-0.00094,0.01023,0.00258


In [5]:
# ============================
# Call 5: สร้าง dataset
#   A) Boosting (tabular)
#   B) DL Residual (sequence)
# ============================

# ฟีเจอร์ตาม meta ของคุณ
feature_cols: List[str] = [
    "Open",
    "High",
    "Low",
    "Close",
    "Volume",
    "Spread",
    "ret_1",
    "ret_4",
    "ret_12",
    "ema_20",
    "ema_50",
    "ema_100",
    "rsi_14",
    "atr_14",
    "vol_20",
    "candle_body",
    "candle_range",
    "upper_wick",
    "lower_wick",
    "hour",
    "dayofweek",
    "session_asia",
    "session_london",
    "session_ny",
]

# ----- A) Boosting dataset -----
X_boost = df_all[feature_cols].to_numpy(dtype=np.float32)
y_boost = df_all[["gap_next", "range_next", "body_next"]].to_numpy(dtype=np.float32)
boost_index = df_all.index  # เวลา input t (target = t+1 ตามนิยามคอลัมน์)

print("Boosting:", X_boost.shape, y_boost.shape)


# ----- B) DL sequence dataset -----
def build_sequence_dataset_residual(
    df: pd.DataFrame,
    cfg: DatasetConfig,
    feature_cols: List[str],
) -> Tuple[np.ndarray, np.ndarray, pd.DatetimeIndex]:
    """
    X_seq: [num_samples, seq_len, num_features]
    y_dl:  [num_samples, 5] = [gap_next, range_next, body_next, upper_wick_next, lower_wick_next]
      *ตรงนี้เป็น "true targets" ก่อนทำ residual
      หลัง train boosting แล้วค่อยแปลง 3 ตัวแรกเป็น residual
    target_index = เวลา target (t+horizon) ของ sample นั้น
    """

    data_feat = df[feature_cols].to_numpy(dtype=np.float32)

    y_true = df[
        ["gap_next", "range_next", "body_next", "upper_wick_next", "lower_wick_next"]
    ].to_numpy(dtype=np.float32)

    index_values = df.index.to_numpy()
    seq_len = cfg.seq_len
    horizon = cfg.horizon
    n_rows = len(df)

    num_samples = n_rows - seq_len - horizon + 1
    if num_samples <= 0:
        raise ValueError("Not enough rows for given seq_len and horizon.")

    num_features = data_feat.shape[1]
    X_seq = np.empty((num_samples, seq_len, num_features), dtype=np.float32)
    y_dl = np.empty((num_samples, 5), dtype=np.float32)
    idx_list = []

    out_idx = 0
    # i = last index of input window (เวลา t)
    for i in range(seq_len - 1, n_rows - horizon):
        start = i - (seq_len - 1)
        end = i + 1

        # target ของ sample นี้ใช้ที่แถว i (นิยามไว้ว่าเป็น t -> t+1 แล้ว)
        X_seq[out_idx] = data_feat[start:end]
        y_dl[out_idx] = y_true[i]
        idx_list.append(index_values[i + horizon])  # เวลาแท่งที่ถูกทำนายจริง

        out_idx += 1

    assert out_idx == num_samples
    target_index = pd.DatetimeIndex(idx_list)
    return X_seq, y_dl, target_index


X_seq, y_dl_true, target_index = build_sequence_dataset_residual(
    df_all, config, feature_cols
)
print("DL seq:", X_seq.shape, y_dl_true.shape, target_index[:3])

Boosting: (4160, 24) (4160, 3)
DL seq: (4136, 24, 24) (4136, 5) DatetimeIndex(['2010-01-27', '2010-01-28', '2010-01-29'], dtype='datetime64[ns]', freq=None)


In [6]:
# ============================
# Call 6: time-based split (train / val / test) สำหรับทั้ง Boosting และ DL
# ============================


def time_based_split_array(arr: np.ndarray, cfg: DatasetConfig):
    n = len(arr)
    train_end = int(n * cfg.train_ratio)
    val_end = train_end + int(n * cfg.val_ratio)
    return arr[:train_end], arr[train_end:val_end], arr[val_end:]


# ----- Boosting split -----
Xb_train, Xb_val, Xb_test = time_based_split_array(X_boost, config)
yb_train, yb_val, yb_test = time_based_split_array(y_boost, config)
idxb_train, idxb_val, idxb_test = (
    boost_index[: len(Xb_train)],
    boost_index[len(Xb_train) : len(Xb_train) + len(Xb_val)],
    boost_index[len(Xb_train) + len(Xb_val) :],
)

print("Boost Train:", Xb_train.shape, yb_train.shape)
print("Boost Val:  ", Xb_val.shape, yb_val.shape)
print("Boost Test: ", Xb_test.shape, yb_test.shape)

# ----- DL split -----
Xs_train, Xs_val, Xs_test = time_based_split_array(X_seq, config)
yd_train_true, yd_val_true, yd_test_true = time_based_split_array(y_dl_true, config)
idxs_train, idxs_val, idxs_test = (
    target_index[: len(Xs_train)],
    target_index[len(Xs_train) : len(Xs_train) + len(Xs_val)],
    target_index[len(Xs_train) + len(Xs_val) :],
)

print("DL Train:", Xs_train.shape, yd_train_true.shape)
print("DL Val:  ", Xs_val.shape, yd_val_true.shape)
print("DL Test: ", Xs_test.shape, yd_test_true.shape)

# Diagnostic: ถ้า hour มีค่าเดียว (D1 แท้) ให้พิจารณาตัดฟีเจอร์เวลา
print("hour nunique:", df_all["hour"].nunique())

Boost Train: (2912, 24) (2912, 3)
Boost Val:   (624, 24) (624, 3)
Boost Test:  (624, 24) (624, 3)
DL Train: (2895, 24, 24) (2895, 5)
DL Val:   (620, 24, 24) (620, 5)
DL Test:  (621, 24, 24) (621, 5)
hour nunique: 1


In [7]:
# ============================
# Call 7: เซฟ dataset (.npz + meta.json) สำหรับ Boosting + DL Residual
# ============================

save_dir = Path("../../prepared_datasets/boosting_dl_residual").resolve()
save_dir.mkdir(parents=True, exist_ok=True)

npz_path = save_dir / "eurusd_struct_sequences.npz"
np.savez_compressed(
    npz_path,
    # ---- Boosting tabular ----
    Xb_train=Xb_train,
    Xb_val=Xb_val,
    Xb_test=Xb_test,
    yb_train=yb_train,
    yb_val=yb_val,
    yb_test=yb_test,
    idxb_train=np.array(idxb_train.astype(str)),
    idxb_val=np.array(idxb_val.astype(str)),
    idxb_test=np.array(idxb_test.astype(str)),
    # ---- DL sequences (true targets; residual จะทำหลัง boost pred) ----
    Xs_train=Xs_train,
    Xs_val=Xs_val,
    Xs_test=Xs_test,
    yd_train_true=yd_train_true,
    yd_val_true=yd_val_true,
    yd_test_true=yd_test_true,
    idxs_train=np.array(idxs_train.astype(str)),
    idxs_val=np.array(idxs_val.astype(str)),
    idxs_test=np.array(idxs_test.astype(str)),
)

print(f"✔ Saved NPZ dataset to: {npz_path}")

meta = {
    "csv_path": config.csv_path,
    "seq_len": config.seq_len,
    "horizon": config.horizon,
    "train_ratio": config.train_ratio,
    "val_ratio": config.val_ratio,
    "feature_cols": feature_cols,
    "columns_required": ["Time", "Open", "High", "Low", "Close", "Volume", "Spread"],
    "targets_boosting": ["gap_next", "range_next", "body_next"],
    "targets_dl_true": [
        "gap_next",
        "range_next",
        "body_next",
        "upper_wick_next",
        "lower_wick_next",
    ],
    "note": "DL targets are TRUE values; convert first 3 to residual after boosting predictions.",
}

meta_path = save_dir / "eurusd_struct_meta.json"
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print(f"✔ Saved meta JSON to: {meta_path}")

✔ Saved NPZ dataset to: /Users/thanaporn/Desktop/EURO_H1_AI/prepared_datasets/boosting_dl_residual/eurusd_struct_sequences.npz
✔ Saved meta JSON to: /Users/thanaporn/Desktop/EURO_H1_AI/prepared_datasets/boosting_dl_residual/eurusd_struct_meta.json
