In [19]:
# ============================
# Call 1: ตั้งค่าเริ่มต้น + import library
# ============================

import os
import json
from dataclasses import dataclass
from typing import Tuple, Dict, List, Optional

import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 80)
pd.set_option("display.width", 140)


@dataclass
class DatasetConfig:
    """
    config หลักของ dataset:
    - csv_path: path ไฟล์ OHLCV
    - seq_len: ความยาว sequence สำหรับ DL residual
    - horizon: ทำนายล่วงหน้าอีกกี่แท่ง (default=1)
    - train_ratio / val_ratio: time-based split
    """

    csv_path: str = "data_csv/EURUSD_D1.csv"

    time_candidates: Tuple[str, ...] = ("Time", "time", "date", "datetime")
    open_candidates: Tuple[str, ...] = ("Open", "open", "open_price")
    high_candidates: Tuple[str, ...] = ("High", "high")
    low_candidates: Tuple[str, ...] = ("Low", "low")
    close_candidates: Tuple[str, ...] = ("Close", "close", "close_price")
    volume_candidates: Tuple[str, ...] = ("Volume", "volume", "vol", "tick_volume")
    spread_candidates: Tuple[str, ...] = ("Spread", "spread", "sprd")

    seq_len: int = 51
    horizon: int = 1

    train_ratio: float = 0.7
    val_ratio: float = 0.15


config = DatasetConfig()
config

DatasetConfig(csv_path='data_csv/EURUSD_D1.csv', time_candidates=('Time', 'time', 'date', 'datetime'), open_candidates=('Open', 'open', 'open_price'), high_candidates=('High', 'high'), low_candidates=('Low', 'low'), close_candidates=('Close', 'close', 'close_price'), volume_candidates=('Volume', 'volume', 'vol', 'tick_volume'), spread_candidates=('Spread', 'spread', 'sprd'), seq_len=51, horizon=1, train_ratio=0.7, val_ratio=0.15)

In [20]:
# ============================
# Call 2: โหลดข้อมูล EURUSD จาก CSV + เตรียม DataFrame ให้สะอาด
# ============================


def _search_csv_path(cfg: DatasetConfig) -> Path:
    """หาไฟล์ CSV แบบยืดหยุ่น (relative / ancestor)"""
    cwd = Path.cwd()
    for _ in range(6):
        probe = cwd / cfg.csv_path
        if probe.exists():
            return probe
        cwd = cwd.parent

    candidates = [
        Path(cfg.csv_path),
        Path("..") / cfg.csv_path,
        Path.cwd() / cfg.csv_path,
    ]
    for c in candidates:
        if c.exists():
            return c

    return Path(cfg.csv_path)


def _find_col_by_candidates(
    df: pd.DataFrame, candidates: Tuple[str, ...]
) -> Optional[str]:
    """หา column ใน df จาก list candidates (case-insensitive)"""
    col_map_lower = {c.lower(): c for c in df.columns}
    for name in candidates:
        lower = name.lower()
        if lower in col_map_lower:
            return col_map_lower[lower]
    return None


def load_ohlcv(cfg: DatasetConfig) -> pd.DataFrame:
    """
    โหลด OHLCV แล้วรีเนมเป็นมาตรฐาน:
      ['Time','Open','High','Low','Close','Volume','Spread']
    ถ้าไม่มี Volume/Spread -> สร้างเป็น 0.0
    """
    csv_path = _search_csv_path(cfg)

    try:
        df = pd.read_csv(csv_path, sep=None, engine="python")
    except Exception:
        df = pd.read_csv(csv_path, sep="\t", engine="python")

    df.columns = [c.strip() for c in df.columns]

    time_col = _find_col_by_candidates(df, cfg.time_candidates)
    open_col = _find_col_by_candidates(df, cfg.open_candidates)
    high_col = _find_col_by_candidates(df, cfg.high_candidates)
    low_col = _find_col_by_candidates(df, cfg.low_candidates)
    close_col = _find_col_by_candidates(df, cfg.close_candidates)
    volume_col = _find_col_by_candidates(df, cfg.volume_candidates)
    spread_col = _find_col_by_candidates(df, cfg.spread_candidates)

    missing = [
        name
        for name, col in (
            ("Open", open_col),
            ("High", high_col),
            ("Low", low_col),
            ("Close", close_col),
        )
        if col is None
    ]
    if time_col is None:
        raise KeyError(f"Time column not found. Available: {df.columns.tolist()}")
    if missing:
        raise KeyError(
            f"Missing required OHLC columns: {missing}. Available: {df.columns.tolist()}"
        )

    rename_map = {
        time_col: "Time",
        open_col: "Open",
        high_col: "High",
        low_col: "Low",
        close_col: "Close",
    }
    if volume_col is not None:
        rename_map[volume_col] = "Volume"
    if spread_col is not None:
        rename_map[spread_col] = "Spread"

    df = df.rename(columns=rename_map)

    if "Volume" not in df.columns:
        df["Volume"] = 0.0
    if "Spread" not in df.columns:
        df["Spread"] = 0.0

    df["Time"] = pd.to_datetime(df["Time"])

    numeric_cols = ["Open", "High", "Low", "Close", "Volume", "Spread"]
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

    df = df.sort_values("Time").reset_index(drop=True)
    df = df.set_index("Time")

    return df[["Open", "High", "Low", "Close", "Volume", "Spread"]]


raw_df = load_ohlcv(config)
raw_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Spread
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-11-26,1.51073,1.51381,1.49404,1.49682,70915,9
2009-11-27,1.49653,1.49985,1.48264,1.49853,76695,11
2009-11-30,1.49931,1.50825,1.49702,1.50343,79595,9
2009-12-01,1.50351,1.51194,1.49719,1.50885,79059,9
2009-12-02,1.50881,1.51096,1.5029,1.50635,80300,9


In [21]:
# ============================
# Call 3: สร้างฟีเจอร์อินดิเคเตอร์ + time/session
# ============================


def add_all_indicators_with_time(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # เผื่อกรณี index ยังไม่ใช่ DatetimeIndex
    if not isinstance(df.index, pd.DatetimeIndex):
        if "Time" in df.columns:
            df["Time"] = pd.to_datetime(df["Time"])
            df = df.set_index("Time")

    df = df.sort_index()

    # 1. ข้อมูลดิบ (Raw Data)
    open_ = df["Open"].astype(float)
    high = df["High"].astype(float)
    low = df["Low"].astype(float)
    close = df["Close"].astype(float)
    volume = df["Volume"].astype(float)

    # ==========================================
    # 1. Momentum & Trend
    # ==========================================
    # RSI 14
    delta = close.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.ewm(alpha=1 / 14, adjust=False).mean()
    avg_loss = loss.ewm(alpha=1 / 14, adjust=False).mean()
    rs = avg_gain / avg_loss.replace(0, 1e-10)
    df["rsi_14"] = 100 - (100 / (1 + rs))

    # MACD (12,26,9)
    ema12 = close.ewm(span=12, adjust=False).mean()
    ema26 = close.ewm(span=26, adjust=False).mean()
    df["macd_line"] = ema12 - ema26
    df["macd_signal"] = df["macd_line"].ewm(span=9, adjust=False).mean()
    df["macd_hist"] = df["macd_line"] - df["macd_signal"]

    # ROC 12
    df["roc_12"] = close.pct_change(periods=12) * 100

    # CCI 20
    tp = (high + low + close) / 3
    sma_tp = tp.rolling(20).mean()
    mean_dev = tp.rolling(20).apply(lambda x: np.abs(x - x.mean()).mean())
    df["cci_20"] = (tp - sma_tp) / (0.015 * mean_dev)

    # OBV
    df["obv"] = (np.sign(close.diff()) * volume).fillna(0).cumsum()

    # ==========================================
    # 2. Volatility
    # ==========================================
    # ATR 14
    prev_close = close.shift(1)
    tr1 = high - low
    tr2 = (high - prev_close).abs()
    tr3 = (low - prev_close).abs()
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    df["atr_14"] = tr.rolling(14).mean()

    # Bollinger Bandwidth 20
    sma_20 = close.rolling(20).mean()
    std_20 = close.rolling(20).std()
    upper_bb = sma_20 + 2 * std_20
    lower_bb = sma_20 - 2 * std_20
    df["bb_width"] = ((upper_bb - lower_bb) / sma_20) * 100

    # ==========================================
    # 3. Oscillator
    # ==========================================
    # Stochastic 14
    stoch_low = low.rolling(14).min()
    stoch_high = high.rolling(14).max()
    stoch_range = (stoch_high - stoch_low).replace(0, np.nan)
    df["stoch_k"] = 100 * (close - stoch_low) / stoch_range
    df["stoch_d"] = df["stoch_k"].rolling(3).mean()

    # ==========================================
    # 4. Moving Average Analysis
    # ==========================================
    ema_50 = close.ewm(span=50, adjust=False).mean()
    df["dist_ema_50"] = ((close - ema_50) / ema_50) * 100
    df["slope_ema_50"] = ema_50.diff()

    # ==========================================
    # 5. Trend Strength (ADX 14)
    # ==========================================
    up_move = high.diff()
    down_move = -low.diff()

    plus_dm = np.where((up_move > down_move) & (up_move > 0), up_move, 0.0)
    minus_dm = np.where((down_move > up_move) & (down_move > 0), down_move, 0.0)

    plus_dm_series = pd.Series(plus_dm, index=df.index)
    minus_dm_series = pd.Series(minus_dm, index=df.index)

    tr_smooth = tr.rolling(14).sum()
    plus_di = 100 * (plus_dm_series.rolling(14).sum() / tr_smooth)
    minus_di = 100 * (minus_dm_series.rolling(14).sum() / tr_smooth)

    dx = 100 * (plus_di - minus_di).abs() / (plus_di + minus_di).replace(0, np.nan)
    df["adx_14"] = dx.rolling(14).mean()

    # ==========================================
    # 6. Candlestick Features
    # ==========================================
    df["candle_body_size"] = (close - open_).abs()
    df["upper_wick"] = high - df[["Open", "Close"]].max(axis=1)
    df["lower_wick"] = df[["Open", "Close"]].min(axis=1) - low

    df["clv"] = ((close - low) - (high - close)) / (high - low)
    df["clv"] = df["clv"].fillna(0)

    # ==========================================
    # Final Selection
    # ==========================================
    cols_to_keep = [
        # Raw
        "Open",
        "High",
        "Low",
        "Close",
        "Volume",
        "Spread",
        # Indicators
        "rsi_14",
        "macd_line",
        "macd_signal",
        "macd_hist",
        "roc_12",
        "atr_14",
        "bb_width",
        "stoch_k",
        "stoch_d",
        "cci_20",
        "obv",
        "dist_ema_50",
        "slope_ema_50",
        "adx_14",
        "candle_body_size",
        "upper_wick",
        "lower_wick",
        "clv",
    ]

    return df[cols_to_keep].dropna()


feature_df = add_all_indicators_with_time(raw_df)
feature_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Spread,rsi_14,macd_line,macd_signal,macd_hist,roc_12,atr_14,bb_width,stoch_k,stoch_d,cci_20,obv,dist_ema_50,slope_ema_50,adx_14,candle_body_size,upper_wick,lower_wick,clv
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2010-01-01,1.43283,1.43356,1.43181,1.43335,76792,11,31.878833,-0.014209,-0.014727,0.000518,-1.297351,0.011359,4.982645,26.759928,24.303636,-62.060461,-637831.0,-2.349574,-0.001408,67.833384,0.00052,0.00021,0.00102,0.76
2010-01-04,1.43143,1.44556,1.42559,1.44244,80039,9,42.670806,-0.012884,-0.014358,0.001474,0.569628,0.011664,4.368248,55.807139,35.56662,-36.511571,-557792.0,-1.66357,-0.000996,63.414669,0.01101,0.00312,0.00584,0.687531
2010-01-05,1.44238,1.44834,1.43445,1.43634,79910,9,38.287277,-0.012186,-0.013924,0.001738,0.188332,0.012027,4.093083,47.038328,43.201798,-22.568894,-637702.0,-1.999514,-0.001196,59.018391,0.00604,0.00596,0.00189,-0.727862
2010-01-06,1.43638,1.44342,1.42807,1.44005,80966,9,42.177871,-0.011204,-0.01338,0.002176,0.819833,0.011501,3.581171,69.124767,57.323411,-31.80108,-556736.0,-1.679047,-0.001004,56.022409,0.00367,0.00337,0.00831,0.560912
2010-01-07,1.44008,1.44432,1.42976,1.43155,80828,9,36.500246,-0.010986,-0.012901,0.001916,0.446961,0.011454,2.978735,37.467412,51.210169,-41.819399,-637564.0,-2.172715,-0.001298,52.565589,0.00853,0.00424,0.00179,-0.754121


In [22]:
# ============================
# Call 3.1: สร้าง Target การ Breakout ของ High/Low
# ============================


def add_range_breakout_targets(df: pd.DataFrame, cfg: DatasetConfig) -> pd.DataFrame:
    """
    เพิ่มเป้าหมายการ Breakout ของ High/Low เข้าไปใน DataFrame

    Target A (col: tgt_high_break):
        1 ถ้า High ของอนาคต (t + horizon) > High ของวันนี้ (t)
        0 ถ้าไม่ใช่

    Target B (col: tgt_low_break):
        1 ถ้า Low ของอนาคต (t + horizon) < Low ของวันนี้ (t)
        0 ถ้าไม่ใช่
    """
    df = df.copy()

    # ต้องมีคอลัมน์ High / Low อยู่ใน df แล้ว (จาก raw_df หรือ feature_df)
    horizon = cfg.horizon  # ปัจจุบัน = 1

    # High / Low ในอนาคต (เลื่อนไปทางขวา = ใช้ shift(-horizon))
    future_high = df["High"].shift(-horizon)
    future_low = df["Low"].shift(-horizon)

    df["future_high"] = future_high
    df["future_low"] = future_low

    # Target A: พรุ่งนี้ High ชนะวันนี้ไหม
    df["tgt_high_break"] = (df["future_high"] > df["High"]).astype(int)

    # Target B: พรุ่งนี้ Low ต่ำกว่าวันนี้ไหม
    df["tgt_low_break"] = (df["future_low"] < df["Low"]).astype(int)

    # ตัดแถวท้าย ๆ ที่ไม่มีอนาคตแล้ว (เพราะ shift(-horizon) เป็น NaN)
    df = df.dropna(subset=["future_high", "future_low"])

    return df


# เพิ่ม Target A / B สำหรับเทรนโมเดล Breakout
dataset_df = add_range_breakout_targets(feature_df, config)
dataset_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Spread,rsi_14,macd_line,macd_signal,macd_hist,roc_12,atr_14,bb_width,stoch_k,stoch_d,cci_20,obv,dist_ema_50,slope_ema_50,adx_14,candle_body_size,upper_wick,lower_wick,clv,future_high,future_low,tgt_high_break,tgt_low_break
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
2010-01-01,1.43283,1.43356,1.43181,1.43335,76792,11,31.878833,-0.014209,-0.014727,0.000518,-1.297351,0.011359,4.982645,26.759928,24.303636,-62.060461,-637831.0,-2.349574,-0.001408,67.833384,0.00052,0.00021,0.00102,0.76,1.44556,1.42559,1,1
2010-01-04,1.43143,1.44556,1.42559,1.44244,80039,9,42.670806,-0.012884,-0.014358,0.001474,0.569628,0.011664,4.368248,55.807139,35.56662,-36.511571,-557792.0,-1.66357,-0.000996,63.414669,0.01101,0.00312,0.00584,0.687531,1.44834,1.43445,1,0
2010-01-05,1.44238,1.44834,1.43445,1.43634,79910,9,38.287277,-0.012186,-0.013924,0.001738,0.188332,0.012027,4.093083,47.038328,43.201798,-22.568894,-637702.0,-1.999514,-0.001196,59.018391,0.00604,0.00596,0.00189,-0.727862,1.44342,1.42807,0,1
2010-01-06,1.43638,1.44342,1.42807,1.44005,80966,9,42.177871,-0.011204,-0.01338,0.002176,0.819833,0.011501,3.581171,69.124767,57.323411,-31.80108,-556736.0,-1.679047,-0.001004,56.022409,0.00367,0.00337,0.00831,0.560912,1.44432,1.42976,1,0
2010-01-07,1.44008,1.44432,1.42976,1.43155,80828,9,36.500246,-0.010986,-0.012901,0.001916,0.446961,0.011454,2.978735,37.467412,51.210169,-41.819399,-637564.0,-2.172715,-0.001298,52.565589,0.00853,0.00424,0.00179,-0.754121,1.44382,1.42616,0,1


In [23]:
# ============================
# Call 3.2: สร้าง Target ทำนายความผันผวน (Volatility Class)
# ============================


def add_volatility_class_targets(
    df: pd.DataFrame,
    cfg: DatasetConfig,
    high_vol_quantile: float = 0.7,
) -> pd.DataFrame:
    """
    สร้าง target สำหรับทำนายความผันผวน (Big Move / No Big Move)

    Class 0: Low Volatility  - พรุ่งนี้เคลื่อนไหวน้อย (อยู่ในกรอบปกติ)
    Class 1: High Volatility - พรุ่งนี้เหวี่ยงแรงกว่าปกติ

    ใช้ future ATR และ future Bollinger Bandwidth เป็นตัวชี้วัด
    แล้วตัดแบ่งตาม quantile (เช่น 70%, 80%)
    """

    df = df.copy()

    # ต้องมี atr_14 และ bb_width มาจาก Call 3 อยู่แล้ว
    if "atr_14" not in df.columns or "bb_width" not in df.columns:
        raise KeyError("ต้องมีคอลัมน์ 'atr_14' และ 'bb_width' ก่อน (จาก Call 3)")

    horizon = cfg.horizon  # ปกติ = 1 (พรุ่งนี้)

    # ใช้ค่าความผันผวนของ 'อนาคต' เป็นเกณฑ์
    df["future_atr_14"] = df["atr_14"].shift(-horizon)
    df["future_bb_width"] = df["bb_width"].shift(-horizon)

    # รวม 2 ตัวเป็น metric เดียวแบบง่าย ๆ: เอา rank (เปอร์เซ็นต์) มาเฉลี่ยกัน
    vol_rank_atr = df["future_atr_14"].rank(pct=True)
    vol_rank_bb = df["future_bb_width"].rank(pct=True)
    df["vol_score"] = (vol_rank_atr + vol_rank_bb) / 2.0

    # กำหนด threshold: ข้างบน quantile = High Volatility
    thr = df["vol_score"].quantile(high_vol_quantile)

    # Class 1 = High Vol, Class 0 = Low Vol
    df["tgt_vol_high"] = (df["vol_score"] >= thr).astype(int)

    # ตัดแถวท้าย ๆ ที่ไม่มีอนาคต (จาก shift)
    df = df.dropna(subset=["future_atr_14", "future_bb_width", "vol_score"])

    return df

vol_dataset_df = add_volatility_class_targets(dataset_df, config)
vol_dataset_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Spread,rsi_14,macd_line,macd_signal,macd_hist,roc_12,atr_14,bb_width,stoch_k,stoch_d,cci_20,obv,dist_ema_50,slope_ema_50,adx_14,candle_body_size,upper_wick,lower_wick,clv,future_high,future_low,tgt_high_break,tgt_low_break,future_atr_14,future_bb_width,vol_score,tgt_vol_high
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
2010-01-01,1.43283,1.43356,1.43181,1.43335,76792,11,31.878833,-0.014209,-0.014727,0.000518,-1.297351,0.011359,4.982645,26.759928,24.303636,-62.060461,-637831.0,-2.349574,-0.001408,67.833384,0.00052,0.00021,0.00102,0.76,1.44556,1.42559,1,1,0.011664,4.368248,0.78122,1
2010-01-04,1.43143,1.44556,1.42559,1.44244,80039,9,42.670806,-0.012884,-0.014358,0.001474,0.569628,0.011664,4.368248,55.807139,35.56662,-36.511571,-557792.0,-1.66357,-0.000996,63.414669,0.01101,0.00312,0.00584,0.687531,1.44834,1.43445,1,0,0.012027,4.093083,0.76937,1
2010-01-05,1.44238,1.44834,1.43445,1.43634,79910,9,38.287277,-0.012186,-0.013924,0.001738,0.188332,0.012027,4.093083,47.038328,43.201798,-22.568894,-637702.0,-1.999514,-0.001196,59.018391,0.00604,0.00596,0.00189,-0.727862,1.44342,1.42807,0,1,0.011501,3.581171,0.708373,1
2010-01-06,1.43638,1.44342,1.42807,1.44005,80966,9,42.177871,-0.011204,-0.01338,0.002176,0.819833,0.011501,3.581171,69.124767,57.323411,-31.80108,-556736.0,-1.679047,-0.001004,56.022409,0.00367,0.00337,0.00831,0.560912,1.44432,1.42976,1,0,0.011454,2.978735,0.636429,0
2010-01-07,1.44008,1.44432,1.42976,1.43155,80828,9,36.500246,-0.010986,-0.012901,0.001916,0.446961,0.011454,2.978735,37.467412,51.210169,-41.819399,-637564.0,-2.172715,-0.001298,52.565589,0.00853,0.00424,0.00179,-0.754121,1.44382,1.42616,0,1,0.011936,2.618863,0.604909,0


In [24]:
# ============================
# Call 3.3: สร้าง Target ทำนายรูปทรงแท่งเทียนถัดไป
# ============================


def add_next_candle_pattern_targets(
    df: pd.DataFrame,
    cfg: DatasetConfig,
    body_ratio_long: float = 0.5,
    body_ratio_doji: float = 0.25,
) -> pd.DataFrame:
    """
    สร้าง target สำหรับทำนาย "รูปทรงแท่งเทียนของวันถัดไป" (t + horizon)

    Class:
      0 = Doji / Spinning Top (เนื้อเทียนเล็ก)
      1 = Bullish Long Candle  (เขียวเนื้อเทียนยาว)
      2 = Bearish Long Candle  (แดงเนื้อเทียนยาว)

    ใช้เกณฑ์:
      body = |Close - Open|
      range = High - Low
      body_ratio = body / range

      - body_ratio >= body_ratio_long  และ close > open  -> Bullish Long
      - body_ratio >= body_ratio_long  และ close < open  -> Bearish Long
      - body_ratio <= body_ratio_doji  -> Doji / Spinning Top
      - อื่น ๆ (กลาง ๆ) จะถูกตัดทิ้ง (dropna)
    """
    df = df.copy()

    horizon = cfg.horizon  # ปกติ = 1 (แท่งถัดไป)

    # ราคาของ "แท่งถัดไป"
    future_open = df["Open"].shift(-horizon)
    future_high = df["High"].shift(-horizon)
    future_low = df["Low"].shift(-horizon)
    future_close = df["Close"].shift(-horizon)

    df["future_open"] = future_open
    df["future_high"] = future_high
    df["future_low"] = future_low
    df["future_close"] = future_close

    # คำนวณ body / range ของแท่งถัดไป
    future_body = (future_close - future_open).abs()
    future_range = (future_high - future_low).replace(0, np.nan)
    body_ratio = future_body / future_range

    # ทิศของแท่งถัดไป
    is_bull = future_close > future_open
    is_bear = future_close < future_open

    # เริ่มต้นเป็น NaN (ยังไม่จัดคลาส)
    tgt_class = pd.Series(np.nan, index=df.index, dtype="float")

    # Bullish Long
    cond_bull_long = (body_ratio >= body_ratio_long) & is_bull
    tgt_class[cond_bull_long] = 1

    # Bearish Long
    cond_bear_long = (body_ratio >= body_ratio_long) & is_bear
    tgt_class[cond_bear_long] = 2

    # Doji / Spinning Top (เนื้อเทียนเล็ก)
    cond_doji = body_ratio <= body_ratio_doji
    tgt_class[cond_doji] = 0

    df["tgt_candle_class"] = tgt_class

    # (option) label แบบ string เผื่อ debug / visualization
    label_map = {
        0: "doji_spinning",
        1: "bull_long",
        2: "bear_long",
    }
    df["tgt_candle_label"] = df["tgt_candle_class"].map(label_map)

    # ตัดแถวท้าย ๆ ที่ไม่มีอนาคต + ตัดแท่งที่ไม่เข้าเงื่อนไขใดเลย
    df = df.dropna(
        subset=[
            "future_open",
            "future_high",
            "future_low",
            "future_close",
            "tgt_candle_class",
        ]
    )

    # ให้ class เป็น int
    df["tgt_candle_class"] = df["tgt_candle_class"].astype(int)

    return df


candle_dataset_df = add_next_candle_pattern_targets(vol_dataset_df, config)
candle_dataset_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Spread,rsi_14,macd_line,macd_signal,macd_hist,roc_12,atr_14,bb_width,stoch_k,stoch_d,cci_20,obv,dist_ema_50,slope_ema_50,adx_14,candle_body_size,upper_wick,lower_wick,clv,future_high,future_low,tgt_high_break,tgt_low_break,future_atr_14,future_bb_width,vol_score,tgt_vol_high,future_open,future_close,tgt_candle_class,tgt_candle_label
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
2010-01-01,1.43283,1.43356,1.43181,1.43335,76792,11,31.878833,-0.014209,-0.014727,0.000518,-1.297351,0.011359,4.982645,26.759928,24.303636,-62.060461,-637831.0,-2.349574,-0.001408,67.833384,0.00052,0.00021,0.00102,0.76,1.44556,1.42559,1,1,0.011664,4.368248,0.78122,1,1.43143,1.44244,1,bull_long
2010-01-05,1.44238,1.44834,1.43445,1.43634,79910,9,38.287277,-0.012186,-0.013924,0.001738,0.188332,0.012027,4.093083,47.038328,43.201798,-22.568894,-637702.0,-1.999514,-0.001196,59.018391,0.00604,0.00596,0.00189,-0.727862,1.44342,1.42807,0,1,0.011501,3.581171,0.708373,1,1.43638,1.44005,0,doji_spinning
2010-01-06,1.43638,1.44342,1.42807,1.44005,80966,9,42.177871,-0.011204,-0.01338,0.002176,0.819833,0.011501,3.581171,69.124767,57.323411,-31.80108,-556736.0,-1.679047,-0.001004,56.022409,0.00367,0.00337,0.00831,0.560912,1.44432,1.42976,1,0,0.011454,2.978735,0.636429,0,1.44008,1.43155,2,bear_long
2010-01-07,1.44008,1.44432,1.42976,1.43155,80828,9,36.500246,-0.010986,-0.012901,0.001916,0.446961,0.011454,2.978735,37.467412,51.210169,-41.819399,-637564.0,-2.172715,-0.001298,52.565589,0.00853,0.00424,0.00179,-0.754121,1.44382,1.42616,0,1,0.011936,2.618863,0.604909,0,1.43164,1.44083,1,bull_long
2010-01-13,1.44751,1.45797,1.4455,1.45215,80530,9,53.515422,-0.005681,-0.009624,0.003942,1.04092,0.012129,1.923839,82.025942,80.261721,213.852184,-480196.0,-0.610172,-0.000364,39.06069,0.00464,0.00582,0.00201,0.06656,1.45546,1.4445,0,1,0.012336,2.066423,0.532844,0,1.45209,1.44982,0,doji_spinning


In [25]:
# ============================
# Call 4: สร้าง dataset
#   A) Boosting (tabular)
#   B) DL (sequence multi-task)
# ============================

# ฟีเจอร์จาก field ปัจจุบัน (ไม่เอา future_* / tgt_* / label)
feature_cols: List[str] = [
    "Open",
    "High",
    "Low",
    "Close",
    "Volume",
    "Spread",
    "rsi_14",
    "macd_line",
    "macd_signal",
    "macd_hist",
    "roc_12",
    "atr_14",
    "bb_width",
    "stoch_k",
    "stoch_d",
    "cci_20",
    "obv",
    "dist_ema_50",
    "slope_ema_50",
    "adx_14",
    "candle_body_size",
    "upper_wick",
    "lower_wick",
    "clv",
]

target_cols: List[str] = [
    "tgt_high_break",  # break high
    "tgt_low_break",  # break low
    "tgt_vol_high",  # high volatility
    "tgt_candle_class",  # candle pattern class
]

# ----- A) Boosting dataset (tabular) -----
df_boost = candle_dataset_df.dropna(subset=target_cols).copy()

X_boost = df_boost[feature_cols].to_numpy(dtype=np.float32)
y_boost = df_boost[target_cols].to_numpy(dtype=np.int64)
boost_index = df_boost.index  # เวลาแท่งที่ใช้เป็น input (ทำนายพรุ่งนี้ตามนิยาม target)

print("Boosting:", X_boost.shape, y_boost.shape)


# ----- B) DL sequence dataset (multi-task classification) -----
def build_sequence_dataset_multitask(
    df: pd.DataFrame,
    cfg: DatasetConfig,
    feature_cols: List[str],
    target_cols: List[str],
) -> Tuple[np.ndarray, np.ndarray, pd.DatetimeIndex]:
    """
    X_seq: [num_samples, seq_len, num_features]
    y_seq: [num_samples, num_targets] = [
        tgt_high_break,
        tgt_low_break,
        tgt_vol_high,
        tgt_candle_class,
    ]

    target_index: เวลา "แท่งที่ถูกทำนาย" = t + horizon
    โดยแต่ละ sample ใช้ window ย้อนหลัง seq_len แท่ง (จบที่เวลา t)
    """

    df = df.dropna(subset=target_cols).copy()

    data_feat = df[feature_cols].to_numpy(dtype=np.float32)
    y_true = df[target_cols].to_numpy(dtype=np.int64)
    index_values = df.index.to_numpy()

    seq_len = cfg.seq_len
    horizon = cfg.horizon
    n_rows = len(df)

    # เหมือนเดิม: ใช้ข้อมูลจนถึงเวลา t ทำนายแท่ง t+horizon
    num_samples = n_rows - seq_len - horizon + 1
    if num_samples <= 0:
        raise ValueError("Not enough rows for given seq_len and horizon.")

    num_features = data_feat.shape[1]
    num_targets = y_true.shape[1]

    X_seq = np.empty((num_samples, seq_len, num_features), dtype=np.float32)
    y_seq = np.empty((num_samples, num_targets), dtype=np.int64)
    idx_list = []

    out_idx = 0
    # i = last index of input window (เวลา t)
    for i in range(seq_len - 1, n_rows - horizon):
        start = i - (seq_len - 1)
        end = i + 1  # ไม่รวม end ดังนั้นได้ seq_len แท่งพอดี

        # ฟีเจอร์ย้อนหลัง seq_len แท่ง (จบที่ t)
        X_seq[out_idx] = data_feat[start:end]

        # target สำหรับ movement ไปยัง t+horizon (เก็บไว้ที่ row i)
        y_seq[out_idx] = y_true[i]

        # เวลาแท่ง "อนาคต" ที่เราทำนายจริง ๆ (t + horizon)
        idx_list.append(index_values[i + horizon])

        out_idx += 1

    assert out_idx == num_samples
    target_index = pd.DatetimeIndex(idx_list)

    return X_seq, y_seq, target_index


X_seq, y_seq, target_index = build_sequence_dataset_multitask(
    candle_dataset_df, config, feature_cols, target_cols
)
print("DL seq:", X_seq.shape, y_seq.shape, target_index[:3])

Boosting: (2960, 24) (2960, 4)
DL seq: (2909, 51, 24) (2909, 4) DatetimeIndex(['2010-04-16', '2010-04-19', '2010-04-21'], dtype='datetime64[ns]', freq=None)


In [26]:
# ============================
# Call 5: time-based split (train / val / test) สำหรับทั้ง Boosting และ DL
# ============================


def time_based_split_array(arr: np.ndarray, cfg: DatasetConfig):
    n = len(arr)
    train_end = int(n * cfg.train_ratio)
    val_end = train_end + int(n * cfg.val_ratio)
    return arr[:train_end], arr[train_end:val_end], arr[val_end:]


# ----- Boosting split -----
Xb_train, Xb_val, Xb_test = time_based_split_array(X_boost, config)
yb_train, yb_val, yb_test = time_based_split_array(y_boost, config)
idxb_train, idxb_val, idxb_test = (
    boost_index[: len(Xb_train)],
    boost_index[len(Xb_train) : len(Xb_train) + len(Xb_val)],
    boost_index[len(Xb_train) + len(Xb_val) :],
)

print("Boost Train:", Xb_train.shape, yb_train.shape)
print("Boost Val:  ", Xb_val.shape, yb_val.shape)
print("Boost Test: ", Xb_test.shape, yb_test.shape)

# ----- DL split (จาก Call 4: X_seq, y_seq, target_index) -----
Xs_train, Xs_val, Xs_test = time_based_split_array(X_seq, config)
yd_train, yd_val, yd_test = time_based_split_array(y_seq, config)
idxs_train, idxs_val, idxs_test = (
    target_index[: len(Xs_train)],
    target_index[len(Xs_train) : len(Xs_train) + len(Xs_val)],
    target_index[len(Xs_train) + len(Xs_val) :],
)

print("DL Train:", Xs_train.shape, yd_train.shape)
print("DL Val:  ", Xs_val.shape, yd_val.shape)
print("DL Test: ", Xs_test.shape, yd_test.shape)

Boost Train: (2072, 24) (2072, 4)
Boost Val:   (444, 24) (444, 4)
Boost Test:  (444, 24) (444, 4)
DL Train: (2036, 51, 24) (2036, 4)
DL Val:   (436, 51, 24) (436, 4)
DL Test:  (437, 51, 24) (437, 4)


In [27]:
# ============================
# Call 6: เซฟ dataset (.npz + meta.json + ตรวจสอบเป็น .csv)
# ============================

save_dir = Path("../../prepared_datasets/boosting_dl_multitask").resolve()
save_dir.mkdir(parents=True, exist_ok=True)

npz_path = save_dir / "eurusd_multitask_sequences.npz"
np.savez_compressed(
    npz_path,
    # ---- Boosting tabular ----
    Xb_train=Xb_train,
    Xb_val=Xb_val,
    Xb_test=Xb_test,
    yb_train=yb_train,
    yb_val=yb_val,
    yb_test=yb_test,
    idxb_train=np.array(idxb_train.astype(str)),
    idxb_val=np.array(idxb_val.astype(str)),
    idxb_test=np.array(idxb_test.astype(str)),
    # ---- DL sequences (multi-task targets) ----
    Xs_train=Xs_train,
    Xs_val=Xs_val,
    Xs_test=Xs_test,
    yd_train=yd_train,
    yd_val=yd_val,
    yd_test=yd_test,
    idxs_train=np.array(idxs_train.astype(str)),
    idxs_val=np.array(idxs_val.astype(str)),
    idxs_test=np.array(idxs_test.astype(str)),
)

print(f"✔ Saved NPZ dataset to: {npz_path}")

meta = {
    "csv_path": config.csv_path,
    "seq_len": config.seq_len,
    "horizon": config.horizon,
    "train_ratio": config.train_ratio,
    "val_ratio": config.val_ratio,
    "feature_cols": feature_cols,
    "columns_required": ["Time", "Open", "High", "Low", "Close", "Volume", "Spread"],
    "targets_boosting": [
        "tgt_high_break",
        "tgt_low_break",
        "tgt_vol_high",
        "tgt_candle_class",
    ],
    "targets_dl": [
        "tgt_high_break",
        "tgt_low_break",
        "tgt_vol_high",
        "tgt_candle_class",
    ],
    "note": "Multi-task classification: breakout, volatility, next-candle pattern.",
}

meta_path = save_dir / "eurusd_multitask_meta.json"
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print(f"✔ Saved meta JSON to: {meta_path}")

# ---------------- CSV สำหรับตรวจสอบ ----------------

# 1) ทั้ง df_all เต็ม ๆ
csv_all_path = save_dir / "eurusd_df_all_full.csv"
candle_dataset_df.to_csv(csv_all_path, index=True)
print(f"✔ Saved full df_all CSV to: {csv_all_path}")

# 2) แยก Boosting train/val/test ตาม index (มีทั้งฟีเจอร์ + target)
df_boost_train = candle_dataset_df.loc[idxb_train].copy()
df_boost_val = candle_dataset_df.loc[idxb_val].copy()
df_boost_test = candle_dataset_df.loc[idxb_test].copy()

df_boost_train.to_csv(save_dir / "eurusd_boost_train.csv")
df_boost_val.to_csv(save_dir / "eurusd_boost_val.csv")
df_boost_test.to_csv(save_dir / "eurusd_boost_test.csv")

print("✔ Saved boosting CSV splits (train/val/test)")

# 3) แยก DL train/val/test ตาม target_index (ตรวจสอบ sequence target)
df_seq_train = candle_dataset_df.loc[idxs_train].copy()
df_seq_val = candle_dataset_df.loc[idxs_val].copy()
df_seq_test = candle_dataset_df.loc[idxs_test].copy()

df_seq_train.to_csv(save_dir / "eurusd_seq_train.csv")
df_seq_val.to_csv(save_dir / "eurusd_seq_val.csv")
df_seq_test.to_csv(save_dir / "eurusd_seq_test.csv")

print("✔ Saved sequence CSV splits (train/val/test)")

✔ Saved NPZ dataset to: /Users/thanaporn/Desktop/EURO_H1_AI/prepared_datasets/boosting_dl_multitask/eurusd_multitask_sequences.npz
✔ Saved meta JSON to: /Users/thanaporn/Desktop/EURO_H1_AI/prepared_datasets/boosting_dl_multitask/eurusd_multitask_meta.json
✔ Saved full df_all CSV to: /Users/thanaporn/Desktop/EURO_H1_AI/prepared_datasets/boosting_dl_multitask/eurusd_df_all_full.csv
✔ Saved boosting CSV splits (train/val/test)
✔ Saved sequence CSV splits (train/val/test)
