# Feature + Target Pipeline

Transform hourly ETH/BTC OHLCV data into a ML-ready feature matrix with a directional label.

In [1]:
import sys
from pathlib import Path


def locate_project_root() -> Path:
    current = Path.cwd().resolve()
    for candidate in (current, *current.parents):
        if (candidate / "src").exists():
            return candidate
    raise RuntimeError("Folder 'src' tidak ditemukan dari jalur kerja sekarang.")


PROJECT_ROOT = locate_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import numpy as np
import pandas as pd

from src.features import (
    build_technical_features,
    export_feature_dataset,
    load_ethbtc_1h,
    make_forward_return,
    make_forward_return_sign,
)


## Load OHLCV data

In [2]:
raw_df = load_ethbtc_1h()
raw_df.tail()

Unnamed: 0_level_0,open,high,low,close,volume,EMA,Oversold HWO Up,Overbought HWO Down,HWO Up,HWO Down,ATR,MACD,Signal
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2025-11-13 22:00:00+00:00,3177.61,3234.68,3172.51,3227.38,445520000.0,3451.319686,5.498791,,,,54.971898,-72.490706,-45.211668
2025-11-13 23:00:00+00:00,3227.37,3271.09,3213.59,3229.61,597739400.0,3447.395621,,,,,55.152477,-71.985401,-50.566414
2025-11-14 00:00:00+00:00,3229.62,3238.9,3185.22,3199.21,795357100.0,3443.002955,,,,,55.0473,-73.194231,-55.091978
2025-11-14 01:00:00+00:00,3199.21,3220.19,3183.64,3205.8,517267400.0,3438.804673,,,,,53.726064,-72.7815,-58.629882
2025-11-14 02:00:00+00:00,3205.8,3249.92,3203.66,3236.59,437101800.0,3435.225652,,,,,53.192774,-69.172534,-60.738412


## Engineer technical features

In [3]:
features = build_technical_features(raw_df)
features.tail()

Unnamed: 0_level_0,ret_1h,ret_4h,ret_12h,ret_24h,momentum_6h,momentum_12h,momentum_24h,momentum_48h,volatility_6h,volatility_24h,volatility_72h,volume_change_6h,volume_change_24h
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2025-11-13 22:00:00+00:00,0.015544,0.004895,-0.082172,-0.058002,-0.02968,-0.078886,-0.056352,-0.058551,0.012964,0.009858,0.008033,-0.660328,1.000978
2025-11-13 23:00:00+00:00,0.000691,0.01059,-0.080404,-0.055306,-0.007358,-0.077257,-0.053805,-0.054622,0.01003,0.009877,0.008036,-0.613012,2.591584
2025-11-14 00:00:00+00:00,-0.009457,0.009393,-0.089859,-0.06404,-0.003864,-0.08594,-0.062033,-0.063584,0.009292,0.009978,0.008056,-0.447631,2.048906
2025-11-14 01:00:00+00:00,0.002058,0.008835,-0.069489,-0.065293,0.003195,-0.06713,-0.063207,-0.06958,0.00907,0.009948,0.008067,-0.464277,-0.218336
2025-11-14 02:00:00+00:00,0.009559,0.00285,-0.061498,-0.058704,0.021232,-0.059645,-0.057015,-0.05509,0.008518,0.010201,0.008068,-0.602337,0.696016


## Create 4h forward return sign target

In [4]:
HORIZON_HOURS = 4
target = make_forward_return_sign(raw_df, horizon=HORIZON_HOURS)
asset_returns = make_forward_return(
    raw_df, horizon=HORIZON_HOURS, return_type="simple"
).rename(f"asset_return_{HORIZON_HOURS}h")

dataset = features.join([target, asset_returns]).dropna()

default_cutoff = pd.Timestamp("2023-01-01 00:00:00+00:00")
if isinstance(dataset.index, pd.DatetimeIndex) and dataset.index.tz is not None:
    cutoff = default_cutoff.tz_convert(dataset.index.tz)
elif isinstance(dataset.index, pd.DatetimeIndex):
    cutoff = default_cutoff.tz_localize(None)
else:
    cutoff = default_cutoff.tz_localize(None)

if (dataset.index > cutoff).sum() == 0:
    fallback_idx = max(int(len(dataset) * 0.8), 1)
    cutoff = dataset.index[min(fallback_idx, len(dataset) - 1)]

split_labels = np.where(dataset.index <= cutoff, "train", "test")
dataset = dataset.assign(split=split_labels)

cutoff_utc = cutoff.tz_convert("UTC") if getattr(cutoff, "tzinfo", None) else cutoff
dataset_metadata = pd.DataFrame(
    [
        ("horizon_hours", HORIZON_HOURS),
        ("target_column", target.name),
        ("asset_return_column", asset_returns.name),
        ("split_cutoff_utc", cutoff_utc),
        ("train_rows", int((dataset["split"] == "train").sum())),
        ("test_rows", int((dataset["split"] == "test").sum())),
    ],
    columns=["key", "value"],
).set_index("key")
dataset.tail()


Unnamed: 0_level_0,ret_1h,ret_4h,ret_12h,ret_24h,momentum_6h,momentum_12h,momentum_24h,momentum_48h,volatility_6h,volatility_24h,volatility_72h,volume_change_6h,volume_change_24h,target_sign_return_4h,asset_return_4h,split
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2025-11-13 18:00:00+00:00,-0.012971,-0.069242,-0.096905,-0.06354,-0.082394,-0.092358,-0.061564,-0.073615,0.008241,0.009077,0.007951,2.736664,2.106805,1,0.004907,test
2025-11-13 19:00:00+00:00,-0.005004,-0.057615,-0.101125,-0.063834,-0.070101,-0.09618,-0.06184,-0.075117,0.008718,0.00908,0.007896,-0.141439,0.40057,1,0.010646,test
2025-11-13 20:00:00+00:00,-0.008261,-0.04829,-0.105655,-0.074722,-0.079195,-0.100265,-0.071999,-0.077858,0.006229,0.009078,0.007911,-0.291347,2.525096,1,0.009437,test
2025-11-13 21:00:00+00:00,0.002615,-0.023621,-0.092342,-0.074055,-0.061301,-0.088207,-0.07138,-0.069789,0.008824,0.009095,0.007886,-0.724592,1.123931,1,0.008875,test
2025-11-13 22:00:00+00:00,0.015544,0.004895,-0.082172,-0.058002,-0.02968,-0.078886,-0.056352,-0.058551,0.012964,0.009858,0.008033,-0.660328,1.000978,1,0.002854,test


## Persist dataset for ML pipelines

In [5]:
output_dir = PROJECT_ROOT / "data/processed"
output_dir.mkdir(parents=True, exist_ok=True)
parquet_path = output_dir / "ethbtc_1h_features_targets.parquet"
saved_path = export_feature_dataset(dataset, parquet_path)
saved_path


WindowsPath('C:/Users/jefri/backtest/data/processed/ethbtc_1h_features_targets.parquet')

In [6]:
from pathlib import Path
import importlib.util
import pandas as pd
from datetime import datetime, timezone
try:
    PROJECT_ROOT
except NameError:  # pragma: no cover - notebook convenience
    PROJECT_ROOT = Path.cwd()



def export_tables_to_excel(tables, path: Path) -> Path:
    def strip_timezone_from_value(value):
        if value is pd.NaT:
            return value
        if isinstance(value, pd.Timestamp):
            if value.tz is not None:
                return value.tz_convert("UTC").tz_localize(None)
            return value
        if isinstance(value, datetime):
            if value.tzinfo is not None:
                return value.astimezone(timezone.utc).replace(tzinfo=None)
            return value
        return value

    def strip_timezone_from_axis(axis):
        if isinstance(axis, pd.MultiIndex):
            new_levels = [strip_timezone_from_axis(level) for level in axis.levels]
            return axis.set_levels(new_levels)
        if isinstance(axis, pd.DatetimeIndex) and axis.tz is not None:
            return axis.tz_convert("UTC").tz_localize(None)
        if getattr(axis, "dtype", None) == object:
            return pd.Index([strip_timezone_from_value(val) for val in axis], name=axis.name)
        return axis

    def make_excel_safe(frame: pd.DataFrame) -> pd.DataFrame:
        frame = frame.copy()
        frame.index = strip_timezone_from_axis(frame.index)
        frame.columns = strip_timezone_from_axis(frame.columns)
        for column in frame.columns:
            series = frame[column]
            if isinstance(series.dtype, pd.DatetimeTZDtype):
                frame[column] = series.dt.tz_convert("UTC").dt.tz_localize(None)
            elif series.dtype == object:
                frame[column] = series.map(strip_timezone_from_value)
        return frame

    serialisable = []
    for sheet_name, table in tables.items():
        if table is None:
            continue
        if isinstance(table, pd.Series):
            frame = table.to_frame()
        elif isinstance(table, pd.DataFrame):
            frame = table.copy()
        elif isinstance(table, dict):
            frame = pd.DataFrame([table])
        else:
            frame = pd.DataFrame(table)
        frame = make_excel_safe(frame)
        serialisable.append((sheet_name, frame))

    if not serialisable:
        raise ValueError("Tidak ada tabel yang bisa diekspor.")

    path.parent.mkdir(parents=True, exist_ok=True)

    def pick_engine() -> str:
        for candidate in ("openpyxl", "xlsxwriter"):
            if importlib.util.find_spec(candidate):
                return candidate
        raise ModuleNotFoundError(
            "Untuk ekspor Excel diperlukan paket 'openpyxl' atau 'xlsxwriter'."
        )

    def normalise_sheet_name(name: str, existing) -> str:
        safe = (name or "Sheet").strip() or "Sheet"
        safe = safe[:31]
        counter = 1
        candidate = safe
        while candidate in existing:
            suffix = f"_{counter}"
            trimmed = safe[: 31 - len(suffix)] or "Sheet"
            candidate = f"{trimmed}{suffix}"
            counter += 1
        existing.add(candidate)
        return candidate

    engine = pick_engine()
    used_names = set()
    with pd.ExcelWriter(path, engine=engine) as writer:
        for sheet_name, frame in serialisable:
            name = normalise_sheet_name(str(sheet_name), used_names)
            frame.to_excel(writer, sheet_name=name, index=True)

    print(
        f"Berhasil mengekspor {len(serialisable)} sheet ke {path} (engine: {engine})"
    )
    return path
export_dir = PROJECT_ROOT / "outputs" / "result-test"
export_path = export_dir / "features_target_pipeline.xlsx"

export_tables_to_excel(
    {
        "features": features,
        "target": target,
        "dataset": dataset,
        "metadata": dataset_metadata,
    },
    export_path,
)


Berhasil mengekspor 4 sheet ke C:\Users\jefri\backtest\outputs\result-test\features_target_pipeline.xlsx (engine: openpyxl)


WindowsPath('C:/Users/jefri/backtest/outputs/result-test/features_target_pipeline.xlsx')