# ML Baseline Models
Notebook ini membangun baseline machine learning berbasis fitur OHLCV harian ETHUSDT.
Dataset processed akan disimpan ulang agar dapat dipakai ulang oleh pipeline berikutnya,
kemudian model linear (Lasso/ElasticNet) serta LightGBM dievaluasi menggunakan skema TimeSeriesSplit
dan window out-of-sample tahun 2023.

In [1]:
from __future__ import annotations
from pathlib import Path
import sys
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
import lightgbm as lgb
import joblib
from IPython.display import display


def locate_project_root() -> Path:
    """Cari folder proyek yang menyimpan data dan notebook."""
    current = Path.cwd().resolve()
    for candidate in (current, *current.parents):
        if (candidate / 'data').exists() and (candidate / 'notebooks').exists():
            return candidate
    return current


PROJECT_ROOT = locate_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

DATA_PATH = PROJECT_ROOT / 'data' / 'BINANCE_ETHUSDT.P, 60.csv'
PROCESSED_PATH = PROJECT_ROOT / 'data' / 'processed' / 'ethusdt_hourly_features.csv'
MODEL_DIR = PROJECT_ROOT / 'outputs' / 'models'
PREDICTION_DIR = PROJECT_ROOT / 'outputs' / 'predictions'

from src.performance.metrics import DEFAULT_BARS_PER_YEAR, summarise_fold_performance

BARS_PER_DAY = 24.0
BARS_PER_YEAR = float(BARS_PER_DAY * 365.0)
HORIZON_HOURS = 5
RETURN_TYPE = 'simple'
WALKFORWARD_SPLITS = 5

PROCESSED_PATH.parent.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)
PREDICTION_DIR.mkdir(parents=True, exist_ok=True)


def root_mean_squared_error(y_true: pd.Series, y_pred: pd.Series) -> float:
    """Hitung RMSE yang kompatibel dengan berbagai versi scikit-learn."""
    try:
        return float(mean_squared_error(y_true, y_pred, squared=False))
    except TypeError:
        return float(mean_squared_error(y_true, y_pred) ** 0.5)



In [2]:
def load_ohlcv(path: Path) -> pd.DataFrame:
    """Load OHLCV CSV dan bersihkan kolomnya."""

    path = Path(path).expanduser()
    if not path.exists():
        raise FileNotFoundError(
            f"Dataset {path} tidak tersedia. Download data terlebih dahulu atau ubah DATA_PATH."
        )

    raw = pd.read_csv(path)
    raw.columns = [col.strip().lower().replace(' ', '_') for col in raw.columns]
    keep = ['time', 'open', 'high', 'low', 'close', 'volume']
    missing = [col for col in keep if col not in raw.columns]
    if missing:
        raise KeyError(f"Missing columns {missing} in {path}")
    ohlcv = raw[keep].copy()
    ohlcv['time'] = pd.to_datetime(ohlcv['time'], utc=True)
    numeric_cols = [col for col in keep if col != 'time']
    ohlcv[numeric_cols] = ohlcv[numeric_cols].apply(pd.to_numeric, errors='coerce')
    ohlcv = ohlcv.dropna().set_index('time').sort_index()
    return ohlcv


def engineer_features(ohlcv: pd.DataFrame) -> pd.DataFrame:
    close = ohlcv['close'].astype(float)
    volume = ohlcv['volume'].astype(float)
    features = pd.DataFrame(index=ohlcv.index)
    features['ret_1d'] = close.pct_change(1)
    features['ret_5d'] = close.pct_change(5)
    features['ret_20d'] = close.pct_change(20)
    features['momentum_20d'] = features['ret_1d'].rolling(20).mean()
    features['momentum_60d'] = features['ret_1d'].rolling(60).mean()
    features['volatility_20d'] = features['ret_1d'].rolling(20).std()
    features['volatility_60d'] = features['ret_1d'].rolling(60).std()
    features['volume_change'] = volume.pct_change(1)
    features['volume_zscore_20d'] = (volume - volume.rolling(20).mean()) / volume.rolling(20).std()
    ema_fast = close.ewm(span=20, adjust=False).mean()
    ema_slow = close.ewm(span=60, adjust=False).mean()
    features['price_ema_spread'] = (ema_fast - ema_slow) / ema_slow
    features['high_low_range'] = (ohlcv['high'] - ohlcv['low']) / close
    return features


def build_dataset(
    ohlcv: pd.DataFrame,
    horizon: int = HORIZON_HOURS,
    return_type: str = RETURN_TYPE,
) -> pd.DataFrame:
    features = engineer_features(ohlcv)
    if return_type == 'log':
        forward_returns = (np.log(ohlcv['close'].shift(-horizon)) - np.log(ohlcv['close']))
    elif return_type == 'simple':
        forward_returns = ohlcv['close'].pct_change(horizon).shift(-horizon)
    else:
        raise ValueError("return_type harus 'simple' atau 'log'.")
    labels = (forward_returns > 0).astype(int).rename('target')
    dataset = features.join(labels).join(forward_returns.rename('future_return'))
    dataset = dataset.replace([np.inf, -np.inf], np.nan).dropna()
    return dataset


ohlcv = load_ohlcv(DATA_PATH)
dataset = build_dataset(ohlcv, horizon=HORIZON_HOURS, return_type=RETURN_TYPE)
print(f"Dataset memiliki {len(dataset)} baris dengan {dataset.shape[1]-2} fitur.")
dataset.to_csv(PROCESSED_PATH)
print(f"Dataset tersimpan ke {PROCESSED_PATH}")

dataset_metadata = pd.DataFrame(
    [
        ("horizon_hours", HORIZON_HOURS),
        ("return_type", RETURN_TYPE),
        ("target_definition", "1 jika forward_return > 0 else 0"),
        ("rows", len(dataset)),
        ("start_time", dataset.index.min()),
        ("end_time", dataset.index.max()),
    ],
    columns=["key", "value"],
).set_index("key")


Dataset memiliki 25383 baris dengan 11 fitur.
Dataset tersimpan ke C:\Users\jefri\backtest-indicator\data\processed\ethusdt_hourly_features.csv


In [3]:
feature_columns = [col for col in dataset.columns if col not in ['target', 'future_return']]

splitter = TimeSeriesSplit(n_splits=WALKFORWARD_SPLITS)
train_folds: list[tuple[int, pd.DataFrame]] = []
test_folds: list[tuple[int, pd.DataFrame]] = []
split_records: list[dict] = []

for fold_id, (train_idx, test_idx) in enumerate(splitter.split(dataset)):
    fold_train = dataset.iloc[train_idx].copy()
    fold_test = dataset.iloc[test_idx].copy()
    train_folds.append((fold_id, fold_train))
    test_folds.append((fold_id, fold_test))
    split_records.append(
        {
            'fold': fold_id,
            'train_start_time': fold_train.index.min(),
            'train_end_time': fold_train.index.max(),
            'test_start_time': fold_test.index.min(),
            'test_end_time': fold_test.index.max(),
            'n_train': len(fold_train),
            'n_test': len(fold_test),
        }
    )

if not train_folds or not test_folds:
    raise ValueError('TimeSeriesSplit gagal menghasilkan fold untuk dataset.')

train = train_folds[-1][1]
test = test_folds[-1][1]

train_split = pd.concat({fold: frame for fold, frame in train_folds}, names=['fold', 'time'])
test_split = pd.concat({fold: frame for fold, frame in test_folds}, names=['fold', 'time'])
cv_split_summary = pd.DataFrame(split_records).set_index('fold')

train_fold_performance = summarise_fold_performance(
    train_folds, return_column='future_return', bars_per_year=BARS_PER_YEAR
)
test_fold_performance = summarise_fold_performance(
    test_folds, return_column='future_return', bars_per_year=BARS_PER_YEAR
)
print('Performa walk-forward (train folds):')
display(train_fold_performance)
print('Performa walk-forward (test folds):')
display(test_fold_performance)

scaler = StandardScaler()
X_train = pd.DataFrame(
    scaler.fit_transform(train[feature_columns]),
    index=train.index,
    columns=feature_columns,
)
X_test = pd.DataFrame(
    scaler.transform(test[feature_columns]),
    index=test.index,
    columns=feature_columns,
)

def drop_low_variance_features(
    X_tr: pd.DataFrame, X_te: pd.DataFrame, tol: float = 1e-9
) -> tuple[pd.DataFrame, pd.DataFrame, list[str]]:
    variances = X_tr.var(axis=0)
    keep_cols = variances[variances > tol].index.tolist()
    dropped = sorted(set(X_tr.columns) - set(keep_cols))
    if dropped:
        print('Menghapus fitur dengan varians sangat kecil: ' + ', '.join(dropped))
    return X_tr[keep_cols], X_te[keep_cols], keep_cols

X_train, X_test, feature_columns = drop_low_variance_features(X_train, X_test)

y_train = train['target']
y_test = test['target']
y_test_returns = test['future_return']

cv_splits = min(WALKFORWARD_SPLITS, len(train) - 1)
if cv_splits < 2:
    raise ValueError('Dataset train terlalu pendek untuk membuat CV splits.')
tscv = TimeSeriesSplit(n_splits=cv_splits)

def get_probabilities(model, X: pd.DataFrame) -> np.ndarray:
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X)[:, 1]

    if hasattr(model, 'decision_function'):
        decision = model.decision_function(X)
        return 1.0 / (1.0 + np.exp(-decision))

    preds = model.predict(X)
    return preds.astype(float)

def sharpe_ratio(signal: pd.Series, realized_returns: pd.Series, periods: int = 252) -> float:
    pnl = signal * realized_returns
    std = pnl.std(ddof=0)
    if std == 0 or np.isnan(std):
        return 0.0
    return pnl.mean() / std * np.sqrt(periods)

def root_mean_squared_error(y_true: pd.Series, y_pred: pd.Series) -> float:
    """Hitung RMSE yang kompatibel dengan berbagai versi scikit-learn."""
    try:
        return float(mean_squared_error(y_true, y_pred, squared=False))
    except TypeError:
        return float(mean_squared_error(y_true, y_pred) ** 0.5)

def rolling_cv_metrics(model, X: pd.DataFrame, y: pd.Series, splitter: TimeSeriesSplit):
    preds = pd.Series(index=y.index, dtype=float)
    accs, aucs = [], []
    for train_idx, val_idx in splitter.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        estimator = clone(model)
        estimator.fit(X_tr, y_tr)
        probs = get_probabilities(estimator, X_val)
        preds.iloc[val_idx] = probs
        accs.append(accuracy_score(y_val, (probs >= 0.5).astype(int)))
        try:
            aucs.append(roc_auc_score(y_val, probs))
        except ValueError:
            aucs.append(np.nan)
    return preds, {'cv_accuracy': float(np.nanmean(accs)), 'cv_auc': float(np.nanmean(aucs))}

def fit_and_evaluate(model, X_tr, y_tr, X_te, y_te, realized_returns):
    estimator = clone(model)
    estimator.fit(X_tr, y_tr)
    probs = get_probabilities(estimator, X_te)
    predictions = (probs >= 0.5).astype(int)
    accuracy = accuracy_score(y_te, predictions)
    auc = roc_auc_score(y_te, probs)
    signal = pd.Series(probs, index=y_te.index)
    signal = 2 * signal - 1
    sharpe = sharpe_ratio(signal, realized_returns.loc[signal.index], periods=24 * 365)
    metrics = {
        'accuracy': float(accuracy),
        'roc_auc': float(auc),
        'signal_sharpe': float(sharpe),
    }
    return estimator, probs, signal, metrics


Performa walk-forward (train folds):


Unnamed: 0_level_0,start_time,end_time,n_bars,mean_return,volatility,annualised_vol,sharpe_ratio,hit_rate
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2023-01-03 12:00:00+00:00,2023-06-28 20:00:00+00:00,4233.0,0.000564,0.012423,1.162749,4.248437,0.50437
1,2023-01-03 12:00:00+00:00,2023-12-22 02:00:00+00:00,8463.0,0.000429,0.010998,1.029359,3.654064,0.505731
2,2023-01-03 12:00:00+00:00,2024-06-15 08:00:00+00:00,12693.0,0.000501,0.012484,1.168447,3.754545,0.509415
3,2023-01-03 12:00:00+00:00,2024-12-08 14:00:00+00:00,16923.0,0.000437,0.013078,1.224007,3.130384,0.509011
4,2023-01-03 12:00:00+00:00,2025-06-02 20:00:00+00:00,21153.0,0.000287,0.014568,1.36349,1.844227,0.507209


Performa walk-forward (test folds):


Unnamed: 0_level_0,start_time,end_time,n_bars,mean_return,volatility,annualised_vol,sharpe_ratio,hit_rate
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2023-06-28 21:00:00+00:00,2023-12-22 02:00:00+00:00,4230.0,0.000295,0.009355,0.875586,2.948867,0.507092
1,2023-12-22 03:00:00+00:00,2024-06-15 08:00:00+00:00,4230.0,0.000644,0.015021,1.405913,4.010713,0.516785
2,2024-06-15 09:00:00+00:00,2024-12-08 14:00:00+00:00,4230.0,0.000247,0.014714,1.377184,1.572112,0.507801
3,2024-12-08 15:00:00+00:00,2025-06-02 20:00:00+00:00,4230.0,-0.000314,0.019406,1.81633,-1.516491,0.5
4,2025-06-02 21:00:00+00:00,2025-11-26 02:00:00+00:00,4230.0,0.000272,0.01609,1.505957,1.584231,0.516312


In [4]:

linear_models = {
    "lasso": LogisticRegression(penalty="l1", solver="liblinear", max_iter=5000, random_state=42),
    "elasticnet": LogisticRegression(
        penalty="elasticnet",
        solver="saga",
        l1_ratio=0.5,
        max_iter=5000,
        random_state=42,
    ),
}

linear_results = {}
logreg_prediction_frame = pd.DataFrame()
for name, model in linear_models.items():
    cv_preds, cv_metrics = rolling_cv_metrics(model, X_train, y_train, tscv)
    estimator, probs, signal, test_metrics = fit_and_evaluate(
        model, X_train, y_train, X_test, y_test, y_test_returns
    )
    linear_results[name] = {
        "model": estimator,
        "cv_metrics": cv_metrics,
        "test_metrics": test_metrics,
        "probabilities": pd.Series(probs, index=X_test.index),
        "signals": pd.Series(signal, index=X_test.index),
    }
    print(f"[{name}] CV metrics: {cv_metrics}")
    print(f"[{name}] Test metrics: {test_metrics}")
linear_results

linear_metrics = pd.DataFrame.from_dict(
    {
        name: {
            "cv_accuracy": result['cv_metrics'].get('cv_accuracy'),
            "cv_auc": result['cv_metrics'].get('cv_auc'),
            "test_accuracy": result['test_metrics'].get('accuracy'),
            "test_roc_auc": result['test_metrics'].get('roc_auc'),
            "test_signal_sharpe": result['test_metrics'].get('signal_sharpe'),
        }
        for name, result in linear_results.items()
    },
    orient="index",
).rename_axis("model")
linear_metrics["deployment_decision"] = np.where(
    linear_metrics["test_signal_sharpe"] > 0,
    "candidate",
    "reject_negative_sharpe",
)
linear_metrics

if linear_results:
    best_logreg = max(
        linear_results.items(),
        key=lambda item: item[1]["test_metrics"].get("signal_sharpe", float('-inf')),
    )
    best_name, best_result = best_logreg
    logreg_prediction_frame = pd.DataFrame(
        {
            "probability": best_result["probabilities"],
            "signal": 2 * best_result["probabilities"] - 1,
            "future_return": y_test_returns.loc[X_test.index],
        },
        index=X_test.index,
    )
    logreg_prediction_frame["position"] = np.sign(logreg_prediction_frame["signal"])
    logreg_prediction_frame["pnl"] = logreg_prediction_frame["position"] * logreg_prediction_frame["future_return"]
    logreg_prediction_path = PREDICTION_DIR / "ml_logreg_baseline_predictions.csv"
    logreg_prediction_frame.to_csv(logreg_prediction_path, index_label="time")
    print(f"[logreg] Prediksi disimpan ke {logreg_prediction_path}")
else:
    logreg_prediction_frame = pd.DataFrame()



[lasso] CV metrics: {'cv_accuracy': 0.5403687943262411, 'cv_auc': 0.5534735600647074}
[lasso] Test metrics: {'accuracy': 0.516548463356974, 'roc_auc': 0.5218614271033626, 'signal_sharpe': -0.5026925535188294}
[elasticnet] CV metrics: {'cv_accuracy': 0.5404822695035462, 'cv_auc': 0.5535003242056233}
[elasticnet] Test metrics: {'accuracy': 0.5177304964539007, 'roc_auc': 0.5217108160656547, 'signal_sharpe': -0.538620184693603}
[logreg] Prediksi disimpan ke C:\Users\jefri\backtest-indicator\outputs\predictions\ml_logreg_baseline_predictions.csv


In [5]:
regression_models = {
    "linear": LinearRegression(),
    "ridge": Ridge(alpha=1.0, random_state=42),
    "lasso": Lasso(alpha=1e-3, max_iter=5000, random_state=42),
}

y_train_reg = train["future_return"]
y_test_reg = y_test_returns

regression_results = {}
linreg_prediction_frame = pd.DataFrame()
for name, model in regression_models.items():
    estimator = clone(model)
    estimator.fit(X_train, y_train_reg)
    preds = pd.Series(estimator.predict(X_test), index=y_test_reg.index)
    mae = mean_absolute_error(y_test_reg, preds)
    rmse = root_mean_squared_error(y_test_reg, preds)
    r2 = r2_score(y_test_reg, preds)
    strength_bins = pd.qcut(preds.abs(), q=5, labels=False, duplicates="drop")
    max_bin = strength_bins.max()
    strength = (strength_bins + 1) / max_bin if pd.notna(max_bin) and max_bin > 0 else strength_bins.fillna(0)
    signal = np.sign(preds) * strength.fillna(0.0)
    sharpe = sharpe_ratio(signal, y_test_reg, periods=int(24 * 365))
    regression_results[name] = {
        "model": estimator,
        "predictions": preds,
        "strength": strength,
        "signal": signal,
        "metrics": {
            "mae": float(mae),
            "rmse": float(rmse),
            "r2": float(r2),
            "signal_sharpe": float(sharpe),
        },
    }
    print(f"[{name}] MAE={mae:.6f}, RMSE={rmse:.6f}, R2={r2:.4f}, Sharpe={sharpe:.4f}")

regression_metrics = pd.DataFrame.from_dict(
    {name: result["metrics"] for name, result in regression_results.items()},
    orient="index",
).rename_axis("model")
regression_metrics

if regression_results:
    best_linreg = max(
        regression_results.items(),
        key=lambda item: item[1]["metrics"].get("signal_sharpe", float('-inf')),
    )
    best_name, best_result = best_linreg
    linreg_prediction_frame = pd.DataFrame(
        {
            "predicted_return": best_result["predictions"],
            "signal_strength": best_result["strength"],
            "signal": best_result["signal"],
            "future_return": y_test_reg,
        },
        index=y_test_reg.index,
    )
    linreg_prediction_frame["position"] = np.sign(linreg_prediction_frame["signal"])
    linreg_prediction_frame["pnl"] = linreg_prediction_frame["position"] * linreg_prediction_frame["future_return"]
    linreg_prediction_path = PREDICTION_DIR / "ml_linreg_baseline_predictions.csv"
    linreg_prediction_frame.to_csv(linreg_prediction_path, index_label="time")
    print(f"[linreg] Prediksi disimpan ke {linreg_prediction_path}")
else:
    linreg_prediction_frame = pd.DataFrame()


[linear] MAE=0.011194, RMSE=0.016164, R2=-0.0092, Sharpe=-2.5138
[ridge] MAE=0.011194, RMSE=0.016164, R2=-0.0092, Sharpe=-2.5487
[lasso] MAE=0.011129, RMSE=0.016090, R2=-0.0000, Sharpe=0.0000
[linreg] Prediksi disimpan ke C:\Users\jefri\backtest-indicator\outputs\predictions\ml_linreg_baseline_predictions.csv


In [6]:

X_train_lgb = X_train.astype(np.float32)
X_test_lgb = X_test.astype(np.float32)

lgb_model = lgb.LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=31,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=5,
    min_child_weight=1e-3,
    feature_pre_filter=False,
    random_state=42,
)

_, lgb_cv_metrics = rolling_cv_metrics(lgb_model, X_train_lgb, y_train, tscv)
lgb_fitted, lgb_probs, lgb_signal, lgb_test_metrics = fit_and_evaluate(
    lgb_model, X_train_lgb, y_train, X_test_lgb, y_test, y_test_returns
)

print("[lightgbm] CV metrics:", lgb_cv_metrics)
print("[lightgbm] Test metrics:", lgb_test_metrics)

joblib.dump(lgb_fitted, MODEL_DIR / "lightgbm_ml_baseline.pkl")
lgb_prediction_frame = pd.DataFrame(
    {
        "probability": lgb_probs,
        "signal": lgb_signal,
        "future_return": y_test_returns.loc[X_test.index],
    },
    index=X_test.index,
)
lgb_prediction_frame["position"] = np.sign(lgb_prediction_frame["signal"])
lgb_prediction_frame["pnl"] = lgb_prediction_frame["position"] * lgb_prediction_frame["future_return"]
lgb_prediction_path = PREDICTION_DIR / "lightgbm_ml_baseline_predictions.csv"
lgb_prediction_frame.to_csv(lgb_prediction_path, index_label="time")
lgb_prediction_frame.head()

try:
    probability_bins = pd.qcut(
        lgb_prediction_frame["probability"], q=10, duplicates="drop"
    )
except ValueError:
    probability_bins = pd.cut(lgb_prediction_frame["probability"], bins=5)
probability_calibration = (
    lgb_prediction_frame.assign(prob_bucket=probability_bins)
    .groupby("prob_bucket", observed=False)
    .agg(
        sample_size=("future_return", "size"),
        avg_future_return=("future_return", "mean"),
        avg_signal=("signal", "mean"),
        avg_position=("position", "mean"),
    )
)

lgb_metrics = pd.DataFrame(
    {
        "cv_accuracy": [lgb_cv_metrics.get('cv_accuracy')],
        "cv_auc": [lgb_cv_metrics.get('cv_auc')],
        "test_accuracy": [lgb_test_metrics.get('accuracy')],
        "test_roc_auc": [lgb_test_metrics.get('roc_auc')],
        "test_signal_sharpe": [lgb_test_metrics.get('signal_sharpe')],
    },
    index=pd.Index(["lightgbm"], name="model"),
)
lgb_metrics



[LightGBM] [Info] Number of positive: 1780, number of negative: 1748
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 3528, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504535 -> initscore=0.018141
[LightGBM] [Info] Start training from score 0.018141
[LightGBM] [Info] Number of positive: 3528, number of negative: 3525
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000485 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 7053, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500213 -> initscore=0.000851
[LightGBM] [Info] Start training from score 0.000851
[LightGBM] [Info] Numb

Unnamed: 0_level_0,cv_accuracy,cv_auc,test_accuracy,test_roc_auc,test_signal_sharpe
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lightgbm,0.519262,0.530657,0.527896,0.531046,0.718839


# Evaluasi metrik regresi
Fungsi `root_mean_squared_error` didefinisikan di sel sebelumnya untuk menjaga kompatibilitas versi scikit-learn ketika menghitung RMSE. Gunakan fungsi tersebut pada sel-sel evaluasi di bawah.


(Sel ini sengaja dikosongkan; logika evaluasi regresi ada pada sel berikutnya.)


In [7]:
y_train_reg = train['future_return']
y_test_reg = y_test_returns

regression_models = {
    'linear': LinearRegression(),
    'ridge': Ridge(alpha=1.0, random_state=42),
    'lasso': Lasso(alpha=1e-3, max_iter=5000, random_state=42),
}

regression_rows = []
for name, model in regression_models.items():
    estimator = clone(model)
    estimator.fit(X_train, y_train_reg)
    preds = estimator.predict(X_test)
    mae = mean_absolute_error(y_test_reg, preds)
    rmse = root_mean_squared_error(y_test_reg, preds)
    r2 = r2_score(y_test_reg, preds)
    regression_rows.append(
        {
            'model': name,
            'mae': mae,
            'rmse': rmse,
            'r2': r2,
        }
    )
    print(f"[{name}] MAE={mae:.6f}, RMSE={rmse:.6f}, R2={r2:.4f}")

regression_model_metrics = pd.DataFrame(regression_rows).set_index('model')
regression_model_metrics


[linear] MAE=0.011194, RMSE=0.016164, R2=-0.0092
[ridge] MAE=0.011194, RMSE=0.016164, R2=-0.0092
[lasso] MAE=0.011129, RMSE=0.016090, R2=-0.0000


Unnamed: 0_level_0,mae,rmse,r2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
linear,0.011194,0.016164,-0.009188266
ridge,0.011194,0.016164,-0.009198508
lasso,0.011129,0.01609,-8.350624e-07


In [8]:
# Evaluasi metrik regresi untuk sinyal probabilitas LightGBM
y_test_reg = y_test_returns.loc[lgb_prediction_frame.index]
regression_preds = lgb_prediction_frame['signal']

mae = mean_absolute_error(y_test_reg, regression_preds)
rmse = root_mean_squared_error(y_test_reg, regression_preds)
r2 = r2_score(y_test_reg, regression_preds)

signal_regression_metrics = pd.DataFrame(
    {
        'mae': [mae],
        'rmse': [rmse],
        'r2': [r2],
    },
    index=pd.Index(['lightgbm_signal'], name='model'),
)
signal_regression_metrics


Unnamed: 0_level_0,mae,rmse,r2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lightgbm_signal,0.177815,0.229775,-202.930659


In [9]:
from pathlib import Path
import importlib.util
import pandas as pd
from datetime import datetime, timezone

try:
    PROJECT_ROOT
except NameError:  # pragma: no cover - notebook convenience
    PROJECT_ROOT = Path.cwd()


def export_tables_to_excel(tables, path: Path) -> Path:
    def strip_timezone_from_value(value):
        if value is pd.NaT:
            return value
        if isinstance(value, pd.Timestamp):
            if value.tz is not None:
                return value.tz_convert('UTC').tz_localize(None)
            return value
        if isinstance(value, datetime):
            if value.tzinfo is not None:
                return value.astimezone(timezone.utc).replace(tzinfo=None)
            return value
        return value

    def strip_timezone_from_axis(axis):
        if isinstance(axis, pd.MultiIndex):
            new_levels = [strip_timezone_from_axis(level) for level in axis.levels]
            return axis.set_levels(new_levels)
        if isinstance(axis, pd.DatetimeIndex) and axis.tz is not None:
            return axis.tz_convert('UTC').tz_localize(None)
        if getattr(axis, 'dtype', None) == object:
            return pd.Index([strip_timezone_from_value(val) for val in axis], name=axis.name)
        return axis

    def make_excel_safe(frame: pd.DataFrame) -> pd.DataFrame:
        frame = frame.copy()
        frame.index = strip_timezone_from_axis(frame.index)
        frame.columns = strip_timezone_from_axis(frame.columns)
        for column in frame.columns:
            series = frame[column]
            if isinstance(series.dtype, pd.DatetimeTZDtype):
                frame[column] = series.dt.tz_convert('UTC').dt.tz_localize(None)
            elif series.dtype == object:
                frame[column] = series.map(strip_timezone_from_value)
        return frame

    serialisable = []
    for sheet_name, table in tables.items():
        if table is None:
            continue
        if isinstance(table, pd.Series):
            frame = table.to_frame()
        elif isinstance(table, pd.DataFrame):
            frame = table.copy()
        elif isinstance(table, dict):
            frame = pd.DataFrame([table])
        else:
            frame = pd.DataFrame(table)
        frame = make_excel_safe(frame)
        serialisable.append((sheet_name, frame))

    if not serialisable:
        raise ValueError('Tidak ada tabel yang bisa diekspor.')

    path.parent.mkdir(parents=True, exist_ok=True)

    def pick_engine() -> str:
        for candidate in ('openpyxl', 'xlsxwriter'):
            if importlib.util.find_spec(candidate):
                return candidate
        raise ModuleNotFoundError(
            "Untuk ekspor Excel diperlukan paket 'openpyxl' atau 'xlsxwriter'."
        )

    def normalise_sheet_name(name: str, existing) -> str:
        safe = (name or 'Sheet').strip() or 'Sheet'
        safe = safe[:31]
        counter = 1
        candidate = safe
        while candidate in existing:
            suffix = f'_{counter}'
            trimmed = safe[: 31 - len(suffix)] or 'Sheet'
            candidate = f"{trimmed}{suffix}"
            counter += 1
        existing.add(candidate)
        return candidate

    engine = pick_engine()
    used_names = set()
    with pd.ExcelWriter(path, engine=engine) as writer:
        for sheet_name, frame in serialisable:
            name = normalise_sheet_name(str(sheet_name), used_names)
            frame.to_excel(writer, sheet_name=name, index=True)
    print(
        f"Berhasil mengekspor {len(serialisable)} sheet ke {path} (engine: {engine})"
    )
    return path


export_dir = PROJECT_ROOT / 'outputs' / 'result-test'
export_path = export_dir / 'ml_baseline.xlsx'
export_tables_to_excel(
    {
        'dataset': dataset,
        'dataset_metadata': dataset_metadata,
        'train_split': train_split,
        'test_split': test_split,
        'cv_split_summary': cv_split_summary,
        'train_fold_performance': train_fold_performance,
        'test_fold_performance': test_fold_performance,
        'final_train_window': train,
        'final_test_window': test,
        'linear_model_metrics': linear_metrics,
        'lightgbm_metrics': lgb_metrics,
        'probability_calibration': probability_calibration,
        'predictions': lgb_prediction_frame,
        'signal_regression_metrics': signal_regression_metrics,
        'regression_model_metrics': regression_model_metrics,
    },
    export_path,
)


Berhasil mengekspor 15 sheet ke C:\Users\jefri\backtest-indicator\outputs\result-test\ml_baseline.xlsx (engine: openpyxl)


WindowsPath('C:/Users/jefri/backtest-indicator/outputs/result-test/ml_baseline.xlsx')