In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import sys
from pathlib import Path

project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))
from ingest.ohlcv.queries import load_ohlcv
from ingest.ohlcv.utils import get_ibex_tickers, get_macro_tickers

In [1]:
micro = get_ibex_tickers()
micro_df = load_ohlcv(micro)
# Some days have 0 volume e.g on Christmas (API errors)
micro_df = micro_df[micro_df["volume"] > 0]
print(micro_df.head(1))

macro = get_macro_tickers()
macro_df = load_ohlcv(macro)
print(micro_df.head(1))


NameError: name 'get_ibex_tickers' is not defined

In [19]:
def rolling_slope(series, window):
    x = np.arange(window)
    return series.rolling(window).apply(
        lambda y: np.polyfit(x, y, 1)[0],
        raw=True
    )

def rsi(series, window):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window).mean()
    avg_loss = loss.rolling(window).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

In [None]:
def build_micro_features(df:pd.DataFrame):
    """
    Intra ticker features
    """
    df = df.copy()
    # Returns 
    df["log_ret_1"] = np.log(df["close"] / df["close"].shift(1))

    for w in [3, 5, 10, 20]:
        df[f"log_ret_{w}"] = np.log(df["close"] / df["close"].shift(w))
        # df[f"ret_mean_{w}"] = df["log_ret_1"].rolling(w).mean() too correlated with above!
    df["ret_mean_5"] = df["log_ret_1"].rolling(5).mean()
    # Volatility
    for w in [5, 10, 20]:
        df[f"vol_{w}"] = df["log_ret_1"].rolling(w).std()

    # volatility ratios (regime indicators)
    df["vol_ratio_5_20"] = df["vol_5"] / df["vol_20"]

    # ATR (Average True Range)
    high_low = df["high"] - df["low"]
    high_close = np.abs(df["high"] - df["close"].shift(1))
    low_close = np.abs(df["low"] - df["close"].shift(1))

    true_range = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df["atr_14"] = true_range.rolling(14).mean()
    df["atr_pct"] = df["atr_14"] / df["close"]

    # TREND & Momentum strength
    # Moving averages & ratios
    for w in [5, 10, 20, 50]:
        df[f"sma_{w}"] = df["close"].rolling(w).mean()
        df[f"ema_{w}"] = df["close"].ewm(span=w, adjust=False).mean()

    df["sma_ratio_5_20"] = df["sma_5"] / df["sma_20"] - 1
    df["sma_ratio_10_50"] = df["sma_10"] / df["sma_50"] - 1
    df["ema_ratio_5_20"] = df["ema_5"] / df["ema_20"] - 1

    # Slope 
    for w in [10, 20]:
        df[f"slope_{w}"] = rolling_slope(df["close"], w)
    
    # Distance to extremes
    for w in [10, 20]:
        df[f"dist_high_{w}"] = df["close"] / df["high"].rolling(w).max() - 1
        df[f"dist_low_{w}"] = df["close"] / df["low"].rolling(w).min() - 1

    # Oscillators: RSI
    # df["rsi_7"] = rsi(df["close"], 7)
    df["rsi_14"] = rsi(df["close"], 14)

    # Stochastic oscillator 
    low_14 = df["low"].rolling(14).min()
    high_14 = df["high"].rolling(14).max()
    df["stoch_k"] = 100 * (df["close"] - low_14) / (high_14 - low_14)
    #df["stoch_d"] = df["stoch_k"].rolling(3).mean()

    # williams r
    #df["williams_r"] = -100 * (high_14 - df["close"]) / (high_14 - low_14)

    # volume liquidity 
    #df["log_volume"] = np.log(df["volume"])

    for w in [5, 20]:
        df[f"volu_mean_{w}"] = df["volume"].rolling(w).mean()
        df[f"volu_ratio_{w}"] = df["volume"] / df[f"volu_mean_{w}"]

    # On-Balance Volume non stationary
    # df["obv"] = (np.sign(df["log_ret_1"]).fillna(0) * df["volume"]).cumsum()

    # Volume-return interaction
    df["volu_ret_1"] = df["log_ret_1"] * df["volu_ratio_5"]

    # candle structure (price action) LESS USEFUL FOR HORIZON 7-9
    df["body"] = (df["close"] - df["open"]).abs() / df["open"]

    df["upper_wick"] = (df["high"] - df[["close", "open"]].max(axis=1)) / df["open"]
    df["lower_wick"] = (df[["close", "open"]].min(axis=1) - df["low"]) / df["open"]

    df["true_range_pct"] = true_range / df["close"]
    df["gap"] = (df["open"] - df["close"].shift(1)) / df["close"].shift(1)

    return df


In [None]:
def assert_columns(df: pd.DataFrame, required, name="DataFrame"):
    missing = set(required) - set(df.columns)
    if missing:
        raise AssertionError(
            f"{name} is missing required columns: {sorted(missing)}"
        )

def build_macro_features(df: pd.DataFrame, df_macro: pd.DataFrame):

    assert_columns(
        df,
        [
            "date",
            "log_ret_5",
            "log_ret_20",
            "vol_20",
        ],
        name="df (micro)"
    )

    assert_columns(
        df_macro,
        ["date", "ticker", "close"],
        name="df_macro"
    )

def build_target_feature(df: pd.DataFrame, horizon):

    assert_columns(df, ["close"], name="df (target)")
    assert horizon > 0, "horizon must be positive"

# before final model matrix
"""
assert "target" in micro_features.columns
assert micro_features["target"].isin([0, 1]).all()
"""

In [None]:
def build_macro_features(df: pd.DataFrame, df_macro: pd.DataFrame):
    """
    Macro ticker features
    """
    df = df.copy()
    
    ibex = df_macro[df_macro["ticker"] == "^IBEX"]
    sp = df_macro[df_macro["ticker"] == "^GSPC"]
    vix = df_macro[df_macro["ticker"] == "^VIX"]
    # ALIGN SERIES 

    df = df.merge(
    ibex[["date", "close"]].rename(columns={"close": "ibx_close"}),
    on="date",
    how="left"
    )
    
    df = df.merge(
    sp[["date", "close"]].rename(columns={"close": "sp_close"}),
    on="date",
    how="left"
    )
    
    df = df.merge(
    vix[["date", "close"]].rename(columns={"close": "vix_close"}),
    on="date",
    how="left"
    )

    # Drop later !!!
    df[f"ibx_log_ret_1"] = np.log(df["ibx_close"] / df["ibx_close"].shift(1))
    df[f"sp_log_ret_1"] = np.log(df["sp_close"] / df["sp_close"].shift(1))
    
    # Returns 
    for w in [5, 10, 20]:
        df[f"ibx_log_ret_{w}"] = np.log(df["ibx_close"] / df["ibx_close"].shift(w))
    for w in [20,50]:
        df[f"sp_log_ret_{w}"] = np.log(df["sp_close"] / df["sp_close"].shift(w))

    # Volatility
    for w in [10,20,60]:
        df[f"ibx_vol_{w}"] = df["ibx_log_ret_1"].rolling(w).std()
    for w in [20,60,100]:
        df[f"sp_vol_{w}"] = df["sp_log_ret_1"].rolling(w).std()

    # Volatilty ratio
    df["ibx_vol_ratio_10_60"] = df["ibx_vol_10"] / df["ibx_vol_60"]
    df["sp_vol_ratio_20_100"] = df["sp_vol_20"] / df["sp_vol_100"]
    
    df["vix_chg_1"] = df["vix_close"].pct_change()
    # Shock detector
    df["vix_chg_z_5"] = df["vix_chg_1"] / df["vix_chg_1"].rolling(5).std()
    
    # Medium-term stress regime
    df["vix_dev_20"] = (
        df["vix_close"] - df["vix_close"].rolling(20).mean()
    )

    # Long-term volatility regime (stable)
    df["vix_pctile_250"] = (
        df["vix_close"]
        .rolling(250)
        .apply(lambda x: pd.Series(x).rank(pct=True).iloc[-1])
    )

    # Relative to market
    df["rel_ret_5"] = df["log_ret_5"] - df["ibx_log_ret_5"]
    df["rel_ret_20"] = df["log_ret_20"] - df["ibx_log_ret_20"]
    df["rel_vol_20"] = df["vol_20"] / df["ibx_vol_20"]

    return df
  

In [None]:
def build_target_feature(df: pd.DataFrame, horizon):
        
    df["future_log_ret"] = np.log(df["close"].shift(-horizon) / df["close"])
    df["target"] = (df["future_log_ret"] > 0).astype(int)

    return df

In [None]:
HORIZON = 7
micro_features = []

for ticker, g in micro_df.groupby("ticker"):
    g = g.sort_values("date").reset_index(drop=True)
    
    g = build_micro_features(g)
    g = build_macro_features(g, macro_df)   
    g = build_target_feature(g,HORIZON)
    g["ticker"] = ticker
    
    micro_features.append(g)

micro_features = pd.concat(micro_features, ignore_index=True)
# add breath
breadth = (
    micro_features.groupby("date")["log_ret_1"]
        .apply(lambda x: (x > 0).mean())
        .rename("ibx_breadth")
)
micro_features = micro_features.merge(
    breadth.reset_index(),
    on="date",
    how="left"
)
micro_features["ibx_breadth_10d"] = micro_features["ibx_breadth"].rolling(10).mean()
""" 
KEEP
log_ret_1
log_ret_3
log_ret_5
log_ret_10
ret_mean_5
slope_10
slope_20
sma_ratio_5_20
ema_ratio_5_20
vol_5
vol_20
vol_ratio_5_20
atr_pct
true_range_pct
dist_high_10
dist_low_10
dist_high_20
dist_low_20
rsi_14
stoch_k
volu_ratio_5
volu_ratio_20
volu_ret_1
body
upper_wick
lower_wick
gap
"""
""" 
DROP
atr_14
sma_5, sma_10, sma_20, sma_50
ema_5, ema_10, ema_20, ema_50
log_volume
obv
williams_r
-------------
ret_mean_3 (if log_ret_3 kept)
vol_10 (if vol_5 & vol_20 kept)
stoch_d (k already enough)

"""
""" 
KEEP MACRO
ibx_vol_10
ibx_vol_60
ibx_vol_ratio_10_60

sp_vol_20
sp_vol_100
sp_vol_ratio_20_100

vix_chg_z_5
vix_pctile_250

rel_ret_5
rel_ret_20
rel_vol_20

"""
"""
DROP MACRO
ibx_log_ret_1
ibx_log_ret_5
ibx_log_ret_10
ibx_log_ret_20
sp_log_ret_1
sp_log_ret_20
sp_log_ret_50
vix_dev_20

raw macro returns = noisy + leak-prone
VIX level differences are not scale invariant
"""
# Handle Nan / inf -> divisions  (especially from early rows)
"""  
Cross-sectional fill
X = (
    X
    .groupby(micro_features["date"])
    .transform(lambda x: x.fillna(x.median()))
)

"""
X = micro_features.drop(columns=["ticker", "date", "target", "future_log_ret"])
mask = X.notna().all(axis=1)
X = X[mask]


X = X.replace([np.inf, -np.inf], np.nan)
y = micro_features["target"]
y = y[mask]

Where RF struggles

Averages many deep, independent trees ‚Üí high bias toward the mean

Weak at learning small edge signals

No notion of ‚Äúfix previous mistakes‚Äù

Needs lots of trees to compete ‚Üí slow

RF is great when:

Signals are strong

Features are low-noise

You don‚Äôt care about tiny improvements

That is not market data üòÖ

Why boosting shines

Boosting:

Fits trees sequentially

Each tree focuses on what previous trees missed

Naturally captures:

non-linear interactions

regime effects (vol ‚Üë ‚Üí feature relevance changes)

asymmetric responses (down ‚â† up)

That‚Äôs why boosting dominates:

quant equity signals

factor models

medium-horizon classification like yours (7d)

0.5 * LightGBM
0.3 * CatBoost
0.2 * XGBoost


In [None]:
tscv = TimeSeriesSplit(n_splits=5)


param_grid = {
    "n_estimators": [300, 600],
    "max_depth": [5, 7, 10],
    "max_features": ["sqrt", 0.5],
    "min_samples_leaf": [1, 5, 10],

}

model = RandomForestClassifier()

search = GridSearchCV(
    model,
    param_grid,
    cv=tscv,
    scoring="accuracy",
)

for train_index, test_index in outer_forward_roll:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    search.fit(X_train, y_train) 
    best_model = search.best_estimator_

    preds = best_model.predict(X_test)

In [None]:
## PROPER IMPLEEMNTION NO TIME DATA LEAKAGE
from sklearn.model_selection import TimeSeriesSplit

H = 7
dates = df_feat["date"].sort_values().unique()

tscv = TimeSeriesSplit(n_splits=5)

for fold, (train_idx, test_idx) in enumerate(tscv.split(dates)):
    train_dates = dates[train_idx]
    test_dates  = dates[test_idx]

    # PURGE to avoid horizon leakage
    test_dates = test_dates[H:]

    train_mask = df_feat["date"].isin(train_dates)
    test_mask  = df_feat["date"].isin(test_dates)

    X_train = X[train_mask]
    y_train = y[train_mask]
    X_test  = X[test_mask]
    y_test  = y[test_mask]

    print(f"Fold {fold}:",
          train_dates[0], "‚Üí", train_dates[-1],
          "| test:", test_dates[0], "‚Üí", test_dates[-1])
