In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import sys
from pathlib import Path

project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))
from ingest.ohlcv.queries import load_ohlcv
from ingest.ohlcv.utils import get_ibex_tickers, get_macro_tickers
from models.trees.features import build_features

In [2]:
micro = get_ibex_tickers()
df_micro = load_ohlcv(micro)
# Some days have 0 volume e.g on Christmas (API errors)
df_micro = df_micro[df_micro["volume"] > 0]
print(df_micro.head(1))

macro = get_macro_tickers()
df_macro = load_ohlcv(macro)
# Same here (although VIX has no volume)
df_macro = df_macro[~(
    ((df_macro["ticker"] == "^IBEX") & (df_macro["volume"] == 0)) |
    ((df_macro["ticker"] == "^GSPC") & (df_macro["volume"] == 0))
)]
print(df_macro.head(1))

   ticker       date      open      high       low     close    volume
0  ACS.MC 2006-01-02  8.884718  9.038241  8.835722  8.966379  772525.0
  ticker       date         open         high         low        close  \
0  ^GSPC 2006-01-03  1248.290039  1270.219971  1245.73999  1268.800049   

         volume  
0  2.554570e+09  


In [3]:
print(df_micro.info())
print(df_macro.info())

<class 'pandas.core.frame.DataFrame'>
Index: 126939 entries, 0 to 127045
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   ticker  126939 non-null  object        
 1   date    126939 non-null  datetime64[ns]
 2   open    126939 non-null  float64       
 3   high    126939 non-null  float64       
 4   low     126939 non-null  float64       
 5   close   126939 non-null  float64       
 6   volume  126939 non-null  float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 7.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 15183 entries, 0 to 15235
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ticker  15183 non-null  object        
 1   date    15183 non-null  datetime64[ns]
 2   open    15183 non-null  float64       
 3   high    15183 non-null  float64       
 4   low     15183 non-null  float64       
 5 

In [4]:
print(df_micro.describe())
print(df_macro.describe())

                                date           open           high  \
count                         126939  126939.000000  126939.000000   
mean   2016-10-15 09:20:35.279937536      17.017770      17.246200   
min              2006-01-02 00:00:00       0.194999       0.202737   
25%              2012-01-10 00:00:00       3.252324       3.292067   
50%              2017-02-10 00:00:00       6.675954       6.754276   
75%              2021-09-22 00:00:00      15.910677      16.062241   
max              2026-02-03 00:00:00    1394.518603    1419.652436   
std                              NaN      57.963333      58.971171   

                 low          close        volume  
count  126939.000000  126939.000000  1.269390e+05  
mean       16.758923      17.001881  1.091891e+07  
min         0.193452       0.197011  1.730000e+02  
25%         3.207860       3.252095  7.029240e+05  
50%         6.595791       6.673171  1.935579e+06  
75%        15.737675      15.904120  8.505541e+06  
max  

In [5]:
df_final = build_features(df_micro, df_macro, horizon=1)

In [None]:
# Handle Nan / inf -> divisions  (especially from early rows)
"""  
Cross-sectional fill
X = (
    X
    .groupby(micro_features["date"])
    .transform(lambda x: x.fillna(x.median()))
)

"""
remove_cols = ["ticker","date","open","high","low","close","ibx_close", "sp_close", "vix_close", "volume","target","future_log_ret"]
X = df_final.drop(columns=remove_cols)
cols = X.columns
print(X.shape)
print(X.head(20)) 
print(len(cols), cols)
X = X.replace([np.inf, -np.inf], np.nan)
"""
cross_cols = ["ibx_breadth","ibx_breadth_10d"]
X[cross_cols] = X.groupby(df_final["date"])[cross_cols].transform(lambda x: x.fillna(x.median()))
micro_cols = X.columns.difference(cross_cols)
X[micro_cols] = X[micro_cols].groupby(df_final["ticker"]).apply(lambda x: x.dropna())
mask = X.notna().all(axis=1)
X = X[mask]
y = y[mask]
"""
mask = X.notna().all(axis=1)
X = X[mask]
print(X.shape) 

y = df_final["target"]
y = y[mask]

(126939, 39)
    log_ret_1  log_ret_3  log_ret_5  log_ret_10  log_ret_20  ret_mean_5  \
0         NaN        NaN        NaN         NaN         NaN         NaN   
1    0.005087        NaN        NaN         NaN         NaN         NaN   
2    0.000000        NaN        NaN         NaN         NaN         NaN   
3   -0.003267   0.001820        NaN         NaN         NaN         NaN   
4    0.002179  -0.001089        NaN         NaN         NaN         NaN   
5   -0.002907  -0.003995   0.001092         NaN         NaN    0.000218   
6    0.016243   0.015515   0.012247         NaN         NaN    0.002449   
7    0.002860   0.016196   0.015108         NaN         NaN    0.003022   
8   -0.007526   0.011577   0.010850         NaN         NaN    0.002170   
9   -0.001366  -0.006031   0.007304         NaN         NaN    0.001461   
10  -0.013148  -0.022040  -0.002937   -0.001845         NaN   -0.000587   
11  -0.008863  -0.023377  -0.028042   -0.015795         NaN   -0.005608   
12   0.01143

In [17]:
X

Unnamed: 0,log_ret_1,log_ret_3,log_ret_5,log_ret_10,log_ret_20,ret_mean_5,vol_5,vol_ratio_5_20,atr_pct,sma_ratio_5_20,...,sp_vol_20,ibx_vol_ratio_10_60,sp_vol_ratio_20_100,vix_chg_z_5,vix_pctile_250,rel_ret_5,rel_ret_20,rel_vol_20,ibx_breadth,ibx_breadth_10d


In [13]:
assert not X.isna().any().any()
assert np.isfinite(X.to_numpy()).all()

0.5 * LightGBM
0.3 * CatBoost
0.2 * XGBoost


In [None]:
tscv = TimeSeriesSplit(n_splits=5)


param_grid = {
    "n_estimators": [300, 600],
    "max_depth": [5, 7, 10],
    "max_features": ["sqrt", 0.5],
    "min_samples_leaf": [1, 5, 10],

}

model = RandomForestClassifier()

search = GridSearchCV(
    model,
    param_grid,
    cv=tscv,
    scoring="accuracy",
)

for train_index, test_index in outer_forward_roll:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    search.fit(X_train, y_train) 
    best_model = search.best_estimator_

    preds = best_model.predict(X_test)

In [None]:
## PROPER IMPLEEMNTION NO TIME DATA LEAKAGE
from sklearn.model_selection import TimeSeriesSplit

H = 7
dates = df_feat["date"].sort_values().unique()

tscv = TimeSeriesSplit(n_splits=5)

for fold, (train_idx, test_idx) in enumerate(tscv.split(dates)):
    train_dates = dates[train_idx]
    test_dates  = dates[test_idx]

    # PURGE to avoid horizon leakage
    test_dates = test_dates[H:]

    train_mask = df_feat["date"].isin(train_dates)
    test_mask  = df_feat["date"].isin(test_dates)

    X_train = X[train_mask]
    y_train = y[train_mask]
    X_test  = X[test_mask]
    y_test  = y[test_mask]

    print(f"Fold {fold}:",
          train_dates[0], "→", train_dates[-1],
          "| test:", test_dates[0], "→", test_dates[-1])
