In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import sys
from pathlib import Path

project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))
from ingest.ohlcv.queries import load_ohlcv
from ingest.ohlcv.utils import get_ibex_tickers, get_macro_tickers
from models.trees.features import build_features

In [None]:
micro = get_ibex_tickers()
df_micro = load_ohlcv(micro)
# Some days have 0 volume e.g on Christmas (API errors)
df_micro = df_micro[df_micro["volume"] > 0]
print(df_micro.head(1))

macro = get_macro_tickers()
df_macro = load_ohlcv(macro)
# Same here (although VIX has no volume) #FIX IBEX TOO DOESNT HAVE VOLUME MOST OF  TIME
df_macro = df_macro[~(
    ((#df_macro["ticker"] == "^IBEX") & (df_macro["volume"] == 0)) |
    ((df_macro["ticker"] == "^GSPC") & (df_macro["volume"] == 0))
)]
print(df_macro.head(1))

   ticker       date      open      high       low     close    volume
0  ACS.MC 2006-01-02  8.884718  9.038241  8.835722  8.966379  772525.0
  ticker       date         open         high         low        close  \
0  ^GSPC 2006-01-03  1248.290039  1270.219971  1245.73999  1268.800049   

         volume  
0  2.554570e+09  


In [3]:
print(df_micro.info())
print(df_macro.info())

<class 'pandas.core.frame.DataFrame'>
Index: 126939 entries, 0 to 127045
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   ticker  126939 non-null  object        
 1   date    126939 non-null  datetime64[ns]
 2   open    126939 non-null  float64       
 3   high    126939 non-null  float64       
 4   low     126939 non-null  float64       
 5   close   126939 non-null  float64       
 6   volume  126939 non-null  float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 7.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 15183 entries, 0 to 15235
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ticker  15183 non-null  object        
 1   date    15183 non-null  datetime64[ns]
 2   open    15183 non-null  float64       
 3   high    15183 non-null  float64       
 4   low     15183 non-null  float64       
 5 

In [None]:
from models.trees.features import macro_features

df = macro_features(df_macro)
df.shape
df.tail(150)

  macro["vix_chg_1"] = macro["vix_close"].pct_change()


ticker,date,sp_close,ibx_close,vix_close,ibx_log_ret_1,ibx_vol_10,ibx_vol_60,ibx_vol_ratio_10_60,sp_log_ret_1,sp_vol_20,sp_vol_100,sp_vol_ratio_20_100,vix_chg_1,vix_chg_z_5,vix_pctile_250
5032,2025-07-03,6279.350098,14182.900391,16.379999,0.009799,0.008515,,,0.008304,,,,-0.015625,-0.875484,
5033,2025-07-04,,13973.000000,,-0.014910,0.010040,,,,,,,0.000000,0.000000,
5034,2025-07-07,6229.979980,14074.799805,17.790001,0.007259,0.010212,,,,,,,0.086081,2.062372,
5035,2025-07-08,6225.520020,14079.500000,16.809999,0.000334,0.009243,,,-0.000716,,,,-0.055087,-1.059529,
5036,2025-07-09,6263.259766,14254.400391,15.940000,0.012346,0.007928,,,0.006044,,,,-0.051755,-0.904286,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5177,2026-01-28,6978.029785,,16.350000,,,,,-0.000082,0.006755,0.007173,0.941760,0.000000,0.000000,
5178,2026-01-29,6969.009766,,16.879999,,,,,-0.001293,0.006754,0.007165,0.942617,0.032416,2.222649,
5179,2026-01-30,6939.029785,,17.440001,,,,,-0.004311,0.006598,0.007182,0.918724,0.033175,2.115675,
5180,2026-02-02,6976.439941,,16.340000,,,,,0.005377,0.006677,0.007194,0.928120,-0.063073,-1.597102,


In [4]:
print(df_micro.describe())
print(df_macro.describe())

                                date           open           high  \
count                         126939  126939.000000  126939.000000   
mean   2016-10-15 09:20:35.279937536      17.017770      17.246200   
min              2006-01-02 00:00:00       0.194999       0.202737   
25%              2012-01-10 00:00:00       3.252324       3.292067   
50%              2017-02-10 00:00:00       6.675954       6.754276   
75%              2021-09-22 00:00:00      15.910677      16.062241   
max              2026-02-03 00:00:00    1394.518603    1419.652436   
std                              NaN      57.963333      58.971171   

                 low          close        volume  
count  126939.000000  126939.000000  1.269390e+05  
mean       16.758923      17.001881  1.091891e+07  
min         0.193452       0.197011  1.730000e+02  
25%         3.207860       3.252095  7.029240e+05  
50%         6.595791       6.673171  1.935579e+06  
75%        15.737675      15.904120  8.505541e+06  
max  

In [5]:
df_final = build_features(df_micro, df_macro, horizon=1)

  macro["vix_chg_1"] = macro["vix_close"].pct_change()


In [None]:
from models.trees.features import safe_build_features

df_final = safe_build_features(df_micro, horizon=1)

In [7]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126939 entries, 0 to 126938
Data columns (total 50 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   ticker           126939 non-null  object        
 1   date             126939 non-null  datetime64[ns]
 2   open             126939 non-null  float64       
 3   high             126939 non-null  float64       
 4   low              126939 non-null  float64       
 5   close            126939 non-null  float64       
 6   volume           126939 non-null  float64       
 7   log_ret_1        126909 non-null  float64       
 8   log_ret_3        126849 non-null  float64       
 9   log_ret_5        126789 non-null  float64       
 10  log_ret_10       126639 non-null  float64       
 11  log_ret_20       126342 non-null  float64       
 12  ret_mean_5       126789 non-null  float64       
 13  vol_5            126789 non-null  float64       
 14  vol_10           126

In [9]:
# Handle Nan / inf -> divisions  (especially from early rows)
"""  
Cross-sectional fill
X = (
    X
    .groupby(micro_features["date"])
    .transform(lambda x: x.fillna(x.median()))
)

"""
remove_cols = ["ticker","date","open","high","low","close","ibx_close", "sp_close", "vix_close", "volume","target","future_log_ret"]
#X = df_final.drop(columns=remove_cols)
X = df_final
cols = X.columns
print(X.shape)
print(X.head(20)) 
print(len(cols), cols)
X = X.replace([np.inf, -np.inf], np.nan)
"""
cross_cols = ["ibx_breadth","ibx_breadth_10d"]
X[cross_cols] = X.groupby(df_final["date"])[cross_cols].transform(lambda x: x.fillna(x.median()))
micro_cols = X.columns.difference(cross_cols)
X[micro_cols] = X[micro_cols].groupby(df_final["ticker"]).apply(lambda x: x.dropna())
mask = X.notna().all(axis=1)
X = X[mask]
y = y[mask]
"""
mask = X.notna().all(axis=1)
X = X[mask]
print(X.shape) 

y = df_final["target"]
y = y[mask]

(126939, 50)
    ticker       date      open      high       low     close     volume  \
0   ACS.MC 2006-01-02  8.884718  9.038241  8.835722  8.966379   772525.0   
1   ACS.MC 2006-01-03  8.982714  9.038243  8.966381  9.012112  1205927.0   
2   ACS.MC 2006-01-04  9.080707  9.080707  8.963115  9.012112  1324707.0   
3   ACS.MC 2006-01-05  9.038243  9.038243  8.963115  8.982714  1448718.0   
4   ACS.MC 2006-01-09  8.982709  9.005574  8.943511  9.002307  7665279.0   
5   ACS.MC 2006-01-10  8.999044  9.012109  8.936981  8.976178  8219878.0   
6   ACS.MC 2006-01-11  8.976175  9.123165  8.976175  9.123165  1187792.0   
7   ACS.MC 2006-01-12  9.070904  9.159098  9.070904  9.149299  1926726.0   
8   ACS.MC 2006-01-13  9.162365  9.175431  9.064372  9.080704  3773883.0   
9   ACS.MC 2006-01-16  9.081467  9.081467  9.028821  9.068305  1675277.0   
10  ACS.MC 2006-01-17  8.982759  9.032114  8.933403  8.949855  2760942.0   
11  ACS.MC 2006-01-18  8.887338  8.887338  8.759013  8.870886  3211432.0   

In [11]:
print(df_final.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126939 entries, 0 to 126938
Data columns (total 66 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ticker               126939 non-null  object        
 1   date                 126939 non-null  datetime64[ns]
 2   open                 126939 non-null  float64       
 3   high                 126939 non-null  float64       
 4   low                  126939 non-null  float64       
 5   close                126939 non-null  float64       
 6   volume               126939 non-null  float64       
 7   log_ret_1            126909 non-null  float64       
 8   log_ret_3            126849 non-null  float64       
 9   log_ret_5            126789 non-null  float64       
 10  log_ret_10           126639 non-null  float64       
 11  log_ret_20           126342 non-null  float64       
 12  ret_mean_5           126789 non-null  float64       
 13  vol_5         

In [9]:
X

Unnamed: 0,log_ret_1,log_ret_3,log_ret_5,log_ret_10,log_ret_20,ret_mean_5,vol_5,vol_10,vol_20,vol_ratio_5_20,...,ibx_vol_ratio_10_60,sp_log_ret_1,sp_vol_20,sp_vol_100,sp_vol_ratio_20_100,vix_chg_1,vix_chg_z_5,vix_pctile_250,ibx_breadth,ibx_breadth_10d


In [13]:
assert not X.isna().any().any()
assert np.isfinite(X.to_numpy()).all()

0.5 * LightGBM
0.3 * CatBoost
0.2 * XGBoost


In [None]:
tscv = TimeSeriesSplit(n_splits=5)


param_grid = {
    "n_estimators": [300, 600],
    "max_depth": [5, 7, 10],
    "max_features": ["sqrt", 0.5],
    "min_samples_leaf": [1, 5, 10],

}

model = RandomForestClassifier()

search = GridSearchCV(
    model,
    param_grid,
    cv=tscv,
    scoring="accuracy",
)

for train_index, test_index in outer_forward_roll:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    search.fit(X_train, y_train) 
    best_model = search.best_estimator_

    preds = best_model.predict(X_test)

In [None]:
## PROPER IMPLEEMNTION NO TIME DATA LEAKAGE
from sklearn.model_selection import TimeSeriesSplit

H = 7
dates = df_feat["date"].sort_values().unique()

tscv = TimeSeriesSplit(n_splits=5)

for fold, (train_idx, test_idx) in enumerate(tscv.split(dates)):
    train_dates = dates[train_idx]
    test_dates  = dates[test_idx]

    # PURGE to avoid horizon leakage
    test_dates = test_dates[H:]

    train_mask = df_feat["date"].isin(train_dates)
    test_mask  = df_feat["date"].isin(test_dates)

    X_train = X[train_mask]
    y_train = y[train_mask]
    X_test  = X[test_mask]
    y_test  = y[test_mask]

    print(f"Fold {fold}:",
          train_dates[0], "→", train_dates[-1],
          "| test:", test_dates[0], "→", test_dates[-1])
