In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score

import numpy as np
import polars as pl
import numpy as np
from datetime import date
from tqdm.notebook import tqdm
import itertools

import warnings
warnings.filterwarnings("ignore")

In [2]:
def add_features(df: pl.DataFrame, columnas: list) -> pl.DataFrame:        
    """... (código original sin cambios) ..."""
    # Lags para features de mercado
    for column in columnas:
        if column in ['date', 'price']:
            continue
        df = df.with_columns(pl.col(column).shift(1).alias(f'{column}_lag_1'))

    # **CALCULAR _change_1d
    for column in columnas:
        if column in ['date', 'total_volume']:
            continue
        df = df.with_columns(((pl.col(column) - pl.col(column).shift(1)) / pl.col(column).shift(1)).alias(f"{column}_change_1d")  )     

    # Ratios intermercado
    for column in columnas:
        if column in ['date', 'price', 'total_volume']:
            continue
        df = df.with_columns((pl.col("price") / pl.col(column)).alias(f"btc_{column}_ratio"))

    df = df.with_columns([
        pl.when(pl.col("price_change_1d") > 0)
          .then(1)
          .when(pl.col("price_change_1d") < 0)
          .then(-1)
          .otherwise(0)
          .alias("price_direction_1d"),
    ])
    # Features temporales básicas
    df = df.with_columns([
        pl.col("date").dt.year().alias("year"),
        pl.col("date").dt.month().alias("month"),
        pl.col("date").dt.day().alias("day"),
        pl.col("date").dt.weekday().alias("weekday"),
    ])
    # Lags de precio Variados
    lags = [15, 30, 45, 60, 75, 90]
    for lag in lags:
        df = df.with_columns([
            # Cambios en lags propuestos
            pl.col("price").shift(lag).alias(f"btc_lag_{lag}"),
        ])
    # Rolling statistics de precio
    windows = [3, 7, 14, 21, 30]
    for window in windows:
        df = df.with_columns([
            pl.col("price").rolling_std(window).alias(f"btc_std_{window}"),
        ])
    windows = [30, 60, 90]
    for window in windows:
        df = df.with_columns([
            # Medias de los Instrumentos en Ciertas Ventanas
            pl.col("price").rolling_mean(window).alias(f"price_ma_{window}"),
            # Momentum adicional basado en precio (no confundir con price_change_1d)
            ((pl.col("price") / pl.col(f"btc_lag_{window}")) - 1).alias(f"btc_momentum_{window}d"),
        ])
    # **TARGET: precio de mañana (SIEMPRE AL FINAL)**
    df = df.with_columns(pl.col("price").shift(-1).alias("price_tomorrow"))
    df = df.with_columns(
        pl.when((pl.col('price_tomorrow') - pl.col('price')) > 0)
        .then(1)
        .when((pl.col('price_tomorrow') - pl.col('price')) < 0)
        .then(-1)
        .otherwise(0)
        .alias('target_direction')
    )
    # Eliminar NaNs (ÚLTIMO PASO)
    max_offset = max(max(lags, default=0), max(windows, default=0), 1)
    return (df.slice(max_offset, df.shape[0] - max_offset))

In [3]:
def temporal_split(df: pl.DataFrame, target: str, test_size: float = 0.4, fecha_corte: date = date(2024, 12, 31)) \
        -> tuple[pl.DataFrame, pl.DataFrame, np.array, np.array, np.array, np.array, pl.DataFrame, pl.DataFrame, np.array, np.array]:
    """"""
    # División
    df_trainval = df.filter(pl.col("date") <= fecha_corte)
    df_future = df.filter(pl.col("date") > fecha_corte)

    # Ordenar por fecha
    df_trainval = df_trainval.sort("date")
    df_future = df_future.sort("date")
    # Split temporal (no aleatorio)
    split_idx = int(len(df_trainval) * (1 - test_size))

    # Obtengo los DataFrames de Train y Test (dentro de df_trainval que es la única data que podrá tener contacto con el Modelo)
    df_train = df_trainval.slice(0, split_idx)
    df_test = df_trainval.slice(split_idx, len(df_trainval) - split_idx)

    # Columnas predictoras (excluimos 'date', 'price_tomorrow', 'price_direction')
    feature_cols = [col for col in df_train.columns if col not in ["date", target, 'price_tomorrow']]

    # Convertir a numpy
    X_train = df_train.select(feature_cols).to_numpy()
    y_train = df_train.select(target).to_numpy().flatten()

    X_test = df_test.select(feature_cols).to_numpy()
    y_test = df_test.select(target).to_numpy().flatten()

    X_test_future = df_future.select(feature_cols).to_numpy()
    y_test_future = df_future.select(target).to_numpy().flatten()

    return df_trainval, df_future, X_train, y_train, X_test, y_test, df_test, df_train, X_test_future, y_test_future

In [4]:
def evaluar_modelo_clasificacion(preds, y_true, df_test, transaction_cost_pct=0.001) -> dict:
    df_test = df_test.sort('date')
    prices_hoy = df_test['price'].to_numpy()
    prices_tomorrow = df_test['price_tomorrow'].to_numpy()

    aciertos_direccion = 0
    retornos_porcentuales = []
    tasas_libre_riesgos = []

    for i in range(len(preds)):
        retorno = abs(prices_tomorrow[i] - prices_hoy[i])
        tasa_libre = prices_hoy[i] * transaction_cost_pct
        tasas_libre_riesgos.append(tasa_libre)

        if preds[i] == y_true[i]:
            aciertos_direccion += 1
            retornos_porcentuales.append((retorno - tasa_libre) / prices_hoy[i])
        else:
            retornos_porcentuales.append(-(retorno + tasa_libre) / prices_hoy[i])

    exce_retorno = np.array(retornos_porcentuales) - np.array(tasas_libre_riesgos)
    sharpe_ratio = np.mean(exce_retorno) / np.std(exce_retorno)
    directional_accuracy = aciertos_direccion / len(preds)
    cumulative_return = np.prod(1 + np.array(retornos_porcentuales)) - 1
    cagr = (1 + cumulative_return)**(365 / len(preds)) - 1

    return {
        "directional_accuracy": directional_accuracy,
        "sharpe_ratio": sharpe_ratio,
        "cumulative_return": cumulative_return,
        "Compound_Annual_Growth_Rate": cagr,
        "n_evaluated_days": len(preds),
    }

In [5]:
def entrena_evalua_lightgbm(params: dict,
                            X_train: np.array,
                            y_train: np.array,
                            X_test: np.array,
                            y_test: np.array,
                            df_test: pl.DataFrame,
                            X_test_future: np.array,
                            y_test_future: np.array,
                            df_future: pl.DataFrame,
                            transaction_cost_pct: float = 0.001,
                            iteracion: int = 0,
                            return_model: bool = False) -> tuple[dict, dict]:

    model = LGBMClassifier(random_state=42, **params)
    model.fit(X_train, y_train)

    if return_model:
        return model

    preds_test = model.predict(X_test)
    preds_future = model.predict(X_test_future)

    metricas_test = evaluar_modelo_clasificacion(preds_test, y_test, df_test, transaction_cost_pct)
    metricas_future = evaluar_modelo_clasificacion(preds_future, y_test_future, df_future, transaction_cost_pct)

    iteraciones_dict = {"iteracion": iteracion}

    return metricas_test | params | iteraciones_dict, metricas_future | params | iteraciones_dict, model



***
***
## Empieza el Flujo

In [6]:
df = pl.read_parquet("db/db.parquet")
target = 'target_direction'

# ['date', 'price', 'total_volume', 'market_cap', 'price_gold', 'stock_index_dowjones', 'stock_index_sp500', 'rate_US10Y', 'stock_index_ni225']
columns = ['date', 'price', 'total_volume', 'stock_index_sp500', 'price_gold', 'rate_US10Y', 'stock_index_ni225']
df = df.select(columns)

# AÑADO MULTIPLES FEATURES CREADAS A PARTIR DE LOS PRECIOS DE LOS DISTINTOS INSTRUMENTOS Y SUS RELACIONES
df = df.pipe(add_features, columns)

# DESCOMPOSICION DE LOS DATOS EN SET DE DATOS PARA ENTRENAR, TESTEAR Y LUEGO PROBRA CON OTRO SET (PARA PROBAR CON DATOS QUE EL MODELO NUNCA HAYA VISTO)
df_trainval, df_future, X_train, y_train, X_test, y_test, df_test, df_train, X_test_future, y_test_future = df.pipe(temporal_split, target)

In [7]:
n_estimators = [200, 300, 400, 500, 600]
max_depth = [2, 3, 4]
learning_rates = [0.1, 0.2]
subsamples = [0.8, 0.9]
colsamples_bytree = [0.8, 0.9]


# Calcular total de combinaciones
total_combinaciones = len(n_estimators) * len(max_depth) * len(learning_rates) * len(subsamples) * len(colsamples_bytree)

print(f"Probando {total_combinaciones} combinaciones...")

data_test = []
data_real_2025 = []
modelos = []
combinaciones = list(itertools.product(n_estimators, max_depth, learning_rates, subsamples, colsamples_bytree))

iteracion = 0
for estimator, depth, lr, subsample, colsample in tqdm(combinaciones, desc="Entrenando modelos"):
    params = {
        "n_estimators": estimator,
        "max_depth": depth,
        "learning_rate": lr,
        "subsample": subsample,
        "colsample_bytree": colsample,
        "verbose": -1,           # Silencia logs
        "verbosity": -1,         # Silencia logs (redundante pero seguro)
        "log_level": "fatal"     # Solo muestra errores críticos
    }

    datos_dict_test, datos_dict_real, modelo = entrena_evalua_lightgbm(
        params=params,
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        df_test=df_test,
        X_test_future=X_test_future,
        y_test_future=y_test_future,
        df_future=df_future,
        iteracion=iteracion,
    )

    data_test.append(datos_dict_test)
    data_real_2025.append(datos_dict_real)
    modelos.append(modelo)
    iteracion += 1

df_parametros_data_test = pl.DataFrame(data_test)
df_parametros_data_2025 = pl.DataFrame(data_real_2025)


Probando 120 combinaciones...


Entrenando modelos:   0%|          | 0/120 [00:00<?, ?it/s]

  File "d:\01_practica\pro-bitcoin\modelos\venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
  File "d:\01_practica\pro-bitcoin\modelos\venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
  File "C:\Users\alisk\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\alisk\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\alisk\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


In [None]:
'''

directional_accuracy   0.525547   con [price, total_volume, stock_index_sp500, price_gold, rate_US10Y, stock_index_ni225]   

directional_accuracy   0.543796   con [price, rate_US10Y, price_gold, total_volume]

directional_accuracy   0.547445   con [price, rate_US10Y, price_gold]

directional_accuracy   0.529197   con [price, total_volume, stock_index_sp500]

directional_accuracy   0.50365   con [price, stock_index_sp500]

directional_accuracy   0.532847   con [price, total_volume]

directional_accuracy   0.521898   con [price]

'''

'\ndirectional_accuracy    ########   y   ########   con [price, total_volume, stock_index_sp500, price_gold, rate_US10Y, stock_index_dowjones, stock_index_ni225]\n\ndirectional_accuracy    ########   y   ########   con [price, total_volume, stock_index_sp500, price_gold, rate_US10Y, stock_index_ni225]      *** SELECCIONADOS\ndirectional_accuracy    ########   y   ########   con [price, total_volume, stock_index_sp500, price_gold, rate_US10Y, stock_index_dowjones]\n\ndirectional_accuracy    ########   y   ########   con [price, total_volume, stock_index_sp500, price_gold, rate_US10Y]\n\ndirectional_accuracy    ########   y   0.543796   con [price, rate_US10Y, price_gold, total_volume]\n\ndirectional_accuracy    ########   y   0.547445   con [price, rate_US10Y, price_gold]\n\ndirectional_accuracy    ########   y   0.529197   con [price, total_volume, stock_index_sp500]\n\ndirectional_accuracy    ########   y   0.50365   con [price, stock_index_sp500]\n\ndirectional_accuracy    ######## 

In [9]:
df_parametros_data_2025.sort("directional_accuracy", descending=True).head(1)

directional_accuracy,sharpe_ratio,cumulative_return,Compound_Annual_Growth_Rate,n_evaluated_days,n_estimators,max_depth,learning_rate,subsample,colsample_bytree,verbose,verbosity,log_level,iteracion
f64,f64,f64,f64,i64,i64,i64,f64,f64,f64,i64,i64,str,i64
0.525547,,,,274,300,4,0.2,0.8,0.9,-1,-1,"""fatal""",45


### Recuperando el Modelo con el mejor `directional_accuracy`

In [10]:
parametros_data_2025 = df_parametros_data_2025.sort("directional_accuracy", descending=True).head(1).to_dicts()[0]
modelo_comprobacion = modelos[parametros_data_2025['iteracion']]

***
***
### Comprobación

In [11]:
import copy

def comprobacion(model):

    # Columnas Features
    features_pytorch = [col for col in df_future.columns if col not in ["date", 'target_direction', 'price_tomorrow']]

    # Modelo tal cual sera utilizado en Produccion
    xgboost_model = model
 
    X_input_xgboost = X_test_future
    
    # Inferencia 
    preds = model.predict(X_input_xgboost)
    y_test = copy.deepcopy(y_test_future)

    aciertos_direccion = 0
    for i in range(len(preds)):
        if preds[i] == y_test[i]:
            aciertos_direccion += 1
  
    directional_accuracy = aciertos_direccion / len(preds)
    return directional_accuracy


directional_accuracy = comprobacion(modelo_comprobacion)
print(f"directional_accuracy --> {directional_accuracy}")

directional_accuracy --> 0.5255474452554745


***
***
### 💾 Guardando los Modelos

In [None]:
import joblib

joblib.dump(modelo_comprobacion, "models/lightgbm_model_data_2025.pkl")