In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
import pickle
from pathlib import Path

# ==================== 1. CARGAR DATOS ====================
# Usamos el dataset V6 (Strict Mode) que generamos antes
INPUT_FILE = '../data/processed/f1_features_complete.csv'
df = pd.read_csv(INPUT_FILE)

# CR√çTICO: Ordenar por tiempo para que TimeSeriesSplit funcione
df = df.sort_values(['year', 'round']).reset_index(drop=True)

# Features del V6
feature_cols = [
    'pct_puntos_actual', 'pct_linear_points', 'posicion_media', # Ojo: usar nombre correcto de tu V6
    'tendencia', 'diff_con_lider_normalizada', 'progreso_temporada',
    'driver_quality_3y', 'team_avg_pos_3y', 'team_trend', 'fiabilidad_driver'
]
# Nota: Si en tu CSV las columnas se llaman diferente (ej: 'posicion_media' en vez de 'posicion_media_limpia'), 
# aj√∫stalo aqu√≠. Estoy asumiendo los nombres del V6.

target_col = 'pct_puntos_final'

# Validar columnas
available_cols = [c for c in feature_cols if c in df.columns]
if len(available_cols) < len(feature_cols):
    print(f"‚ö†Ô∏è Aviso: Faltan columnas. Usando: {available_cols}")
    feature_cols = available_cols

# ==================== 2. DEFINIR LAS 3 FASES (DATASETS SEPARADOS) ====================
# Separamos los datos pero MANTENEMOS el orden temporal dentro de cada fase
fases_data = {
    'EARLY': df[df['round'] <= 5],
    'MID':   df[(df['round'] > 5) & (df['round'] <= 12)],
    'LATE':  df[df['round'] > 12]
}

# ==================== 3. CONFIGURACI√ìN DE LA B√öSQUEDA ====================
tscv = TimeSeriesSplit(n_splits=5)

# M√©trica: Spearman (Queremos el mejor ranking, no el menor error num√©rico)
def spearman_scorer(y_true, y_pred):
    return spearmanr(y_true, y_pred)[0]

custom_scorer = make_scorer(spearman_scorer, greater_is_better=True)

# Grid de Hiperpar√°metros a explorar
# Incluimos rangos amplios para que cada fase "elija" su personalidad
param_grid = {
    'n_estimators': [50, 100, 150, 200, 300],
    'max_depth': [2, 3, 4, 5, 6],          # Early preferir√° bajos, Late altos
    'learning_rate': [0.01, 0.05, 0.08, 0.1, 0.15],
    'subsample': [0.7, 0.8, 0.9, 0.95],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8],
    'reg_alpha': [0, 0.1, 0.5, 1.0, 5.0],  # Regularizaci√≥n L1 (Clave para Early)
    'reg_lambda': [0, 0.1, 1.0, 5.0]       # Regularizaci√≥n L2
}

best_params_per_phase = {}

print("üèéÔ∏è INICIANDO OPTIMIZACI√ìN ESPECIALIZADA POR FASE (Time Series CV)...\n")

for fase_name, df_fase in fases_data.items():
    print(f"üîé Optimizando fase: {fase_name} ({len(df_fase)} filas)...")
    
    X = df_fase[feature_cols]
    y = df_fase[target_col]
    
    # Modelo base
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1, random_state=42)
    
    # B√∫squeda Aleatoria (M√°s eficiente que Grid total)
    search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_grid,
        n_iter=30,             # 30 combinaciones aleatorias por fase
        cv=tscv,               # Validaci√≥n Cruzada Temporal (ROBUSTEZ)
        scoring=custom_scorer, # Maximizamos Spearman
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    
    search.fit(X, y)
    
    best_params_per_phase[fase_name] = search.best_params_
    
    print(f"   ‚úÖ Mejor Spearman Promedio: {search.best_score_:.4f}")
    print(f"   ‚öôÔ∏è Config ganadora: {search.best_params_}\n")

# ==================== 4. RESUMEN FINAL ====================
print("="*80)
print("RESUMEN DE MOTORES OPTIMIZADOS PARA EL ENTRENAMIENTO FINAL (NB 5)")
print("="*80)

for fase, params in best_params_per_phase.items():
    print(f"\nparams_{fase.lower()} = {params}")

‚ö†Ô∏è Aviso: Faltan columnas. Usando: ['pct_puntos_actual', 'pct_linear_points', 'posicion_media', 'diff_con_lider_normalizada', 'progreso_temporada', 'driver_quality_3y', 'team_avg_pos_3y', 'team_trend']
üèéÔ∏è INICIANDO OPTIMIZACI√ìN ESPECIALIZADA POR FASE (Time Series CV)...

üîé Optimizando fase: EARLY (1891 filas)...
Fitting 5 folds for each of 30 candidates, totalling 150 fits


        nan 0.86320098 0.84770366 0.84865659 0.86611351 0.86329075
 0.83117546 0.86557011        nan 0.85588564 0.81154051 0.86112116
        nan 0.84864729 0.86820223 0.86207365 0.83500797 0.83539784
 0.87104286        nan 0.82377388 0.86296093        nan 0.85749054]


   ‚úÖ Mejor Spearman Promedio: 0.8710
   ‚öôÔ∏è Config ganadora: {'subsample': 0.8, 'reg_lambda': 5.0, 'reg_alpha': 0.1, 'n_estimators': 150, 'max_depth': 2, 'learning_rate': 0.01, 'colsample_bytree': 0.5}

üîé Optimizando fase: MID (2645 filas)...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
   ‚úÖ Mejor Spearman Promedio: 0.9550
   ‚öôÔ∏è Config ganadora: {'subsample': 0.95, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'n_estimators': 50, 'max_depth': 2, 'learning_rate': 0.1, 'colsample_bytree': 0.6}

üîé Optimizando fase: LATE (3043 filas)...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
   ‚úÖ Mejor Spearman Promedio: 0.9846
   ‚öôÔ∏è Config ganadora: {'subsample': 0.8, 'reg_lambda': 5.0, 'reg_alpha': 0, 'n_estimators': 150, 'max_depth': 6, 'learning_rate': 0.08, 'colsample_bytree': 0.8}

RESUMEN DE MOTORES OPTIMIZADOS PARA EL ENTRENAMIENTO FINAL (NB 5)

params_early = {'subsample': 0.8, 'reg_lambda': 5.0, 'reg_alpha': 0.1, 'n_estimators': 150, 'max_dept