In [None]:
# ============================================================================
# MODELO H√çBRIDO FINAL
# ============================================================================

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error
from scipy.stats import spearmanr
import pickle
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ==================== CONFIGURACI√ìN ====================

INPUT_FILE = '../data/processed/f1_features_complete.csv'
OUTPUT_DIR = '../models'

feature_cols = [
    'pct_puntos_actual',
    'pct_linear_points',
    'tendencia_ultimas_3',
    'diff_con_lider_normalizada',
    'progreso_temporada',
    'driver_quality_3y',
    'team_avg_pos_3y',
    'team_trend'
]
target_col = 'pct_puntos_final'

print("="*80)
print("üéØ OPTIMIZACI√ìN CORREGIDA - SIN DATA LEAKAGE")
print("="*80)

# ==================== CARGAR Y SPLIT TEMPORAL ====================

df = pd.read_csv(INPUT_FILE)
df = df.sort_values(['year', 'round']).reset_index(drop=True)

# üö® CR√çTICO: SPLIT TEMPORAL PRIMERO
df_train = df[df['year'] <= 2022].copy()
df_test = df[df['year'] >= 2023].copy()

print(f"\n‚úÖ Split temporal correcto:")
print(f"   Train: {len(df_train)} filas (2008-2022)")
print(f"   Test:  {len(df_test)} filas (2023-2024)")
print(f"\n‚ö†Ô∏è  Test set NUNCA se usa en entrenamiento u optimizaci√≥n")

# ==================== CREAR FASES SOLO CON TRAIN ====================

fases_data_train = {
    'EARLY': df_train[df_train['round'] <= 5],
    'MID':   df_train[(df_train['round'] > 5) & (df_train['round'] <= 12)],
    'LATE':  df_train[df_train['round'] > 12]
}

print(f"\nüìä Distribuci√≥n CORRECTA (solo train):")
for fase_name, df_fase in fases_data_train.items():
    print(f"   {fase_name:>5}: {len(df_fase):>5} filas | "
          f"A√±os {df_fase['year'].min()}-{df_fase['year'].max()}")

# ==================== GRIDS DE HIPERPAR√ÅMETROS ====================

param_grid_early = {
    'n_estimators': [100, 150, 200, 300],
    'max_depth': [2, 3, 4, 5],
    'learning_rate': [0.01, 0.03, 0.05, 0.08],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'reg_alpha': [0.1, 0.5, 1.0, 2.0],
    'reg_lambda': [1.0, 3.0, 5.0, 10.0]
}

param_grid_mid = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.05, 0.08, 0.1, 0.12],
    'subsample': [0.8, 0.9, 0.95],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0.1, 1.0, 3.0, 5.0]
}

param_grid_late = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [5, 6, 7, 8],
    'learning_rate': [0.08, 0.1, 0.12, 0.15],
    'subsample': [0.9, 0.95, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 1.0, 3.0]
}

grids = {
    'EARLY': param_grid_early,
    'MID': param_grid_mid,
    'LATE': param_grid_late
}

# ==================== OPTIMIZACI√ìN ====================

def spearman_scorer(y_true, y_pred):
    corr, _ = spearmanr(y_true, y_pred)
    return corr

spearman_score = make_scorer(spearman_scorer, greater_is_better=True)
tscv = TimeSeriesSplit(n_splits=5)

print(f"\n{'='*80}")
print(f"üöÄ INICIANDO OPTIMIZACI√ìN (SOLO con 2008-2022)")
print(f"{'='*80}")

resultados_optimizacion = {}

for fase_name, df_fase in fases_data_train.items():
    print(f"\n{'='*80}")
    print(f"üîç OPTIMIZANDO FASE: {fase_name}")
    print(f"{'='*80}")
    print(f"Datos: {len(df_fase)} filas (2008-2022)")
    
    X_fase = df_fase[feature_cols]
    y_fase = df_fase[target_col]
    
    xgb_model = xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1
    )
    
    search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=grids[fase_name],
        n_iter=40,
        cv=tscv,
        scoring=spearman_score,
        n_jobs=-1,
        verbose=1,
        random_state=42,
        return_train_score=True
    )
    
    print(f"‚è≥ Entrenando 40 √ó 5 = 200 modelos...")
    search.fit(X_fase, y_fase)
    
    best_params = search.best_params_
    best_spearman = search.best_score_
    
    cv_results = search.cv_results_
    best_idx = search.best_index_
    train_score = cv_results['mean_train_score'][best_idx]
    test_score = cv_results['mean_test_score'][best_idx]
    overfitting = train_score - test_score
    
    resultados_optimizacion[fase_name] = {
        'params': best_params,
        'spearman_cv': best_spearman,
        'spearman_train': train_score,
        'overfitting': overfitting,
        'modelo': search.best_estimator_
    }
    
    print(f"\n‚úÖ MEJOR CONFIGURACI√ìN:")
    print(f"   Spearman CV:  {best_spearman:.4f}")
    print(f"   Spearman Train: {train_score:.4f}")
    print(f"   Overfitting: {overfitting:.4f}")
    print(f"\n   Hiperpar√°metros:")
    for param, value in sorted(best_params.items()):
        print(f"      {param:20}: {value}")

# ==================== ENTRENAR MODELOS FINALES ====================

print(f"\n{'='*80}")
print(f"üèóÔ∏è  ENTRENANDO MODELOS FINALES (2008-2022)")
print(f"{'='*80}")

modelos_finales = {}

for fase_name, df_fase in fases_data_train.items():
    print(f"\nüîß Entrenando modelo final {fase_name}...")
    
    X_fase = df_fase[feature_cols]
    y_fase = df_fase[target_col]
    
    best_params = resultados_optimizacion[fase_name]['params']
    
    modelo_final = xgb.XGBRegressor(
        **best_params,
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1
    )
    modelo_final.fit(X_fase, y_fase)
    
    modelos_finales[fase_name] = modelo_final
    print(f"   ‚úÖ Modelo {fase_name} entrenado")

# ==================== EVALUACI√ìN EN TEST ====================

print(f"\n{'='*80}")
print(f"üìä EVALUACI√ìN EN TEST SET (2023-2024) - NUNCA VISTO")
print(f"{'='*80}")

def predecir_hibrido(df_test_input, modelos):
    predicciones = []
    
    for idx, row in df_test_input.iterrows():
        round_num = row['round']
        
        if round_num <= 5:
            modelo = modelos['EARLY']
        elif round_num <= 12:
            modelo = modelos['MID']
        else:
            modelo = modelos['LATE']
        
        X_row = row[feature_cols].values.reshape(1, -1)
        pred = modelo.predict(X_row)[0]
        predicciones.append(pred)
    
    return np.array(predicciones)

y_test = df_test[target_col]
y_test_pred = predecir_hibrido(df_test, modelos_finales)

from sklearn.metrics import mean_squared_error, r2_score

test_spearman, _ = spearmanr(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

print(f"\n{'='*80}")
print(f"üéØ M√âTRICAS FINALES (SIN DATA LEAKAGE)")
print(f"{'='*80}")
print(f"\n   Spearman:  {test_spearman:.4f}  ‚≠ê")
print(f"   MAE:       {test_mae:.4f}  ({test_mae*100:.2f}%)")
print(f"   RMSE:      {test_rmse:.4f}")
print(f"   R¬≤:        {test_r2:.4f}  ({test_r2*100:.1f}%)")

baseline_spearman = 0.8955
mejora = (test_spearman - baseline_spearman) * 100

print(f"\n{'='*80}")
print(f"üìà COMPARACI√ìN CON BASELINE 4C")
print(f"{'='*80}")
print(f"\n   Baseline 4C:     {baseline_spearman:.4f}")
print(f"   Este modelo:     {test_spearman:.4f}")
print(f"   Mejora REAL:     {mejora:+.2f}%")

if test_spearman > baseline_spearman:
    if mejora > 1.0:
        print(f"\n   ‚úÖ MEJORA SIGNIFICATIVA!")
    else:
        print(f"\n   ‚ö†Ô∏è  Mejora marginal")
else:
    print(f"\n   ‚ùå No hay mejora")

# ==================== FEATURE IMPORTANCE ====================

print(f"\n{'='*80}")
print(f"üìä FEATURE IMPORTANCE (MODELO LATE)")
print(f"{'='*80}")

importances = modelos_finales['LATE'].feature_importances_
indices = np.argsort(importances)[::-1]

print("\nFeatures ordenadas por importancia:\n")
for i, idx in enumerate(indices, 1):
    feat = feature_cols[idx]
    imp = importances[idx]
    print(f"   {i}. {feat:30} {imp:.4f} ({imp*100:.1f}%)")

# ==================== GUARDAR ====================

print(f"\n{'='*80}")
print(f"üíæ GUARDANDO MODELOS")
print(f"{'='*80}")

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

for fase_name, modelo in modelos_finales.items():
    filename = f'{OUTPUT_DIR}/xgboost_{fase_name.lower()}_v5.pkl'
    with open(filename, 'wb') as f:
        pickle.dump({
            'modelo': modelo,
            'features': feature_cols,
            'params': resultados_optimizacion[fase_name]['params'],
            'metricas': {
                'spearman_cv': resultados_optimizacion[fase_name]['spearman_cv'],
                'overfitting': resultados_optimizacion[fase_name]['overfitting']
            }
        }, f)
    print(f"‚úÖ Guardado: {filename}")

config_hibrido = {
    'modelos': {
        'early': {
            'params': resultados_optimizacion['EARLY']['params'],
            'features': feature_cols,
            'metricas': {
                'spearman': resultados_optimizacion['EARLY']['spearman_cv'],
                'overfitting': resultados_optimizacion['EARLY']['overfitting']
            }
        },
        'mid': {
            'params': resultados_optimizacion['MID']['params'],
            'features': feature_cols,
            'metricas': {
                'spearman': resultados_optimizacion['MID']['spearman_cv'],
                'overfitting': resultados_optimizacion['MID']['overfitting']
            }
        },
        'late': {
            'params': resultados_optimizacion['LATE']['params'],
            'features': feature_cols,
            'metricas': {
                'spearman': resultados_optimizacion['LATE']['spearman_cv'],
                'overfitting': resultados_optimizacion['LATE']['overfitting']
            }
        }
    },
    'metricas_hibrido': {
        'spearman': test_spearman,
        'mae': test_mae,
        'rmse': test_rmse,
        'r2': test_r2
    },
    'comparacion_baseline': {
        'baseline_4c': baseline_spearman,
        'modelo_optimizado': test_spearman,
        'mejora_porcentual': mejora
    },
    'features_utilizadas': feature_cols,
    'features_eliminadas': ['posicion_media'],
    'train_years': '2008-2022',
    'test_years': '2023-2024',
    'fecha_entrenamiento': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

config_file = f'{OUTPUT_DIR}/config_hibrido_v5.pkl'
with open(config_file, 'wb') as f:
    pickle.dump(config_hibrido, f)

print(f"‚úÖ Guardado: {config_file}")

print(f"\n{'='*80}")
print(f"‚úÖ OPTIMIZACI√ìN COMPLETADA (SIN DATA LEAKAGE)")
print(f"{'='*80}\n")

üéØ OPTIMIZACI√ìN CORREGIDA - SIN DATA LEAKAGE

‚úÖ Split temporal correcto:
   Train: 6221 filas (2008-2022)
   Test:  1358 filas (2023-2024)

‚ö†Ô∏è  Test set NUNCA se usa en entrenamiento u optimizaci√≥n

üìä Distribuci√≥n CORRECTA (solo train):
   EARLY:  1592 filas | A√±os 2008-2022
     MID:  2226 filas | A√±os 2008-2022
    LATE:  2403 filas | A√±os 2008-2022

üöÄ INICIANDO OPTIMIZACI√ìN (SOLO con 2008-2022)

üîç OPTIMIZANDO FASE: EARLY
Datos: 1592 filas (2008-2022)
‚è≥ Entrenando 40 √ó 5 = 200 modelos...
Fitting 5 folds for each of 40 candidates, totalling 200 fits

‚úÖ MEJOR CONFIGURACI√ìN:
   Spearman CV:  0.8685
   Spearman Train: 0.8617
   Overfitting: -0.0068

   Hiperpar√°metros:
      colsample_bytree    : 0.8
      learning_rate       : 0.03
      max_depth           : 2
      n_estimators        : 100
      reg_alpha           : 2.0
      reg_lambda          : 1.0
      subsample           : 0.8

üîç OPTIMIZANDO FASE: MID
Datos: 2226 filas (2008-2022)
‚è≥ Entrenan