In [2]:
# ============================================================================
# PASO 4C: 3 MODELOS POR FASE (SIN POSICION_MEDIA)
# ============================================================================

import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import spearmanr
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

print("\n" + "="*70)
print("PASO 4C: 3 MODELOS POR FASE (SIN POSICION_MEDIA)")
print("="*70)

# ==================== CONFIGURACI√ìN ====================

INPUT_FILE = '../data/processed/f1_features_complete.csv'
OUTPUT_DIR = '../models'

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# ==================== CARGAR DATOS ====================

print(f"\nCargando dataset: {INPUT_FILE}")
df = pd.read_csv(INPUT_FILE)
print(f"‚úì Cargado: {len(df):,} filas")

# ==================== FEATURES (SIN POSICION_MEDIA) ====================

print("\n" + "="*70)
print("CONFIGURACI√ìN DE FEATURES")
print("="*70)

# Features originales MENOS posicion_media
feature_cols = [
    'pct_puntos_actual',
    'pct_linear_points',
    'tendencia_ultimas_3',           # MANTENEMOS (√∫ltimas 3, no toda la temporada)
    'diff_con_lider_normalizada',
    'progreso_temporada',
    'driver_quality_3y',
    'team_avg_pos_3y',
    'team_trend'
]

target_col = 'pct_puntos_final'

print(f"\nFeatures usadas: {len(feature_cols)}")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i}. {col}")

print(f"\n‚ùå Feature ELIMINADA: posicion_media (80% importance)")
print(f"‚úÖ Feature MANTENIDA: tendencia_ultimas_3 (solo √∫ltimas 3)")

# ==================== DIVISI√ìN TRAIN/TEST ====================

print("\n" + "="*70)
print("DIVISI√ìN TRAIN/TEST")
print("="*70)

df_train = df[df['year'] <= 2022].copy()
df_test = df[df['year'].isin([2023, 2024])].copy()

print(f"\nTRAIN: {len(df_train):,} filas (2008-2022)")
print(f"TEST:  {len(df_test):,} filas (2023-2024)")

# ==================== SEGMENTAR POR FASE ====================

print("\n" + "="*70)
print("SEGMENTACI√ìN POR FASE")
print("="*70)

def segmentar_por_fase(df):
    """Divide dataset en 3 fases seg√∫n ronda"""
    
    df_early = df[df['round'] <= 5].copy()
    df_mid = df[(df['round'] > 5) & (df['round'] <= 12)].copy()
    df_late = df[df['round'] > 12].copy()
    
    return df_early, df_mid, df_late

train_early, train_mid, train_late = segmentar_por_fase(df_train)
test_early, test_mid, test_late = segmentar_por_fase(df_test)

print("\nDIVISI√ìN POR FASE:")
print("-"*70)
print(f"{'Fase':<20s} {'Train':<10s} {'Test':<10s}")
print("-"*70)
print(f"{'EARLY (R1-R5)':<20s} {len(train_early):<10,} {len(test_early):<10,}")
print(f"{'MID (R6-R12)':<20s} {len(train_mid):<10,} {len(test_mid):<10,}")
print(f"{'LATE (R13+)':<20s} {len(train_late):<10,} {len(test_late):<10,}")
print("-"*70)
print(f"{'TOTAL':<20s} {len(df_train):<10,} {len(df_test):<10,}")

# ==================== ENTRENAR 3 MODELOS ====================

print("\n" + "="*70)
print("ENTRENAR 3 MODELOS (EARLY, MID, LATE)")
print("="*70)

modelos = {}

def entrenar_modelo_fase(df_train_fase, df_test_fase, features, fase_nombre, params_custom=None):
    """Entrena un modelo para una fase espec√≠fica"""
    
    print(f"\n{fase_nombre}:")
    print("-"*70)
    
    # Preparar datos
    X_train = df_train_fase[features]
    y_train = df_train_fase[target_col]
    
    X_test = df_test_fase[features]
    y_test = df_test_fase[target_col]
    
    print(f"  Train: {len(X_train):,} filas")
    print(f"  Test:  {len(X_test):,} filas")
    print(f"  Features: {len(features)}")
    
    # Hiperpar√°metros (ajustados por fase)
    if params_custom is None:
        if fase_nombre == "EARLY (R1-R5)":
            params = {
                'objective': 'reg:squarederror',
                'n_estimators': 300,        # M√°s √°rboles (menos datos)
                'max_depth': 5,              # Menos profundidad (evitar overfitting)
                'learning_rate': 0.05,       # M√°s conservador
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'reg_alpha': 0.1,            # L1 regularization
                'reg_lambda': 1.0,           # L2 regularization
                'random_state': 42,
                'n_jobs': -1
            }
        elif fase_nombre == "MID (R6-R12)":
            params = {
                'objective': 'reg:squarederror',
                'n_estimators': 250,
                'max_depth': 6,
                'learning_rate': 0.08,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'reg_alpha': 0.05,
                'reg_lambda': 0.5,
                'random_state': 42,
                'n_jobs': -1
            }
        else:  # LATE
            params = {
                'objective': 'reg:squarederror',
                'n_estimators': 200,
                'max_depth': 7,              # M√°s profundidad (m√°s datos fiables)
                'learning_rate': 0.1,
                'subsample': 0.9,
                'colsample_bytree': 0.9,
                'reg_alpha': 0.01,
                'reg_lambda': 0.1,
                'random_state': 42,
                'n_jobs': -1
            }
    else:
        params = params_custom
    
    # Entrenar
    print(f"\n  Entrenando...")
    modelo = xgb.XGBRegressor(**params)
    modelo.fit(X_train, y_train)
    print(f"  ‚úì Modelo entrenado")
    
    # Predicciones
    y_train_pred = modelo.predict(X_train)
    y_test_pred = modelo.predict(X_test)
    
    # M√©tricas TRAIN
    mae_train = mean_absolute_error(y_train, y_train_pred)
    spearman_train, _ = spearmanr(y_train, y_train_pred)
    
    # M√©tricas TEST
    mae_test = mean_absolute_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    r2_test = r2_score(y_test, y_test_pred)
    spearman_test, _ = spearmanr(y_test, y_test_pred)
    
    print(f"\n  M√©tricas TRAIN:")
    print(f"    MAE:      {mae_train:.4f}")
    print(f"    Spearman: {spearman_train:.4f}")
    
    print(f"\n  M√©tricas TEST:")
    print(f"    MAE:      {mae_test:.4f} ({mae_test*100:.2f}%)")
    print(f"    RMSE:     {rmse_test:.4f}")
    print(f"    R¬≤:       {r2_test:.4f}")
    print(f"    Spearman: {spearman_test:.4f} ‚≠ê")
    
    # Diagn√≥stico overfitting
    diff_spearman = spearman_train - spearman_test
    if diff_spearman < 0.05:
        print(f"\n  ‚úì Overfitting: Ninguno/Bajo")
    elif diff_spearman < 0.15:
        print(f"\n  ‚ö†Ô∏è Overfitting: Moderado")
    else:
        print(f"\n  ‚ùå Overfitting: Alto")
    
    # Feature importance
    importances = modelo.feature_importances_
    feature_importance_df = pd.DataFrame({
        'feature': features,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print(f"\n  Top 5 Features:")
    for idx, row in feature_importance_df.head(5).iterrows():
        bar = "‚ñà" * int(row['importance'] * 50)
        print(f"    {row['feature']:30s} {row['importance']:.3f} {bar}")
    
    return {
        'modelo': modelo,
        'features': features,
        'params': params,
        'metricas_train': {
            'mae': mae_train,
            'spearman': spearman_train
        },
        'metricas_test': {
            'mae': mae_test,
            'rmse': rmse_test,
            'r2': r2_test,
            'spearman': spearman_test
        },
        'feature_importance': feature_importance_df
    }

# Entrenar los 3 modelos
modelos['early'] = entrenar_modelo_fase(train_early, test_early, feature_cols, "EARLY (R1-R5)")
modelos['mid'] = entrenar_modelo_fase(train_mid, test_mid, feature_cols, "MID (R6-R12)")
modelos['late'] = entrenar_modelo_fase(train_late, test_late, feature_cols, "LATE (R13+)")

# ==================== EVALUACI√ìN CONJUNTO ====================

print("\n" + "="*70)
print("EVALUACI√ìN MODELO H√çBRIDO (TODO EL TEST SET)")
print("="*70)

def predecir_con_modelo_apropiado(df_test, modelos):
    """Usa el modelo apropiado seg√∫n la ronda"""
    
    predicciones = []
    
    for idx, row in df_test.iterrows():
        round_num = row['round']
        
        # Seleccionar modelo seg√∫n ronda
        if round_num <= 5:
            modelo_info = modelos['early']
        elif round_num <= 12:
            modelo_info = modelos['mid']
        else:
            modelo_info = modelos['late']
        
        # Predecir
        features = modelo_info['features']
        modelo = modelo_info['modelo']
        
        X = row[features].values.reshape(1, -1)
        pred = modelo.predict(X)[0]
        
        predicciones.append(pred)
    
    return np.array(predicciones)

print("\nGenerando predicciones con modelo h√≠brido...")
y_test_pred_hibrido = predecir_con_modelo_apropiado(df_test, modelos)
y_test_real = df_test[target_col].values

# M√©tricas generales
mae_hibrido = mean_absolute_error(y_test_real, y_test_pred_hibrido)
rmse_hibrido = np.sqrt(mean_squared_error(y_test_real, y_test_pred_hibrido))
r2_hibrido = r2_score(y_test_real, y_test_pred_hibrido)
spearman_hibrido, _ = spearmanr(y_test_real, y_test_pred_hibrido)

print("\n" + "="*70)
print("M√âTRICAS MODELO H√çBRIDO (3 MODELOS COMBINADOS)")
print("="*70)
print(f"\n  MAE:      {mae_hibrido:.4f} ({mae_hibrido*100:.2f}%)")
print(f"  RMSE:     {rmse_hibrido:.4f}")
print(f"  R¬≤:       {r2_hibrido:.4f}")
print(f"  Spearman: {spearman_hibrido:.4f} ‚≠ê")

# Comparar con baseline
print("\n" + "-"*70)
print("COMPARACI√ìN vs BASELINE:")
print("-"*70)
print(f"  Baseline (1 modelo):         Spearman = 0.8955")
print(f"  H√≠brido (3 modelos):         Spearman = {spearman_hibrido:.4f}")
print(f"  Mejora absoluta:             {spearman_hibrido - 0.8955:+.4f}")
print(f"  Mejora relativa:             {(spearman_hibrido - 0.8955)/0.8955*100:+.1f}%")

if spearman_hibrido > 0.8955:
    print(f"\n  ‚úÖ MEJORA CONSEGUIDA")
elif spearman_hibrido > 0.88:
    print(f"\n  ‚ö†Ô∏è Ligera p√©rdida pero a√∫n excelente")
else:
    print(f"\n  ‚ùå P√©rdida significativa")

# ==================== AN√ÅLISIS DETALLADO ====================

print("\n" + "="*70)
print("AN√ÅLISIS POR A√ëO Y FASE")
print("="*70)

df_test['pred_hibrido'] = y_test_pred_hibrido

for year in [2023, 2024]:
    print(f"\n{year}:")
    print("-"*70)
    
for fase, (min_round, max_round) in [('EARLY', (1, 5)), ('MID', (6, 12)), ('LATE', (13, 99))]:
    df_fase = df_year[(df_year['round'] >= min_round) & (df_year['round'] <= max_round)]
    
    if len(df_fase) == 0:
        continue
    
    mae_fase = mean_absolute_error(df_fase[target_col], df_fase['pred_hibrido'])
    spearman_fase, _ = spearmanr(df_fase[target_col], df_fase['pred_hibrido'])
    
    # Formatear rango
    if max_round >= 99:
        rango_str = f"R{min_round:2d}-fin"
    else:
        rango_str = f"R{min_round:2d}-R{max_round:2d}"
    
    print(f"  {fase:6s} ({rango_str}): "
          f"MAE={mae_fase:.4f} | Spearman={spearman_fase:.4f} | n={len(df_fase):3d}")

# ==================== CLASIFICACI√ìN FINAL ====================

print("\n" + "="*70)
print("CLASIFICACI√ìN FINAL (√∫ltimas rondas)")
print("="*70)

for year in [2023, 2024]:
    df_year_final = df_test[
        (df_test['year'] == year) & 
        (df_test['round'] == df_test[df_test['year'] == year]['round'].max())
    ]
    
    if len(df_year_final) == 0:
        continue
    
    print(f"\n{year} - Ronda {df_year_final['round'].iloc[0]}:")
    print("-"*70)
    
    df_year_final_sorted = df_year_final.sort_values('pred_hibrido', ascending=False)
    
    print(f"{'Pos':>3s} {'Driver':6s} {'Real':>8s} {'Pred':>8s} {'Diff':>8s}")
    print("-"*70)
    
    for i, (idx, row) in enumerate(df_year_final_sorted.head(10).iterrows(), 1):
        print(f"{i:3d} {row['driver']:6s} "
              f"{row[target_col]:8.4f} {row['pred_hibrido']:8.4f} "
              f"{row[target_col] - row['pred_hibrido']:+8.4f}")

# ==================== GUARDAR MODELOS ====================

print("\n" + "="*70)
print("GUARDAR MODELOS")
print("="*70)

# Guardar los 3 modelos
for fase in ['early', 'mid', 'late']:
    modelo_file = f'{OUTPUT_DIR}/xgboost_{fase}_v5.pkl'
    with open(modelo_file, 'wb') as f:
        pickle.dump(modelos[fase], f)
    print(f"‚úì {modelo_file}")

# Guardar config
config_hibrido = {
    'feature_cols': feature_cols,
    'target_col': target_col,
    'modelos': {
        'early': {
            'rounds': '1-5',
            'metricas': modelos['early']['metricas_test'],
            'feature_importance': modelos['early']['feature_importance'].to_dict('records')
        },
        'mid': {
            'rounds': '6-12',
            'metricas': modelos['mid']['metricas_test'],
            'feature_importance': modelos['mid']['feature_importance'].to_dict('records')
        },
        'late': {
            'rounds': '13+',
            'metricas': modelos['late']['metricas_test'],
            'feature_importance': modelos['late']['feature_importance'].to_dict('records')
        }
    },
    'metricas_hibrido': {
        'mae': mae_hibrido,
        'rmse': rmse_hibrido,
        'r2': r2_hibrido,
        'spearman': spearman_hibrido
    },
    'comparacion_baseline': {
        'baseline_spearman': 0.8955,
        'hibrido_spearman': spearman_hibrido,
        'mejora': spearman_hibrido - 0.8955
    }
}

config_file = f'{OUTPUT_DIR}/config_hibrido_v5.pkl'
with open(config_file, 'wb') as f:
    pickle.dump(config_hibrido, f)
print(f"‚úì {config_file}")

# ==================== RESUMEN FINAL ====================

print("\n" + "="*70)
print("‚úÖ PASO 4C COMPLETADO")
print("="*70)

print(f"\nüìä RESUMEN:")
print(f"  Features usadas: {len(feature_cols)} (sin posicion_media)")
print(f"  Modelos entrenados: 3 (EARLY, MID, LATE)")

print(f"\nüìà M√âTRICAS POR MODELO:")
print(f"  EARLY (R1-R5):   Spearman = {modelos['early']['metricas_test']['spearman']:.4f}")
print(f"  MID (R6-R12):    Spearman = {modelos['mid']['metricas_test']['spearman']:.4f}")
print(f"  LATE (R13+):     Spearman = {modelos['late']['metricas_test']['spearman']:.4f}")

print(f"\nüéØ MODELO H√çBRIDO:")
print(f"  MAE:      {mae_hibrido:.4f} ({mae_hibrido*100:.2f}%)")
print(f"  Spearman: {spearman_hibrido:.4f} ‚≠ê")

print(f"\nüìä vs BASELINE:")
print(f"  Mejora: {spearman_hibrido - 0.8955:+.4f} ({(spearman_hibrido - 0.8955)/0.8955*100:+.1f}%)")

print("\n" + "="*70)


PASO 4C: 3 MODELOS POR FASE (SIN POSICION_MEDIA)

Cargando dataset: ../data/processed/f1_features_complete.csv
‚úì Cargado: 7,579 filas

CONFIGURACI√ìN DE FEATURES

Features usadas: 8
  1. pct_puntos_actual
  2. pct_linear_points
  3. tendencia_ultimas_3
  4. diff_con_lider_normalizada
  5. progreso_temporada
  6. driver_quality_3y
  7. team_avg_pos_3y
  8. team_trend

‚ùå Feature ELIMINADA: posicion_media (80% importance)
‚úÖ Feature MANTENIDA: tendencia_ultimas_3 (solo √∫ltimas 3)

DIVISI√ìN TRAIN/TEST

TRAIN: 6,221 filas (2008-2022)
TEST:  919 filas (2023-2024)

SEGMENTACI√ìN POR FASE

DIVISI√ìN POR FASE:
----------------------------------------------------------------------
Fase                 Train      Test      
----------------------------------------------------------------------
EARLY (R1-R5)        1,592      199       
MID (R6-R12)         2,226      280       
LATE (R13+)          2,403      440       
---------------------------------------------------------------------