In [7]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import spearmanr
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
INPUT_FILE = '../data/processed/f1_features_complete.csv'
OUTPUT_DIR = '../models'
MODEL_FILE = f'{OUTPUT_DIR}/xgboost_baseline.pkl'
CONFIG_FILE = f'{OUTPUT_DIR}/config.pkl'

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [9]:
df = pd.read_csv(INPUT_FILE)

In [10]:
# Para entrenar el modelo la division que voy a hacer es desde la temporada 2008 hasta la temporada 2022 sera de entrenamiento y las temporadas 2023 y 2024 seran de test

In [11]:
df_train = df[df['year'] <= 2022].copy()
df_test = df[df['year'].isin([2023, 2024])].copy()

In [12]:
# No puedo usar train_test_split() porque si lo usase habria fuga de datos
# Si usase split aleatorio, el modelo podría aprender de 2024 y predecir 2022, lo cual es data leakage temporal: usar información del futuro para predecir el pasado.
# En producción, esto no sería posible. En 2022 no puedo saber qué pasará en 2024.
# Por eso uso split temporal: entreno con 2008-2022 y evaluo con 2023-2024, simulando exactamente cómo el modelo funcionará en la vida real al predecir 2025.

In [13]:
#Selecciono las features con las que voy a entrenar al modelo

feature_cols = [
    'pct_puntos_actual',
    'pct_linear_points',
    'posicion_media',
    'tendencia_ultimas_3',
    'diff_con_lider_normalizada',
    'progreso_temporada',
    'driver_quality_3y',
    'team_avg_pos_3y',
    'team_trend'
]

# También selecciono el target

target_col = 'pct_puntos_final'

In [14]:
X_train = df_train[feature_cols]
y_train = df_train[target_col]

X_test = df_test[feature_cols]
y_test = df_test[target_col]

In [15]:
params = {
    'objective': 'reg:squarederror',
    'n_estimators': 200,
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1
}

for key, value in params.items():
    print(f"  {key}: {value}")

print("\nEntrenando...")
modelo = xgb.XGBRegressor(**params)
modelo.fit(X_train, y_train)

  objective: reg:squarederror
  n_estimators: 200
  max_depth: 6
  learning_rate: 0.1
  subsample: 0.8
  colsample_bytree: 0.8
  random_state: 42
  n_jobs: -1

Entrenando...


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [16]:
# ==================== PREDICCIONES ====================

print("\n" + "="*70)
print("PREDICCIONES")
print("="*70)

y_train_pred = modelo.predict(X_train)
y_test_pred = modelo.predict(X_test)

print("✓ Predicciones generadas")

# ==================== MÉTRICAS ====================

def calcular_metricas(y_true, y_pred, dataset_name):
    """Calcula todas las métricas"""
    
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    # Spearman correlation (métrica CLAVE)
    spearman_corr, spearman_pval = spearmanr(y_true, y_pred)
    
    print(f"\n{dataset_name}:")
    print(f"  MAE:              {mae:.4f} ({mae*100:.2f}% de puntos)")
    print(f"  RMSE:             {rmse:.4f}")
    print(f"  R² Score:         {r2:.4f}")
    print(f"  Spearman Corr:    {spearman_corr:.4f} ⭐")
    
    return {
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'spearman': spearman_corr
    }

print("\n" + "="*70)
print("MÉTRICAS DE RENDIMIENTO")
print("="*70)

metricas_train = calcular_metricas(y_train, y_train_pred, "TRAIN (2008-2022)")
metricas_test = calcular_metricas(y_test, y_test_pred, "TEST (2023-2024)")

# Diagnóstico de overfitting
print("\n" + "-"*70)
print("DIAGNÓSTICO:")
print("-"*70)

diff_mae = metricas_train['mae'] - metricas_test['mae']
diff_r2 = metricas_train['r2'] - metricas_test['r2']

if diff_mae < 0.01 and diff_r2 < 0.1:
    print("✓ No hay overfitting significativo")
elif diff_mae < 0.02 and diff_r2 < 0.2:
    print("⚠️ Overfitting leve (aceptable)")
else:
    print("❌ Overfitting moderado/alto")

print(f"\nDiferencia MAE (train-test): {diff_mae:+.4f}")
print(f"Diferencia R² (train-test): {diff_r2:+.4f}")

# ==================== FEATURE IMPORTANCE ====================

print("\n" + "="*70)
print("FEATURE IMPORTANCE")
print("="*70)

importances = modelo.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nImportancia de features:")
for idx, row in feature_importance_df.iterrows():
    bar = "█" * int(row['importance'] * 100)
    print(f"  {row['feature']:30s} {row['importance']:.3f} {bar}")

# ==================== ANÁLISIS DE TEST ====================

print("\n" + "="*70)
print("ANÁLISIS DETALLADO DEL TEST SET")
print("="*70)

# Añadir predicciones al test set
df_test['pct_puntos_pred'] = y_test_pred

# Análisis por año
print("\nRendimiento por año:")
for year in [2023, 2024]:
    df_year = df_test[df_test['year'] == year]
    
    if len(df_year) == 0:
        continue
    
    mae_year = mean_absolute_error(df_year[target_col], df_year['pct_puntos_pred'])
    spearman_year, _ = spearmanr(df_year[target_col], df_year['pct_puntos_pred'])
    
    print(f"\n  {year}:")
    print(f"    Filas: {len(df_year)}")
    print(f"    MAE: {mae_year:.4f}")
    print(f"    Spearman: {spearman_year:.4f}")

# Mejores y peores predicciones
print("\n" + "-"*70)
print("TOP 5 MEJORES PREDICCIONES (menor error absoluto):")
print("-"*70)

df_test['error_abs'] = np.abs(df_test[target_col] - df_test['pct_puntos_pred'])
mejores = df_test.nsmallest(5, 'error_abs')

for idx, row in mejores.iterrows():
    print(f"  {row['year']} R{row['round']:2d} {row['driver']:3s}: "
          f"Real={row[target_col]:.3f} Pred={row['pct_puntos_pred']:.3f} "
          f"Error={row['error_abs']:.4f}")

print("\n" + "-"*70)
print("TOP 5 PEORES PREDICCIONES (mayor error absoluto):")
print("-"*70)

peores = df_test.nlargest(5, 'error_abs')

for idx, row in peores.iterrows():
    print(f"  {row['year']} R{row['round']:2d} {row['driver']:3s}: "
          f"Real={row[target_col]:.3f} Pred={row['pct_puntos_pred']:.3f} "
          f"Error={row['error_abs']:.4f}")

# ==================== ANÁLISIS DE CLASIFICACIÓN ====================

print("\n" + "="*70)
print("ANÁLISIS DE CLASIFICACIÓN FINAL (2023-2024)")
print("="*70)

for year in [2023, 2024]:
    df_year_final = df_test[
        (df_test['year'] == year) & 
        (df_test['round'] == df_test[df_test['year'] == year]['round'].max())
    ]
    
    if len(df_year_final) == 0:
        continue
    
    print(f"\n{year} - Última ronda:")
    print("-"*70)
    
    # Ordenar por predicción
    df_year_final = df_year_final.sort_values('pct_puntos_pred', ascending=False)
    
    print(f"{'Pos':3s} {'Driver':6s} {'Real':8s} {'Pred':8s} {'Diff':8s}")
    print("-"*70)
    
    for i, (idx, row) in enumerate(df_year_final.head(10).iterrows(), 1):
        print(f"{i:3d} {row['driver']:6s} "
              f"{row[target_col]:8.4f} {row['pct_puntos_pred']:8.4f} "
              f"{row[target_col] - row['pct_puntos_pred']:+8.4f}")

# ==================== GUARDAR MODELO ====================

print("\n" + "="*70)
print("GUARDAR MODELO")
print("="*70)

# Guardar modelo
with open(MODEL_FILE, 'wb') as f:
    pickle.dump(modelo, f)
print(f"✓ Modelo guardado: {MODEL_FILE}")

# Guardar configuración
config = {
    'feature_cols': feature_cols,
    'target_col': target_col,
    'params': params,
    'metricas_train': metricas_train,
    'metricas_test': metricas_test,
    'feature_importance': feature_importance_df.to_dict('records'),
    'train_years': list(range(2008, 2023)),
    'test_years': [2023, 2024]
}

with open(CONFIG_FILE, 'wb') as f:
    pickle.dump(config, f)
print(f"✓ Config guardada: {CONFIG_FILE}")

# ==================== VISUALIZACIONES ====================

print("\n" + "="*70)
print("GENERANDO VISUALIZACIONES")
print("="*70)

# 1. Feature Importance
plt.figure(figsize=(10, 6))
feature_importance_df_plot = feature_importance_df.sort_values('importance')
plt.barh(feature_importance_df_plot['feature'], feature_importance_df_plot['importance'])
plt.xlabel('Importancia')
plt.title('Feature Importance - XGBoost Baseline')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/feature_importance.png', dpi=150, bbox_inches='tight')
print(f"✓ Guardado: {OUTPUT_DIR}/feature_importance.png")
plt.close()

# 2. Predicciones vs Real (Test)
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, alpha=0.5, s=10)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Real (pct_puntos_final)')
plt.ylabel('Predicción')
plt.title(f'Predicciones vs Real - Test Set\nSpearman: {metricas_test["spearman"]:.3f}')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/pred_vs_real_test.png', dpi=150, bbox_inches='tight')
print(f"✓ Guardado: {OUTPUT_DIR}/pred_vs_real_test.png")
plt.close()

# 3. Distribución de errores
plt.figure(figsize=(10, 6))
errors = y_test - y_test_pred
plt.hist(errors, bins=50, edgecolor='black', alpha=0.7)
plt.axvline(x=0, color='r', linestyle='--', linewidth=2)
plt.xlabel('Error (Real - Predicción)')
plt.ylabel('Frecuencia')
plt.title(f'Distribución de Errores - Test Set\nMAE: {metricas_test["mae"]:.4f}')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/error_distribution.png', dpi=150, bbox_inches='tight')
print(f"✓ Guardado: {OUTPUT_DIR}/error_distribution.png")
plt.close()

# ==================== RESUMEN FINAL ====================

print("\n" + "="*70)
print("✅ PASO 4A COMPLETADO")
print("="*70)

print(f"\nModelo: XGBoost Baseline")
print(f"Features: {len(feature_cols)}")
print(f"Train: {len(df_train):,} filas (2008-2022)")
print(f"Test: {len(df_test):,} filas (2023-2024)")

print(f"\nMÉTRICAS TEST:")
print(f"  MAE:      {metricas_test['mae']:.4f} ({metricas_test['mae']*100:.2f}% puntos)")
print(f"  Spearman: {metricas_test['spearman']:.4f} ⭐")
print(f"  R²:       {metricas_test['r2']:.4f}")

print(f"\nTOP 3 FEATURES MÁS IMPORTANTES:")
for i, row in feature_importance_df.head(3).iterrows():
    print(f"  {i+1}. {row['feature']:30s} ({row['importance']:.3f})")

print(f"\nARCHIVOS GENERADOS:")
print(f"  - {MODEL_FILE}")
print(f"  - {CONFIG_FILE}")
print(f"  - {OUTPUT_DIR}/feature_importance.png")
print(f"  - {OUTPUT_DIR}/pred_vs_real_test.png")
print(f"  - {OUTPUT_DIR}/error_distribution.png")

print(f"\n{'='*70}")
print("SIGUIENTE PASO:")
print("  - Si Spearman > 0.85: Excelente, optimizar hiperparámetros")
print("  - Si Spearman 0.70-0.85: Bueno, probar otros algoritmos")
print("  - Si Spearman < 0.70: Revisar features o modelo múltiple")
print("="*70)


PREDICCIONES
✓ Predicciones generadas

MÉTRICAS DE RENDIMIENTO

TRAIN (2008-2022):
  MAE:              0.0018 (0.18% de puntos)
  RMSE:             0.0027
  R² Score:         0.9974
  Spearman Corr:    0.9944 ⭐

TEST (2023-2024):
  MAE:              0.0122 (1.22% de puntos)
  RMSE:             0.0204
  R² Score:         0.8647
  Spearman Corr:    0.8955 ⭐

----------------------------------------------------------------------
DIAGNÓSTICO:
----------------------------------------------------------------------
⚠️ Overfitting leve (aceptable)

Diferencia MAE (train-test): -0.0104
Diferencia R² (train-test): +0.1327

FEATURE IMPORTANCE

Importancia de features:
  posicion_media                 0.803 ████████████████████████████████████████████████████████████████████████████████
  tendencia_ultimas_3            0.057 █████
  team_avg_pos_3y                0.038 ███
  diff_con_lider_normalizada     0.030 ██
  driver_quality_3y              0.020 ██
  pct_puntos_actual              0.018 █
