In [23]:
# Además de tener variables contextuales voy a agregarle variables dinamicas, esto lo hago ya que a lo largo de la temporada pueden ir cambiando en sentido y en magnitud las predicciones
# Es decir si en las primeras carreras un piloto va bien pero a mitad de temporada tiene una mala racha el modelo tiene que saber si en el pasado ha pasado y predecir como va a quedar al final de la temporada

In [24]:
# Las features dinamicas que voy a añadir son las siguientes: 

'''

1. pct_puntos_actual - % de puntos acumulados

pct_puntos_actual = puntos_acumulados / puntos_totales_temporada

Verstappen en R10 de 24 carreras:
- Puntos acumulados: 250
- Puntos totales temporada: 24 × 101 = 2424
→ pct_puntos_actual = 250 / 2424 = 0.103 (10.3%)

2.pct_linear_points - Sistema lineal para diferenciar fuera del Top 10

linear_points_carrera = max(0, 20 - position + 1)

P1 → 20 puntos
P2 → 19 puntos
...
P10 → 11 puntos
P11 → 10 puntos  ← Diferencia con P10 (el sistema real da 0 a ambos)
P20 → 1 punto
P21+ → 0 puntos

Piloto con [P3, P5, P1, P12, P8]:
- Linear: [18, 16, 20, 9, 13] = 76 puntos
- pct_linear_points = 76 / (24 × 20) = 0.158

3. posicion_media - Posición promedio (solo carreras terminadas)

Piloto: [P3, DNF, P5, P4, P2, DNF, P3]
Terminadas: [P3, P5, P4, P2, P3]
→ posicion_media = (3+5+4+2+3) / 5 = 3.4


4. tendencia_ultimas_3 - Momentum reciente

Piloto: [P8, P7, DNF, P6, P5, DNF, P3, P2]
Terminadas: [P8, P7, P6, P5, P3, P2]
Últimas 3: [P5, P3, P2]
→ tendencia_ultimas_3 = (5+3+2) / 3 = 3.33

5. diff_con_lider_normalizada - Distancia con el líder

progreso = round_actual / total_carreras

R10 de 24: 10/24 = 0.417 (41.7%)
R22 de 22: 22/22 = 1.0 (100%)

'''

'\n\n1. pct_puntos_actual - % de puntos acumulados\n\npct_puntos_actual = puntos_acumulados / puntos_totales_temporada\n\nVerstappen en R10 de 24 carreras:\n- Puntos acumulados: 250\n- Puntos totales temporada: 24 × 101 = 2424\n→ pct_puntos_actual = 250 / 2424 = 0.103 (10.3%)\n\n2.pct_linear_points - Sistema lineal para diferenciar fuera del Top 10\n\nlinear_points_carrera = max(0, 20 - position + 1)\n\nP1 → 20 puntos\nP2 → 19 puntos\n...\nP10 → 11 puntos\nP11 → 10 puntos  ← Diferencia con P10 (el sistema real da 0 a ambos)\nP20 → 1 punto\nP21+ → 0 puntos\n\nPiloto con [P3, P5, P1, P12, P8]:\n- Linear: [18, 16, 20, 9, 13] = 76 puntos\n- pct_linear_points = 76 / (24 × 20) = 0.158\n\n3. posicion_media - Posición promedio (solo carreras terminadas)\n\nPiloto: [P3, DNF, P5, P4, P2, DNF, P3]\nTerminadas: [P3, P5, P4, P2, P3]\n→ posicion_media = (3+5+4+2+3) / 5 = 3.4\n\n\n4. tendencia_ultimas_3 - Momentum reciente\n\nPiloto: [P8, P7, DNF, P6, P5, DNF, P3, P2]\nTerminadas: [P8, P7, P6, P5, P3

In [25]:
# El target que he escogido es el porcentaje de puntos sobre el total que tiene cada piloto. 
# Lo he hecho de esta forma y no sobre el total de puntos ya que las distintas temporadas tienen distintas carreras y por tanto el total de puntos no es el mismo, por lo que el modelo no se entrenaria bien
# Tampoco heutilizado como target las posiciones ya que si fuese así el modelo puede predecir a dos pilotos la misma posición, lo que no seria correcto para mi objetivo

In [26]:

import pandas as pd
import numpy as np
import pickle
from pathlib import Path


INPUT_CLEAN = '../data/processed/f1_clean.csv'
INPUT_CONTEXT = '../data/processed/context_stats_rolling.pkl'
OUTPUT_FILE = '../data/processed/f1_features_complete.csv'

# ==================== TEAM MAPPING ====================

TEAM_MAPPING = {
    'Aston Martin': 'Aston Martin',
    'Aston Martin F1 Team': 'Aston Martin',
    'Racing Point': 'Aston Martin',
    'Racing Point F1 Team': 'Aston Martin',
    'Force India': 'Aston Martin',
    'Spyker': 'Aston Martin',
    'Spyker F1': 'Aston Martin',
    'Jordan': 'Aston Martin',
    'Midland': 'Aston Martin',
    
    'Alpine': 'Alpine',
    'Alpine F1 Team': 'Alpine',
    'Renault': 'Alpine',
    'Lotus F1': 'Alpine',
    'Lotus': 'Alpine',
    
    'Red Bull Racing': 'Red Bull Racing',
    'Red Bull': 'Red Bull Racing',
    
    'Racing Bulls': 'AlphaTauri',
    'RB': 'AlphaTauri',
    'AlphaTauri': 'AlphaTauri',
    'Toro Rosso': 'AlphaTauri',
    
    'Kick Sauber': 'Sauber',
    'Alfa Romeo': 'Sauber',
    'Sauber': 'Sauber',
    'Alfa Romeo Sauber': 'Sauber',
    'BMW Sauber': 'Sauber',
    
    'Mercedes': 'Mercedes',
    'Mercedes-Benz': 'Mercedes',
    'Brawn': 'Mercedes',
    
    'Haas F1 Team': 'Haas',
    'Haas': 'Haas',
    
    'Ferrari': 'Ferrari',
    'McLaren': 'McLaren',
    'Williams': 'Williams',
    
    'Toyota': 'Toyota',
    'Honda': 'Honda',
    'BAR': 'BAR',
    'Jaguar': 'Jaguar',
    'Minardi': 'Minardi',
    'Arrows': 'Arrows',
    'Prost': 'Prost',
    'Benetton': 'Benetton',
    'Virgin': 'Virgin',
    'Lotus Racing': 'Lotus Racing',
    'HRT': 'HRT',
    'Marussia': 'Marussia',
    'Caterham': 'Caterham',
    'Manor': 'Manor',
}

def normalizar_team(team):
    return TEAM_MAPPING.get(team, team)

# ==================== CALCULAR TARGET ====================

def calcular_target(df):
    """
    Calcula pct_puntos_final para cada año-piloto
    """
    
    df_final = df.groupby(['year', 'driver']).agg({
        'puntos_carrera': 'sum'
    }).reset_index()
    
    df_final = df_final.rename(columns={'puntos_carrera': 'puntos_final'})
    
    total_carreras_por_año = df.groupby('year')['round'].max().to_dict()
    
    df_final['total_carreras'] = df_final['year'].map(total_carreras_por_año)
    df_final['puntos_totales_temporada'] = df_final['total_carreras'] * 101
    df_final['pct_puntos_final'] = df_final['puntos_final'] / df_final['puntos_totales_temporada']
    
    return df_final[['year', 'driver', 'pct_puntos_final']]

# ==================== FEATURES DINÁMICAS (ACUMULATIVAS) ====================

def calcular_features_hasta_ronda(df_hasta_ronda):
    
    if len(df_hasta_ronda) == 0:
        return None
    
    # Total de carreras de la temporada (del dataset completo)
    total_carreras = df_hasta_ronda['total_carreras_temporada'].iloc[0]
    puntos_totales_temporada = total_carreras * 101
    
    round_actual = df_hasta_ronda['round'].max()
    
    # --- FEATURE 1: pct_puntos_actual ---
    puntos_acumulados = df_hasta_ronda['puntos_carrera'].sum()
    pct_puntos_actual = puntos_acumulados / puntos_totales_temporada
    
    # --- FEATURE 2: pct_linear_points ---
    linear_points_acumulados = df_hasta_ronda['linear_points'].sum()
    max_linear_points = total_carreras * 20
    pct_linear_points = linear_points_acumulados / max_linear_points
    
    # --- FEATURE 3: posicion_media ---
    terminadas = df_hasta_ronda[~df_hasta_ronda['es_abandono']]
    if len(terminadas) > 0:
        posicion_media = terminadas['position'].mean()
    else:
        posicion_media = 20.0
    
    # --- FEATURE 4: tendencia_ultimas_3 ---
    ultimas_3_terminadas = terminadas.tail(3)
    if len(ultimas_3_terminadas) > 0:
        tendencia_ultimas_3 = ultimas_3_terminadas['position'].mean()
    else:
        tendencia_ultimas_3 = posicion_media
    
    # --- FEATURE 5: diff_con_lider_normalizada ---
    diff_con_lider_normalizada = 0.0  # Placeholder
    
    # --- FEATURE 6: progreso_temporada ---
    progreso_temporada = round_actual / total_carreras
    
    return {
        'pct_puntos_actual': pct_puntos_actual,
        'pct_linear_points': pct_linear_points,
        'posicion_media': posicion_media,
        'tendencia_ultimas_3': tendencia_ultimas_3,
        'diff_con_lider_normalizada': diff_con_lider_normalizada,
        'progreso_temporada': progreso_temporada
    }

# ==================== CALCULAR DIFF CON LÍDER ====================

def calcular_diff_lider(df_features):
    """
    Calcula diff_con_lider_normalizada para cada ronda
    """
    
    for (year, round_num), group in df_features.groupby(['year', 'round']):
        # Líder en esta ronda
        max_puntos = group['pct_puntos_actual'].max()
        
        # Calcular diferencia
        mask = (df_features['year'] == year) & (df_features['round'] == round_num)
        df_features.loc[mask, 'diff_con_lider_normalizada'] = (
            max_puntos - df_features.loc[mask, 'pct_puntos_actual']
        )
    
    return df_features

# ==================== AÑADIR CONTEXT STATS ====================

def añadir_context_stats(df_features, context_stats):
    """
    Añade driver_quality_3y, team_avg_pos_3y, team_trend
    """

    df_features['driver_quality_3y'] = np.nan
    df_features['team_avg_pos_3y'] = np.nan
    df_features['team_trend'] = np.nan
    
    for idx, row in df_features.iterrows():
        year = row['year']
        driver = row['driver']
        team = row['team_normalized']
        
        if year in context_stats:
            stats_year = context_stats[year]
            
            driver_quality = stats_year['drivers'].get(driver, 50.0)
            df_features.loc[idx, 'driver_quality_3y'] = driver_quality
            
            team_avg = stats_year['teams']['avg_pos_3y'].get(team, 10.0)
            df_features.loc[idx, 'team_avg_pos_3y'] = team_avg
            
            team_trend_val = stats_year['teams']['trend'].get(team, 0.0)
            df_features.loc[idx, 'team_trend'] = team_trend_val
    
    
    # Rellenar nulos
    df_features['driver_quality_3y'] = df_features['driver_quality_3y'].fillna(50.0)
    df_features['team_avg_pos_3y'] = df_features['team_avg_pos_3y'].fillna(10.0)
    df_features['team_trend'] = df_features['team_trend'].fillna(0.0)
    
    return df_features

# ==================== PIPELINE COMPLETO ====================

def crear_features_completas(df, context_stats):

    # Normalizar teams
    df['team_normalized'] = df['team'].apply(normalizar_team)
    
    # Calcular linear points
    df['linear_points'] = df['position'].apply(
        lambda p: max(0, 20 - p + 1) if p <= 20 else 0
    )
    
    # Añadir total_carreras_temporada a cada fila
    total_carreras_por_año = df.groupby('year')['round'].transform('max')
    df['total_carreras_temporada'] = total_carreras_por_año
    
    # Calcular target
    df_target = calcular_target(df)
    
    
    features_list = []
    
    # Para cada año desde 2008
    for year in range(2008, 2026):
        if year not in df['year'].values:
            continue
        
        df_year = df[df['year'] == year]
        
        # Para cada piloto
        for driver in df_year['driver'].unique():
            df_driver = df_year[df_year['driver'] == driver].copy()
            df_driver = df_driver.sort_values('round')
            
            # Para cada ronda (acumulativo)
            for round_num in df_driver['round'].unique():
                # Todas las carreras HASTA esta ronda
                df_hasta_ronda = df_driver[df_driver['round'] <= round_num]
                
                # Calcular features
                features = calcular_features_hasta_ronda(df_hasta_ronda)
                
                if features is None:
                    continue
                
                # Añadir info básica
                features['year'] = year
                features['round'] = round_num
                features['driver'] = driver
                features['team'] = df_hasta_ronda['team'].iloc[-1]  # Team en esta ronda
                features['team_normalized'] = df_hasta_ronda['team_normalized'].iloc[-1]
                
                features_list.append(features)
        
        if year % 5 == 0:
            print(f"   Procesado año {year}: {len(features_list)} filas...")
    
    df_features = pd.DataFrame(features_list)
    
    # Calcular diff con líder
    df_features = calcular_diff_lider(df_features)
    
    # Añadir context stats
    df_features = añadir_context_stats(df_features, context_stats)
    
    # Merge con target
    df_features = df_features.merge(
        df_target,
        on=['year', 'driver'],
        how='left'
    )
    
    nulos_target = df_features['pct_puntos_final'].isna().sum()
    if nulos_target > 0:
        print(f"{nulos_target} filas sin target (se eliminarán)")
        df_features = df_features.dropna(subset=['pct_puntos_final'])

    
    return df_features


# ==================== EJECUCIÓN ====================

df = pd.read_csv(INPUT_CLEAN)

with open(INPUT_CONTEXT, 'rb') as f:
    context_stats = pickle.load(f)

df_features = crear_features_completas(df, context_stats)


Path(OUTPUT_FILE).parent.mkdir(parents=True, exist_ok=True)
df_features.to_csv(OUTPUT_FILE, index=False)



   Procesado año 2010: 1164 filas...
   Procesado año 2015: 3299 filas...
   Procesado año 2020: 5341 filas...
   Procesado año 2025: 7579 filas...


In [30]:
df_features

Unnamed: 0,pct_puntos_actual,pct_linear_points,posicion_media,tendencia_ultimas_3,diff_con_lider_normalizada,progreso_temporada,year,round,driver,team,team_normalized,driver_quality_3y,team_avg_pos_3y,team_trend,pct_puntos_final
0,0.013751,0.055556,1.000000,1.000000,0.000000,0.055556,2008,1,HAM,McLaren,McLaren,63.269231,3.428563,-1.509511,0.133663
1,0.019252,0.100000,3.000000,3.000000,0.000000,0.111111,2008,2,HAM,McLaren,McLaren,63.269231,3.428563,-1.509511,0.133663
2,0.019252,0.122222,6.333333,6.333333,0.006601,0.166667,2008,3,HAM,McLaren,McLaren,63.269231,3.428563,-1.509511,0.133663
3,0.027503,0.172222,5.500000,7.000000,0.012101,0.222222,2008,4,HAM,McLaren,McLaren,63.269231,3.428563,-1.509511,0.133663
4,0.037404,0.225000,4.800000,6.000000,0.010451,0.277778,2008,5,HAM,McLaren,McLaren,63.269231,3.428563,-1.509511,0.133663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7574,0.000000,0.134091,15.636364,17.333333,0.141764,0.818182,2025,18,COL,Alpine,Alpine,64.375000,11.378571,2.590734,0.000000
7575,0.000000,0.143182,15.750000,17.333333,0.146265,0.863636,2025,19,COL,Alpine,Alpine,64.375000,11.378571,2.590734,0.000000
7576,0.000000,0.154545,15.769231,16.333333,0.153915,0.909091,2025,20,COL,Alpine,Alpine,64.375000,11.378571,2.590734,0.000000
7577,0.000000,0.168182,15.714286,16.000000,0.165167,0.954545,2025,21,COL,Alpine,Alpine,64.375000,11.378571,2.590734,0.000000


In [None]:
# Con estas features ya podemos entrenar al modelo