In [43]:
import pandas as pd
import numpy as np

# Load data
df_results = pd.read_csv('../data/race_results_2025.csv')
df_qualifying = pd.read_csv('../data/qualifying_2025.csv')
df_pitstops = pd.read_csv('../data/pitstops_2025.csv')

print("Data loaded!")
print(f"Race results: {df_results.shape}")
print(f"Qualifying: {df_qualifying.shape}")
print(f"Pit stops: {df_pitstops.shape}")

Data loaded!
Race results: (479, 14)
Qualifying: (479, 10)
Pit stops: (629, 7)


In [44]:
# Who won the most races in 2025?
print("=== 2025 RACE WINNERS ===")
winners = df_results[df_results['finish_position'] == 1]
print(winners.groupby('driver_name')['finish_position'].count().sort_values(ascending=False))

print("\n=== 2025 CONSTRUCTORS (Total Points) ===")
print(df_results.groupby('team')['points'].sum().sort_values(ascending=False))

print("\n=== POLE POSITIONS ===")
poles = df_qualifying[df_qualifying['quali_position'] == 1]
print(poles.groupby('driver_name')['quali_position'].count().sort_values(ascending=False))

=== 2025 RACE WINNERS ===
driver_name
Max Verstappen    8
Lando Norris      7
Oscar Piastri     7
George Russell    2
Name: finish_position, dtype: int64

=== 2025 CONSTRUCTORS (Total Points) ===
team
McLaren           775.0
Mercedes          424.0
Red Bull          410.0
Ferrari           360.0
Williams          124.0
RB F1 Team         88.0
Aston Martin       80.0
Haas F1 Team       73.0
Sauber             70.0
Alpine F1 Team     20.0
Name: points, dtype: float64

=== POLE POSITIONS ===
driver_name
Max Verstappen     8
Lando Norris       7
Oscar Piastri      6
George Russell     2
Charles Leclerc    1
Name: quali_position, dtype: int64


In [45]:
# Create driver mapping from full name to code
driver_mapping = (
    df_results[['driver', 'driver_name']]
    .drop_duplicates()
    .assign(driver_lastname=lambda x: x['driver_name'].str.split(' ').str[-1].str.lower())
)

# Fix special character
driver_mapping['driver_lastname'] = driver_mapping['driver_lastname'].str.replace('ü', 'u')

print("Driver mapping created!")
print(driver_mapping[['driver', 'driver_name', 'driver_lastname']].to_string())

Driver mapping created!
    driver            driver_name driver_lastname
0      NOR           Lando Norris          norris
1      VER         Max Verstappen      verstappen
2      RUS         George Russell         russell
3      ANT  Andrea Kimi Antonelli       antonelli
4      ALB        Alexander Albon           albon
5      STR           Lance Stroll          stroll
6      HUL        Nico Hülkenberg      hulkenberg
7      LEC        Charles Leclerc         leclerc
8      PIA          Oscar Piastri         piastri
9      HAM         Lewis Hamilton        hamilton
10     GAS           Pierre Gasly           gasly
11     TSU           Yuki Tsunoda         tsunoda
12     OCO           Esteban Ocon            ocon
13     BEA         Oliver Bearman         bearman
14     LAW            Liam Lawson          lawson
15     BOR      Gabriel Bortoleto       bortoleto
16     ALO        Fernando Alonso          alonso
17     SAI           Carlos Sainz           sainz
18     DOO            Jack

In [46]:
# Merge pit stops with driver codes
pitstops_fixed = df_pitstops.merge(
    driver_mapping[['driver', 'driver_lastname']],
    left_on='driver',
    right_on='driver_lastname',
    how='left'
)

pitstops_fixed = pitstops_fixed.rename(columns={'driver_x': 'driver_raw', 'driver_y': 'driver_code'})

# Fix max_verstappen edge case
pitstops_fixed['driver_code'] = pitstops_fixed['driver_code'].fillna(
    pitstops_fixed['driver_raw'].map({'max_verstappen': 'VER'})
)

# Convert duration to float
pitstops_fixed['duration_seconds'] = pd.to_numeric(pitstops_fixed['duration'], errors='coerce')

matched = pitstops_fixed['driver_code'].notna().sum()
print(f"Matched: {matched}/{len(pitstops_fixed)} pit stops")
print(pitstops_fixed[['driver_raw', 'driver_code', 'round', 'duration_seconds']].head(5))

Matched: 629/629 pit stops
       driver_raw driver_code  round  duration_seconds
0          norris         NOR      1            13.341
1  max_verstappen         VER      1            13.416
2         piastri         PIA      1            13.078
3         russell         RUS      1            13.672
4         leclerc         LEC      1            14.182


In [47]:
# Feature 1: Outperformance (remove positions_gained and expected_finish)
avg_finish_by_grid = df_results.groupby('grid_position')['finish_position'].mean()
df_results['expected_finish'] = df_results['grid_position'].map(avg_finish_by_grid)
df_results['outperformance'] = df_results['expected_finish'] - df_results['finish_position']

# Drop redundant columns
df_results = df_results.drop(columns=['expected_finish'])

print("=== DRIVER OUTPERFORMANCE ===")
print(df_results.groupby('driver_name')['outperformance'].mean().sort_values(ascending=False).round(2))

=== DRIVER OUTPERFORMANCE ===
driver_name
Lewis Hamilton           1.67
Alexander Albon          1.36
Oliver Bearman           1.30
Esteban Ocon             1.03
George Russell           1.01
Max Verstappen           0.92
Nico Hülkenberg          0.80
Charles Leclerc          0.26
Oscar Piastri            0.20
Lando Norris             0.05
Lance Stroll            -0.03
Andrea Kimi Antonelli   -0.26
Yuki Tsunoda            -0.37
Liam Lawson             -0.47
Carlos Sainz            -0.56
Fernando Alonso         -0.69
Isack Hadjar            -0.84
Gabriel Bortoleto       -1.17
Franco Colapinto        -2.01
Pierre Gasly            -2.05
Jack Doohan             -2.51
Name: outperformance, dtype: float64


In [48]:
# Feature 2: Teammate qualifying gap per round
# Find teammate pairs per round per team
quali_team = df_qualifying[['round', 'driver', 'driver_name', 'team', 'quali_position']].copy()

# Get best and second best qualifier per team per round
teammate_quali = quali_team.sort_values(['round', 'team', 'quali_position'])
teammate_quali['teammate_quali_rank'] = teammate_quali.groupby(['round', 'team']).cumcount() + 1

# Gap vs teammate (positive = you qualified better than teammate)
best = teammate_quali[teammate_quali['teammate_quali_rank'] == 1][['round', 'team', 'quali_position']].rename(columns={'quali_position': 'best_quali'})
teammate_quali = teammate_quali.merge(best, on=['round', 'team'], how='left')
teammate_quali['teammate_quali_gap'] = teammate_quali['best_quali'] - teammate_quali['quali_position']

# Feature 3: Teammate finish gap per round
race_team = df_results[['round', 'driver', 'team', 'finish_position']].copy()
race_team_sorted = race_team.sort_values(['round', 'team', 'finish_position'])
race_team_sorted['teammate_finish_rank'] = race_team_sorted.groupby(['round', 'team']).cumcount() + 1
best_finish = race_team_sorted[race_team_sorted['teammate_finish_rank'] == 1][['round', 'team', 'finish_position']].rename(columns={'finish_position': 'best_finish'})
race_team_sorted = race_team_sorted.merge(best_finish, on=['round', 'team'], how='left')
race_team_sorted['teammate_finish_gap'] = race_team_sorted['best_finish'] - race_team_sorted['finish_position']

# Merge teammate gaps back to qualifying and results
df_qualifying = df_qualifying.merge(
    teammate_quali[['round', 'driver', 'teammate_quali_gap']],
    on=['round', 'driver'],
    how='left'
)

df_results = df_results.merge(
    race_team_sorted[['round', 'driver', 'teammate_finish_gap']],
    on=['round', 'driver'],
    how='left'
)

# Feature 4: Constructor rolling points (last 5 races)
team_points = df_results.groupby(['round', 'team'])['points'].sum().reset_index()
team_points = team_points.sort_values(['team', 'round'])
team_points['constructor_rolling_points_5'] = (
    team_points.groupby('team')['points']
    .transform(lambda x: x.rolling(window=5, min_periods=1).mean())
)

df_results = df_results.merge(
    team_points[['round', 'team', 'constructor_rolling_points_5']],
    on=['round', 'team'],
    how='left'
)

# Feature 5: Season stage
def get_season_stage(round_num):
    if round_num <= 8:
        return 'early'
    elif round_num <= 16:
        return 'mid'
    else:
        return 'late'

df_results['season_stage'] = df_results['round'].apply(get_season_stage)

# Feature 6: Sprint weekend flag
# 2025 sprint rounds
sprint_rounds = [2, 6, 12, 18, 21, 23]
df_results['is_sprint_weekend'] = df_results['round'].isin(sprint_rounds).astype(int)

print(f"Sample teammate quali gap:\n{df_qualifying[['round', 'driver', 'team', 'quali_position', 'teammate_quali_gap']].head(10)}")

Sample teammate quali gap:
   round driver            team  quali_position  teammate_quali_gap
0      1    NOR         McLaren               1                   0
1      1    PIA         McLaren               2                  -1
2      1    VER        Red Bull               3                   0
3      1    RUS        Mercedes               4                   0
4      1    TSU      RB F1 Team               5                   0
5      1    ALB        Williams               6                   0
6      1    LEC         Ferrari               7                   0
7      1    HAM         Ferrari               8                  -1
8      1    GAS  Alpine F1 Team               9                   0
9      1    SAI        Williams              10                  -4


In [49]:
# Feature 7: Circuit type
circuit_types = {
    'Albert Park Grand Prix Circuit': 'street',
    'Shanghai International Circuit': 'permanent',
    'Suzuka Circuit': 'permanent',
    'Bahrain International Circuit': 'permanent',
    'Jeddah Corniche Circuit': 'street',
    'Miami International Autodrome': 'street',
    'Autodromo Enzo e Dino Ferrari': 'permanent',
    'Circuit de Monaco': 'street',
    'Circuit de Barcelona-Catalunya': 'permanent',
    'Circuit Gilles Villeneuve': 'permanent',
    'Red Bull Ring': 'permanent',
    'Silverstone Circuit': 'permanent',
    'Circuit de Spa-Francorchamps': 'permanent',
    'Hungaroring': 'permanent',
    'Circuit Zandvoort': 'permanent',
    'Autodromo Nazionale di Monza': 'permanent',
    'Baku City Circuit': 'street',
    'Marina Bay Street Circuit': 'street',
    'Circuit of the Americas': 'permanent',
    'Autodromo Hermanos Rodriguez': 'permanent',
    'Autodromo Jose Carlos Pace': 'permanent',
    'Las Vegas Strip Street Circuit': 'street',
    'Lusail International Circuit': 'permanent',
    'Yas Marina Circuit': 'permanent'
}

df_results['circuit_type'] = df_results['circuit'].map(circuit_types)
df_results['circuit_type'] = df_results['circuit_type'].fillna('permanent')

# Feature 8: Circuit overtaking difficulty (1=easy, 5=very hard)
overtaking_difficulty = {
    'Albert Park Grand Prix Circuit': 2,
    'Shanghai International Circuit': 2,
    'Suzuka Circuit': 3,
    'Bahrain International Circuit': 2,
    'Jeddah Corniche Circuit': 2,
    'Miami International Autodrome': 3,
    'Autodromo Enzo e Dino Ferrari': 3,
    'Circuit de Monaco': 5,
    'Circuit de Barcelona-Catalunya': 3,
    'Circuit Gilles Villeneuve': 3,
    'Red Bull Ring': 2,
    'Silverstone Circuit': 2,
    'Circuit de Spa-Francorchamps': 1,
    'Hungaroring': 4,
    'Circuit Zandvoort': 4,
    'Autodromo Nazionale di Monza': 1,
    'Baku City Circuit': 2,
    'Marina Bay Street Circuit': 4,
    'Circuit of the Americas': 2,
    'Autodromo Hermanos Rodriguez': 3,
    'Autodromo Jose Carlos Pace': 3,
    'Las Vegas Strip Street Circuit': 2,
    'Lusail International Circuit': 2,
    'Yas Marina Circuit': 2
}

df_results['overtaking_difficulty'] = df_results['circuit'].map(overtaking_difficulty)
df_results['overtaking_difficulty'] = df_results['overtaking_difficulty'].fillna(3)

# Feature 9: Historical safety car probability per circuit (0-1)
safety_car_prob = {
    'Albert Park Grand Prix Circuit': 0.7,
    'Shanghai International Circuit': 0.5,
    'Suzuka Circuit': 0.4,
    'Bahrain International Circuit': 0.3,
    'Jeddah Corniche Circuit': 0.8,
    'Miami International Autodrome': 0.6,
    'Autodromo Enzo e Dino Ferrari': 0.4,
    'Circuit de Monaco': 0.9,
    'Circuit de Barcelona-Catalunya': 0.3,
    'Circuit Gilles Villeneuve': 0.7,
    'Red Bull Ring': 0.5,
    'Silverstone Circuit': 0.5,
    'Circuit de Spa-Francorchamps': 0.6,
    'Hungaroring': 0.3,
    'Circuit Zandvoort': 0.5,
    'Autodromo Nazionale di Monza': 0.6,
    'Baku City Circuit': 0.8,
    'Marina Bay Street Circuit': 0.8,
    'Circuit of the Americas': 0.6,
    'Autodromo Hermanos Rodriguez': 0.4,
    'Autodromo Jose Carlos Pace': 0.7,
    'Las Vegas Strip Street Circuit': 0.6,
    'Lusail International Circuit': 0.4,
    'Yas Marina Circuit': 0.3
}

df_results['safety_car_probability'] = df_results['circuit'].map(safety_car_prob)
df_results['safety_car_probability'] = df_results['safety_car_probability'].fillna(0.5)

# Feature 10: Home race flag
home_circuits = {
    'VER': 'Circuit Zandvoort',
    'NOR': 'Silverstone Circuit',
    'PIA': 'Albert Park Grand Prix Circuit',
    'RUS': 'Silverstone Circuit',
    'HAM': 'Silverstone Circuit',
    'LEC': 'Circuit de Monaco',
    'SAI': 'Circuit de Barcelona-Catalunya',
    'ALO': 'Circuit de Barcelona-Catalunya',
    'GAS': 'Circuit de Monaco',
    'OCO': 'Circuit de Monaco',
    'TSU': 'Suzuka Circuit',
    'ANT': 'Autodromo Nazionale di Monza',
    'BOR': 'Autodromo Jose Carlos Pace',
    'HAD': 'Circuit de Monaco',
    'STR': 'Circuit Gilles Villeneuve',
    'ALB': 'Silverstone Circuit',
    'HUL': 'Circuit de Spa-Francorchamps',
    'BEA': 'Silverstone Circuit',
    'LAW': 'Albert Park Grand Prix Circuit',
    'DOO': 'Albert Park Grand Prix Circuit',
    'COL': 'Autodromo Jose Carlos Pace'
}

df_results['home_race'] = df_results.apply(
    lambda row: 1 if home_circuits.get(row['driver']) == row['circuit'] else 0,
    axis=1
)

print("✅ Circuit features added!")
print(df_results[['circuit', 'circuit_type', 'overtaking_difficulty', 'safety_car_probability']].drop_duplicates().to_string())

✅ Circuit features added!
                            circuit circuit_type  overtaking_difficulty  safety_car_probability
0    Albert Park Grand Prix Circuit       street                    2.0                     0.7
20   Shanghai International Circuit    permanent                    2.0                     0.5
40                   Suzuka Circuit    permanent                    3.0                     0.4
60    Bahrain International Circuit    permanent                    2.0                     0.3
80          Jeddah Corniche Circuit       street                    2.0                     0.8
100   Miami International Autodrome       street                    3.0                     0.6
120   Autodromo Enzo e Dino Ferrari    permanent                    3.0                     0.4
140               Circuit de Monaco       street                    5.0                     0.9
160  Circuit de Barcelona-Catalunya    permanent                    3.0                     0.3
179       Circ

In [50]:
# Feature 11: Championship gap to leader after each round
cumulative_points = df_results.sort_values(['driver_name', 'round'])
cumulative_points['cumulative_points'] = (
    cumulative_points.groupby('driver_name')['points'].cumsum()
)

# Get leader points after each round
leader_points = cumulative_points.groupby('round')['cumulative_points'].max().reset_index()
leader_points.columns = ['round', 'leader_points']

cumulative_points = cumulative_points.merge(leader_points, on='round', how='left')
cumulative_points['championship_gap'] = cumulative_points['cumulative_points'] - cumulative_points['leader_points']

df_results = df_results.merge(
    cumulative_points[['round', 'driver', 'championship_gap']],
    on=['round', 'driver'],
    how='left'
)

# Feature 12: Driver experience (total 2025 starts as proxy)
driver_experience = {
    'VER': 181, 'HAM': 350, 'ALO': 400, 'SAI': 173, 'LEC': 142,
    'NOR': 122, 'RUS': 110, 'PIA': 50,  'STR': 168, 'ALB': 100,
    'GAS': 180, 'OCO': 120, 'TSU': 90,  'HUL': 200, 'BOT': 220,
    'ANT': 1,   'BOR': 1,   'HAD': 1,   'DOO': 1,   'LAW': 20,
    'BEA': 15,  'COL': 10
}

df_results['driver_experience'] = df_results['driver'].map(driver_experience)
df_results['driver_experience'] = df_results['driver_experience'].fillna(50)

# Feature 13: Wet race flag (known wet races in 2025)
wet_races = [3, 21]  # Japan and Sao Paulo had significant rain
df_results['wet_race'] = df_results['round'].isin(wet_races).astype(int)

print("\n=== CHAMPIONSHIP GAP SAMPLE ===")
sample = df_results[df_results['driver'].isin(['VER', 'NOR', 'PIA'])][
    ['round', 'driver', 'points', 'championship_gap']
].sort_values(['round', 'driver'])
print(sample.head(15).to_string())


=== CHAMPIONSHIP GAP SAMPLE ===
    round driver  points  championship_gap
0       1    NOR    25.0               0.0
8       1    PIA     2.0             -23.0
1       1    VER    18.0              -7.0
21      2    NOR    18.0               0.0
20      2    PIA    25.0             -16.0
23      2    VER    12.0             -13.0
41      3    NOR    18.0               0.0
42      3    PIA    15.0             -19.0
40      3    VER    25.0              -6.0
62      4    NOR    15.0               0.0
60      4    PIA    25.0              -9.0
65      4    VER     8.0             -13.0
83      5    NOR    12.0              -4.0
80      5    PIA    25.0               0.0
81      5    VER    18.0             -11.0


In [51]:
# Sort data first
df_results = df_results.sort_values(['driver_name', 'round']).reset_index(drop=True)
df_qualifying = df_qualifying.sort_values(['driver_name', 'round']).reset_index(drop=True)

# Rolling average finish position (last 5 races)
df_results['rolling_avg_finish_5'] = (
    df_results.groupby('driver_name')['finish_position']
    .transform(lambda x: x.rolling(window=5, min_periods=1).mean())
)

# Rolling average points (last 5 races)
df_results['rolling_avg_points_5'] = (
    df_results.groupby('driver_name')['points']
    .transform(lambda x: x.rolling(window=5, min_periods=1).mean())
)

# Rolling average qualifying position (last 5 races)
df_qualifying['rolling_avg_quali_5'] = (
    df_qualifying.groupby('driver_name')['quali_position']
    .transform(lambda x: x.rolling(window=5, min_periods=1).mean())
)

print(df_results[['driver', 'round', 'finish_position', 'rolling_avg_finish_5', 'rolling_avg_points_5']].head(10))

  driver  round  finish_position  rolling_avg_finish_5  rolling_avg_points_5
0    ALB      1                5                  5.00                  10.0
1    ALB      2                7                  6.00                   8.0
2    ALB      3                9                  7.00                   6.0
3    ALB      4               12                  8.25                   4.5
4    ALB      5                9                  8.40                   4.0
5    ALB      6                5                  8.40                   4.0
6    ALB      7                5                  8.00                   4.8
7    ALB      8                9                  8.00                   4.8
8    ALB      9               19                  9.40                   4.8
9    ALB     10               20                 11.60                   4.4


In [52]:
# Average pit stop time per team
team_pit_performance = (
    df_results[['round', 'driver', 'team']]
    .merge(
        pitstops_fixed[['round', 'driver_code', 'duration_seconds']],
        left_on=['round', 'driver'],
        right_on=['round', 'driver_code'],
        how='left'
    )
)

avg_pit_by_team = (
    team_pit_performance.groupby('team')['duration_seconds']
    .mean()
    .sort_values()
    .round(3)
)

print("=== AVERAGE PIT STOP TIME BY TEAM (seconds) ===")
print(avg_pit_by_team)

=== AVERAGE PIT STOP TIME BY TEAM (seconds) ===
team
McLaren           23.284
Mercedes          23.337
Ferrari           23.476
RB F1 Team        23.826
Sauber            23.892
Aston Martin      23.998
Alpine F1 Team    23.999
Red Bull          24.035
Williams          24.535
Haas F1 Team      25.354
Name: duration_seconds, dtype: float64


In [53]:
# DNF rate per driver (reliability)
df_results['dnf'] = df_results['status'].apply(
    lambda x: 0 if x == 'Finished' or '+' in str(x) else 1
)

dnf_rate = df_results.groupby('driver_name')['dnf'].mean().sort_values(ascending=False).round(3)
print("=== DNF RATE PER DRIVER ===")
print(dnf_rate)

=== DNF RATE PER DRIVER ===
driver_name
Franco Colapinto         0.722
Pierre Gasly             0.542
Jack Doohan              0.500
Gabriel Bortoleto        0.500
Lance Stroll             0.478
Nico Hülkenberg          0.458
Isack Hadjar             0.458
Carlos Sainz             0.375
Liam Lawson              0.375
Yuki Tsunoda             0.333
Alexander Albon          0.333
Esteban Ocon             0.333
Fernando Alonso          0.292
Oliver Bearman           0.250
Andrea Kimi Antonelli    0.250
Lewis Hamilton           0.167
Charles Leclerc          0.125
Lando Norris             0.125
Oscar Piastri            0.083
George Russell           0.042
Max Verstappen           0.042
Name: dnf, dtype: float64


In [54]:
# Merge everything into master dataframe
df_master = df_results.merge(
    df_qualifying[['round', 'driver', 'quali_position', 'rolling_avg_quali_5', 'teammate_quali_gap']],
    on=['round', 'driver'],
    how='left'
)

# Add team pit stop average
df_master = df_master.merge(
    avg_pit_by_team.rename('avg_team_pit_seconds'),
    on='team',
    how='left'
)

# Add DNF rate
df_master = df_master.merge(
    dnf_rate.rename('dnf_rate'),
    on='driver_name',
    how='left'
)

# Drop columns we no longer need
df_master = df_master.drop(columns=['positions_gained'], errors='ignore')

print(f"Master dataframe shape: {df_master.shape}")
print(f"All features:")
for col in df_master.columns.tolist():
    print(f"  - {col}")

Master dataframe shape: (479, 34)
All features:
  - year
  - round
  - race_name
  - circuit
  - date
  - driver
  - driver_name
  - team
  - grid_position
  - finish_position
  - points
  - status
  - laps_completed
  - fastest_lap_rank
  - outperformance
  - teammate_finish_gap
  - constructor_rolling_points_5
  - season_stage
  - is_sprint_weekend
  - circuit_type
  - overtaking_difficulty
  - safety_car_probability
  - home_race
  - championship_gap
  - driver_experience
  - wet_race
  - rolling_avg_finish_5
  - rolling_avg_points_5
  - dnf
  - quali_position
  - rolling_avg_quali_5
  - teammate_quali_gap
  - avg_team_pit_seconds
  - dnf_rate


In [55]:
df_master.to_csv('../data/master_features_2025.csv', index=False)

import os
size = os.path.getsize('../data/master_features_2025.csv')
print(f"Size: {size/1024:.1f} KB")
print(f"Shape: {df_master.shape}")
print(f"Total features: {df_master.shape[1]}")

Size: 101.5 KB
Shape: (479, 34)
Total features: 34


In [56]:
# Check which row has missing qualifying data
missing_quali = df_master[df_master['quali_position'].isna()]
print("Missing qualifying data:")
print(missing_quali[['round', 'driver', 'race_name', 'grid_position']].to_string())

Missing qualifying data:
     round driver             race_name  grid_position
182     21    BOR  São Paulo Grand Prix             18


In [57]:
# Fill with Bortoleto's average qualifying position for the season
bor_avg_quali = df_master[
    (df_master['driver'] == 'BOR') & 
    (df_master['round'] != 21)
]['quali_position'].mean()

bor_avg_rolling = df_master[
    (df_master['driver'] == 'BOR') & 
    (df_master['round'] < 21)
]['rolling_avg_quali_5'].iloc[-1]

df_master.loc[(df_master['driver'] == 'BOR') & (df_master['round'] == 21), 'quali_position'] = bor_avg_quali
df_master.loc[(df_master['driver'] == 'BOR') & (df_master['round'] == 21), 'rolling_avg_quali_5'] = bor_avg_rolling
df_master.loc[(df_master['driver'] == 'BOR') & (df_master['round'] == 21), 'teammate_quali_gap'] = 0

print(f"Filled with Bortoleto's season average quali position: {bor_avg_quali:.1f}")

# Verify
missing = df_master.isnull().sum()
print(missing[missing > 0] if missing[missing > 0].any() else "✅ No missing values!")

df_master.to_csv('../data/master_features_2025.csv', index=False)

Filled with Bortoleto's season average quali position: 14.0
fastest_lap_rank    22
dtype: int64


In [59]:
# Use 21 for missing fastest_lap_rank
# (21 means "outside top 20")
df_master['fastest_lap_rank'] = df_master['fastest_lap_rank'].fillna(21)

# Verify
missing = df_master.isnull().sum()
print(missing[missing > 0] if missing[missing > 0].any() else "No missing values!")

df_master.to_csv('../data/master_features_2025.csv', index=False)

print(f"Final shape: {df_master.shape}")

No missing values!
Final shape: (479, 34)
