In [1]:
import pandas as pd
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

In [2]:
df_constructors = pd.read_csv('f1data/constructors.csv')
df_drivers = pd.read_csv('f1data/drivers.csv')
raceID = pd.read_csv('f1data/races.csv')
df_results = pd.read_csv('f1data/results.csv')

In [3]:
print(df_results.columns)

# Extract only relevant information about the race for training purposes
df_race = raceID[["raceId", "year", "round", "circuitId"]].copy()

Index(['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid',
       'position', 'positionText', 'positionOrder', 'points', 'laps', 'time',
       'milliseconds', 'fastestLap', 'rank', 'fastestLapTime',
       'fastestLapSpeed', 'statusId'],
      dtype='object')


Reasons to analyze data from 2014 and onwards:
    1. Modern era relevance: 2014 marked the beginning of the turbo-hybrid era in Formula 1, which brought significant changes to car design, power units, and overall performance. This era is most relevant to current racing conditions.
    2. Technological advancements: The sport has seen rapid technological progress in recent years. Data from before 2014 may not accurately reflect current car capabilities, aerodynamics, or energy recovery systems.
    3. Rule changes: Major regulation changes were implemented in 2014, affecting aerodynamics, fuel consumption, and other aspects of car design. These rules have shaped the current competitive landscape.
    4. Team dynamics: The performance of teams and their hierarchies have shifted significantly since 2014. Using data from this point onwards would provide a more accurate picture of current team strengths and weaknesses.
    5. Driver line-ups: Many current drivers entered F1 around or after 2014. This data set would include most relevant information about current driver performances.
    6. Data quality and quantity: More recent data is likely to be more detailed and comprehensive, offering better insights for prediction modeling.
    7. Sufficient historical context: A decade of data (2014-2024) provides enough historical context to identify trends and patterns without being overly influenced by outdated information.

In [4]:
df_race = df_race.sort_values(by=['year', 'round'])
df_race = df_race[df_race["year"] >= 2014]
print(df_race)

      raceId  year  round  circuitId
897      900  2014      1          1
898      901  2014      2          2
899      902  2014      3          3
900      903  2014      4         17
901      904  2014      5          4
...      ...   ...    ...        ...
1120    1140  2024     20         32
1121    1141  2024     21         18
1122    1142  2024     22         80
1123    1143  2024     23         78
1124    1144  2024     24         24

[228 rows x 4 columns]


In [5]:
df_res = df_results[['raceId', 'driverId', 'constructorId', 'grid', 'positionOrder']].copy()
print(df_res)
duplicates = df_race.duplicated()
num_duplicates = duplicates.sum()
print(f"Number of duplicate rows: {num_duplicates}")
print(df_race)

       raceId  driverId  constructorId  grid  positionOrder
0          18         1              1     1              1
1          18         2              2     5              2
2          18         3              3     7              3
3          18         4              4    11              4
4          18         5              1     3              5
...       ...       ...            ...   ...            ...
26514    1132       839            214    18             16
26515    1132       815              9     0             17
26516    1132       855             15    14             18
26517    1132       847            131     1             19
26518    1132       842            214    19             20

[26519 rows x 5 columns]
Number of duplicate rows: 0
      raceId  year  round  circuitId
897      900  2014      1          1
898      901  2014      2          2
899      902  2014      3          3
900      903  2014      4         17
901      904  2014      5          4
... 

In [6]:
def calculate_rolling_average_points(df_results, n_races=244, start_race_id=900):
    # Ensure the dataframe is sorted by raceId
    df_results = df_results.sort_values('raceId')
    
    # Filter for races from 2014 onwards
    df_results = df_results[df_results['raceId'] >= start_race_id]
    
    # Group by driver and sort by raceId
    grouped = df_results.groupby('driverId')
    
    # Function to calculate rolling average for a group
    def rolling_avg(group):
        return group['points'].rolling(window=n_races, min_periods=1).mean()
    
    # Apply rolling average calculation
    df_results['rolling_avg_points'] = grouped.apply(rolling_avg).reset_index(level=0, drop=True)
    
    return df_results


# Calculate rolling average for last 5 races
results_with_rolling_avg = calculate_rolling_average_points(df_results, n_races=5)

# Display the results
print(results_with_rolling_avg[['raceId', 'driverId', 'points', 'rolling_avg_points']])

       raceId  driverId  points  rolling_avg_points
22142     900       813     0.0                 0.0
22148     900       817     0.0                 0.0
22147     900       155     0.0                 0.0
22146     900        13     0.0                 0.0
22145     900         1     0.0                 0.0
...       ...       ...     ...                 ...
26500    1132       830    18.0                17.2
26499    1132         1    25.0                14.4
26517    1132       847     0.0                12.4
26507    1132       848     2.0                 0.8
26518    1132       842     0.0                 1.2

[4386 rows x 4 columns]


  df_results['rolling_avg_points'] = grouped.apply(rolling_avg).reset_index(level=0, drop=True)


In [7]:
import pandas as pd
import numpy as np

def calculate_relative_position_improvement(df, driver_id_col, race_id_col, start_position_col, finish_position_col, n_races=None):
    # Create a copy of the DataFrame to avoid modifying the original
    df = df.copy()
    
    # Sort the DataFrame by driver and race ID
    df = df.sort_values([driver_id_col, race_id_col])
    
    # Ensure start and finish positions are numeric
    df[start_position_col] = pd.to_numeric(df[start_position_col], errors='coerce')
    df[finish_position_col] = pd.to_numeric(df[finish_position_col], errors='coerce')
    
    # Calculate position improvement for each race
    df['position_improvement'] = df[start_position_col] - df[finish_position_col]
    
    # Function to calculate average improvement
    def avg_improvement(group):
        if n_races is None:
            return group.mean()
        else:
            return group.tail(n_races).mean()
    
    # Calculate average improvement for each driver
    avg_improvements = df.groupby(driver_id_col)['position_improvement'].transform(avg_improvement)
    
    # Add the result to the DataFrame
    df['avg_position_improvement'] = avg_improvements
    
    return df

n_races = 5  # Calculate average over the last 5 races, or use None for all races

df_results = calculate_relative_position_improvement(df_results, 'driverId', 'raceId', 'grid', 'position', n_races)
print(df_results.head())

      resultId  raceId  driverId  constructorId number  grid  position  \
7572      7573       1         1              1      1    18       NaN   
7579      7580       2         1              1      1    12       7.0   
7598      7599       3         1              1      1     9       6.0   
7616      7617       4         1              1      1     5       4.0   
7641      7642       5         1              1      1    14       9.0   

     positionText  positionOrder  points  laps       time milliseconds  \
7572            D             20     0.0    58         \N           \N   
7579            7              7     1.0    31  +1:00.733      4312825   
7598            6              6     3.0    56  +1:11.866      7135351   
7616            4              4     5.0    57    +22.096      5530278   
7641            9              9     0.0    65         \N           \N   

     fastestLap rank fastestLapTime fastestLapSpeed  statusId  \
7572         39   13       1:29.020         2

In [9]:
import pandas as pd
import numpy as np

def calculate_driver_form(df, driver_id_col, race_id_col, finish_position_col, 
                          qualifying_position_col, points_col, n_races=5, 
                          race_weight=0.6, quali_weight=0.4):
    """
    Calculate a driver's form based on recent race results and qualifying performances.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing race and qualifying results
    driver_id_col (str): Name of the column containing driver IDs
    race_id_col (str): Name of the column containing race IDs
    finish_position_col (str): Name of the column containing race finishing positions
    qualifying_position_col (str): Name of the column containing qualifying positions
    points_col (str): Name of the column containing points scored
    n_races (int): Number of recent races to consider (default: 5)
    race_weight (float): Weight given to race performance (default: 0.6)
    quali_weight (float): Weight given to qualifying performance (default: 0.4)
    
    Returns:
    pd.DataFrame: Original DataFrame with an additional column for the form metric
    """
    
    # Create a copy of the DataFrame to avoid modifying the original
    df = df.copy()
    
    # Sort the DataFrame by driver and race ID
    df = df.sort_values([driver_id_col, race_id_col])
    
    # Ensure relevant columns are numeric
    for col in [finish_position_col, qualifying_position_col, points_col]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Calculate the maximum points scored in a single race
    max_points = df[points_col].max()
    
    # Function to calculate form for a group of races
    def calculate_form(group):
        # Use only the last n_races
        recent_races = group.tail(n_races)
        
        # Calculate race performance (inverse of finishing position, normalized)
        race_performance = (21 - recent_races[finish_position_col]) / 20
        
        # Calculate qualifying performance (inverse of qualifying position, normalized)
        quali_performance = (21 - recent_races[qualifying_position_col]) / 20
        
        # Calculate points performance (normalized by max points)
        points_performance = recent_races[points_col] / max_points
        
        # Combine race and qualifying performance
        performance = (race_performance * race_weight + 
                       quali_performance * quali_weight + 
                       points_performance) / 3
        
        # Calculate form as the mean performance over recent races
        return performance.mean()
    
    # Calculate form for each driver
    df['form'] = df.groupby(driver_id_col).apply(calculate_form).reset_index(level=0, drop=True)
    
    return df

# Example usage:
# Assuming you have a DataFrame 'race_results' with columns 'driverId', 'raceId', 'position', 'qualifyingPosition', and 'points'
n_races = 5  # Calculate form based on the last 5 races

df_results = calculate_driver_form(df_results, 'driverId', 'raceId', 'position', 'grid', 'points', n_races)
print(df_results.head())

      resultId  raceId  driverId  constructorId number  grid  position  \
7572      7573       1         1              1      1    18       NaN   
7579      7580       2         1              1      1    12       7.0   
7598      7599       3         1              1      1     9       6.0   
7616      7617       4         1              1      1     5       4.0   
7641      7642       5         1              1      1    14       9.0   

     positionText  positionOrder  points  ...       time milliseconds  \
7572            D             20     0.0  ...         \N           \N   
7579            7              7     1.0  ...  +1:00.733      4312825   
7598            6              6     3.0  ...  +1:11.866      7135351   
7616            4              4     5.0  ...    +22.096      5530278   
7641            9              9     0.0  ...         \N           \N   

     fastestLap rank fastestLapTime fastestLapSpeed statusId  \
7572         39   13       1:29.020         214.455 

  df['form'] = df.groupby(driver_id_col).apply(calculate_form).reset_index(level=0, drop=True)


In [17]:
valid_form_results = df_results[df_results['form'].notna()].sort_values('form', ascending=False)
race_results = pd.read_csv('f1data/results.csv')
drivers = pd.read_csv('f1data/drivers.csv')
races = pd.read_csv('f1data/races.csv')


race_results = calculate_driver_form(race_results, 'driverId', 'raceId', 'position', 'grid', 'points', n_races=5)

# Merge race results with driver information
race_results = race_results.merge(drivers[['driverId', 'driverRef']], on='driverId', how='left')

# Merge with races information
race_results = race_results.merge(races[['raceId', 'name', 'year']], on='raceId', how='left')

# Find the most recent race
most_recent_race_id = race_results['raceId'].max()

# Filter for only the most recent race and sort by finishing position
most_recent_results = race_results[race_results['raceId'] == most_recent_race_id].sort_values('position')
race_details = most_recent_results.iloc[0]

print(f"\nMost Recent Race: {race_details['name']} {race_details['year']}")
print(f"Race ID: {most_recent_race_id}")
print(f"Number of Drivers: {len(most_recent_results)}")
print("\n==========================================")

# Print results
print(f"Race Results for the Most Recent Grand Prix (Race ID: {most_recent_race_id}):")
print("==========================================")

for _, race_result in most_recent_results.iterrows():
    print(f"Driver: {race_result['driverRef']} (ID: {race_result['driverId']})")
    print(f"Form: {race_result['form']:.4f}")
    print(f"Finishing Position: {race_result['position']}")
    print(f"Qualifying Position: {race_result['grid']}")
    print(f"Points Scored: {race_result['points']}")
    print("------------------------------------------")

# Optional: Print summary statistics for the most recent race
print("\nSummary Statistics for Form in the Most Recent Race:")
print(most_recent_results['form'].describe())

# Optional: Print race details
race_details = most_recent_results.iloc[0]
print(f"\nRace Details:")
print(f"Name: {race_details['name']}")
print(f"Year: {race_details['year']}")
print(f"Number of Drivers: {len(most_recent_results)}")


Most Recent Race: British Grand Prix 2024
Race ID: 1132
Number of Drivers: 20

Race Results for the Most Recent Grand Prix (Race ID: 1132):
Driver: hamilton (ID: 1)
Form: nan
Finishing Position: 1.0
Qualifying Position: 2
Points Scored: 25.0
------------------------------------------
Driver: max_verstappen (ID: 830)
Form: nan
Finishing Position: 2.0
Qualifying Position: 4
Points Scored: 18.0
------------------------------------------
Driver: norris (ID: 846)
Form: nan
Finishing Position: 3.0
Qualifying Position: 3
Points Scored: 15.0
------------------------------------------
Driver: piastri (ID: 857)
Form: nan
Finishing Position: 4.0
Qualifying Position: 5
Points Scored: 12.0
------------------------------------------
Driver: sainz (ID: 832)
Form: nan
Finishing Position: 5.0
Qualifying Position: 7
Points Scored: 11.0
------------------------------------------
Driver: hulkenberg (ID: 807)
Form: nan
Finishing Position: 6.0
Qualifying Position: 6
Points Scored: 8.0
---------------------

  df['form'] = df.groupby(driver_id_col).apply(calculate_form).reset_index(level=0, drop=True)
