In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# F1 Pit Stop Data Preprocessing

This notebook preprocesses Formula 1 data for pit stop analysis and prediction modeling. We'll clean, encode, and engineer features from the merged F1 dataset to prepare it for machine learning models.

## Table of Contents
1. [Data Loading and Initial Exploration](#data-loading)
2. [Feature Selection](#feature-selection)
3. [Data Cleaning](#data-cleaning)
4. [Categorical Data Encoding](#categorical-encoding)
5. [Feature Engineering](#feature-engineering)
6. [Data Analysis and Validation](#data-analysis)

---

## 1. Data Loading and Initial Exploration

In [2]:
data = pd.read_csv('f1_merged_data.csv')
data.head()

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,Throttle,Brake,DRS,Source,Distance,RelativeDistance,Status,X,Y,Z
0,0 days 00:35:09.122000,VET,5,0 days 00:01:37.542000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:41.422000,...,,,,,,,,,,
1,0 days 00:35:10.524000,BOT,77,0 days 00:01:38.944000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:42.430000,...,,,,,,,,,,
2,0 days 00:35:11.046000,LEC,16,0 days 00:01:39.466000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:42.805000,...,,,,,,,,,,
3,0 days 00:35:11.521000,HAM,44,0 days 00:01:39.941000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:42.993000,...,,,,,,,,,,
4,0 days 00:35:12.750000,VER,33,0 days 00:01:41.170000,1.0,1.0,NaT,NaT,NaT,0 days 00:00:43.258000,...,,,,,,,,,,


In [3]:
data.columns

Index(['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint',
       'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
       'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest',
       'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime',
       'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason',
       'FastF1Generated', 'IsAccurate', 'Year_x', 'RaceName_x',
       'SessionType_x', 'AirTemp', 'Humidity', 'Pressure', 'Rainfall',
       'TrackTemp', 'WindDirection', 'WindSpeed', 'Year_y', 'RaceName_y',
       'SessionType_y', 'Date', 'SessionTime', 'DriverAhead',
       'DistanceToDriverAhead', 'RPM', 'Speed', 'nGear', 'Throttle', 'Brake',
       'DRS', 'Source', 'Distance', 'RelativeDistance', 'Status', 'X', 'Y',
       'Z'],
      dtype='object')

---

## 2. Feature Selection

We select relevant features for pit stop analysis from the merged F1 dataset. These features include:

- **Driver Information**: Driver, LapNumber, RaceName_x, Year_x
- **Tire Data**: TyreLife, Compound, FreshTyre  
- **Performance Metrics**: LapTime, Sector1Time, Sector2Time, Sector3Time
- **Pit Stop Data**: PitInTime, PitOutTime
- **Environmental Conditions**: AirTemp, TrackTemp, Humidity, Rainfall
- **Race Position**: Position

In [None]:
selected_features = [
    'Driver','LapNumber', 'RaceName_x','Year_x',
    'TyreLife', 'Compound', 'FreshTyre',
    'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time','PitInTime','PitOutTime',
    'AirTemp', 'TrackTemp', 'Humidity', 'Rainfall',
    'Position', 
]
df = data[selected_features]
df.head()


Unnamed: 0,Driver,LapNumber,RaceName_x,Year_x,LapTime,TyreLife,Compound,FreshTyre,LapTime.1,Sector1Time,Sector2Time,Sector3Time,PitInTime,PitOutTime,AirTemp,TrackTemp,Humidity,Rainfall,Position
0,VET,1.0,Bahrain,2019,0 days 00:01:37.542000,4.0,SOFT,False,0 days 00:01:37.542000,NaT,0 days 00:00:41.422000,0 days 00:00:23.828000,NaT,NaT,26.1,29.2,54.1,False,1.0
1,BOT,1.0,Bahrain,2019,0 days 00:01:38.944000,4.0,SOFT,False,0 days 00:01:38.944000,NaT,0 days 00:00:42.430000,0 days 00:00:23.703000,NaT,NaT,26.1,29.2,54.1,False,2.0
2,LEC,1.0,Bahrain,2019,0 days 00:01:39.466000,4.0,SOFT,False,0 days 00:01:39.466000,NaT,0 days 00:00:42.805000,0 days 00:00:23.773000,NaT,NaT,26.1,29.2,54.1,False,3.0
3,HAM,1.0,Bahrain,2019,0 days 00:01:39.941000,4.0,SOFT,False,0 days 00:01:39.941000,NaT,0 days 00:00:42.993000,0 days 00:00:23.801000,NaT,NaT,26.1,29.2,54.1,False,4.0
4,VER,1.0,Bahrain,2019,0 days 00:01:41.170000,4.0,SOFT,False,0 days 00:01:41.170000,NaT,0 days 00:00:43.258000,0 days 00:00:24.244000,NaT,NaT,26.1,29.2,54.1,False,5.0


In [5]:

df.isna().sum()

Driver          0
LapNumber       0
RaceName_x      0
Year_x          0
LapTime         0
TyreLife        0
Compound        0
FreshTyre       0
LapTime         0
Sector1Time     0
Sector2Time     0
Sector3Time     0
PitInTime       0
PitOutTime      0
AirTemp         0
TrackTemp       0
Humidity        0
Rainfall        0
Position       68
dtype: int64

In [6]:
df = df.dropna()

  df['LapTime'] = df['LapTime'].apply(lambda x: str(x)[7:])  # Removes '0 days '


---

## 3. Data Cleaning

We handle missing values by removing rows with any null values. This ensures we have complete data for all features across all observations.

**Note**: In production, you might want to use more sophisticated imputation methods depending on the missing data patterns.

In [None]:
# Check data types and identify categorical columns
print("Data types:")
print(df.dtypes)
print("\nUnique values in categorical columns:")

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical columns found: {categorical_columns}")

# Check unique values for each categorical column
for col in categorical_columns:
    print(f"\n{col}: {df[col].unique()}")
    print(f"Number of unique values: {df[col].nunique()}")

Data types:
Driver          object
LapNumber      float64
RaceName_x      object
Year_x           int64
LapTime         object
TyreLife       float64
Compound        object
FreshTyre         bool
LapTime         object
Sector1Time     object
Sector2Time     object
Sector3Time     object
PitInTime       object
PitOutTime      object
AirTemp        float64
TrackTemp      float64
Humidity       float64
Rainfall          bool
Position       float64
dtype: object

Unique values in categorical columns:

Categorical columns found: ['Driver', 'RaceName_x', 'LapTime', 'Compound', 'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'PitInTime', 'PitOutTime']

Driver: ['VET' 'BOT' 'LEC' 'HAM' 'VER' 'SAI' 'MAG' 'RAI' 'RIC' 'PER' 'HUL' 'GAS'
 'ALB' 'NOR' 'GIO' 'KVY' 'KUB' 'RUS' 'STR' 'GRO' 'OCO' 'LAT' 'ALO' 'TSU'
 'MSC' 'MAZ' 'ZHO' 'DEV']
Number of unique values: 28

RaceName_x: ['Bahrain' 'Styria' '70th Anniversary' 'Italy' 'Monaco' 'Saudi Arabian'
 'Canada' 'Spain' 'Emilia Romagna' 'Azerbaija

AttributeError: 'DataFrame' object has no attribute 'unique'

In [None]:
categorical_columns.remove('Driver')  # 'Driver' is not a feature for analysis
categorical_columns.remove('RaceName_x')  # 'RaceName_x' is not a feature for analysis      

---

## 4. Categorical Data Encoding

Before machine learning algorithms can process our data, we need to convert categorical variables into numerical format. We exclude `Driver` and `RaceName_x` from encoding as they are identifiers rather than features for analysis.

### Encoding Strategy:
- **Label Encoding**: For categorical variables with inherent order
- **Binary Encoding**: For boolean variables like `FreshTyre` and `Rainfall`

The encoding process creates new numerical columns while preserving the original categorical information for reference.

In [None]:
# Import encoding libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

# Create a copy of the dataframe for encoding
df_encoded = df.copy()

print("=== ENCODING CATEGORICAL DATA ===\n")

# Method 1: Label Encoding (for ordinal data or when you want simple numeric mapping)
print("1. LABEL ENCODING:")
print("-" * 20)

label_encoders = {}
for col in categorical_columns:
    if col in df_encoded.columns:
        le = LabelEncoder()
        df_encoded[f'{col}_label'] = le.fit_transform(df_encoded[col])
        label_encoders[col] = le
        
        print(f"{col} -> {col}_label:")
        print(f"  Original: {df_encoded[col].unique()}")
        print(f"  Encoded:  {df_encoded[f'{col}_label'].unique()}")
        print()

# Display the mapping for reference
print("Label encoding mappings:")
for col, encoder in label_encoders.items():
    mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
    print(f"{col}: {mapping}")
    print()



=== ENCODING CATEGORICAL DATA ===

1. LABEL ENCODING:
--------------------
Compound -> Compound_label:
  Original: ['SOFT' 'MEDIUM' 'HARD' 'WET' 'INTERMEDIATE']
  Encoded:  [3 2 0 4 1]

LapTime -> LapTime_label:
  Original: ['0 days 00:01:37.542000' '0 days 00:01:38.944000'
 '0 days 00:01:39.466000' ... '0 days 00:02:01.697000'
 '0 days 00:02:00.319000' '0 days 00:01:57.565000']
  Encoded:  [17057 17763 17980 ... 21570 21484 21284]

Sector1Time -> Sector1Time_label:
  Original: ['NaT' '0 days 00:00:30.628000' '0 days 00:00:30.754000' ...
 '0 days 00:00:38.920000' '0 days 00:00:37.152000'
 '0 days 00:00:34.610000']
  Encoded:  [14560  9188  9311 ... 12481 11545 10992]

Sector2Time -> Sector2Time_label:
  Original: ['0 days 00:00:41.422000' '0 days 00:00:42.430000'
 '0 days 00:00:42.805000' ... '0 days 00:00:44.380000'
 '0 days 00:00:43.830000' '0 days 00:00:43.761000']
  Encoded:  [11192 12005 12334 ... 13540 13185 13133]

Sector1Time -> Sector1Time_label:
  Original: ['NaT' '0 days 00:

In [None]:
categorical_columns.remove('PitInTime')
df_encoded= df_encoded.drop(columns=categorical_columns)
df_encoded.head()

Unnamed: 0,Driver,LapNumber,RaceName_x,Year_x,TyreLife,FreshTyre,PitInTime,AirTemp,TrackTemp,Humidity,Rainfall,Position,Compound_label,LapTime_label,Sector1Time_label,Sector2Time_label,Sector3Time_label,PitInTime_label,PitOutTime_label
0,VET,1.0,Bahrain,2019,4.0,False,NaT,26.1,29.2,54.1,False,1.0,3,17057,14560,11192,3216,1250,1214
1,BOT,1.0,Bahrain,2019,4.0,False,NaT,26.1,29.2,54.1,False,2.0,3,17763,14560,12005,3092,1250,1214
2,LEC,1.0,Bahrain,2019,4.0,False,NaT,26.1,29.2,54.1,False,3.0,3,17980,14560,12334,3161,1250,1214
3,HAM,1.0,Bahrain,2019,4.0,False,NaT,26.1,29.2,54.1,False,4.0,3,18145,14560,12497,3189,1250,1214
4,VER,1.0,Bahrain,2019,4.0,False,NaT,26.1,29.2,54.1,False,5.0,3,18492,14560,12720,3631,1250,1214


In [None]:
if 'FreshTyre' in df_encoded.columns:
    df_encoded['FreshTyre_encoded'] = df_encoded['FreshTyre'].astype(int)
    print("FreshTyre encoding:")
    print("True -> 1, False -> 0")
    print()

if 'Rainfall' in df_encoded.columns:
    df_encoded['Rainfall_encoded'] = df_encoded['Rainfall'].astype(int)
    print("FreshTyre encoding:")
    print("True -> 1, False -> 0")
    print()

FreshTyre encoding:
True -> 1, False -> 0

FreshTyre encoding:
True -> 1, False -> 0



In [None]:
df_encoded= df_encoded.drop(columns=['FreshTyre', 'Rainfall'])
df_encoded.head()

Unnamed: 0,Driver,LapNumber,RaceName_x,Year_x,TyreLife,PitInTime,AirTemp,TrackTemp,Humidity,Position,Compound_label,LapTime_label,Sector1Time_label,Sector2Time_label,Sector3Time_label,PitInTime_label,PitOutTime_label,FreshTyre_encoded,Rainfall_encoded
0,VET,1.0,Bahrain,2019,4.0,NaT,26.1,29.2,54.1,1.0,3,17057,14560,11192,3216,1250,1214,0,0
1,BOT,1.0,Bahrain,2019,4.0,NaT,26.1,29.2,54.1,2.0,3,17763,14560,12005,3092,1250,1214,0,0
2,LEC,1.0,Bahrain,2019,4.0,NaT,26.1,29.2,54.1,3.0,3,17980,14560,12334,3161,1250,1214,0,0
3,HAM,1.0,Bahrain,2019,4.0,NaT,26.1,29.2,54.1,4.0,3,18145,14560,12497,3189,1250,1214,0,0
4,VER,1.0,Bahrain,2019,4.0,NaT,26.1,29.2,54.1,5.0,3,18492,14560,12720,3631,1250,1214,0,0


### Boolean Variables Encoding

We convert boolean variables (`FreshTyre`, `Rainfall`) to binary numerical format:
- `True` → `1` (Fresh tire / Raining)
- `False` → `0` (Used tire / Dry conditions)

After encoding, we remove the original categorical columns to avoid redundancy.

---

## 5. Data Analysis and Validation

### Value Ranges Analysis

We examine the intervals and ranges of all numerical columns to:
- **Understand data distribution**: Identify the scale and spread of each feature
- **Detect outliers**: Spot unusual values that might need attention
- **Plan feature scaling**: Determine if normalization/standardization is needed
- **Validate encoding**: Ensure categorical encoding worked correctly

In [None]:

print("="*60)
print("VALUE INTERVALS/RANGES FOR EACH COLUMN")
print("="*60)

for column in df_encoded.columns:
   
   
    if df_encoded[column].dtype in ['int64', 'float64', 'int32', 'float32']:
        min_val = df_encoded[column].min()
        max_val = df_encoded[column].max()
        
        
        print(f"  {column} Range: [{min_val:.4f} to {max_val:.4f}]")
   

VALUE INTERVALS/RANGES FOR EACH COLUMN
  LapNumber Range: [1.0000 to 78.0000]
  Year_x Range: [2019.0000 to 2022.0000]
  TyreLife Range: [1.0000 to 68.0000]
  AirTemp Range: [9.4000 to 35.3000]
  TrackTemp Range: [13.8000 to 55.1000]
  Humidity Range: [12.9000 to 89.6000]
  Position Range: [1.0000 to 20.0000]
  Compound_label Range: [0.0000 to 4.0000]
  LapTime_label Range: [0.0000 to 22746.0000]
  Sector1Time_label Range: [0.0000 to 14560.0000]
  Sector2Time_label Range: [0.0000 to 15647.0000]
  Sector3Time_label Range: [0.0000 to 13720.0000]
  PitInTime_label Range: [0.0000 to 1250.0000]
  PitOutTime_label Range: [0.0000 to 1214.0000]
  FreshTyre_encoded Range: [0.0000 to 1.0000]
  Rainfall_encoded Range: [0.0000 to 1.0000]


---

## 6. Feature Engineering

### NextPit Column Creation

We create a crucial feature for pit stop prediction: `NextPit` - the number of laps until the next pit stop.

#### Key Features:
- **Multi-dimensional Grouping**: Groups by Year → Race → Driver → Lap for accurate calculations
- **Race Boundary Respect**: Ensures no cross-race or cross-year lap calculations
- **Pit Stop Detection**: Identifies actual pit stops using `PitInTime` data
- **Countdown Logic**: Calculates remaining laps until each driver's next pit stop


In [None]:
def add_next_pit_column(df):
   
    
    # Create a copy to avoid modifying the original dataframe
    df_with_nextpit = df.copy()
    
    # Sort by Year, Race, Driver, and LapNumber to ensure proper order
    df_with_nextpit = df_with_nextpit.sort_values(['Year_x', 'RaceName_x', 'Driver', 'LapNumber'])
    
    # Initialize NextPit column
    df_with_nextpit['NextPit'] = None
    
    print("🔧 Calculating laps until next pit stop...")
    
    # Group by year, race AND driver to calculate next pit for each driver in each race of each year
    race_count = 0
    total_pit_stops = 0
    
    for year in df_with_nextpit['Year_x'].unique():
        year_data = df_with_nextpit[df_with_nextpit['Year_x'] == year]
        
        for race in year_data['RaceName_x'].unique():
            race_count += 1
            print(f"   Processing race {race_count}: {year} - {race}")
            
            race_data = year_data[year_data['RaceName_x'] == race]
            
            for driver in race_data['Driver'].unique():
                # Get data for this specific driver in this specific race and year
                mask = (df_with_nextpit['Year_x'] == year) & (df_with_nextpit['RaceName_x'] == race) & (df_with_nextpit['Driver'] == driver)
                driver_race_data = df_with_nextpit[mask].copy().sort_values('LapNumber')


                
                if len(driver_race_data) > 0:
                    
                    # Identify pit stop laps (where PitInTime or PitOutTime is not null/zero)
                    pit_stop_laps = []
                    
                    for idx, row in driver_race_data.iterrows():
                        
                        has_pit_in = row['PitInTime'] != 'NaT'
                        
                        
                        if has_pit_in :
                            pit_stop_laps.append(row['LapNumber'])
                    
                    total_pit_stops += len(pit_stop_laps)
                    
                    # For each lap, calculate laps until next pit stop
                    nextpit_values = []
                    
                    for idx, row in driver_race_data.iterrows():
                        current_lap = row['LapNumber']
                        
                        # Find the next pit stop lap after current lap
                        next_pit_lap = None
                        for pit_lap in pit_stop_laps:
                            if pit_lap > current_lap:
                                next_pit_lap = pit_lap
                                break
                        
                        if next_pit_lap is not None:
                            # Calculate laps until next pit
                            laps_to_pit = next_pit_lap - current_lap
                            nextpit_values.append(laps_to_pit)
                        else:
                            # No more pit stops in this race for this driver
                            nextpit_values.append(None)
                    
                    # Update the main dataframe
                    df_with_nextpit.loc[mask, 'NextPit'] = nextpit_values
    
    print(f"   Found {total_pit_stops} pit stops across all years, races and drivers")
    return df_with_nextpit

# Apply the function to our encoded dataframe
print("="*60)
print("ADDING NEXTPIT COLUMN")
print("="*60)

df_encoded_with_nextpit = add_next_pit_column(df_encoded)

print("✅ NextPit column added successfully!")
print(f"Original shape: {df_encoded.shape}")
print(f"New shape: {df_encoded_with_nextpit.shape}")

# Update main dataframe
df_encoded = df_encoded_with_nextpit.copy()

ADDING NEXTPIT COLUMN
🔧 Calculating laps until next pit stop...
   Processing race 1: 2019 - 70th Anniversary
   Processing race 2: 2019 - Azerbaijan
   Processing race 2: 2019 - Azerbaijan
   Processing race 3: 2019 - Bahrain
   Processing race 3: 2019 - Bahrain
   Processing race 4: 2019 - Canada
   Processing race 4: 2019 - Canada
   Processing race 5: 2019 - Emilia Romagna
   Processing race 5: 2019 - Emilia Romagna
   Processing race 6: 2019 - Italy
   Processing race 6: 2019 - Italy
   Processing race 7: 2019 - Monaco
   Processing race 7: 2019 - Monaco
   Processing race 8: 2019 - Saudi Arabian
   Processing race 8: 2019 - Saudi Arabian
   Processing race 9: 2019 - Spain
   Processing race 9: 2019 - Spain
   Processing race 10: 2019 - Styria
   Processing race 10: 2019 - Styria
   Processing race 11: 2020 - 70th Anniversary
   Processing race 11: 2020 - 70th Anniversary
   Processing race 12: 2020 - Azerbaijan
   Processing race 12: 2020 - Azerbaijan
   Processing race 13: 2020 

In [None]:
df_encoded_with_nextpit['NextPit'].value_counts().sort_index()

NextPit
1.0     1290
2.0     1202
3.0     1148
4.0     1128
5.0     1086
        ... 
59.0       2
60.0       1
61.0       1
62.0       1
63.0       1
Name: count, Length: 63, dtype: int64

In [None]:
df_encoded_with_nextpit.head()

Unnamed: 0,Driver,LapNumber,RaceName_x,Year_x,TyreLife,PitInTime,AirTemp,TrackTemp,Humidity,Position,Compound_label,LapTime_label,Sector1Time_label,Sector2Time_label,Sector3Time_label,PitInTime_label,PitOutTime_label,FreshTyre_encoded,Rainfall_encoded,NextPit
2496,ALB,1.0,70th Anniversary,2019,1.0,NaT,25.9,43.2,36.1,13.0,2,16178,14560,7318,4174,1250,1214,1,0,27.0
2516,ALB,2.0,70th Anniversary,2019,2.0,NaT,26.0,42.9,34.9,13.0,2,11046,9173,5758,3506,1250,1214,1,0,26.0
2536,ALB,3.0,70th Anniversary,2019,3.0,NaT,26.0,41.8,34.5,13.0,2,10220,8952,5122,3396,1250,1214,1,0,25.0
2556,ALB,4.0,70th Anniversary,2019,4.0,NaT,25.8,41.3,35.2,13.0,2,9845,8386,5348,3313,1250,1214,1,0,24.0
2576,ALB,5.0,70th Anniversary,2019,5.0,NaT,25.5,40.9,35.8,13.0,2,9603,8425,4959,3381,1250,1214,1,0,23.0


In [None]:
import os


os.makedirs('processed_data', exist_ok=True)

# Save the dataframe as CSV
df_encoded.to_csv('processed_data/df_encoded.csv', index=False)
print("df_encoded saved to 'processed_data/df_encoded.csv'")

df_encoded saved to 'processed_data/df_encoded.csv'


---

## 7. Summary and Conclusion

### Data Preprocessing Complete ✅

We have successfully preprocessed the F1 pit stop dataset with the following transformations:

#### Key Accomplishments:
1. **✅ Data Loading**: Loaded and explored the merged F1 dataset
2. **✅ Feature Selection**: Selected relevant features for pit stop analysis
3. **✅ Data Cleaning**: Handled missing values through complete case analysis
4. **✅ Categorical Encoding**: Converted categorical variables to numerical format
5. **✅ Feature Engineering**: Created the crucial `NextPit` countdown feature
6. **✅ Data Validation**: Verified data quality and feature engineering accuracy

#### Final Dataset Features:
- **Driver Information**: Driver, LapNumber, RaceName_x, Year_x
- **Performance Metrics**: LapTime, Sector times, Position
- **Tire Data**: TyreLife, Compound (encoded), FreshTyre (binary)
- **Environmental**: AirTemp, TrackTemp, Humidity, Rainfall (binary)
- **Pit Stop Data**: PitInTime, NextPit (engineered feature)


### Next Steps:
- **Exploratory Data Analysis (EDA)**: Visualize relationships and patterns
- **Feature Selection**: Identify most important predictive features
- **Model Development**: Build pit stop prediction models
- **Model Evaluation**: Assess performance and validate results

