In [43]:
import pandas as pd
df = pd.read_csv("lap_times_2025_round_15.csv")
print(f"Initial dataset shape: {df.shape}")
print("Columns present:", df.columns.tolist())

Initial dataset shape: (1364, 8)
Columns present: ['Driver', 'Lap Number', 'Lap Time', 'Position', 'Time', 'Sector 1', 'Sector 2', 'Sector 3']


In [44]:
# Define time-related columns (based on dataset structure)
time_columns = ['Lap Time', 'Sector 1', 'Sector 2', 'Sector 3']

# Delete rows with any blank (NaN) values in time columns
df = df.dropna(subset=time_columns)
print(f"Rows with blank times deleted. New shape: {df.shape}")

Rows with blank times deleted. New shape: (1342, 8)


In [45]:
# Remove 'Position' column if it exists
if 'Position' in df.columns:
    df = df.drop(columns=['Position'])
    print("'Position' column deleted.")
else:
    print("'Position' column not found in dataset.")

'Position' column deleted.


In [46]:
# Remove 'Time' column if it exists
if 'Time' in df.columns:
    df = df.drop(columns=['Time'])
    print("'Time' column deleted.")
else:
    print("'Time' column not found in dataset.")

'Time' column deleted.


In [47]:
def clean_lap_time(time_str):
    """
    Convert raw time format (e.g., '0 days 00:01:19.680000') 
    to minute-based format (e.g., '01:19.68')
    """
    try:
        # Extract the time portion (e.g., '00:01:19.680000' from the string)
        time_segment = time_str.split(' ')[-1]
        # Split into hours:minutes:seconds (e.g., ['00', '01', '19.680000'])
        hours, minutes, seconds = time_segment.split(':')
        # Combine minutes and seconds (keep 3 decimal places for milliseconds)
        return f"{minutes}:{seconds.split('.')[0]}.{seconds.split('.')[1][:3]}"
    except:
        return None  # Handle invalid formats

# Apply cleaning to 'Lap Time' column
df['Lap Time'] = df['Lap Time'].apply(clean_lap_time)

# Remove any rows where time cleaning failed
df = df.dropna(subset=['Lap Time'])
print(f"Time formatted to minute-based. Shape: {df.shape}")

Time formatted to minute-based. Shape: (1339, 6)


In [48]:
def convert_to_seconds(time_str):
    """Convert minute-based time (e.g., '01:19.68') to total seconds for comparison"""
    minutes, seconds = time_str.split(':')
    return float(minutes) * 60 + float(seconds)

# Create temporary column for numerical comparison
df['lap_seconds'] = df['Lap Time'].apply(convert_to_seconds)

# Group by driver and keep the row with the smallest (fastest) lap time
fastest_laps = df.groupby('Driver')['lap_seconds'].idxmin()
df = df.loc[fastest_laps]

# Remove temporary numerical column
df = df.drop(columns=['lap_seconds'])
print(f"Fastest lap per driver retained. Final shape: {df.shape}")

Fastest lap per driver retained. Final shape: (20, 6)


In [49]:
# Export cleaned data to CSV
output_file = "cleaned_lap_times_2025_round_15.csv"
df.to_csv(output_file, index=False)
print(f"Cleaned dataset saved as: {output_file}")

Cleaned dataset saved as: cleaned_lap_times_2025_round_15.csv
