In [29]:
import numpy as np
import pandas as pd


In [30]:
pit_stops = pd.read_csv("pit_stops.csv")

In [31]:
pit_stops.head()

Unnamed: 0,raceId,driverId,stop,lap,time,duration,milliseconds
0,841,153,1,1,17:05:23,26.898,26898
1,841,30,1,1,17:05:52,25.021,25021
2,841,17,1,11,17:20:48,23.426,23426
3,841,4,1,12,17:22:34,23.251,23251
4,841,13,1,13,17:24:10,23.842,23842


In [32]:
pit_stops.shape

(11371, 7)

In [33]:
pit_stops.dtypes

raceId           int64
driverId         int64
stop             int64
lap              int64
time            object
duration        object
milliseconds     int64
dtype: object

convertng time and duration to datetime to make comparison easier 

In [34]:
time_series = pit_stops['time']
datetime_converted = pd.to_datetime(time_series, format='%H:%M:%S')
pit_stops['time_hours'] = datetime_converted.dt.hour
pit_stops['time_mins'] = datetime_converted.dt.minute
pit_stops['time_secs'] = datetime_converted.dt.second
pit_stops

Unnamed: 0,raceId,driverId,stop,lap,time,duration,milliseconds,time_hours,time_mins,time_secs
0,841,153,1,1,17:05:23,26.898,26898,17,5,23
1,841,30,1,1,17:05:52,25.021,25021,17,5,52
2,841,17,1,11,17:20:48,23.426,23426,17,20,48
3,841,4,1,12,17:22:34,23.251,23251,17,22,34
4,841,13,1,13,17:24:10,23.842,23842,17,24,10
...,...,...,...,...,...,...,...,...,...,...
11366,1144,840,2,32,17:52:48,22.053,22053,17,52,48
11367,1144,1,1,34,17:55:17,21.694,21694,17,55,17
11368,1144,4,2,37,18:00:10,22.437,22437,18,0,10
11369,1144,855,2,39,18:03:21,28.765,28765,18,3,21


converting duration to numeric 

In [35]:
def convert_duration_to_seconds(value):
    value = str(value).strip()
    
    try:
        # format like 'MM:SS.mmm'
        if ':' in value:
            td = pd.to_timedelta('00:' + value)
            return td.total_seconds()
        else:
            # already seconds (2.345)
            return float(value)
    except Exception:
        return None

pit_stops['duration_seconds'] = pit_stops['duration'].apply(convert_duration_to_seconds)

# split into minutes and seconds
pit_stops['duration_minutes'] = pit_stops['duration_seconds'] // 60
pit_stops['duration_seconds_remainder'] = pit_stops['duration_seconds'] % 60

pit_stops

Unnamed: 0,raceId,driverId,stop,lap,time,duration,milliseconds,time_hours,time_mins,time_secs,duration_seconds,duration_minutes,duration_seconds_remainder
0,841,153,1,1,17:05:23,26.898,26898,17,5,23,26.898,0.0,26.898
1,841,30,1,1,17:05:52,25.021,25021,17,5,52,25.021,0.0,25.021
2,841,17,1,11,17:20:48,23.426,23426,17,20,48,23.426,0.0,23.426
3,841,4,1,12,17:22:34,23.251,23251,17,22,34,23.251,0.0,23.251
4,841,13,1,13,17:24:10,23.842,23842,17,24,10,23.842,0.0,23.842
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11366,1144,840,2,32,17:52:48,22.053,22053,17,52,48,22.053,0.0,22.053
11367,1144,1,1,34,17:55:17,21.694,21694,17,55,17,21.694,0.0,21.694
11368,1144,4,2,37,18:00:10,22.437,22437,18,0,10,22.437,0.0,22.437
11369,1144,855,2,39,18:03:21,28.765,28765,18,3,21,28.765,0.0,28.765


outliers

In [36]:
Q1 = pit_stops['duration_seconds'].quantile(0.25)
Q3 = pit_stops['duration_seconds'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = pit_stops[(pit_stops['duration_seconds'] < lower_bound) | 
                     (pit_stops['duration_seconds'] > upper_bound)]

print(f"Number of outliers: {len(outliers)}")
display(outliers[['raceId', 'driverId', 'duration', 'duration_seconds']])


Number of outliers: 990


Unnamed: 0,raceId,driverId,duration,duration_seconds
24,841,22,37.856,37.856
102,842,24,38.823,38.823
105,843,24,14.160,14.160
154,843,10,33.827,33.827
160,843,815,13.900,13.900
...,...,...,...,...
11336,1143,807,37.692,37.692
11339,1143,859,34.655,34.655
11340,1143,846,36.069,36.069
11342,1143,848,39.687,39.687


In [37]:
pit_stops = pit_stops.replace(r'\N', np.nan)
null_vals = pit_stops.isnull().sum()
print(null_vals)

raceId                        0
driverId                      0
stop                          0
lap                           0
time                          0
duration                      0
milliseconds                  0
time_hours                    0
time_mins                     0
time_secs                     0
duration_seconds              0
duration_minutes              0
duration_seconds_remainder    0
dtype: int64


In [38]:
duplicates = pit_stops.duplicated()
pit_stops[duplicates].count()

raceId                        0
driverId                      0
stop                          0
lap                           0
time                          0
duration                      0
milliseconds                  0
time_hours                    0
time_mins                     0
time_secs                     0
duration_seconds              0
duration_minutes              0
duration_seconds_remainder    0
dtype: int64

In [39]:
unique_cat = pit_stops['milliseconds'].nunique()
print(f"number of unique values in this column: {unique_cat}")


number of unique values in this column: 7604


In [40]:
final_df = pit_stops.drop(columns = ['milliseconds'], inplace = True)
pit_stops.head()


Unnamed: 0,raceId,driverId,stop,lap,time,duration,time_hours,time_mins,time_secs,duration_seconds,duration_minutes,duration_seconds_remainder
0,841,153,1,1,17:05:23,26.898,17,5,23,26.898,0.0,26.898
1,841,30,1,1,17:05:52,25.021,17,5,52,25.021,0.0,25.021
2,841,17,1,11,17:20:48,23.426,17,20,48,23.426,0.0,23.426
3,841,4,1,12,17:22:34,23.251,17,22,34,23.251,0.0,23.251
4,841,13,1,13,17:24:10,23.842,17,24,10,23.842,0.0,23.842


In [41]:
pit_stops = pit_stops[pit_stops['raceId'] >= 989]
pit_stops

Unnamed: 0,raceId,driverId,stop,lap,time,duration,time_hours,time_mins,time_secs,duration_seconds,duration_minutes,duration_seconds_remainder
6251,989,843,1,1,16:15:04,22.213,16,15,4,22.213,0.0,22.213
6252,989,8,1,18,16:40:07,21.421,16,40,7,21.421,0.0,21.421
6253,989,1,1,19,16:41:30,21.821,16,41,30,21.821,0.0,21.821
6254,989,844,1,20,16:44:03,22.242,16,44,3,22.242,0.0,22.242
6255,989,830,1,21,16:45:06,20.953,16,45,6,20.953,0.0,20.953
...,...,...,...,...,...,...,...,...,...,...,...,...
11366,1144,840,2,32,17:52:48,22.053,17,52,48,22.053,0.0,22.053
11367,1144,1,1,34,17:55:17,21.694,17,55,17,21.694,0.0,21.694
11368,1144,4,2,37,18:00:10,22.437,18,0,10,22.437,0.0,22.437
11369,1144,855,2,39,18:03:21,28.765,18,3,21,28.765,0.0,28.765


In [42]:
pit_stops.to_csv("pit_stops_cleaned.csv", index=False)
