---
title: "Exploratory Data Analysis"
---

In [6]:
## import necessary libraries 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

In [7]:
## read dataset needed for this project
## reason the following datasets been choosen is that the main purpose of this project is to find the relationship between pit stop time and 
## drivers with constructors. Therefore, dataset like qualifying and sprint race are deserted since they have no obvious relation with the 
## purpose of this project.
drivers = pd.read_csv('Data/drivers.csv')
pitStops = pd.read_csv('Data/pit_stops.csv')
constructors = pd.read_csv('Data/constructors.csv')
races = pd.read_csv('Data/races.csv')
circuits = pd.read_csv('Data/circuits.csv')
results = pd.read_csv('Data/results.csv')

# Check the drivers dataset:

In [8]:
drivers.head()

Unnamed: 0,driverId,driverRef,number,code,forename,surname,dob,nationality,url
0,1,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
1,2,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
2,3,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg
3,4,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
4,5,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen


In [9]:
## check how many \N are there in the column number since not all drivers had a number in the race
num_null = (drivers['number'] == r'\N').sum()
num_null

803

In [10]:
drivers['driverName'] = drivers['forename'] + ' ' + drivers['surname']
drivers = drivers.rename(columns = {'nationality' : 'driverNationality'})

In [11]:
## we can drop unrelated variables since they obviously will not influence the data analysis
drivers = drivers.drop(labels = ['driverRef', 'number', 'code','forename', 'surname', 'dob', 'url'], axis = 1)

# Check the pitStops dataset:

In [12]:
pitStops.head()

Unnamed: 0,raceId,driverId,stop,lap,time,duration,milliseconds
0,841,153,1,1,17:05:23,26.898,26898
1,841,30,1,1,17:05:52,25.021,25021
2,841,17,1,11,17:20:48,23.426,23426
3,841,4,1,12,17:22:34,23.251,23251
4,841,13,1,13,17:24:10,23.842,23842


In [13]:
## here we can get the statistical information about the pitStops dataset
pitStops.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
raceId,10089.0,962.774011,81.144375,841.0,888.0,958.0,1035.0,1110.0
driverId,10089.0,523.99891,389.698555,1.0,18.0,815.0,831.0,858.0
stop,10089.0,1.759738,0.916282,1.0,1.0,2.0,2.0,6.0
lap,10089.0,25.312023,14.729775,1.0,13.0,25.0,36.0,78.0
milliseconds,10089.0,75348.633363,278858.845817,12897.0,21914.0,23570.0,26202.0,3069017.0


In [14]:
pitStops = pitStops.rename(columns = {'time' : 'pitTime'})
pitStops['seconds'] = pitStops['milliseconds'].apply(lambda x: x/1000)

In [15]:
pitStops = pitStops.drop(labels = ['milliseconds'], axis = 1)

# Checking the constructors dataset:

In [16]:
constructors.head()

Unnamed: 0,constructorId,constructorRef,name,nationality,url
0,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
1,2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
2,3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Pr...
3,4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formul...
4,5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso


In [17]:
## we also need to drop an unnecessary column here which is the url column
constructors = constructors.drop(labels = ['url'], axis = 1)

In [18]:
constructors = constructors.rename(columns={'name' : 'constructorName', 'nationality' : 'constructorNationality'})

# Checking the races dataset:

In [19]:
races.head()

Unnamed: 0,raceId,year,round,circuitId,name,date,time,url,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,sprint_date,sprint_time
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_G...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Gr...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Gran...,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N


In [20]:
## check the ratio of \N in the fp and quali columns since we can observe huge amount of \N  
valueCheck = '\\N'
total = len(races)
value = races[races[['fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time']].isin([valueCheck]).any(axis=1)].shape[0]
ratio = value / total
print('Ratio is:', ratio)

Ratio is: 1.0


In [21]:
# since the ratio of \N in those columns is 1, therefore, we can drop those columns snice they will not affect the analysis
races = races.drop(labels = ['fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time', 'url'], axis = 1)

In [22]:
races = races.rename(columns = {'name' : 'raceName'})

# Check the circuits dataset

In [23]:
circuits.head()

Unnamed: 0,circuitId,circuitRef,name,location,country,lat,lng,alt,url
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_Internatio...
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park


In [24]:
circuits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   circuitId   77 non-null     int64  
 1   circuitRef  77 non-null     object 
 2   name        77 non-null     object 
 3   location    77 non-null     object 
 4   country     77 non-null     object 
 5   lat         77 non-null     float64
 6   lng         77 non-null     float64
 7   alt         77 non-null     object 
 8   url         77 non-null     object 
dtypes: float64(2), int64(1), object(6)
memory usage: 5.5+ KB


In [25]:
## we still gonna drop the column url since it will not affect our analysis
circuits = circuits.drop(labels = ['url'], axis = 1)

In [26]:
circuits = circuits.rename(columns = {'name' : 'circuitName', 'location' : 'circuitLocation', 'country' : 'circuitCountry'})

In [27]:
## infomation provided by the circuits dataset that can be used in the analysis is the circuitid and name which we will compare the relationship 
## between pit stop time and each circuit.
## however, we can see that the dataset also provided us with the latitude, longtitude, and altitude of each circuits, we can plot the circuits map
## to make people who does not familiar with F1 race have a better understanding of it.
import folium
coordinates = []
for lat, lng in zip(circuits['lat'], circuits['lng']):
    coordinates.append([lat, lng])
maps = folium.Map(zoom_start = 2, title = 'Stamen Watercolor')
for i, j in zip(coordinates, circuits.circuitName):
    marker = folium.Marker(
        location = i,
        icon = folium.Icon(icon = "car", color = 'green', prefix = 'fa'),
        popup = "<strong>{0}</strong>".format(j))
    marker.add_to(maps)
maps

# Check the results dataset

In [28]:
results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


In [29]:
results.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
resultId,26080.0,13041.372661,7530.008377,1.0,6520.75,13040.5,19560.25,26085.0
raceId,26080.0,536.695667,303.034639,1.0,294.75,519.0,791.0,1110.0
driverId,26080.0,266.277569,272.581622,1.0,57.0,163.0,364.0,858.0
constructorId,26080.0,49.059663,60.221056,1.0,6.0,25.0,58.25,214.0
grid,26080.0,11.167561,7.232797,0.0,5.0,11.0,17.0,34.0
positionOrder,26080.0,12.854141,7.700068,1.0,6.0,12.0,18.0,39.0
points,26080.0,1.906635,4.219715,0.0,0.0,0.0,2.0,50.0
laps,26080.0,46.076687,29.726058,0.0,22.0,53.0,66.0,200.0
statusId,26080.0,17.476074,26.129965,1.0,1.0,10.0,14.0,141.0


In [30]:
results = results.drop(labels = ['milliseconds'], axis = 1)

In [31]:
results['position'] = results['position'].replace(r'\\N', np.nan, regex = True)
results = results.dropna(subset = ['position'])

In [32]:
def convert_to_seconds(time_str):
    try:
        minutes, rest = time_str.split(":")
        seconds, milliseconds = rest.split(".")
        total_seconds = int(minutes)*60 + int(seconds) + int(milliseconds)/1000
        return total_seconds
    except ValueError:
        return None

# Apply the function to the column
results['fastestLapTime_seconds'] = results['fastestLapTime'].apply(convert_to_seconds)
results = results.dropna(subset = ['fastestLapSpeed'])
results

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,fastestLapTime_seconds
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,39,2,1:27.452,218.300,1,87.452
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,41,3,1:27.739,217.586,1,87.739
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,41,5,1:28.090,216.719,1,88.090
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,58,7,1:28.603,215.464,1,88.603
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,43,1,1:27.418,218.385,1,87.418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26073,26079,1110,848,3,23,15,14,14,14,0.0,44,+1:36.184,35,3,1:49.841,229.553,1,109.841
26074,26080,1110,825,210,20,16,15,15,15,0.0,44,+1:41.754,27,14,1:50.993,227.171,1,110.993
26075,26081,1110,817,213,3,19,16,16,16,0.0,44,+1:43.071,25,15,1:50.994,227.169,1,110.994
26076,26082,1110,858,3,2,18,17,17,17,0.0,44,+1:44.476,37,9,1:50.486,228.213,1,110.486


In [33]:
results = results.dropna(subset=['fastestLapTime_seconds'])

In [34]:
results

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,fastestLapTime_seconds
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,39,2,1:27.452,218.300,1,87.452
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,41,3,1:27.739,217.586,1,87.739
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,41,5,1:28.090,216.719,1,88.090
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,58,7,1:28.603,215.464,1,88.603
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,43,1,1:27.418,218.385,1,87.418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26073,26079,1110,848,3,23,15,14,14,14,0.0,44,+1:36.184,35,3,1:49.841,229.553,1,109.841
26074,26080,1110,825,210,20,16,15,15,15,0.0,44,+1:41.754,27,14,1:50.993,227.171,1,110.993
26075,26081,1110,817,213,3,19,16,16,16,0.0,44,+1:43.071,25,15,1:50.994,227.169,1,110.994
26076,26082,1110,858,3,2,18,17,17,17,0.0,44,+1:44.476,37,9,1:50.486,228.213,1,110.486


In [35]:
results.to_csv('cleanedResults.csv', index = False)


# Correlation between dataset

In [36]:
## we will start our preliminary analysis by finding the correlation bewteen dataset
## the first step is to join certain dataset by their primary key to get a new result dataset
mergedResults = pd.merge(results, races, left_on = 'raceId', right_index = True, how = 'left')
mergedResults = pd.merge(mergedResults, circuits, left_on = 'circuitId', right_index = True, how = 'left')
mergedResults = pd.merge(mergedResults, constructors, left_on = 'constructorId', right_index = True, how = 'left')
mergedResults = pd.merge(mergedResults, drivers, left_on = 'driverId', right_index = True, how = 'left')
mergedResults

Unnamed: 0,resultId,raceId_x,driverId_x,constructorId_x,number,grid,position,positionText,positionOrder,points,...,lat,lng,alt,constructorId_y,constructorRef,constructorName,constructorNationality,driverId_y,driverNationality,driverName
0,1,18,1,1,22,1,1,1,1,10.0,...,26.0325,50.5106,7,2.0,bmw_sauber,BMW Sauber,German,2.0,German,Nick Heidfeld
1,2,18,2,2,3,5,2,2,2,8.0,...,26.0325,50.5106,7,3.0,williams,Williams,British,3.0,German,Nico Rosberg
2,3,18,3,3,7,7,3,3,3,6.0,...,26.0325,50.5106,7,4.0,renault,Renault,French,4.0,Spanish,Fernando Alonso
3,4,18,4,4,5,11,4,4,4,5.0,...,26.0325,50.5106,7,5.0,toro_rosso,Toro Rosso,Italian,5.0,Finnish,Heikki Kovalainen
4,5,18,5,1,23,3,5,5,5,4.0,...,26.0325,50.5106,7,2.0,bmw_sauber,BMW Sauber,German,6.0,Japanese,Kazuki Nakajima
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26073,26079,1110,848,3,23,15,14,14,14,0.0,...,,,,4.0,renault,Renault,French,850.0,Brazilian,Pietro Fittipaldi
26074,26080,1110,825,210,20,16,15,15,15,0.0,...,,,,214.0,alpine,Alpine F1 Team,French,826.0,Russian,Daniil Kvyat
26075,26081,1110,817,213,3,19,16,16,16,0.0,...,,,,,,,,818.0,French,Jean-Éric Vergne
26076,26082,1110,858,3,2,18,17,17,17,0.0,...,,,,4.0,renault,Renault,French,,,


In [37]:
mergedPitStops = pd.merge(pitStops,races, left_on = 'raceId', right_index = True, how = 'left')
mergedPitStops = pd.merge(mergedPitStops,circuits, left_on = 'circuitId', right_index = True, how = 'left')
mergedPitStops = pd.merge(mergedPitStops, mergedResults[['raceId_x', 'driverId_x', 'driverName', 'constructorId_x', 'constructorName']], left_on = ['raceId_x', 'driverId'], right_on = ['raceId_x', 'driverId_x'])
mergedPitStops

Unnamed: 0,raceId_x,driverId,stop,lap,pitTime,duration,seconds,raceId_y,year,round,...,circuitName,circuitLocation,circuitCountry,lat,lng,alt,driverId_x,driverName,constructorId_x,constructorName
0,841,153,1,1,17:05:23,26.898,26.898,843.0,2011.0,3.0,...,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,153,Romain Grosjean,5,Ferrari
1,841,153,2,17,17:31:06,24.463,24.463,843.0,2011.0,3.0,...,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,153,Romain Grosjean,5,Ferrari
2,841,153,3,35,17:59:45,26.348,26.348,843.0,2011.0,3.0,...,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,153,Romain Grosjean,5,Ferrari
3,841,17,1,11,17:20:48,23.426,23.426,843.0,2011.0,3.0,...,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,17,Jenson Button,9,Force India
4,841,17,2,26,17:44:29,22.520,22.520,843.0,2011.0,3.0,...,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,17,Jenson Button,9,Force India
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9218,1110,830,1,14,15:30:04,22.887,22.887,,,,...,,,,,,,830,Felipe Nasr,9,Force India
9219,1110,830,2,30,16:00:16,23.012,23.012,,,,...,,,,,,,830,Felipe Nasr,9,Force India
9220,1110,840,1,20,15:42:12,25.397,25.397,,,,...,,,,,,,840,Antonio Giovinazzi,117,Moore
9221,1110,847,1,22,15:46:00,23.837,23.837,,,,...,,,,,,,847,Nicholas Latifi,131,HWM


In [38]:
raceResults = pd.merge(mergedResults, mergedPitStops.groupby(by = ['raceId_x', 'raceName', 'constructorName', 'driverId', 'driverName']).sum(), left_on = ['raceId_x', 'driverId_x'], right_on = ['raceId_x', 'driverId'], how = 'left')
raceResults

  raceResults = pd.merge(mergedResults, mergedPitStops.groupby(by = ['raceId_x', 'raceName', 'constructorName', 'driverId', 'driverName']).sum(), left_on = ['raceId_x', 'driverId_x'], right_on = ['raceId_x', 'driverId'], how = 'left')


Unnamed: 0,resultId,raceId_x,driverId_x_x,constructorId_x_x,number,grid,position,positionText,positionOrder,points,...,seconds,raceId_y_y,year_y,round_y,circuitId_x_y,circuitId_y_y,lat_y,lng_y,driverId_x_y,constructorId_x_y
0,1,18,1,1,22,1,1,1,1,10.0,...,,,,,,,,,,
1,2,18,2,2,3,5,2,2,2,8.0,...,,,,,,,,,,
2,3,18,3,3,7,7,3,3,3,6.0,...,,,,,,,,,,
3,4,18,4,4,5,11,4,4,4,5.0,...,,,,,,,,,,
4,5,18,5,1,23,3,5,5,5,4.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6433,26079,1110,848,3,23,15,14,14,14,0.0,...,,,,,,,,,,
6434,26080,1110,825,210,20,16,15,15,15,0.0,...,,,,,,,,,,
6435,26081,1110,817,213,3,19,16,16,16,0.0,...,,,,,,,,,,
6436,26082,1110,858,3,2,18,17,17,17,0.0,...,,,,,,,,,,


#### We have some variables naming issue that lots of variables are sharing the same variable name in different dataset when we try to merge datasets in the first time. Therefore, renaming variables in each dataset is necessary. The above code is actually the second time mergeing and since code of merging dataset are the same, only the second time merging process are displayed.

In [39]:
fig = px.line(mergedPitStops[mergedPitStops['seconds'] < 50].groupby(by = ['year', 'constructorName']).mean(numeric_only=True).reset_index(),
                 x = 'year',
                 y = 'seconds',
                 color = 'constructorName'
                )
fig.update_layout(
    title_text='Average Pit Stop Times by Constructor',
)
fig.show()

In [40]:
fig = px.box(mergedPitStops[mergedPitStops['seconds'] < 50].groupby(by = ['raceId_x', 'raceName', 'date', 'constructorName']).mean(numeric_only=True).reset_index().sort_values(by = 'seconds', ascending = True),
                 x = 'constructorName',
                 y = 'seconds',
                 color = 'constructorName'
                )
fig.update_layout(
    title_text='Pit Stop Durations by Constructor from 2011 to date',
)
fig.show()

# Outlier 

In [41]:
Q1 = mergedResults.quantile(0.25)
Q3 = mergedResults.quantile(0.25)
IQR = Q3 - Q1
mergedResults[~((mergedResults < (Q1 - 1.5*IQR))| (mergedResults > (Q3 + 1.5*IQR))).any(axis = 1)]
mergedResults






Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`



Unnamed: 0,resultId,raceId_x,driverId_x,constructorId_x,number,grid,position,positionText,positionOrder,points,...,lat,lng,alt,constructorId_y,constructorRef,constructorName,constructorNationality,driverId_y,driverNationality,driverName
0,1,18,1,1,22,1,1,1,1,10.0,...,26.0325,50.5106,7,2.0,bmw_sauber,BMW Sauber,German,2.0,German,Nick Heidfeld
1,2,18,2,2,3,5,2,2,2,8.0,...,26.0325,50.5106,7,3.0,williams,Williams,British,3.0,German,Nico Rosberg
2,3,18,3,3,7,7,3,3,3,6.0,...,26.0325,50.5106,7,4.0,renault,Renault,French,4.0,Spanish,Fernando Alonso
3,4,18,4,4,5,11,4,4,4,5.0,...,26.0325,50.5106,7,5.0,toro_rosso,Toro Rosso,Italian,5.0,Finnish,Heikki Kovalainen
4,5,18,5,1,23,3,5,5,5,4.0,...,26.0325,50.5106,7,2.0,bmw_sauber,BMW Sauber,German,6.0,Japanese,Kazuki Nakajima
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26073,26079,1110,848,3,23,15,14,14,14,0.0,...,,,,4.0,renault,Renault,French,850.0,Brazilian,Pietro Fittipaldi
26074,26080,1110,825,210,20,16,15,15,15,0.0,...,,,,214.0,alpine,Alpine F1 Team,French,826.0,Russian,Daniil Kvyat
26075,26081,1110,817,213,3,19,16,16,16,0.0,...,,,,,,,,818.0,French,Jean-Éric Vergne
26076,26082,1110,858,3,2,18,17,17,17,0.0,...,,,,4.0,renault,Renault,French,,,


#### there are no obvious outlier detected since no rows was removed

# Hypothesis Refination

We car observe that there are some differences between every constructor's pit stop time, however, those differences are within the range of 1 to 2 seconds which is quite insignificant even in the formula 1 race since there are many uncertainty in the race and any of those uncertainty could cause error more than 2 seconds

Therefore, hypothesis of this project should divert to will the pit stop time differences have the effect of determine the winner of one race in the following project.