In [1]:
import tabula as tb
import pandas as pd
import numpy as np
import re

## Race

In [31]:
df = tb.read_pdf("../Data/Race/2013-AUS-RAC.pdf", area = (120, 0, 500, 222), columns=[72, 78, 90, 110], pages = "1")[0]
df

Got stderr: Feb 21, 2023 2:29:56 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 21, 2023 2:29:56 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 21, 2023 2:29:56 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 21, 2023 2:29:56 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



Unnamed: 0.1,Pos,Unnamed: 0,Unnamed: 1,Unnamed: 2,Rider N
0,1,,25,99,Jorge LORENZO
1,2,,20,26,Dani PEDROSA
2,3,,16,46,Valentino ROSSI
3,4,,13,35,Cal CRUTCHLOW
4,5,,11,19,Alvaro BAUTISTA
5,6,,10,38,Bradley SMITH
6,7,,9,69,Nicky HAYDEN
7,8,,8,29,Andrea IANNONE
8,9,,7,4,Andrea DOVIZIOSO
9,10,,6,14,Randy DE PUNIET


In [32]:
# remove all instances of 'Pos' that are not an integer
df2 = df[pd.to_numeric(df['Pos'], errors='coerce').notnull()]
df2.tail()

Unnamed: 0.1,Pos,Unnamed: 0,Unnamed: 1,Unnamed: 2,Rider N
16,17,,,71,Claudio CORTI
17,18,,,70,Michael LAVERTY
18,19,,,52,Lukas PESEK
19,20,,,7,Hiroshi AOYAMA
20,21,,,50,Damian CUDLIN


In [34]:
# remove unnecessary columns (which are the 2nd and 3rd columns)
race = df2[['Pos', 'Unnamed: 2', 'Rider N']]
race.rename(columns={'Unnamed: 2': 'Number', 'Rider N': 'Name'}, inplace=True)
race.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  race.rename(columns={'Unnamed: 2': 'Number', 'Rider N': 'Name'}, inplace=True)


Unnamed: 0,Pos,Number,Name
0,1,99,Jorge LORENZO
1,2,26,Dani PEDROSA
2,3,46,Valentino ROSSI
3,4,35,Cal CRUTCHLOW
4,5,19,Alvaro BAUTISTA


## Free Practice

In [5]:
filename = "../Data/FP/2013-AUS-FP4.pdf"

In [38]:
# getting the data for the free practice pdf is problematic because the table is split into two sides in one page
dfl = tb.read_pdf(filename, area = (20, 0, 730, 133), columns = [79], pandas_options = {'header': False}, pages = 'all')  # left side
dfr = tb.read_pdf(filename, area = (20, 318, 730, 399), columns = [340], pages = 'all')  # right side

dfl[0].head(20)

Got stderr: Feb 21, 2023 2:31:23 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 21, 2023 2:31:23 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 21, 2023 2:31:23 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 21, 2023 2:31:23 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>

Got stderr: Feb 21, 2023 2:31:25 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 21, 2023 2:31:25 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 21, 2023 2:31:25 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 21, 2023 2:31:25 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



Unnamed: 0,P,hillip Island
0,,4448 m.
1,P Cro,ssing the finis
2,Lap,Lap Time
3,1st,26 Da
4,1,2'10.732
5,2,1'32.029
6,3,1'30.337
7,4,1'29.577
8,5,1'29.615
9,6,1'32.348


In [39]:
# next step is to combine all those data together, from a list of dataframes to one huge dataframe
# the combination goes like this: on the first page, we take the data from the left side first, then to the right side, then to the next page
# therefore, [dfl[0], dfr[0], dfl[1], dfr[1], dfl[2], ...]
combine_df = []
for i in range(len(dfl)):
    combine_df.append(dfl[i])
    combine_df.append(dfr[i])
df = pd.DataFrame(np.concatenate(combine_df, axis=0), columns=dfl[0].columns)
df

Unnamed: 0,P,hillip Island
0,,4448 m.
1,P Cro,ssing the finis
2,Lap,Lap Time
3,1st,26 Da
4,1,2'10.732
...,...,...
342,9,1'39.498
343,10,1'38.057
344,11,1'44.020
345,12,1'34.628


In [40]:
# next step is to remove unnecessary values and clean the data
df2 = df.copy()
df2.rename(columns={df2.columns[0]: 'Lap Number', df2.columns[1]: 'Lap Time'}, inplace=True)

# remove instances that doesn't contain any digit from 'Lap Number'
df2['Lap Number'].replace('^([^0-9]*)$', '', regex=True, inplace=True) 
df2['Lap Number'].replace('', np.nan, inplace=True)  # dunno if this two-step replacement is necessary, lazy to check

df2.dropna(inplace=True)
df2.reset_index(drop=True, inplace=True)

#[^\'.\d]
#\d{0,2}'\d\d\.\d\d\d.*
df2

Unnamed: 0,Lap Number,Lap Time
0,1st,26 Da
1,1,2'10.732
2,2,1'32.029
3,3,1'30.337
4,4,1'29.577
...,...,...
321,8,1'52.739
322,9,1'39.498
323,10,1'38.057
324,11,1'44.020


In [41]:
# get rider number for identification purposes
_temp = pd.to_numeric(df2['Lap Number'], errors='coerce')
riders = df2[_temp.isna()].copy()

riders['Lap Time'].replace('\D+', '', regex=True, inplace=True) 
riders['Lap Time'].replace('', np.nan, inplace=True)
riders.dropna(inplace=True)

riders.head()
#df2['Lap Number'].str.findall(r'\d{0,2}[a-z][a-z]')

Unnamed: 0,Lap Number,Lap Time
0,1st,26
16,2nd,99
29,3rd,93
43,4th,46
61,5th,35


In [42]:
rider_number = riders['Lap Time']  # this will be the rider's identity, make this as a column

In [43]:
# now we need to get all the lap times for each rider, this is done by iterating
# through df2['Lap Time'] and using rider_index as a way to separate which lap time belongs to which rider
rider_index = riders.index
laps = df2['Lap Time']

k = 0  # this is a pointer to the index of rider_index
"""
[0, 17, 33, 48, 68, 84, 101, 117, 134, 154, 171, 188, 206, 222, 240, 260, 279, 293, 310, 327, 346, 364, 380, 394]
for example, when i reaches 17, k points to the 1st index,
and when i reaches 33, k points to the 2nd index, etc
"""
laps_list = [[] for i in range(len(rider_index))]
for i in range(1, len(laps)):  # we skip i=0 because we know i=0 doesn't contain any laptime
    if i in rider_index:
        k += 1
    elif re.search(r"\d{0,2}'\d\d\.\d\d\d.*", laps[i]):
        laps_list[k].append(laps[i])

In [44]:
df3 = pd.DataFrame(laps_list).transpose()
df3.columns = rider_number
# obviously, the first lap never counts, so we can easily remove them
df3.drop(0, inplace=True)
df3.head()

Lap Time,26,99,93,46,35,38,69,19,41,5,...,71,7,68,9,70,23,8,67,52,50
1,1'32.029,1'29.191,1'32.285,1'31.822,1'33.353,1'33.160,1'32.612,1'32.278,1'31.485,1'35.868,...,1'32.242,1'32.589,1'31.688,1'32.041,1'33.459,1'33.079,1'33.130,1'32.690,1'33.257,1'38.502
2,1'30.337,1'29.122,1'30.540,1'30.030,1'30.433,1'31.225,1'30.859,1'30.692,1'31.092,1'31.828,...,1'31.493,1'30.991,1'31.827,1'31.390,1'32.120,1'31.612,5'43.879 P,1'32.150,1'35.082,1'35.091
3,1'29.577,1'29.478,5'36.658 P,1'29.661,1'41.558,1'30.812,1'34.949,1'30.306,6'43.880 P,1'30.937,...,1'31.608,1'31.728,1'31.349,1'32.614,1'31.954,1'32.272,1'44.336,1'32.312,1'33.755,1'34.297
4,1'29.615,1'29.269,1'40.382,1'29.845,1'29.981,1'30.532,1'30.452,1'30.586,1'41.689,1'32.056,...,1'56.769,1'31.134,7'54.042 P,1'31.351,1'42.107,7'50.323 P,1'32.216,1'32.500,1'32.981,1'34.211
5,1'32.348,14'20.999 P,1'29.448,1'29.907,1'30.185,1'30.812,1'30.645,1'30.760,1'32.244,1'30.836,...,1'30.974,1'31.270,,5'42.856 P,1'31.741,1'38.939,1'55.004,1'32.355,7'30.468 P,1'34.009


In [45]:
# these are functions to convert laptime format
def lap_to_sec(lap):
    # converts laptime format: from --'---.--- string type to seconds float type
    minsec = lap.split("'")
    sec = round(int(minsec[0]) * 60 + float(minsec[1]), 3)
    return sec

def sec_to_lap(sec):
    # converts laptime format: from seconds float type to --'---.--- string type
    min = 0
    while sec >= 60:
        sec -= 60
        min += 1
    sec = format(round(sec, 3), ".3f")
    lap = str(min) + "'" + str(sec).zfill(6)
    return lap

In [46]:
# next, we remove laptimes that are the first laps out of pits, 
# the laps where the rider enters the pit (marked with 'P'),
# and the invalid laps (marked with '*').
row_len = df3.shape[0]
col_len = df3.shape[1]

pit = False
for i in range(col_len):
    for j in range(row_len):
        #.iat accesses dataframe by [row, column]
        laptime = df3.iat[j, i]

        # invalid laptimes converted to None
        if laptime == None:
            break
        if "P" in laptime:
            pit = True
            df3.iat[j, i] = None
        elif pit:
            df3.iat[j, i] = None
            pit = False
        elif "*" in laptime:
            df3.iat[j, i] = None

        # valid laptimes converted to float
        else:
            df3.iat[j, i] = lap_to_sec(laptime)
df3.head()

Lap Time,26,99,93,46,35,38,69,19,41,5,...,71,7,68,9,70,23,8,67,52,50
1,92.029,89.191,92.285,91.822,93.353,93.16,92.612,92.278,91.485,95.868,...,92.242,92.589,91.688,,93.459,93.079,93.13,92.69,93.257,98.502
2,90.337,89.122,90.54,90.03,90.433,91.225,90.859,90.692,91.092,91.828,...,91.493,90.991,91.827,91.39,92.12,91.612,,92.15,95.082,95.091
3,89.577,89.478,,89.661,101.558,90.812,94.949,90.306,,90.937,...,91.608,91.728,91.349,92.614,91.954,92.272,,92.312,93.755,94.297
4,89.615,89.269,,89.845,89.981,90.532,90.452,90.586,,92.056,...,116.769,91.134,,91.351,102.107,,92.216,92.5,92.981,94.211
5,92.348,,89.448,89.907,90.185,90.812,90.645,90.76,92.244,90.836,...,90.974,91.27,,,91.741,,115.004,92.355,,94.009


### Laptime outlier detection
Unlike races where riders push from start to finish, riders sometimes slow down in free practice, causing them to record laps much slower than what they're capable of. This is obviously not reflective of what they do in races, so we need to remove the slow laps. However, the laptimes in free practice are not normally distributed (since riders slow down, skewing the average laptime much more to the slower side), so we can't use the usual 1.5 * IQR approximation.

Naturally, riders might slow down towards the end of the session due to tyre degradation -- it is a race simulation practice after all -- and sometimes riders make slight mistakes when they're pushing. So, taking all external factors into account, I will use a 102% threshold to detect outliers. Any laptime that is above 102% of the rider's best time will be removed.

In [50]:
minimum_lap = df3.min(skipna=True)
threshold = minimum_lap * 1.02
df4 = df3[df3 <= threshold]  # remove all values above the threshold
df4.head()

Lap Time,26,99,93,46,35,38,69,19,41,5,...,71,7,68,9,70,23,8,67,52,50
1,,89.191,,,,,,,91.485,,...,92.242,92.589,91.688,,,93.079,93.13,92.69,93.257,
2,90.337,89.122,90.54,90.03,90.433,91.225,90.859,90.692,91.092,91.828,...,91.493,90.991,91.827,91.39,92.12,91.612,,92.15,,95.091
3,89.577,89.478,,89.661,,90.812,,90.306,,90.937,...,91.608,91.728,91.349,92.614,91.954,92.272,,92.312,93.755,94.297
4,89.615,89.269,,89.845,89.981,90.532,90.452,90.586,,92.056,...,,91.134,,91.351,,,92.216,92.5,92.981,94.211
5,,,89.448,89.907,90.185,90.812,90.645,90.76,,90.836,...,90.974,91.27,,,91.741,,,92.355,,94.009


### Final Step
Find average laptime for each rider, then sort from fastest to slowest

In [84]:
df5 = df4.mean().sort_values().reset_index()
df5.head()

Unnamed: 0,Lap Time,0
0,99,89.334625
1,26,89.515556
2,93,89.768
3,46,89.8279
4,35,90.1439


### Merge free practice with race data
Note: there is a high chance that crash(es) occur(s) during the race, leaving the number of riders finishing the race lesser than the number of riders completing the free practice. For the sake of simplicity, we'll just remove them from the data.

In [119]:
# remove DNF and DNS riders
df6 = df5[df5["Lap Time"].isin(race["Number"])]    # turn them to NaN
fp = df6[df6["Lap Time"].notna()].reset_index()    # remove NaN

final = pd.DataFrame({"fp": fp["Lap Time"], "race": race["Number"]})
final

Unnamed: 0,fp,race
0,99,99
1,26,26
2,46,46
3,35,35
4,19,19
5,38,38
6,69,69
7,14,29
8,41,4
9,29,14
