In [290]:
import tabula as tb
import pandas as pd
import numpy as np
import re

## Race

In [66]:
df = tb.read_pdf("../Data/Race/2014-FRA-RAC.pdf", area = (120, 0, 500, 222), columns=[72, 78, 90, 110], pages = "1")[0]
df

Got stderr: Feb 09, 2023 9:35:54 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 09, 2023 9:35:54 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 09, 2023 9:35:54 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 09, 2023 9:35:54 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



Unnamed: 0.1,Pos,Unnamed: 0,Unnamed: 1,Unnamed: 2,Rider N
0,1,,25,93,Marc MARQUEZ
1,2,,20,46,Valentino ROSSI
2,3,,16,19,Alvaro BAUTISTA
3,4,,13,44,Pol ESPARGARO
4,5,,11,26,Dani PEDROSA
5,6,,10,99,Jorge LORENZO
6,7,,9,6,Stefan BRADL
7,8,,8,4,Andrea DOVIZIOSO
8,9,,7,41,Aleix ESPARGARO
9,10,,6,38,Bradley SMITH


In [67]:
# remove all instances of 'Pos' that are not an integer
df2 = df[pd.to_numeric(df['Pos'], errors='coerce').notnull()]
df2.tail()

Unnamed: 0.1,Pos,Unnamed: 0,Unnamed: 1,Unnamed: 2,Rider N
14,15,,1.0,17,Karel ABRAHAM
15,16,,,70,Michael LAVERTY
16,17,,,5,Colin EDWARDS
17,18,,,23,Broc PARKES
18,19,,,63,Mike DI MEGLIO


In [68]:
# remove unnecessary columns (which are the 2nd and 3rd columns)
df3 = df2[['Pos', 'Unnamed: 2', 'Rider N']]
df3.rename(columns={'Unnamed: 2': 'Number', 'Rider N': 'Name'}, inplace=True)
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.rename(columns={'Unnamed: 2': 'Number', 'Rider N': 'Name'}, inplace=True)


Unnamed: 0,Pos,Number,Name
0,1,93,Marc MARQUEZ
1,2,46,Valentino ROSSI
2,3,19,Alvaro BAUTISTA
3,4,44,Pol ESPARGARO
4,5,26,Dani PEDROSA


## Free Practice

In [291]:
filename = "../Data/FP/2022-INA-FP4.pdf"

In [292]:
# getting the data for the free practice pdf is problematic because the table is split into two sides in one page
dfl = tb.read_pdf(filename, area = (20, 0, 730, 133), columns = [79], pandas_options = {'header': False}, pages = 'all')  # left side
dfr = tb.read_pdf(filename, area = (20, 318, 730, 399), columns = [340], pages = 'all')  # right side

dfl[0].head(20)

Got stderr: Feb 11, 2023 4:07:54 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 11, 2023 4:07:54 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 11, 2023 4:07:54 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>

Got stderr: Feb 11, 2023 4:07:56 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 11, 2023 4:07:56 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 11, 2023 4:07:56 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



Unnamed: 0,rtamin,a Mandalika
0,,4301 m.
1,* La,p / Sector time
2,P Cro,ssing the finis
3,** Ty,re data subject
4,Lap,Lap Time
5,1st,21 FraMon
6,Run,# 1 Fro
7,1,2'35.052
8,2,1'35.145
9,3,1'38.322


In [293]:
# next step is to combine all those data together, from a list of dataframes to one huge dataframe
# the combination goes like this: on the first page, we take the data from the left side first, then to the right side, then to the next page
# therefore, [dfl[0], dfr[0], dfl[1], dfr[1], dfl[2], ...]
combine_df = []
for i in range(len(dfl)):
    combine_df.append(dfl[i])
    combine_df.append(dfr[i])
df = pd.DataFrame(np.concatenate(combine_df, axis=0), columns=dfl[0].columns)
df

Unnamed: 0,rtamin,a Mandalika
0,,4301 m.
1,* La,p / Sector time
2,P Cro,ssing the finis
3,** Ty,re data subject
4,Lap,Lap Time
...,...,...
485,10,1'35.013
486,11,1'40.916 *
487,12,1'34.667
488,13,1'35.203


In [294]:
# next step is to remove unnecessary values and clean the data
df2 = df.copy()
df2.rename(columns={df2.columns[0]: 'Lap Number', df2.columns[1]: 'Lap Time'}, inplace=True)

# remove instances that doesn't contain any digit from 'Lap Number'
df2['Lap Number'].replace('^([^0-9]*)$', '', regex=True, inplace=True) 
df2['Lap Number'].replace('', np.nan, inplace=True)  # dunno if this two-step replacement is necessary, lazy to check

df2.dropna(inplace=True)
df2.reset_index(drop=True, inplace=True)

#[^\'.\d]
#\d{0,2}'\d\d\.\d\d\d.*
df2

Unnamed: 0,Lap Number,Lap Time
0,1st,21 FraMon
1,1,2'35.052
2,2,1'35.145
3,3,1'38.322
4,4,1'39.004 P
...,...,...
404,10,1'35.013
405,11,1'40.916 *
406,12,1'34.667
407,13,1'35.203


In [295]:
# get rider number for identification purposes
_temp = pd.to_numeric(df2['Lap Number'], errors='coerce')
riders = df2[_temp.isna()].copy()

riders['Lap Time'].replace('\D+', '', regex=True, inplace=True) 
riders['Lap Time'].replace('', np.nan, inplace=True)
riders.dropna(inplace=True)

riders.head()
#df2['Lap Number'].str.findall(r'\d{0,2}[a-z][a-z]')

Unnamed: 0,Lap Number,Lap Time
0,1st,21
17,2nd,72
33,3rd,5
48,4th,20
68,5th,42


In [296]:
rider_number = riders['Lap Time']  # this will be the rider's identity, make this as a column

In [306]:
# now we need to get all the lap times for each rider, this is done by iterating
# through df2['Lap Time'] and using rider_index as a way to separate which lap time belongs to which rider
rider_index = riders.index
laps = df2['Lap Time']

k = 0  # this is a pointer to the index of rider_index
"""
[0, 17, 33, 48, 68, 84, 101, 117, 134, 154, 171, 188, 206, 222, 240, 260, 279, 293, 310, 327, 346, 364, 380, 394]
for example, when i reaches 17, k points to the 1st index,
and when i reaches 33, k points to the 2nd index, etc
"""
laps_list = [[] for i in range(len(rider_index))]
for i in range(1, len(laps)):  # we skip i=0 because we know i=0 doesn't contain any laptime
    if i in rider_index:
        k += 1
    elif re.search(r"\d{0,2}'\d\d\.\d\d\d.*", laps[i]):
        laps_list[k].append(laps[i])

In [308]:
df3 = pd.DataFrame(laps_list).transpose()
df3.columns = rider_number
# obviously, the first lap out of pits never counts, so we can easily remove them
df3.drop(0, inplace=True)
df3.head()

Lap Time,21,72,5,20,42,89,23,43,88,10,...,33,30,36,12,73,4,87,25,49,40
1,1'35.145,1'36.986,1'34.027,1'34.523,1'34.640,1'34.586,1'34.338,2'12.877,1'34.373,1'36.717,...,1'38.162,1'35.390,1'35.121,1'35.329,1'36.966,1'34.657,1'37.975,1'34.315,1'35.185,1'35.744
2,1'38.322,1'36.138,1'43.864,1'33.994,1'37.364,1'34.409,1'38.094,1'41.049 P,1'33.705,1'35.055,...,1'34.213,1'34.985,1'33.767,1'34.816,1'34.925,1'34.063,1'34.062,1'46.972 P,1'34.853,1'35.312
3,1'39.004 P,1'37.750 P,1'34.736,1'44.739 P,1'34.270,1'40.697 P,1'37.407 P,21'22.815,1'34.887 P,1'36.813 P,...,1'38.633 P,1'38.495 P,1'37.687 P,1'37.968 P,1'37.687 P,1'38.826 *,1'41.277 P,21'24.665,1'36.473 P,1'36.777 P
4,20'03.694 P,20'01.670 P,1'33.427,20'02.127 P,1'34.031,20'24.785 P,19'49.976 P,1'49.994 P,20'14.286 P,20'13.014 P,...,20'10.558 P,20'03.104 P,20'41.602 P,20'54.100 P,20'02.201 P,1'41.034 P,19'50.439 P,1'34.719,20'20.051 P,19'58.294 P
5,1'33.996,1'32.774,1'33.450,1'33.378,1'33.331,1'33.755 *,1'40.708,4'41.840,1'33.874,1'34.528,...,1'35.179 *,1'40.596,1'54.054 P,1'39.031,1'33.512,19'08.185 P,1'34.285,1'33.761,1'33.956,1'34.558 *


In [309]:
# next, we remove laptimes that are the first laps out of pits, 
# the laps where the rider enters the pit (marked with 'P'),
# and the invalid laps (marked with '*').