In [1]:
import pandas as pd
from functions import trim_and_lower, extract_string_in_parentheses, time_parts
import re
from datetime import timedelta
import yaml

In [2]:
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("The configuration file was not found!")

In [3]:
path = config['data']['raw']['raw_1']
df = pd.read_csv(path)

In [4]:
df = trim_and_lower(df)

Problems:
- time wasn't recorded or was lost for years 1905 to 1912
- over 2000 rows include values where the finishing time was only minutes or seconds behind the winner
- finding the winning time was difficult for years where the winner was dsq, because their ranking was no longer "1"

In [5]:
df["time_parts"] = df.time.apply(lambda x: pd.Series([time_parts(x)]))

In [6]:
w = df.drop_duplicates(subset="year", keep='first')

cols_to_drop = [
    "rank",
    "rider",
    "team",
    "time"
]

w = w.drop(columns=cols_to_drop).set_index("year")
w.rename(columns={'time_parts': 'winning_time_parts'}, inplace=True)

In [7]:
df = df.join(w, on="year", how='left', lsuffix="_x")
df.head()

Unnamed: 0,year,rank,rider,time,team,time_parts,winning_time_parts
0,1903,1,maurice garin (fra),"94h 33' 14""",la française,"[94, 33, 14]","[94, 33, 14]"
1,1903,2,lucien pothier (fra),"+ 2h 59' 21""",la française,"[2, 59, 21]","[94, 33, 14]"
2,1903,3,fernand augereau (fra),"+ 4h 29' 24""",la française,"[4, 29, 24]","[94, 33, 14]"
3,1903,4,rodolfo muller[27] (ita),"+ 4h 39' 30""",la française,"[4, 39, 30]","[94, 33, 14]"
4,1903,5,jean fischer (fra),"+ 4h 58' 44""",la française,"[4, 58, 44]","[94, 33, 14]"


In [8]:
df['finishing_time_parts'] = df.apply(lambda row: row['time_parts'] if row['time_parts'] == row['winning_time_parts'] else [a + b for a, b in zip(row['time_parts'], row['winning_time_parts'])], axis=1)
df.rename(columns={"time_parts": "margin"}, inplace=True)
df['margin'] = df.apply(lambda row: [0, 0, 0] if row['margin'] == row['winning_time_parts'] else row["margin"], axis=1)
df

Unnamed: 0,year,rank,rider,time,team,margin,winning_time_parts,finishing_time_parts
0,1903,1,maurice garin (fra),"94h 33' 14""",la française,"[0, 0, 0]","[94, 33, 14]","[94, 33, 14]"
1,1903,2,lucien pothier (fra),"+ 2h 59' 21""",la française,"[2, 59, 21]","[94, 33, 14]","[96, 92, 35]"
2,1903,3,fernand augereau (fra),"+ 4h 29' 24""",la française,"[4, 29, 24]","[94, 33, 14]","[98, 62, 38]"
3,1903,4,rodolfo muller[27] (ita),"+ 4h 39' 30""",la française,"[4, 39, 30]","[94, 33, 14]","[98, 72, 44]"
4,1903,5,jean fischer (fra),"+ 4h 58' 44""",la française,"[4, 58, 44]","[94, 33, 14]","[98, 91, 58]"
...,...,...,...,...,...,...,...,...
9890,2022,130,frederik frison (bel),"+ 5h 30' 19""",lotto–soudal,"[5, 30, 19]","[79, 33, 20]","[84, 63, 39]"
9891,2022,131,reinardt janse van rensburg (rsa),"+ 5h 31' 25""",lotto–soudal,"[5, 31, 25]","[79, 33, 20]","[84, 64, 45]"
9892,2022,132,amund grøndahl jansen (nor),"+ 5h 31' 27""",team bikeexchange–jayco,"[5, 31, 27]","[79, 33, 20]","[84, 64, 47]"
9893,2022,133,albert torres (esp),"+ 5h 36' 33""",movistar team,"[5, 36, 33]","[79, 33, 20]","[84, 69, 53]"


In [9]:
#df["time_parts"] = df.time.apply(lambda x: pd.Series([time_parts(x)]))

df["country"] = df.rider.apply(lambda x: pd.Series([extract_string_in_parentheses(x)]))
df

Unnamed: 0,year,rank,rider,time,team,margin,winning_time_parts,finishing_time_parts,country
0,1903,1,maurice garin (fra),"94h 33' 14""",la française,"[0, 0, 0]","[94, 33, 14]","[94, 33, 14]",fra
1,1903,2,lucien pothier (fra),"+ 2h 59' 21""",la française,"[2, 59, 21]","[94, 33, 14]","[96, 92, 35]",fra
2,1903,3,fernand augereau (fra),"+ 4h 29' 24""",la française,"[4, 29, 24]","[94, 33, 14]","[98, 62, 38]",fra
3,1903,4,rodolfo muller[27] (ita),"+ 4h 39' 30""",la française,"[4, 39, 30]","[94, 33, 14]","[98, 72, 44]",ita
4,1903,5,jean fischer (fra),"+ 4h 58' 44""",la française,"[4, 58, 44]","[94, 33, 14]","[98, 91, 58]",fra
...,...,...,...,...,...,...,...,...,...
9890,2022,130,frederik frison (bel),"+ 5h 30' 19""",lotto–soudal,"[5, 30, 19]","[79, 33, 20]","[84, 63, 39]",bel
9891,2022,131,reinardt janse van rensburg (rsa),"+ 5h 31' 25""",lotto–soudal,"[5, 31, 25]","[79, 33, 20]","[84, 64, 45]",rsa
9892,2022,132,amund grøndahl jansen (nor),"+ 5h 31' 27""",team bikeexchange–jayco,"[5, 31, 27]","[79, 33, 20]","[84, 64, 47]",nor
9893,2022,133,albert torres (esp),"+ 5h 36' 33""",movistar team,"[5, 36, 33]","[79, 33, 20]","[84, 69, 53]",esp


In [10]:
#display(df.loc[df.time.isna()])
#display(df.loc[df.time_parts.isna()])
#display(df.loc[df.winning_time_parts.isna()])

In [11]:
# Save the DataFrame to a new CSV file with 'utf-8' encoding
output_file_path = config['data']['clean']['clean_1']
df.to_csv(output_file_path, encoding='utf-8', index=False)