In [1]:
import pandas as pd
from functions import trim_and_lower, extract_string_in_parentheses, drop_non_numbers, calculate_decade, remove_country
import re
import yaml

In [2]:
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("The configuration file was not found!")

The following cell contains all of the custom functions that are only applicable to this notebook and therefore should not be added to an external custom functions module (i.e. functions.py)

In [3]:
def split_course(course):
    parts = course.split(" to ")
    start = parts[0]
    finish = [parts[1] if len(parts) == 2 else start][0]
    return start, finish


In [4]:
path = config['data']['raw']['raw_2']
df = pd.read_csv(path)

The following cell applies an external function to the entire dataframe to achieve the following:
- remove leading and trailing spaces from column names
- replace spaces in column names with "_"
- transform column names to lowercase
- transform dataset strings to lowercase
- remove leading and trailing spaces from strings in the dataset

In [5]:
df = trim_and_lower(df)
df.head()

Unnamed: 0,year,date,stage,course,distance,type,winner
0,1903,1903-07-01,1,paris to lyon,467 km (290 mi),plain stage,maurice garin (fra)
1,1903,1903-07-05,2,lyon to marseille,374 km (232 mi),stage with mountain(s),hippolyte aucouturier (fra)
2,1903,1903-07-08,3,marseille to toulouse,423 km (263 mi),plain stage,hippolyte aucouturier (fra)
3,1903,1903-07-12,4,toulouse to bordeaux,268 km (167 mi),plain stage,charles laeser (sui)
4,1903,1903-07-13,5,bordeaux to nantes,425 km (264 mi),plain stage,maurice garin (fra)


In [6]:
df["miles"] = df.distance.apply(lambda x: drop_non_numbers(extract_string_in_parentheses(x)))
df["km"] = df.distance.apply(lambda x: drop_non_numbers(x.split("(")[0]))
df.head()

Unnamed: 0,year,date,stage,course,distance,type,winner,miles,km
0,1903,1903-07-01,1,paris to lyon,467 km (290 mi),plain stage,maurice garin (fra),290.0,467.0
1,1903,1903-07-05,2,lyon to marseille,374 km (232 mi),stage with mountain(s),hippolyte aucouturier (fra),232.0,374.0
2,1903,1903-07-08,3,marseille to toulouse,423 km (263 mi),plain stage,hippolyte aucouturier (fra),263.0,423.0
3,1903,1903-07-12,4,toulouse to bordeaux,268 km (167 mi),plain stage,charles laeser (sui),167.0,268.0
4,1903,1903-07-13,5,bordeaux to nantes,425 km (264 mi),plain stage,maurice garin (fra),264.0,425.0


In [7]:
df.date = pd.to_datetime(df.date)

df[["start", "finish"]] = df.course.apply(lambda x: pd.Series(split_course(x)))

df.type = ["mountain stage" if "mountain" in item or "hill" in item else
           ("time trial" if "trial" in item else "plain stage") for item in df.type]

df["winning_country"] = df.winner.apply(lambda x: pd.Series(extract_string_in_parentheses(x)))

# Remove the characters after and including the first "[" or "(" from the "winner" column.
df.winner = df.winner.apply(lambda x: pd.Series(remove_country(x)))

df["decade"] = df.year.apply(lambda x: calculate_decade(x))

df = df.drop(columns=["course", "distance"])

df.head()

Unnamed: 0,year,date,stage,type,winner,miles,km,start,finish,winning_country,decade
0,1903,1903-07-01,1,plain stage,maurice garin,290.0,467.0,paris,lyon,fra,1900
1,1903,1903-07-05,2,mountain stage,hippolyte aucouturier,232.0,374.0,lyon,marseille,fra,1900
2,1903,1903-07-08,3,plain stage,hippolyte aucouturier,263.0,423.0,marseille,toulouse,fra,1900
3,1903,1903-07-12,4,plain stage,charles laeser,167.0,268.0,toulouse,bordeaux,sui,1900
4,1903,1903-07-13,5,plain stage,maurice garin,264.0,425.0,bordeaux,nantes,fra,1900


In [8]:
df["decade"] = df.year.apply(lambda x: calculate_decade(x))
df.head()

Unnamed: 0,year,date,stage,type,winner,miles,km,start,finish,winning_country,decade
0,1903,1903-07-01,1,plain stage,maurice garin,290.0,467.0,paris,lyon,fra,1900
1,1903,1903-07-05,2,mountain stage,hippolyte aucouturier,232.0,374.0,lyon,marseille,fra,1900
2,1903,1903-07-08,3,plain stage,hippolyte aucouturier,263.0,423.0,marseille,toulouse,fra,1900
3,1903,1903-07-12,4,plain stage,charles laeser,167.0,268.0,toulouse,bordeaux,sui,1900
4,1903,1903-07-13,5,plain stage,maurice garin,264.0,425.0,bordeaux,nantes,fra,1900


In [9]:
# Save the DataFrame to a new CSV file with 'utf-8' encoding
output_file_path = config['data']['clean']['clean_2']
df.to_csv(output_file_path, encoding='utf-8', index=False)