In [21]:
import pandas as pd
from functions import trim_and_lower, drop_non_numbers, time_parts
import datetime

In [22]:
path = "../data/raw/tdf_winners.csv"
df = pd.read_csv(path, encoding='cp1252')

In [23]:
df = trim_and_lower(df)
df.shape

(102, 13)

In [9]:
cols_to_drop = [
    #"time",
    "died",
    "margin",
    "avg_speed"
]

df = df.drop(columns=cols_to_drop)
df.head()

Unnamed: 0,year,country,rider,team,time,stages_won,stages_led,height,weight,born
0,1903,france,maurice garin,la française,"94h 33' 14""",3,6.0,1.62m,60kg,1871-03-03
1,1904,france,henri cornet,conte,"96h 05' 55""",1,3.0,,,1884-08-04
2,1905,france,louis trousselier,peugeot–wolber,,5,10.0,,,1881-06-29
3,1906,france,rené pottier,peugeot–wolber,,5,12.0,,,1879-06-05
4,1907,france,lucien petit-breton,peugeot–wolber,,2,5.0,,,1882-10-18


In [10]:
df.time = df.time.apply(lambda x: pd.Series([time_parts(x)]))
df.head()

Unnamed: 0,year,country,rider,team,time,stages_won,stages_led,height,weight,born
0,1903,france,maurice garin,la française,"[94, 33, 14]",3,6.0,1.62m,60kg,1871-03-03
1,1904,france,henri cornet,conte,"[96, 5, 55]",1,3.0,,,1884-08-04
2,1905,france,louis trousselier,peugeot–wolber,"[0, 0, 0]",5,10.0,,,1881-06-29
3,1906,france,rené pottier,peugeot–wolber,"[0, 0, 0]",5,12.0,,,1879-06-05
4,1907,france,lucien petit-breton,peugeot–wolber,"[0, 0, 0]",2,5.0,,,1882-10-18


In [11]:
missing_stages_led = {
    2020: 2,
    2021: 14,
    2022: 11
}

# This maps the values from the missing_stages_led dictionary to the NaN values in the stages_led column
# and then changes the resulting series from float to int.
df.stages_led = df.stages_led.fillna(df.year.map(missing_stages_led))
df.stages_led = df.stages_led.apply(lambda x: int(x))
df

Unnamed: 0,year,country,rider,team,time,stages_won,stages_led,height,weight,born
0,1903,france,maurice garin,la française,"[94, 33, 14]",3,6,1.62m,60kg,1871-03-03
1,1904,france,henri cornet,conte,"[96, 5, 55]",1,3,,,1884-08-04
2,1905,france,louis trousselier,peugeot–wolber,"[0, 0, 0]",5,10,,,1881-06-29
3,1906,france,rené pottier,peugeot–wolber,"[0, 0, 0]",5,12,,,1879-06-05
4,1907,france,lucien petit-breton,peugeot–wolber,"[0, 0, 0]",2,5,,,1882-10-18
...,...,...,...,...,...,...,...,...,...,...
97,2018,great britain,geraint thomas,team sky,"[83, 17, 13]",2,11,1.83m,71kg,1986-05-25
98,2019,colombia,egan bernal,team ineos,"[82, 57, 0]",0,2,1.75m,60kg,1997-01-13
99,2020,slovenia,tadej poga?ar,uae team emirates,"[87, 20, 13]",3,2,1.77m,66kg,1998-09-21
100,2021,slovenia,tadej poga?ar,uae team emirates,"[82, 56, 36]",3,14,1.77m,66kg,1998-09-21


In [12]:
weight_mode = df.weight.mode()[0]
df.weight = df.weight.fillna(value=weight_mode)
df.weight = df.weight.apply(lambda x: drop_non_numbers(x))

height_mode = df.height.mode()[0]
df.height = df.height.fillna(value=height_mode)
df.height = df.height.apply(lambda x: drop_non_numbers(x))

In [13]:
df

Unnamed: 0,year,country,rider,team,time,stages_won,stages_led,height,weight,born
0,1903,france,maurice garin,la française,"[94, 33, 14]",3,6,1.62,60.0,1871-03-03
1,1904,france,henri cornet,conte,"[96, 5, 55]",1,3,1.86,62.0,1884-08-04
2,1905,france,louis trousselier,peugeot–wolber,"[0, 0, 0]",5,10,1.86,62.0,1881-06-29
3,1906,france,rené pottier,peugeot–wolber,"[0, 0, 0]",5,12,1.86,62.0,1879-06-05
4,1907,france,lucien petit-breton,peugeot–wolber,"[0, 0, 0]",2,5,1.86,62.0,1882-10-18
...,...,...,...,...,...,...,...,...,...,...
97,2018,great britain,geraint thomas,team sky,"[83, 17, 13]",2,11,1.83,71.0,1986-05-25
98,2019,colombia,egan bernal,team ineos,"[82, 57, 0]",0,2,1.75,60.0,1997-01-13
99,2020,slovenia,tadej poga?ar,uae team emirates,"[87, 20, 13]",3,2,1.77,66.0,1998-09-21
100,2021,slovenia,tadej poga?ar,uae team emirates,"[82, 56, 36]",3,14,1.77,66.0,1998-09-21


In [14]:
df.weight.isna().sum()

0

In [15]:
df.weight.mean()

65.93137254901961

In [16]:
df.shape

(102, 10)

In [17]:
df.born = pd.to_datetime(df.born).dt.year
df["age"] = df.year - df.born
df

Unnamed: 0,year,country,rider,team,time,stages_won,stages_led,height,weight,born,age
0,1903,france,maurice garin,la française,"[94, 33, 14]",3,6,1.62,60.0,1871,32
1,1904,france,henri cornet,conte,"[96, 5, 55]",1,3,1.86,62.0,1884,20
2,1905,france,louis trousselier,peugeot–wolber,"[0, 0, 0]",5,10,1.86,62.0,1881,24
3,1906,france,rené pottier,peugeot–wolber,"[0, 0, 0]",5,12,1.86,62.0,1879,27
4,1907,france,lucien petit-breton,peugeot–wolber,"[0, 0, 0]",2,5,1.86,62.0,1882,25
...,...,...,...,...,...,...,...,...,...,...,...
97,2018,great britain,geraint thomas,team sky,"[83, 17, 13]",2,11,1.83,71.0,1986,32
98,2019,colombia,egan bernal,team ineos,"[82, 57, 0]",0,2,1.75,60.0,1997,22
99,2020,slovenia,tadej poga?ar,uae team emirates,"[87, 20, 13]",3,2,1.77,66.0,1998,22
100,2021,slovenia,tadej poga?ar,uae team emirates,"[82, 56, 36]",3,14,1.77,66.0,1998,23


In [18]:
df.dtypes

year            int64
country        object
rider          object
team           object
time           object
stages_won      int64
stages_led      int64
height        float64
weight        float64
born            int32
age             int64
dtype: object

In [19]:
df.shape

(102, 11)

In [20]:
# Save the DataFrame to a new CSV file with 'utf-8' encoding
output_file_path = "/Users/anthonydillon/Ironhack/wk3/project/data/clean/winners.csv"
df.to_csv(output_file_path, encoding='utf-8', index=False)