In [1]:
import pandas as pd
import numpy as np
from functions import duration_in_hours
import plotly.express as px
from dash import Dash, dcc, html
import webbrowser
from threading import Timer
import yaml

In [2]:
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("The configuration file was not found!")

## Section 1: Tour and Winners
In this section we do the following:
- create the "tours" and "winners" dataframes and join them together as "df".
- convert the "time_hours" column from it's clean format "[hh, mm, ss]" to a more usable float format.
- use the new "time_hours" format to calculate "avg_speed".
- infer the avg speed and winning times for rows representing 1905-1912, where official finishing times were not available.

In [3]:
# Create one single dataframe containing the clean "tours" and "winners" data.
tours_path = config["data"]["clean"]["clean_3"]
tours = pd.read_csv(tours_path)
tours = tours.set_index("year", drop=True)

winners_path = config["data"]["clean"]["clean_4"]
winners = pd.read_csv(winners_path)
winners = winners.set_index("year", drop=True)

df = tours.join(winners, how="left", rsuffix="_x")

In [4]:
# Turn the "time" lists into a more usable total hours value.
df["time_hours"] = df.time.apply(lambda x: pd.Series(duration_in_hours(x)))

# Calculate the average winning speed for each year.
df["avg_speed"] = round(df.km / df.time_hours, 2)

In [5]:
# Replace the unusable values in the average speed column with the minimum value found in that column.
# The minimum value was chosen because the years requiring the replacement value were 1905-1912, when speeds were low.
df = df.replace(np.inf, df.avg_speed.min())

In [6]:
# Now that the average speed has been inferred, we can infer the winning times for the same years.
df.time_hours = df.apply(lambda row: row['km'] / row['avg_speed'] if row['time_hours'] == 0 else row['time_hours'], axis=1)

In [7]:
round(df.describe(), 2)

Unnamed: 0,stages,starters,finishers,miles,km,stages_won,stages_led,height,weight,born,age,decade,time_hours,avg_speed
count,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0
mean,20.08,144.1,90.78,2602.29,4188.02,2.73,10.72,1.78,71.17,1937.3,28.14,1960.92,129.32,34.13
std,3.37,40.61,46.4,440.3,708.88,1.82,5.28,0.04,5.98,34.75,3.37,35.11,44.22,5.69
min,6.0,60.0,10.0,1509.0,2428.0,0.0,1.0,1.61,52.0,1871.0,20.0,1900.0,79.54,23.97
25%,20.0,120.0,51.0,2259.0,3635.0,1.0,6.0,1.76,67.0,1907.0,26.0,1930.0,92.55,30.36
50%,21.0,132.0,86.0,2558.0,4117.0,2.0,11.0,1.77,75.0,1942.0,28.0,1960.0,116.27,35.52
75%,22.0,184.0,138.0,2884.0,4642.0,4.0,14.0,1.78,75.0,1964.0,30.0,1990.0,147.86,39.19
max,25.0,210.0,174.0,3570.0,5745.0,8.0,22.0,1.9,88.0,1998.0,36.0,2020.0,238.74,41.84


#### Charts
In this section we build the following charts:
- *A line chart plotting the number of riders starting the tour vs the number finishing the tour.*
- *A line chart plotting the change in the winning average speed over time.*
- *A line chart plotting the change in winning times over time.*
- *A line chart plotting the change in total distance covered by the tour over time.*
- *Histograms showing the age, weight and height distribution of tour winners.*
- *Histograms showing which riders and countries achieved the most tour wins.*

In [8]:
# Filter the dataframe to exclude Lance Armstrong, who was disqualified and stripped of his wins from 1999-2005.
df = df[df.rider != "lance armstrong"]

In [9]:
starters_finishers = px.line(df, x=df.index, y=["starters", "finishers"], labels={"value": "No. of riders", "year": "Year"})

avg_speed = px.line(df, x=df.index, y="avg_speed", labels={"avg_speed": "Avg Winning Speed (km/h)", "year": "Year"})

winning_time = px.line(df, x=df.index, y="time_hours", labels={"time_hours": "Winning Time", "year": "Year"})

total_distance = px.line(df, x=df.index, y="km", labels={"km": "Total Distance (km)", "year": "Year"})

winning_age = px.histogram(df, x="age", histnorm="percent", labels={"age": "Winning Age"}).update_yaxes(title_text="Percentage")
winning_age.write_image("../images/age.png")

winning_weight = px.histogram(df, x="weight", nbins=8, histnorm="percent", labels={"weight": "Winning Rider Weight"}).update_yaxes(title_text="Percentage")
winning_weight.write_image("../images/weight.png")

winning_height = px.histogram(df, x="height", nbins=5, histnorm="percent", labels={"height": "Winning Rider Height"}).update_yaxes(title_text="Percentage")
winning_height.write_image("../images/height.png")

tour_winners_cols = df[["rider", "time"]]
tour_winners_grouped = tour_winners_cols.groupby(by="rider").count().sort_values(by="time", ascending=False).reset_index()
top_riders = tour_winners_grouped[tour_winners_grouped.time >= tour_winners_grouped.time.quantile(0.9)]
top_riders = px.histogram(top_riders, x="rider", y="time", labels={"rider": "Rider"}).update_yaxes(title_text="No. of Tour Wins")

filtered = df[df.index > 1970]
winning_country = filtered.country.value_counts().reset_index()
winning_country.columns = ["country", "count"]
winning_country_sorted = winning_country.sort_values(by="count", ascending=False)
top_country = px.histogram(winning_country_sorted, x="country", y="count", histnorm="percent", labels={"country": "Country"}).update_yaxes(title_text="Percentage of tours won")

## Section 2: Stages and Riders
In this section we do the following:
- *Create the "stages" and "finishers" dataframes.*
- *Convert "margin", "winning_time_parts" and "finishing_time_parts" from "[hh, mm, ss]" to float*

In [10]:
# Create one single dataframe containing the clean "tours" and "winners" data.
stages_path = config["data"]["clean"]["clean_2"]
stages = pd.read_csv(stages_path)

finishers_path = config["data"]["clean"]["clean_1"]
finishers = pd.read_csv(finishers_path)

In [11]:
# Turn the "time" lists into a more usable total hours value.
finishers["margin"] = finishers.margin.apply(lambda x: pd.Series(duration_in_hours(x)))
finishers["winning_time_parts"] = finishers.winning_time_parts.apply(lambda x: pd.Series(duration_in_hours(x)))
finishers["finishing_time_parts"] = finishers.finishing_time_parts.apply(lambda x: pd.Series(duration_in_hours(x)))

In [12]:
km = px.box(df.km, x="km", labels={"km": "Total Distance"})
km.write_image("../images/km.png")

time = px.box(df.time_hours, x="time_hours", labels={"time_hours": "Winning Times"})
time.write_image("../images/time.png")

stages_pivot1 = stages.pivot_table(index="decade", columns="type", aggfunc="size", fill_value=0).reset_index()
stage_type1 = px.histogram(stages_pivot1, x="decade", y=['mountain stage', 'plain stage', 'time trial'], barnorm="percent",labels={"decade": "Decade"}).update_yaxes(title_text="%")
stage_type1.write_image("../images/stage_type.png")

#### Charts
In this section we build the following charts from the "stages" and "finishers" dataframes:
- *A bar chart showing the number of stages in each tour broken down by stage type - i.e. mountain, plain or time-trial.*
- *A box plot showing the spread of finishing times each year.*
- *Histograms showing the most successful riders and countries of the tour.*

In [13]:
stages_pivot = stages.pivot_table(index="year", columns="type", aggfunc="size", fill_value=0).reset_index()
stages_pivot = stages_pivot[stages_pivot.year >= 2000]
stage_type = px.bar(stages_pivot, x="year", y=['mountain stage', 'plain stage', 'time trial'], labels={"year": "Year"}).update_yaxes(title_text="Count")

finishing_time = finishers[finishers.year >= 2000]
finishing_time = px.box(finishing_time, x="year", y="finishing_time_parts", labels={"year": "Year", "finishing_time_parts": "Finishing Time"})

stage_winning_country_cols = stages[["winning_country", "year"]]
stage_winning_country_grouped = stage_winning_country_cols.groupby(by="winning_country").count().sort_values(by="year", ascending=False).reset_index()
stage_wins_by_country = stage_winning_country_grouped[stage_winning_country_grouped.year >= stage_winning_country_grouped.year.quantile(0.80)]
stage_wins_by_country = px.histogram(stage_wins_by_country, x="winning_country", y="year", labels={"winning_country": "Country"}).update_yaxes(title_text="No. of Stage Wins")

stage_winning_rider_cols = stages[["winner", "year"]]
stage_winning_rider_grouped = stage_winning_rider_cols.groupby(by="winner").count().sort_values(by="year", ascending=False).reset_index()
stage_wins_by_rider = stage_winning_rider_grouped[stage_winning_rider_grouped.year >= stage_winning_rider_grouped.year.quantile(0.99)]
stage_wins_by_rider = px.histogram(stage_wins_by_rider, x="winner", y="year", labels={"winner": "Rider"}).update_yaxes(title_text="No. of Stage Wins")

## Section 3: Chart App
In this section we create the web app using Dash and present the charts created above.

In [14]:
# Create a Dash application
app = Dash(__name__)

# Define the layout of the app
app.layout = html.Div(children=[
    html.H1(children='Section 1: Tours and Winners'),

    html.Div(children='''
        How has the tour changed over time and what attributes are common among the winning riders?
    '''),

    dcc.Graph(
        id='total_distance-line-chart',
        figure=total_distance
    ),

    dcc.Graph(
        id="stage_type-bar",
        figure=stage_type
    ),
    
    dcc.Graph(
        id='starters_finishers-line-chart',
        figure=starters_finishers
    ),

    dcc.Graph(
        id='avg_speed-line-chart',
        figure=avg_speed
    ),

    dcc.Graph(
        id='winning_time-line_chart',
        figure=winning_time
    ),
    
    dcc.Graph(
        id='winning_age-histogram',
        figure=winning_age
    ),

    dcc.Graph(
        id="winning_weight-hist",
        figure=winning_weight
    ),

    dcc.Graph(
        id="winning_height-hist",
        figure=winning_height
    ),

    html.H1(children='Section 2: Successful Riders and Countries'),

    html.Div(children='''
        Which riders and countries have had the most success in the tour?
    '''),
    
    dcc.Graph(
        id="top_riders-hist",
        figure=top_riders
    ),

    dcc.Graph(
        id="top_country-hist",
        figure=top_country
    ),

    dcc.Graph(
        id="finishing_time-box",
        figure=finishing_time
    ),

    dcc.Graph(
        id="stage_wins_by_country-hist",
        figure=stage_wins_by_country
    ),

    dcc.Graph(
        id="stage_wins_by_rider-hist",
        figure=stage_wins_by_rider
    )
])

# Function to open the browser automatically
def open_browser():
    webbrowser.open_new("http://127.0.0.1:8050/")

# Run the app and open it in a new browser tab
if __name__ == '__main__':
    Timer(1, open_browser).start()  # Open the browser after 1 second delay
    app.run_server(debug=False)
