In [None]:
# Imports
from functools import partial
import pandas as pd
import numpy as np
import re
import collections
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pyplot import show
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly
plotly.offline.init_notebook_mode()
import sys
sys.path.append('..')
%load_ext autoreload
%autoreload 1
%aimport study_utils
%aimport lausanne_1999_2016_utils

In [None]:
# Constants
PATH_TO_DATA = '../../Scraping/DataSport/Data/'
ALL_YEARS = [y for y in range(1999, 2017)]
ALL_AGE_CATEGORIES = ['All', '10-25 years', '26-30 years', '31-35 years', '36-40 years', '41-45 years', '46-50 years', '51-55 years', '56-60 years', '61-65 years', '65+ years']
ALL_SEX_CATEGORIES = ['All', 'Female', 'Male']
ALL_PERFORMANCE_CRITERIA = ['Time', 'Speed (m/s)']
YEAR_LABEL = widgets.Label(value='Year')
AGE_LABEL = widgets.Label(value='Age category')
SEX_LABEL = widgets.Label(value='Sex category')
PERFORMANCE_LABEL = widgets.Label(value='Performance')

**NOTE:**
This Notebook contains plots that use widgets. Please make sure to run this Notebook on Jupyter in order to manipulate them correctly.

# Data wrangling

First, we retrieve all the data corresponding to Lausanne Marathons, from 1999 to 2016.

In [None]:
lausanne_all_marathons = study_utils.get_data(PATH_TO_DATA)

In [None]:
lausanne_all_marathons.head()

We then apply different computations in order to clean DataFrame and format columns for further analysis.

In [None]:
lausanne_all_marathons_cleaned = study_utils.apply_computations(lausanne_all_marathons)

In [None]:
lausanne_all_marathons_cleaned = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned.apply(study_utils.filter_participants, axis=1)]

In [None]:
lausanne_all_marathons_cleaned = lausanne_1999_2016_utils.remove_outliers(lausanne_all_marathons_cleaned)

We also compute the overall rank for each year.

In [None]:
lausanne_all_marathons_cleaned = lausanne_1999_2016_utils.compute_overall_rank(lausanne_all_marathons_cleaned)

In [None]:
lausanne_all_marathons_cleaned.drop('rank', axis=1, inplace=True)

We finally obtain the final DataFrame, ready to be used for analysis.

In [None]:
lausanne_all_marathons_cleaned.head()

In [None]:
lausanne_all_marathons_10km = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned['distance (km)'] == 10]
lausanne_all_marathons_21km = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned['distance (km)'] == 21]
lausanne_all_marathons_42km = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned['distance (km)'] == 42]

In [None]:
# DataFrame with time format
lausanne_all_marathons_cleaned_with_datetime = lausanne_all_marathons_cleaned.copy()
lausanne_all_marathons_cleaned_with_datetime['time'] = pd.to_datetime([study_utils.convert_seconds_to_time(t) for t in lausanne_all_marathons_cleaned['time']], format='%H:%M:%S')

## Statistics about outliers

Let's display the number of runners for whom one or more fundamental attributes are missing.

In [None]:
columns_to_check = ['sex', 'time', 'rank']
# As sex is used to find outliers, we add the column into the original DataFrame
lausanne_all_marathons['sex'] = lausanne_all_marathons.apply(study_utils.get_sex_of_runner, axis=1)
study_utils.get_statistics_outliers(lausanne_all_marathons, columns_to_check)

Finally, let's display the results of the cleaning process.

In [None]:
all_runners = len(lausanne_all_marathons)
considered_runners = len(lausanne_all_marathons_cleaned)
print('Runners in raw DataFrame: ' + str(all_runners))
print('Runners to be considered for analysis: ' + str(considered_runners))
print('Unconsidered runners: ' + str(all_runners - considered_runners))

# Analysis of data

## Part 1: General analysis

### Global evolution of number of participants

In [None]:
columns = ['Marathon', '10 km', 'Semi-marathon']
distribution_of_runners = lausanne_1999_2016_utils.generate_distributions(lausanne_all_marathons_10km, lausanne_all_marathons_21km, lausanne_all_marathons_42km, lausanne_1999_2016_utils.filter_by_years)
distribution_of_runners = distribution_of_runners[columns]

In [None]:
distribution_of_runners.head()

In [None]:
distribution_runners_over_years = lausanne_1999_2016_utils.plot_distribution_over_years(distribution_of_runners)

We easily see some changes over the years thanks to the graph. Among other things, we identify the new direction of Lausanne Marathon as for 2009, number of participants started to increase again. We also notice that number of runners of the marathon is decreasing over the years, as RTSSport mentioned it <a href="https://www.rts.ch/sport/athletisme/1258113-le-marathon-de-lausanne-change-dorientation.html">in an article dedicated to Lausanne Marathon</a>, while, in the meantime, the number of runners of the 10 km running increases (and is now higher than the one for semi-marathon).

#### In-depth: evolution by age categories and gender

First, we create all the figures.

In [None]:
# Creation of all figures
all_evolution_figures = lausanne_1999_2016_utils.generate_all_evolution_figures(lausanne_all_marathons_cleaned, ALL_AGE_CATEGORIES, ALL_SEX_CATEGORIES)

Then, we create handlers.

In [None]:
def plot_evolution():
    clear_output()
    data = all_evolution_figures[selected_age_category_evolution][selected_sex_category_evolution]
    plotly.offline.iplot(data)

def update_age_category_evolution(change):
    if change['new'] not in ALL_AGE_CATEGORIES:
        return
    global selected_age_category_evolution
    selected_age_category_evolution = change['new']
    plot_evolution()

def update_sex_category_evolution(change):
    if change['new'] not in ALL_SEX_CATEGORIES:
        return
    global selected_sex_category_evolution
    selected_sex_category_evolution = change['new']
    plot_evolution()

# Default selections
selected_age_category_evolution = 'All'
selected_sex_category_evolution = 'All'

# Widget for age categories
age_categories_evolution_widget = widgets.Dropdown(options=ALL_AGE_CATEGORIES, value=selected_age_category_evolution)
age_categories_evolution_widget.observe(update_age_category_evolution)

# Widget for performance criteria
sex_categories_evolution_widget = widgets.Dropdown(options=ALL_SEX_CATEGORIES, value=selected_sex_category_evolution)
sex_categories_evolution_widget.observe(update_sex_category_evolution)

Finally, we display widgets and graph.

In [None]:
display(AGE_LABEL)
display(age_categories_evolution_widget)
display(SEX_LABEL)
display(sex_categories_evolution_widget)
plot_evolution()

### Focus on gender

First, we retrieve gender distribution over the years into a unique DataFrame.

In [None]:
gender_distributions = lausanne_1999_2016_utils.generate_distributions(lausanne_all_marathons_10km, lausanne_all_marathons_21km, lausanne_all_marathons_42km, lausanne_1999_2016_utils.filter_by_sex_and_years)

In [None]:
gender_distributions.head(6)

To plot the results, we separe data according to the gender of participants.

In [None]:
columns = ['10 km', 'Semi-marathon', 'Marathon']
female_runners = gender_distributions[gender_distributions.index.get_level_values(0) == 'female']
female_runners.index = female_runners.index.droplevel(0)
female_runners = female_runners[columns]

male_runners = gender_distributions[gender_distributions.index.get_level_values(0) == 'male']
male_runners.index = male_runners.index.droplevel(0)
male_runners = male_runners[columns]

gender_distributions_ordered = collections.OrderedDict([('female', female_runners), ('male', male_runners)])

In [None]:
gender_distributions_over_years = lausanne_1999_2016_utils.plot_gender_distributions_over_years(gender_distributions_ordered)

What we have seen for Lausanne Marathon 2016 can be generalized here. First, we see that women tend to prefer 10 km running, while for men, the number of participants among the different runnings is better divided. Second, we see that globally, for 10 km running and marathon, evolution is similar among the two genders. However, we observe much more variations for men than for women when we consider semi-marathon, and this even if we consider the new direction of Lausanne Marathon established in 2009.

## Zoom on evolution of age of participants

In [None]:
statistics = lausanne_1999_2016_utils.generate_median_age_statistics(lausanne_all_marathons_cleaned)

In [None]:
statistics['global'].round(1).T

In [None]:
statistics['detailed'].index.levels[0].rename('Gender', inplace=True)
statistics['detailed'].index.levels[1].rename('Year', inplace=True)
statistics['detailed'].round(1)

In [None]:
median_age_by_gender = statistics['detailed'].swaplevel().reset_index(level=1)
median_age_evolution = lausanne_1999_2016_utils.plot_median_age_evolution(median_age_by_gender, groupby_attributes={'female': {'name': 'Female runners', 'color': '#f442e8'}, 'male': {'name': 'Male runners', 'color': '#4286f4'}})

The evolution of the age is not very relevant, even if between 2007 and 2015 there is a possible linear regression. Also, we observe that in 2016 the average age seems to increase and break the previous downwards tendency.

TODO: Add a boxplot to see that the distribution become larger over the years but stay with the same mean average.

## Part 2: Analysis of performance of Lausanne Marathon editions

### Display of evolution of performance over the years

First, we initialize visualisation parameters and we create Plotly figure.

In [None]:
# Visualisation parameters
selected_age_category = 'All'
selected_sex_category = 'All'
selected_performance_criterion = 'Time'

# Creation of all figures
all_performance_figures = lausanne_1999_2016_utils.generate_all_performance_figures(lausanne_all_marathons_cleaned_with_datetime, ALL_AGE_CATEGORIES, ALL_SEX_CATEGORIES, ALL_PERFORMANCE_CRITERIA)

We then create dropdown and their respective handlers.

In [None]:
# ----------------------------------------------------------------------------------------
# HANDLERS

def update_boxplots_figure():
    clear_output()
    data = all_performance_figures[selected_age_category][selected_sex_category][selected_performance_criterion]
    plotly.offline.iplot(data)

def update_age_category(change, update_function):
    if change['new'] not in ALL_AGE_CATEGORIES:
        return
    global selected_age_category
    selected_age_category = change['new']
    update_function()

def update_sex_category(change, update_function):
    if change['new'] not in ALL_SEX_CATEGORIES:
        return
    global selected_sex_category
    selected_sex_category = change['new']
    update_function()
    
def update_performance_criterion(change, update_function):
    if change['new'] not in ALL_PERFORMANCE_CRITERIA:
        return
    global selected_performance_criterion
    selected_performance_criterion = change['new']
    update_function()

update_age_category_with_args = partial(update_age_category, update_function=update_boxplots_figure)
update_sex_category_with_args = partial(update_sex_category, update_function=update_boxplots_figure)
update_performance_criterion_with_args = partial(update_performance_criterion, update_function=update_boxplots_figure)

# HANDLERS (END)
# ----------------------------------------------------------------------------------------

# Widget for age categories
age_widget = widgets.Dropdown(options=ALL_AGE_CATEGORIES, value=selected_age_category)
age_widget.observe(update_age_category_with_args)

# Widget for age categories
sex_widget = widgets.Dropdown(options=ALL_SEX_CATEGORIES, value=selected_sex_category)
sex_widget.observe(update_sex_category_with_args)

# Widget for performance criteria
performance_widget = widgets.Dropdown(options=ALL_PERFORMANCE_CRITERIA, value=selected_performance_criterion)
performance_widget.observe(update_performance_criterion_with_args)

Finally, we display the graph.

In [None]:
display(AGE_LABEL)
display(age_widget)
display(SEX_LABEL)
display(sex_widget)
display(PERFORMANCE_LABEL)
display(performance_widget)
plotly.offline.iplot(all_performance_figures[selected_age_category][selected_sex_category][selected_performance_criterion])

If we consider all the runners, it seems that there is kind of oscillations over the years, but globally difference doesn't seem important.

#### Display of evolution of number of participants and performance

First, we define default values of select and we generate all figures using previous figures used for separated plots (see above).

In [None]:
selected_age_category_evolution_performance = 'All'
selected_sex_category_evolution_performance = 'All'
selected_performance_evolution_performance = 'Time'

In [None]:
# IMPORTANT: These two lines must be executed together (permanent modifications of dictionary is done when generating figures)
all_evolution_performance_data = lausanne_1999_2016_utils.join_evolution_and_performance_data(all_evolution_figures, all_performance_figures, ALL_AGE_CATEGORIES, ALL_SEX_CATEGORIES, ALL_PERFORMANCE_CRITERIA)
all_evolution_performance_figures = lausanne_1999_2016_utils.generate_evolution_and_performance_figures(all_evolution_performance_data, ALL_AGE_CATEGORIES, ALL_SEX_CATEGORIES, ALL_PERFORMANCE_CRITERIA)

In [None]:
# ----------------------------------------------------------------------------------------
# HANDLERS

def plot_evolution_and_performance():
    clear_output()
    data = all_evolution_performance_figures[selected_age_category_evolution_performance][selected_sex_category_evolution_performance][selected_performance_evolution_performance]
    plotly.offline.iplot(data)
    
def update_age_category_evolution_performance(change):
    if change['new'] not in ALL_AGE_CATEGORIES:
        return
    global selected_age_category_evolution_performance
    selected_age_category_evolution_performance = change['new']
    plot_evolution_and_performance()
    
def update_sex_category_evolution_performance(change):
    if change['new'] not in ALL_SEX_CATEGORIES:
        return
    global selected_sex_category_evolution_performance
    selected_sex_category_evolution_performance = change['new']
    plot_evolution_and_performance()

def update_performance_evolution_performance(change):
    if change['new'] not in ALL_PERFORMANCE_CRITERIA:
        return
    global selected_performance_evolution_performance
    selected_performance_evolution_performance = change['new']
    plot_evolution_and_performance()

# HANDLERS (END)
# ----------------------------------------------------------------------------------------

# Widget for age categories
age_categories_evolution_performance_widget = widgets.Dropdown(options=ALL_AGE_CATEGORIES, value=selected_age_category_evolution_performance)
age_categories_evolution_performance_widget.observe(update_age_category_evolution_performance)

# Widget for age categories
sex_categories_evolution_performance_widget = widgets.Dropdown(options=ALL_SEX_CATEGORIES, value=selected_sex_category_evolution_performance)
sex_categories_evolution_performance_widget.observe(update_sex_category_evolution_performance)

# Widget for performance criteria
performance_evolution_performance_widget = widgets.Dropdown(options=ALL_PERFORMANCE_CRITERIA, value=selected_performance_evolution_performance)
performance_evolution_performance_widget.observe(update_performance_evolution_performance)

Finally, we display graph.

In [None]:
display(AGE_LABEL)
display(age_categories_evolution_performance_widget)
display(SEX_LABEL)
display(sex_categories_evolution_performance_widget)
display(PERFORMANCE_LABEL)
display(performance_evolution_performance_widget)
plot_evolution_and_performance()

### Focus on performance distribution

First, we define default values of select and we generate all figures using previous figures used for separated plots (see above).

In [None]:
selected_age_category_performance_distribution = 'All'
selected_sex_category_performance_distribution = 'All'
selected_performance_performance_distribution = 'Time'

In [None]:
all_performance_distribution_figures = lausanne_1999_2016_utils.generate_performance_distribution_figures(lausanne_all_marathons_cleaned_with_datetime, ALL_AGE_CATEGORIES, ALL_SEX_CATEGORIES, ALL_PERFORMANCE_CRITERIA)

Then, we define handlers and widgets.

In [None]:
# ----------------------------------------------------------------------------------------
# HANDLERS

def plot_performance_distribution():
    clear_output()
    data = all_performance_distribution_figures[selected_age_category_performance_distribution][selected_sex_category_performance_distribution][selected_performance_performance_distribution]
    plotly.offline.iplot(data)
    
def update_age_category_performance_distribution(change):
    if change['new'] not in ALL_AGE_CATEGORIES:
        return
    global selected_age_category_performance_distribution
    selected_age_category_performance_distribution = change['new']
    plot_performance_distribution()
    
def update_sex_category_performance_distribution(change):
    if change['new'] not in ALL_SEX_CATEGORIES:
        return
    global selected_sex_category_performance_distribution
    selected_sex_category_performance_distribution = change['new']
    plot_performance_distribution()

def update_performance_performance_distribution(change):
    if change['new'] not in ALL_PERFORMANCE_CRITERIA:
        return
    global selected_performance_performance_distribution
    selected_performance_performance_distribution = change['new']
    plot_performance_distribution()

# HANDLERS (END)
# ----------------------------------------------------------------------------------------

# Widget for age categories
age_categories_performance_distribution_widget = widgets.Dropdown(options=ALL_AGE_CATEGORIES, value=selected_age_category_performance_distribution)
age_categories_performance_distribution_widget.observe(update_age_category_performance_distribution)

# Widget for age categories
sex_categories_performance_distribution_widget = widgets.Dropdown(options=ALL_SEX_CATEGORIES, value=selected_sex_category_performance_distribution)
sex_categories_performance_distribution_widget.observe(update_sex_category_performance_distribution)

# Widget for performance criteria
performance_performance_distribution_widget = widgets.Dropdown(options=ALL_PERFORMANCE_CRITERIA, value=selected_performance_performance_distribution)
performance_performance_distribution_widget.observe(update_performance_performance_distribution)

In [None]:
display(AGE_LABEL)
display(age_categories_performance_distribution_widget)
display(SEX_LABEL)
display(sex_categories_performance_distribution_widget)
display(PERFORMANCE_LABEL)
display(performance_performance_distribution_widget)
plot_performance_distribution()

### Comparison between single edition versus all editions of Lausanne Marathon

First, we generate all the boxplots used for the comparison.

In [None]:
selected_age_category_comparison = 'All'
selected_year_comparison = 2016
selected_performance_comparison = 'Time'
all_performance_comparison_boxplots = lausanne_1999_2016_utils.generate_performance_comparison(lausanne_all_marathons_cleaned_with_datetime, ALL_AGE_CATEGORIES, ALL_PERFORMANCE_CRITERIA)

We also define our widget used to select performance criterion.

In [None]:
# ----------------------------------------------------------------------------------------
# HANDLERS

def plot_comparison():
    clear_output()
    data = all_performance_comparison_boxplots[selected_age_category_comparison][selected_year_comparison]
    lausanne_1999_2016_utils.plot_performance_comparison(data, selected_age_category_comparison, selected_year_comparison, selected_performance_comparison)

def update_age_category_comparison(change):
    if change['new'] not in ALL_AGE_CATEGORIES:
        return
    global selected_age_category_comparison
    selected_age_category_comparison = change['new']
    plot_comparison()
    
def update_year_comparison(change):
    if change['new'] not in ALL_YEARS:
        return
    global selected_year_comparison
    selected_year_comparison = change['new']
    plot_comparison()

def update_performance_comparison(change):
    if change['new'] not in ALL_PERFORMANCE_CRITERIA:
        return
    global selected_performance_comparison
    selected_performance_comparison = change['new']
    plot_comparison()

# HANDLERS (END)
# ----------------------------------------------------------------------------------------

# Widget for age categories
age_categories_comparison_widget = widgets.Dropdown(options=ALL_AGE_CATEGORIES, value=selected_age_category_comparison)
age_categories_comparison_widget.observe(update_age_category_comparison)

# Widget for age categories
years_comparison_widget = widgets.Dropdown(options=ALL_YEARS, value=selected_year_comparison)
years_comparison_widget.observe(update_year_comparison)

# Widget for performance criteria
performance_comparison_widget = widgets.Dropdown(options=ALL_PERFORMANCE_CRITERIA, value=selected_performance_comparison)
performance_comparison_widget.observe(update_performance_comparison)

Finally, we display the complete graph.

In [None]:
display(AGE_LABEL)
display(age_categories_comparison_widget)
display(YEAR_LABEL)
display(years_comparison_widget)
display(PERFORMANCE_LABEL)
display(performance_comparison_widget)
plot_comparison()

## Back in time: Evolution of performance of teams

In [None]:
teams_performance_figures = lausanne_1999_2016_utils.generate_teams_evolution_figures(data=lausanne_all_marathons_cleaned_with_datetime)

In [None]:
for running in teams_performance_figures.keys():
    plotly.offline.iplot(teams_performance_figures[running])

## Focus on BIB numbers (for curiosity)

For pure curiosity, we can display the performance according BIB numbers for all Lausanne Marathon's editions.

*Note: The following figure is not exported to JSON by default and does not appear on the website.*

In [None]:
bib_distribution = lausanne_1999_2016_utils.generate_all_bib_performance_figure(lausanne_all_marathons_cleaned_with_datetime)

# Save of (dictionaries containing) Plotly figures to JSON

As comparison plot is generated on-the-fly, we need to create dictionary.

In [None]:
all_comparison_figures = {}
age_category = 'All'
for year in ALL_YEARS:
    all_comparison_figures[year] = {}
    data = all_performance_comparison_boxplots[age_category][year]
    # Note: We use silent option to only retrieve each figure
    # Also, we only use 'Time' as criterion and only 'All' as age category (minified JSON of 145 MB instead of 543 MB)
    all_comparison_figures[year] = lausanne_1999_2016_utils.plot_performance_comparison(data=data, age_category=age_category, year=year, performance_criterion='Time', silent=True)

We define dictionary containing all data to export.

In [None]:
data_to_export = {'distribution_runners_over_years': distribution_runners_over_years,
                  'all_evolution_figures': all_evolution_figures,
                  'gender_distributions_over_years': gender_distributions_over_years,
                  'median_age_evolution': median_age_evolution,
                  'all_performance_figures': all_performance_figures,
                  'all_evolution_performance_figures': all_evolution_performance_figures,
                  'all_performance_distribution_figures': all_performance_distribution_figures,
                  'all_comparison_figures': all_comparison_figures,
                  'teams_performance_figures': teams_performance_figures
                 }

Finally, we export all data.

**IMPORTANT: Generating JSON of all data takes some time and requires high resources.<br>By default, code is commented and existing files are not overriden.**

In [None]:
#for name, data in data_to_export.items():
    # Note: We set indent to None in order to minify JSON
#    study_utils.convert_to_JSON(object=data, file_name=name, path='./Data', encoder=plotly.utils.PlotlyJSONEncoder, indent=None)