In [None]:
# Imports
import pandas as pd
import numpy as np
import re
import collections
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pyplot import show
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly
plotly.offline.init_notebook_mode()
import sys
sys.path.append('..')

# our code (mark it at autoreload at every cell execution - useful in developement mode)
%load_ext autoreload
%autoreload 1
%aimport study_utils
%aimport lausanne_1999_2016_utils

In [None]:
PATH_TO_DATA = '../../Scraping/DataSport/Data/'

# Data wrangling

First, we retrieve all the data corresponding to Lausanne Marathons, from 1999 to 2016.

In [None]:
lausanne_all_marathons = study_utils.get_data(PATH_TO_DATA)

In [None]:
lausanne_all_marathons.head()

We then apply different computations in order to clean DataFrame and format columns for further analysis.

In [None]:
lausanne_all_marathons_cleaned = study_utils.apply_computations(lausanne_all_marathons)

In [None]:
lausanne_all_marathons_cleaned = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned.apply(study_utils.filter_participants, axis=1)]

We also compute the overall rank for each year.

In [None]:
lausanne_all_marathons_cleaned = lausanne_1999_2016_utils.remove_outliers(lausanne_all_marathons_cleaned)

In [None]:
lausanne_all_marathons_cleaned = lausanne_1999_2016_utils.compute_overall_rank(lausanne_all_marathons_cleaned)

In [None]:
lausanne_all_marathons_cleaned.drop('rank', axis=1, inplace=True)

We finally obtain the final DataFrame, ready to be used for analysis.

In [None]:
lausanne_all_marathons_cleaned.head()

In [None]:
lausanne_all_marathons_10km = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned['distance (km)'] == 10]
lausanne_all_marathons_21km = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned['distance (km)'] == 21]
lausanne_all_marathons_42km = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned['distance (km)'] == 42]

## Statistics about outliers

Let's display the number of runners for whom one or more fundamental attributes are missing.

In [None]:
columns_to_check = ['sex', 'time', 'rank']
# As sex is used to find outliers, we add the column into the original DataFrame
lausanne_all_marathons['sex'] = lausanne_all_marathons.apply(study_utils.get_sex_of_runner, axis=1)
study_utils.get_statistics_outliers(lausanne_all_marathons, columns_to_check)

Finally, let's display the results of the cleaning process.

In [None]:
all_runners = len(lausanne_all_marathons)
considered_runners = len(lausanne_all_marathons_cleaned)
print('Runners in raw DataFrame: ' + str(all_runners))
print('Runners to be considered for analysis: ' + str(considered_runners))
print('Unconsidered runners: ' + str(all_runners - considered_runners))

# Analysis of data

## Evolution of number of participants

In [None]:
columns = ['Marathon', '10 km', 'Semi-marathon']
distribution_of_runners = lausanne_1999_2016_utils.generate_distributions(lausanne_all_marathons_10km, lausanne_all_marathons_21km, lausanne_all_marathons_42km, lausanne_1999_2016_utils.filter_by_years)
distribution_of_runners = distribution_of_runners[columns]

In [None]:
distribution_of_runners.head()

In [None]:
lausanne_1999_2016_utils.plot_distribution_over_years(distribution_of_runners)

We easily see some changes over the years thanks to the graph. Among other things, we identify the new direction of Lausanne Marathon as for 2009, number of participants started to increase again. We also notice that number of runners of the marathon is decreasing over the years, as RTSSport mentioned it <a href="https://www.rts.ch/sport/athletisme/1258113-le-marathon-de-lausanne-change-dorientation.html">in an article dedicated to Lausanne Marathon</a>, while, in the meantime, the number of runners of the 10 km running increases (and is now higher than the one for semi-marathon).

## Focus on gender

First, we retrieve gender distribution over the years into a unique DataFrame.

In [None]:
gender_distributions = lausanne_1999_2016_utils.generate_distributions(lausanne_all_marathons_10km, lausanne_all_marathons_21km, lausanne_all_marathons_42km, lausanne_1999_2016_utils.filter_by_sex_and_years)

In [None]:
gender_distributions.head(6)

To plot the results, we separe data according to the gender of participants.

In [None]:
columns = ['10 km', 'Semi-marathon', 'Marathon']
female_runners = gender_distributions[gender_distributions.index.get_level_values(0) == 'female']
female_runners.index = female_runners.index.droplevel(0)
female_runners = female_runners[columns]

male_runners = gender_distributions[gender_distributions.index.get_level_values(0) == 'male']
male_runners.index = male_runners.index.droplevel(0)
male_runners = male_runners[columns]

gender_data = collections.OrderedDict([('female', female_runners), ('male', male_runners)])

In [None]:
lausanne_1999_2016_utils.plot_gender_distributions_over_years(gender_data)

What we have seen for Lausanne Marathon 2016 can be generalized here. First, we see that women tend to prefer 10 km running, while for men, the number of participants among the different runnings is better divided. Second, we see that globally, for 10 km running and marathon, evolution is similar among the two genders. However, we observe much more variations for men than for women when we consider semi-marathon, and this even if we consider the new direction of Lausanne Marathon established in 2009.

## Evolution of age of participants

In [None]:
statistics = lausanne_1999_2016_utils.generate_median_age_statistics(lausanne_all_marathons_cleaned)

In [None]:
statistics['global'].round(1).T

In [None]:
statistics['detailed'].index.levels[0].rename('Gender', inplace=True)
statistics['detailed'].index.levels[1].rename('Year', inplace=True)
statistics['detailed'].round(1)

In [None]:
median_age_by_gender = statistics['detailed'].swaplevel().reset_index(level=1)
lausanne_1999_2016_utils.plot_median_age_evolution(median_age_by_gender)

What an interesting thing!

We clearly see that over the years, participants of Lausanne Marathon are younger and younger! In 1999, median age was about 54 years for women and 57 years for men, while, in 2016, it was 36 years and 40 years, respectively!

Globall, it seems that the decrease is linear. To verify this assumption, we use <a href="https://en.wikipedia.org/wiki/Ordinary_least_squares">Ordinary least squares</a> method on the two datasets.

In [None]:
female_data = median_age_by_gender[median_age_by_gender['Gender'] == 'female'].reset_index()
male_data = median_age_by_gender[median_age_by_gender['Gender'] == 'male'].reset_index()
female_data['Year'] = female_data['Year'].apply(lambda x: int(float(x)))
male_data['Year'] = male_data.reset_index()['Year'].apply(lambda x: int(float(x)))
gender_data = {'female': female_data, 'male': male_data}
ols_results_gender = study_utils.run_ols_test(gender_data, 'Year', 'Median age (all runnings)')

In [None]:
ols_results_gender['female'].summary()

In [None]:
ols_results_gender['male'].summary()

In [None]:
for gender, ols_result in ols_results_gender.items():
    print('GENDER: ' + gender)
    fig, ax = plt.subplots()
    fig = sm.graphics.plot_fit(ols_result, 'Year', ax=ax)
    plt.show()

From OLS, we find high R-squared values, so linear correlation can be assumed for the two genders. Graphically, we also see that original results are included in the interval defined by error bars associated with fitted values.

What are the equations of linear regressions?

In [None]:
for gender, ols_result in ols_results_gender.items():
    print('GENDER: ' + gender)
    print('Y = Â * X + Ê')
    print('Â = ' + str(ols_results_gender[gender].params[1]))
    print('Ê = ' + str(ols_results_gender[gender].params[0]) + '\n')

Each year, median age decreased by 1 year approximately.

If this evolution continues, this means that, for 2017, median age of runners would be approximately:

In [None]:
for gender, ols_result in ols_results_gender.items():
    print('Median age of ' + gender + ' runners: ' + str(ols_results_gender[gender].params[1]*2017+ols_results_gender[gender].params[0]))

It is important to note that even if during the past 18 years, median age continued to decrease, this evolution will obviously not continue to be linear in the following years. We expect stabilization, or inverse behavior (linear increase).

## Evolution of performance over the years

### Static visualisation

In [None]:
study_utils.display_boxplot(lausanne_all_marathons_cleaned, 'year', 'time', 'distance (km)', title='Evolution of performance over the years for all runnings of Lausanne Marathon', y_format=study_utils.convert_seconds_to_time)

If we consider all the runners, it seems that there is no visible differences over the years (except for outliers).

### Dynamic visualisation

First, we initialize visualisation parameters and we create Plotly figure.

In [None]:
# Visualisation parameters
selected_age_category = 'All'
selected_performance_criterion = 'Time'
all_age_categories = ['All', '15-25 years', '26-30 years', '31-35 years', '36-40 years', '41-45 years', '46-50 years', '51-55 years', '56-60 years', '61-65 years', '65+ years']
all_performance_criteria = ['Time', 'Speed (m/s)']

# DataFrame with time format
lausanne_all_marathons_cleaned_with_datetime = lausanne_all_marathons_cleaned.copy()
lausanne_all_marathons_cleaned_with_datetime['time'] = pd.to_datetime([study_utils.convert_seconds_to_time(t) for t in lausanne_all_marathons_cleaned['time']], format='%H:%M:%S')

# Creation of all figures
all_performance_figures = lausanne_1999_2016_utils.generate_all_performance_figures(lausanne_all_marathons_cleaned_with_datetime, all_age_categories, all_performance_criteria)

We then create dropdown and their respective handlers.

In [None]:
# ----------------------------------------------------------------------------------------
# HANDLERS

def update_plot():
    global selected_age_category, selected_performance_criterion
    clear_output()
    data = all_performance_figures[selected_age_category][selected_performance_criterion]
    plotly.offline.iplot(data)

def update_age_category(change):
    if change['new'] not in all_age_categories:
        return
    global selected_age_category
    selected_age_category = change['new']
    data = all_performance_figures[selected_age_category][selected_performance_criterion]
    update_plot()
    
def update_performance_criterion(change):
    if change['new'] not in all_performance_criteria:
        return
    global selected_performance_criterion
    selected_performance_criterion = change['new']
    update_plot()

# HANDLERS (END)
# ----------------------------------------------------------------------------------------
    
# Widget for age categories
age_label = widgets.Label(value='Age category')
age_widget = widgets.Dropdown(options=all_age_categories, value=selected_age_category)
age_widget.observe(update_age_category)

# Widget for performance criteria
performance_label = widgets.Label(value='Performance')
performance_widget = widgets.Dropdown(options=all_performance_criteria, value=selected_performance_criterion)
performance_widget.observe(update_performance_criterion)

Finally, we display the graph.

In [None]:
display(age_label)
display(age_widget)
display(performance_label)
display(performance_widget)
plotly.offline.iplot(all_performance_figures[selected_age_category][selected_performance_criterion])