In [None]:
# Imports
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import collections
from matplotlib.pyplot import show
import sys
sys.path.append('..')

# our code (mark it at autoreload at every cell execution - useful in developement mode)
%load_ext autoreload
%autoreload 1
%aimport study_utils
%aimport lausanne_1999_2016_utils

In [None]:
PATH_TO_DATA = '../../Scraping/DataSport/Data/'

# Data wrangling

First, we retrieve all the data corresponding to Lausanne Marathons, from 1999 to 2016.

In [None]:
lausanne_all_marathons = study_utils.get_data(PATH_TO_DATA)

In [None]:
lausanne_all_marathons

We then apply different computations in order to clean DataFrame and format columns for further analysis.

In [None]:
lausanne_all_marathons_cleaned = study_utils.apply_computations(lausanne_all_marathons)

In [None]:
lausanne_all_marathons_cleaned = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned.apply(study_utils.filter_participants, axis=1)]

We also compute the overall rank for each year.

In [None]:
lausanne_all_marathons_cleaned = lausanne_1999_2016_utils.remove_outliers(lausanne_all_marathons_cleaned)

In [None]:
lausanne_all_marathons_cleaned = lausanne_1999_2016_utils.compute_overall_rank(lausanne_all_marathons_cleaned)

In [None]:
lausanne_all_marathons_cleaned.drop('rank', axis=1, inplace=True)

We finally obtain the final DataFrame, ready to be used for analysis.

In [None]:
lausanne_all_marathons_cleaned.head()

In [None]:
lausanne_all_marathons_10km = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned['distance (km)'] == 10]
lausanne_all_marathons_21km = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned['distance (km)'] == 21]
lausanne_all_marathons_42km = lausanne_all_marathons_cleaned[lausanne_all_marathons_cleaned['distance (km)'] == 42]

## Statistics about outliers

Let's display the number of runners for whom one or more fundamental attributes are missing.

In [None]:
columns_to_check = ['sex', 'time', 'rank']
# As sex is used to find outliers, we add the column into the original DataFrame
lausanne_all_marathons['sex'] = lausanne_all_marathons.apply(study_utils.get_sex_of_runner, axis=1)
study_utils.get_statistics_outliers(lausanne_all_marathons, columns_to_check)

Finally, let's display the results of the cleaning process.

In [None]:
all_runners = len(lausanne_all_marathons)
considered_runners = len(lausanne_all_marathons_cleaned)
print('Runners in raw DataFrame: ' + str(all_runners))
print('Runners to be considered for analysis: ' + str(considered_runners))
print('Unconsidered runners: ' + str(all_runners - considered_runners))

# Analysis of data

## Focus on gender

First, we retrieve gender distribution over the years into a unique DataFrame.

In [None]:
gender_distributions = lausanne_1999_2016_utils.generate_gender_distributions_over_years(lausanne_all_marathons_10km, lausanne_all_marathons_21km, lausanne_all_marathons_42km)

In [None]:
gender_distributions.head(6)

To plot the results, we separe data according to the gender of participants.

In [None]:
female_runners = gender_distributions[gender_distributions.index.get_level_values(0) == 'female']
female_runners.index = female_runners.index.droplevel(0)

male_runners = gender_distributions[gender_distributions.index.get_level_values(0) == 'male']
male_runners.index = male_runners.index.droplevel(0)

In [None]:
lausanne_1999_2016_utils.plot_gender_distributions_over_years([female_runners, male_runners])

We easily see some changes over the years thanks to the graph. Among other things, we identify the new direction of Lausanne Marathon as for 2009, number of participants started to increase again. We also notice that number of runners is decreasing over the years, as RTSSport mentioned it <a href="https://www.rts.ch/sport/athletisme/1258113-le-marathon-de-lausanne-change-dorientation.html">in an article dedicated to Lausanne Marathon</a>.