In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import collections
from matplotlib.pyplot import show

# our code (mark it at autoreload at every cell execution - useful in developement mode)
%load_ext autoreload
%autoreload 1
%aimport runners_utils

In [None]:
PATH_TO_DATA = '../../Scraping/DataSport/Data/Runners/'
# change the name to the global like website but need to see if the website still works.
RUNNERS_FILE = 'runners_2009.csv'
RUNS_FILE = 'runs_2009.csv'

In [None]:
runners = pd.read_csv(PATH_TO_DATA + RUNNERS_FILE)
runs = pd.read_csv(PATH_TO_DATA + RUNS_FILE)

In [None]:
runners.head()

In [None]:
runs.head()

In [None]:
runs_before_preprocessing = runs.copy()

We remove useless columns, and all value showed that the runners have resigned.

In [None]:
runners_utils.preprocess_runners(runners)

In [None]:
runs = runners_utils.remove_outliers(runs)

For each races we compute the distance, we remove all other sports than running to remain coherent during the study.

In [None]:
runs['distance (km)'] = runs.apply(runners_utils.compute_distance_from_category,axis=1)
runs = runs[~runs['distance (km)'].isnull()]

For each runners, we compute the gender (male/female).

In [None]:
result = runs.apply(runners_utils.compute_sex, args=(runners,), axis=1)

In order to have enought races for each runners, we take only runners who have run more than 100 races throught 1999 to 2016.

In [None]:
runs = runners_utils.select_runners_by_numbers_of_runs(runs, 100)

For each runners, we compute his age at the moment of the race.

In [None]:
runs['age'] = runs.apply(runners_utils.compute_age, args=(runners,), axis=1)

In [None]:
runs['time (s)'] = runs.apply(runners_utils.transform_string_to_second, axis=1)
runs['speed (m/s)'] = runs['distance (km)']*1000 / runs['time (s)']

In [None]:
runs['eventDate'] = runs.apply(runners_utils.compute_date_event, axis=1)

In [None]:
runners_utils.remove_useless_columns(runs)

After all operations of preprocessing, we obtain 10 runners who have finished more than 100 running races.

In [None]:
len(runs['acode'].unique())

In [None]:
runs.head()

## Runner study.

In [None]:
races_59 = runs[runs['acode'] == 'FU3KDU5G']
races_38 = runs[runs['acode'] == 'G756ZTL6']
races_26 = runs[runs['acode'] == 'FFJBKFSN']

In [None]:
fig = plt.figure()
fig.tight_layout
fig.set_size_inches(15, 10)
fig.suptitle('Runnner speed by distance', fontsize=14)
annotation_plot2 = ['performance \n gap',(9.9,3), (9,4)]
runners_utils.presentation_performance_runners(fig, [races_59, races_38, races_26], annotation_plot2)

As we can notice from the graph above we see some enormous disparity between speed of race who have the same distance.
Futhermore an other interesting fact is the speed seems lower for the race of 31Km (http://www.sierre-zinal.com) than marathons.
The current study try to focus on vizualisation of multiple factors that could explain these gap:
 - Age
 - Training (As we don't have any personnal information, we take as reference how much they race)
 - Weather / altitude

## Performance study

### Training Part

In [None]:
group_by_runs = runs.groupby(['acode']).size().reset_index().groupby('acode')[[0]].max()
group_by_runs_before_process = runs_before_preprocessing.groupby(['acode']).size().reset_index().groupby('acode')[[0]].max()
group_by_runs_resign = runs_before_preprocessing[(runs_before_preprocessing['resultState'] == 'non classé')].groupby(['acode']).size().reset_index().groupby('acode')[[0]].max()

group_by_runs.columns = ['number_race']
group_by_runs_before_process.columns = ['overall number race']
group_by_runs_resign.columns = ['number abandon']

result = group_by_runs.join(group_by_runs_before_process)
result = pd.merge(result, group_by_runs_resign, how='left', right_index=True, left_index=True)

result.fillna(value=0,  inplace=True)

# We Reindex the dataFrame
for index,value in enumerate (test.index):
    result.index.values[index] = 'Runner '+ str(index + 1)
    
result

The study carry on 10 runners, as we can notice they are not begginers some of them have more the 500 events on 17 years which mean more than 2 events each month during a period of 17 years... Even the runners with the less events have a ration of 1 race every 2 months wich is huge knowing that for a debutant the recommended waiting time before getting  is 3-4 to completely get health after http://www.runnersworld.com/ask-coach-jenny/how-many-marathons-can-you-run-in-a-year.

The result that we could found by the study is clearly individual, everybody reacts differently, we should push the study furhter (by getting additional data) to confirm any hypothesis.

In [None]:
runs_before_preprocessing['eventDate'] = runs_before_preprocessing.apply(runners_utils.compute_date_event, axis=1)
runs_before_preprocessing['year'] = runs_before_preprocessing['eventDate'].apply(lambda x: int(x.year))

In [None]:
group_by_runs_before_process = runs_before_preprocessing.groupby(['acode', 'year']).size().reset_index().groupby(['acode', 'year'])[[0]].max()
group_by_runs_resign = runs_before_preprocessing[(runs_before_preprocessing['resultState'] == 'non classé')].groupby(['acode', 'year']).size().reset_index().groupby(['acode', 'year'])[[0]].max()

group_by_runs.columns = ['number_race']
group_by_runs_before_process.columns = ['overall number race']
group_by_runs_resign.columns = ['number abandon']


#result = group_by_runs.join(group_by_runs_before_process)
result = pd.merge(group_by_runs_before_process, group_by_runs_resign, how='left', right_index=True, left_index=True)

result.fillna(value=0,  inplace=True)
result.reset_index(inplace=True)

# The year 2017 contains too small amount of data.
result = result[result['year'] != 2017 ]

# Let see if there are a link with number_overall event and the number of event.
ax = sns.lmplot(x="overall number race", y="number abandon",  data=result, col = 'year', col_wrap=3, size=4)


The result above has been done on a total of 61 runners, As we can expect generally the number of abandon grows with proportionnaly with the number of race done.
An interesting fact is the 

In [None]:
sns.jointplot(result["overall number race"], result["number abandon"], kind="reg", stat_func=runners_utils.r2)

The R-squared value is to small to affirm anything about any relation between abandon the number of race runned and the number of abandon.

## TEST

In [None]:
races_59 = runs[runs['acode'] ==  'FFJBKFSN'] #FU3KDU5G
races_38 = runs[runs['acode'] == 'G756ZTL6']
runs_42 = races_59[races_59['distance (km)'] == 42] 
group_by_age = races_59.groupby(['age'], sort=False).sum()
group_by_age.reset_index(inplace=True)

sns.boxplot(x="age", y="speed (m/s)", data=runs_42)


In [None]:
group_by_age[['age','distance (km)']].plot(x='age', y='distance (km)')