# Assumptions (potiential problems)
 - ath_name == '' means race cancelled (checked for a few of events)
   - correct, afaik
 - ['season', 'venue', 'event', 'date', 'gender'] identify uniquely a race
     - we do not get the same number of races as https://en.wikipedia.org/wiki/FIS_Alpine_Ski_World_Cup
 - ['season', 'event', 'date', 'gender', 'ath_name'] is a unique index for the data frames

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns

In [None]:
def parseCSV(isMale=True):
    gender = 'm' if isMale else 'f'
    df = pd.read_csv(f'../../data/wc{gender}.csv')
    df = df.replace(np.nan, '', regex=True)
    df.date = pd.to_datetime(df.date, format='%Y-%m-%d')
    df['gender'] = gender
    return df

In [None]:
def remove_cancelled_race(df):
    return df[df['ath_name']!='']

In [None]:
def number_season(df):
    return len(df['season'].unique())

def number_races(df):
    return len(df[['season', 'venue', 'event','date', 'gender']].drop_duplicates())

def number_athletes(df):
    return len(df['ath_name'].unique())

def number_country(df):
    return len(df['ath_country'].unique())

def number_races_events(df):
    return df[['season', 'venue', 'event','date','gender']].drop_duplicates()[['venue', 'event']].groupby('event').count()

def number_races_years(df):
    return df[['season', 'venue', 'event', 'date', 'gender']].drop_duplicates()[['event','season']].groupby('season').count()
    

In [None]:
def show_basic_stats(df):
    print('# seasons  :', number_season(df))
    print('# races    :', number_races(df))
    print('# athletes :', number_athletes(df))
    print('# country  :', number_country(df))
    print(number_races_events(df))

In [None]:
dfm = remove_cancelled_race(parseCSV())
dff = remove_cancelled_race(parseCSV(False))
both = pd.concat([dfm, dff])
print('-----  World Cup Men -----')
show_basic_stats(dfm)

print('-----  World Cup Women -----')
show_basic_stats(dff)

print('-----  World Cup Women -----')
show_basic_stats(both)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(15,5))
axes = number_races_years(both).plot.bar(ax=axes,title='Number of races per season', color=(0, 134/255.0, 1))
axes.set_xlabel("Season")
axes.set_ylabel("Number of races")
fig.savefig('races_per_year.png')

In [None]:
events = pd.DataFrame(columns=['Downhill','Super G','Giant Slalom','Slalom','Combined','Parallel','decade'])
decades = ["70's", "80's", "90's", "00's", "10's"]

start = 1970 
end = 1980
i = 0
while(end < 2021):
    e = number_races_events(both[(both['season'] >= start) &(both['season'] < end)])
    e = e.transpose().reset_index().drop('index', axis=1)
    e['decade'] = decades[i]
    e.set_index('decade')
    events = pd.concat([events, e])
    start +=10
    end +=10
    i+=1
events = events.set_index('decade')

In [None]:
events

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(15,5))
axes = events.plot.bar(ax=axes, title='Number of events per decade',rot=0)
axes.legend(loc="center right",bbox_to_anchor=(1.13, 0.5))
axes.set_xlabel("Decades")
axes.set_ylabel("Number of events")

fig.savefig('events_per_decades.png')

In [None]:
both[both['ath_time_run_1']!=0].sort_values('season')

## Corrections

Attention: after parsing, WCM has 2 Kitzbüehl Downhill races on the 1995-01-14. One of them was on the 1995-01-13. Corrected manually.

In [None]:
dfm_indexed = dfm.set_index(['season', 'date', 'venue', 'country', 'event', 'ath_name'])
dfm_indexed.index.is_unique#.value_counts().head(30)

In [None]:
dff_indexed = dff.set_index(['season', 'date', 'venue', 'country', 'event', 'ath_name'])
dff_indexed.index.is_unique#.value_counts().head(30)