_______________
# 06. Data Analysis & Visualisation (Kaggle Example)
_______________

The following commands can be used (with minor adjustments, of course) to download/unzip some kaggle dataset without leaving the jupyter interpreter. However, you would first have to install a kaggle-specific library within you conda environment to do so. In the present case, this has been simplified for you, the data has been downloaded and stored within the `data` directory:

    data/athlete_events.csv
    data/noc_regions.csv


In [None]:
# !kaggle datasets download -d heesoo37/120-years-of-olympic-history-athletes-and-results

In [None]:
# !unzip -qq 120-years-of-olympic-history-athletes-and-results.zip -d ./data

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
os.listdir('data')

# Importing data

In [None]:
data = pd.read_csv('./data/athlete_events.csv')

In [None]:
regions = pd.read_csv('./data/noc_regions.csv')

# Main information about the data

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.head(3).T

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
df = pd.merge(data, regions, on='NOC', how='left')

In [None]:
df.head().T

In [None]:
df.info()

# Distribution of the age of gold medalists

In [None]:
df.Medal == 'Gold'

In [None]:
# these are alternatives in this case
gold_medals = df[df.Medal == 'Gold']
gold_medals = df.loc[df.Medal == 'Gold']

In [None]:
gold_medals.head()

In [None]:
gold_medals.isnull().sum()

In [None]:
gold_medals = gold_medals[np.isfinite(gold_medals['Age'])]

In [None]:
plt.figure(figsize=(20, 10))
# plt.tight_layout()
sns.countplot(gold_medals['Age'])
plt.title('Distribution of Gold Medals')

In [None]:
gold_medals['ID'][gold_medals['Age'] > 50].count()

In [None]:
gold_medals[gold_medals['Age'] < 15].Sport.value_counts()

In [None]:
plt.figure(figsize=(15, 5))
plt.tight_layout()
sns.countplot(gold_medals['Sport'][gold_medals['Age'] > 50])
plt.title('Gold medals for over 50')

In [None]:
women_in_olympics = df[(df.Sex == 'F') & (df.Season == 'Summer')]

In [None]:
women_in_olympics.sample(4).T

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(x='Year', data=women_in_olympics)
plt.title('Women participations per year')

In [None]:
women_in_olympics.loc[women_in_olympics['Year'] == 1900]

# Medals per country

In [None]:
total_recent_gold_medals = (gold_medals[gold_medals.Year > 1990]
    .region
    .value_counts()
    .reset_index(name="Medals")
    .head(10))

In [None]:
g = sns.catplot(x='index', y='Medals', data=total_recent_gold_medals, kind='bar', palette='muted', aspect=3)
g.despine(left=True)
g.set_xlabels('Top 10 countries after 1990')
g.set_ylabels('Number of medals')
plt.title('Medals per country')