In [1]:
import pandas as pd
import os
dataset_path = 'dataset'


In [2]:
silver_df = pd.read_parquet(os.path.join(dataset_path, 'silver_df.parquet'))
silver_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 269728 entries, 0 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   ID      269728 non-null  int64   
 1   Name    269728 non-null  object  
 2   Sex     269728 non-null  category
 3   Age     260414 non-null  Int8    
 4   Height  210917 non-null  Int16   
 5   Weight  208204 non-null  float32 
 6   Team    269728 non-null  category
 7   NOC     269728 non-null  category
 8   Games   269728 non-null  category
 9   Year    269728 non-null  uint16  
 10  Season  269728 non-null  category
 11  City    269728 non-null  category
 12  Sport   269728 non-null  category
 13  Event   269728 non-null  category
 14  Medal   39772 non-null   category
dtypes: Int16(1), Int8(1), category(9), float32(1), int64(1), object(1), uint16(1)
memory usage: 12.2+ MB


In [3]:
# Missing weight and height feature based on year

from IPython.display import display
from pandasql import sqldf
import numpy as np
pysqldf = lambda q: sqldf(q, globals())

subquery_missing_height = \
                ' (SELECT Year, COUNT(*) AS missing_height' \
                ' FROM silver_df' \
                ' WHERE Height IS NULL' \
                ' GROUP BY Year' \
                ' ORDER BY Year ASC) missingH'

subquery_missing_weight = \
                ' (SELECT Year, COUNT(*) AS missing_weight' \
                ' FROM silver_df' \
                ' WHERE Weight IS NULL' \
                ' GROUP BY Year' \
                ' ORDER BY Year ASC) missingW'

query_missing = f'SELECT silver_df.Year, COUNT(1) AS total_entries, missing_height, missing_weight' \
               f' FROM silver_df' \
               f' LEFT JOIN ' \
               f'{subquery_missing_height}' \
               f' ON silver_df.Year = missingH.Year' \
               f' LEFT JOIN ' \
               f'{subquery_missing_weight}' \
               f' ON silver_df.Year = missingW.Year' \
               f' GROUP BY silver_df.Year' \
               f' ORDER BY silver_df.Year DESC'
gold_missing_data_df = pysqldf(query_missing)

gold_missing_data_df = gold_missing_data_df.assign(
    missing_weight_perc = 100 * np.divide(gold_missing_data_df.missing_weight, gold_missing_data_df.total_entries))
gold_missing_data_df = gold_missing_data_df.assign(
    missing_height_perc = 100 * np.divide(gold_missing_data_df.missing_height, gold_missing_data_df.total_entries))

display(gold_missing_data_df)

Unnamed: 0,Year,total_entries,missing_height,missing_weight,missing_weight_perc,missing_height_perc
0,2016,13688,176,223,1.629164,1.285798
1,2014,4891,20,218,4.457166,0.408914
2,2012,12920,168,360,2.786378,1.30031
3,2010,4402,2,24,0.545207,0.045434
4,2008,13602,151,159,1.168946,1.110131
5,2006,4382,6,16,0.36513,0.136924
6,2004,13443,36,37,0.275236,0.267797
7,2002,4109,29,47,1.143831,0.705768
8,2000,13821,123,126,0.911656,0.88995
9,1998,3605,84,86,2.385576,2.330097


In [8]:
import plotly.express as px


import plotly.graph_objects as go
data = [
    go.Scatter(x=gold_missing_data_df.Year, y=gold_missing_data_df.missing_weight_perc, name='weight'),
    go.Scatter(x=gold_missing_data_df.Year, y=gold_missing_data_df.missing_height_perc, name='height')
    ]

fig = go.Figure(data=data)
fig.update_layout(title='Missing height and weight data based on the Olympic year',
                   xaxis_title='Year',
                   yaxis_title='Missing data (%)')
fig.show()

In [26]:
# Drop values before 1960
silver_df = silver_df.loc[silver_df.Year >= 1960]

In [27]:
# Characterize men and women heights
import plotly.express as px

gold_height_df = silver_df.dropna(subset=['Height'])
fig = px.histogram(x=gold_height_df.Height, color=gold_height_df.Sex, nbins=30, barmode='overlay', histnorm='probability density')
fig.show()



In [28]:
print('Women:')
gold_height_df['Height'].loc[gold_height_df.Sex=='F'].describe()

Women:


count    65925.000000
mean       167.899279
std          8.805629
min        127.000000
25%        162.000000
50%        168.000000
75%        173.000000
max        213.000000
Name: Height, dtype: float64

In [29]:
print('Men:')
gold_height_df['Height'].loc[gold_height_df.Sex=='M'].describe()


Men:


count    132028.000000
mean        179.078052
std           9.429131
min         127.000000
25%         173.000000
50%         179.000000
75%         185.000000
max         226.000000
Name: Height, dtype: float64

In [36]:
subquery_count_winning = '( SELECT NOC, Sport, Event, 1 as medals' \
                ' FROM silver_df' \
                ' WHERE Medal IS NOT NULL' \
                ' GROUP BY NOC, Sport, Event) winning'

winning_teams = f' SELECT silver_df.NOC, COUNT(1) as total_competing, COUNT(medals) as medals_no' \
                f' FROM silver_df' \
                f' LEFT JOIN' \
                f' {subquery_count_winning}' \
                f' ON silver_df.NOC = winning.NOC' \
                f' GROUP BY silver_df.NOC' \
                f' ORDER BY medals DESC' \
                f' LIMIT 15'


gold_winning_teams_df = pysqldf(winning_teams)
# gold_winning_teams_df
display(gold_winning_teams_df)



Unnamed: 0,NOC,total_competing,medals_no
0,ZIM,1545,1545
1,ZAM,366,366
2,YUG,75825,75825
3,WIF,40,40
4,VIE,616,616
5,VEN,10673,10673
6,UZB,11880,11880
7,USA,4124010,4124010
8,URU,646,646
9,URS,1176016,1176016
