In [2]:
import pandas as pd
import os
import numpy as np
from IPython.display import display
# pd.set_option('display.max_rows', )

In [3]:
datasets_path = 'dataset/'

# Load the csv dataset
raw_df = pd.read_csv(os.path.join(datasets_path, 'athlete_events.csv'))

# Visualize df info
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      271116 non-null  int64  
 1   Name    271116 non-null  object 
 2   Sex     271116 non-null  object 
 3   Age     261642 non-null  float64
 4   Height  210945 non-null  float64
 5   Weight  208241 non-null  float64
 6   Team    271116 non-null  object 
 7   NOC     271116 non-null  object 
 8   Games   271116 non-null  object 
 9   Year    271116 non-null  int64  
 10  Season  271116 non-null  object 
 11  City    271116 non-null  object 
 12  Sport   271116 non-null  object 
 13  Event   271116 non-null  object 
 14  Medal   39783 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 31.0+ MB


In [144]:
# First 5 rows to make sure of data types
display(raw_df.head(5))

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [145]:
import re

# Find the -1 -2 suffixes from names and teams and take them off
p = '(-\d+)'
raw_df.Team = raw_df.Team.str.replace(p, '', case=False, regex=True)

raw_df.Name = raw_df.Name.str.replace(p, '', case=False, regex=True)

In [146]:
# Cast to appropriate data types

# Sex
raw_df.Sex = raw_df.Sex.astype(pd.CategoricalDtype(categories=['M', 'F']))
raw_df.Age = raw_df.Age.astype('Int8')
raw_df.Height = np.floor(pd.to_numeric(raw_df.Height, errors='coerce')).astype('Int16')
raw_df.Weight = raw_df.Weight.astype('float32')
raw_df.NOC = raw_df.NOC.astype('category')
raw_df.Games = raw_df.Games.astype('category')
raw_df.Year = raw_df.Year.astype('uint16')
raw_df.Season = raw_df.Season.astype('category')
raw_df.City = raw_df.City.astype('category')
raw_df.Sport = raw_df.Sport.astype('category')
raw_df.Event = raw_df.Event.astype('category')
raw_df.Medal = raw_df.Medal.astype(pd.CategoricalDtype(categories=['Gold', 'Silver', 'Bronze'], ordered=True))
raw_df.Team = raw_df.Team.astype('category')

In [147]:
# Find duplicates
dups = raw_df.duplicated(subset=['ID', 'Name', 'Sex', 'Team', 'Games', 'Year', 'Season', 'City', 'Sport', 'Event', 'Medal'], keep=False)
dups_df = raw_df[dups]
display(dups_df)

raw_df = raw_df.drop_duplicates(subset=['ID', 'Name', 'Sex', 'Team', 'Games',
                                        'Year', 'Season', 'City', 'Sport',
                                        'Event', 'Medal'])

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
1251,704,Dsir Antoine Acket,M,27,,,Belgium,BEL,1932 Summer,1932,Summer,Los Angeles,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
1252,704,Dsir Antoine Acket,M,27,,,Belgium,BEL,1932 Summer,1932,Summer,Los Angeles,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
4281,2449,William Truman Aldrich,M,48,,,United States,USA,1928 Summer,1928,Summer,Amsterdam,Art Competitions,"Art Competitions Mixed Painting, Drawings And ...",
4282,2449,William Truman Aldrich,M,48,,,United States,USA,1928 Summer,1928,Summer,Amsterdam,Art Competitions,"Art Competitions Mixed Painting, Drawings And ...",
4283,2449,William Truman Aldrich,M,48,,,United States,USA,1928 Summer,1928,Summer,Amsterdam,Art Competitions,"Art Competitions Mixed Painting, Drawings And ...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269997,135072,Anna Katrina Zinkeisen (-Heseltine),F,46,,,Great Britain,GBR,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
269998,135073,Doris Clare Zinkeisen (-Johnstone),F,49,,,Great Britain,GBR,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
269999,135073,Doris Clare Zinkeisen (-Johnstone),F,49,,,Great Britain,GBR,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
270199,135173,Henri Achille Zo,M,58,,,France,FRA,1932 Summer,1932,Summer,Los Angeles,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",


In [148]:
display(raw_df.head(5))
raw_df.info()
# Save to a parquet silver table
raw_df.to_parquet(os.path.join(datasets_path, 'silver_df.parquet'))



Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 269728 entries, 0 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   ID      269728 non-null  int64   
 1   Name    269728 non-null  object  
 2   Sex     269728 non-null  category
 3   Age     260414 non-null  Int8    
 4   Height  210917 non-null  Int16   
 5   Weight  208204 non-null  float32 
 6   Team    269728 non-null  category
 7   NOC     269728 non-null  category
 8   Games   269728 non-null  category
 9   Year    269728 non-null  uint16  
 10  Season  269728 non-null  category
 11  City    269728 non-null  category
 12  Sport   269728 non-null  category
 13  Event   269728 non-null  category
 14  Medal   39772 non-null   category
dtypes: Int16(1), Int8(1), category(9), float32(1), int64(1), object(1), uint16(1)
memory usage: 12.2+ MB


In [4]:
noc_df = pd.read_csv(os.path.join(datasets_path, 'noc_regions.csv'))

# Visualize df info
noc_df.info()
noc_df.head(10)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   NOC     230 non-null    object
 1   region  228 non-null    object
 2   notes   21 non-null     object
dtypes: object(3)
memory usage: 5.5+ KB


Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,AHO,Curacao,Netherlands Antilles
2,ALB,Albania,
3,ALG,Algeria,
4,AND,Andorra,
5,ANG,Angola,
6,ANT,Antigua,Antigua and Barbuda
7,ANZ,Australia,Australasia
8,ARG,Argentina,
9,ARM,Armenia,


In [5]:
# Check for duplicates
dups = noc_df.duplicated(subset=['NOC', 'region'], keep=False)
print(dups.unique())

[False]


In [6]:
noc_df.to_parquet(os.path.join(datasets_path, 'silver_noc.parquet'))
