In [146]:
import pandas as pd
import os
import numpy as np
from IPython.display import display
import pycountry
# pd.set_option('display.max_rows', )

In [133]:
datasets_path = 'dataset/'

# Load the csv dataset
raw_df = pd.read_csv(os.path.join(datasets_path, 'athlete_events.csv'))

# Visualize df info
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      271116 non-null  int64  
 1   Name    271116 non-null  object 
 2   Sex     271116 non-null  object 
 3   Age     261642 non-null  float64
 4   Height  210945 non-null  float64
 5   Weight  208241 non-null  float64
 6   Team    271116 non-null  object 
 7   NOC     271116 non-null  object 
 8   Games   271116 non-null  object 
 9   Year    271116 non-null  int64  
 10  Season  271116 non-null  object 
 11  City    271116 non-null  object 
 12  Sport   271116 non-null  object 
 13  Event   271116 non-null  object 
 14  Medal   39783 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 31.0+ MB


In [134]:
# First 5 rows to make sure of data types
display(raw_df.head(5))

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [135]:
import re

# Find the -1 -2 suffixes from names and teams and take them off
p = '(-\d+)'
raw_df.Team = raw_df.Team.str.replace(p, '', case=False, regex=True)

raw_df.Name = raw_df.Name.str.replace(p, '', case=False, regex=True)

In [136]:
# Cast to appropriate data types

# Sex
raw_df.Sex = raw_df.Sex.astype(pd.CategoricalDtype(categories=['M', 'F']))
raw_df.Age = raw_df.Age.astype('Int8')
raw_df.Height = np.floor(pd.to_numeric(raw_df.Height, errors='coerce')).astype('Int16')
raw_df.Weight = raw_df.Weight.astype('float32')
raw_df.NOC = raw_df.NOC.astype('category')
raw_df.Games = raw_df.Games.astype('category')
raw_df.Year = raw_df.Year.astype('uint16')
raw_df.Season = raw_df.Season.astype('category')
raw_df.City = raw_df.City.astype('category')
raw_df.Sport = raw_df.Sport.astype('category')
raw_df.Event = raw_df.Event.astype('category')
raw_df.Medal = raw_df.Medal.astype(pd.CategoricalDtype(categories=['Gold', 'Silver', 'Bronze'], ordered=True))
raw_df.Team = raw_df.Team.astype('category')

In [137]:
# Find duplicates
dups = raw_df.duplicated(subset=['ID', 'Name', 'Sex', 'Team', 'Games', 'Year', 'Season', 'City', 'Sport', 'Event', 'Medal'], keep=False)
dups_df = raw_df[dups]
display(dups_df)

raw_df = raw_df.drop_duplicates(subset=['ID', 'Name', 'Sex', 'Team', 'Games',
                                        'Year', 'Season', 'City', 'Sport',
                                        'Event', 'Medal'])

raw_df['Medal'] = raw_df['Medal'].astype(pd.CategoricalDtype(categories=['Gold', 'Silver', 'Bronze', 'NoMedal'], ordered=True))
raw_df['Medal'] = raw_df['Medal'].fillna(value='NoMedal')


Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
1251,704,Dsir Antoine Acket,M,27,,,Belgium,BEL,1932 Summer,1932,Summer,Los Angeles,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
1252,704,Dsir Antoine Acket,M,27,,,Belgium,BEL,1932 Summer,1932,Summer,Los Angeles,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
4281,2449,William Truman Aldrich,M,48,,,United States,USA,1928 Summer,1928,Summer,Amsterdam,Art Competitions,"Art Competitions Mixed Painting, Drawings And ...",
4282,2449,William Truman Aldrich,M,48,,,United States,USA,1928 Summer,1928,Summer,Amsterdam,Art Competitions,"Art Competitions Mixed Painting, Drawings And ...",
4283,2449,William Truman Aldrich,M,48,,,United States,USA,1928 Summer,1928,Summer,Amsterdam,Art Competitions,"Art Competitions Mixed Painting, Drawings And ...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269997,135072,Anna Katrina Zinkeisen (-Heseltine),F,46,,,Great Britain,GBR,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
269998,135073,Doris Clare Zinkeisen (-Johnstone),F,49,,,Great Britain,GBR,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
269999,135073,Doris Clare Zinkeisen (-Johnstone),F,49,,,Great Britain,GBR,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",
270199,135173,Henri Achille Zo,M,58,,,France,FRA,1932 Summer,1932,Summer,Los Angeles,Art Competitions,"Art Competitions Mixed Painting, Unknown Event",


In [138]:
display(raw_df.head(5))
raw_df.info()
# Save to a parquet silver table
raw_df.to_parquet(os.path.join(datasets_path, 'silver_df.parquet'))



Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,NoMedal
1,2,A Lamusi,M,23,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,NoMedal
2,3,Gunnar Nielsen Aaby,M,24,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,NoMedal
3,4,Edgar Lindenau Aabye,M,34,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,NoMedal


<class 'pandas.core.frame.DataFrame'>
Int64Index: 269728 entries, 0 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   ID      269728 non-null  int64   
 1   Name    269728 non-null  object  
 2   Sex     269728 non-null  category
 3   Age     260414 non-null  Int8    
 4   Height  210917 non-null  Int16   
 5   Weight  208204 non-null  float32 
 6   Team    269728 non-null  category
 7   NOC     269728 non-null  category
 8   Games   269728 non-null  category
 9   Year    269728 non-null  uint16  
 10  Season  269728 non-null  category
 11  City    269728 non-null  category
 12  Sport   269728 non-null  category
 13  Event   269728 non-null  category
 14  Medal   269728 non-null  category
dtypes: Int16(1), Int8(1), category(9), float32(1), int64(1), object(1), uint16(1)
memory usage: 12.2+ MB


In [151]:
noc_df = pd.read_csv(os.path.join(datasets_path, 'noc_regions.csv'))

# Visualize df info
noc_df.info()
noc_df.head(10)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   NOC     230 non-null    object
 1   region  228 non-null    object
 2   notes   21 non-null     object
dtypes: object(3)
memory usage: 5.5+ KB


Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,AHO,Curacao,Netherlands Antilles
2,ALB,Albania,
3,ALG,Algeria,
4,AND,Andorra,
5,ANG,Angola,
6,ANT,Antigua,Antigua and Barbuda
7,ANZ,Australia,Australasia
8,ARG,Argentina,
9,ARM,Armenia,


In [152]:
# Check for duplicates
dups = noc_df.duplicated(subset=['NOC', 'region'], keep=False)
print(dups.unique())

[False]


In [141]:
# noc_df.to_parquet(os.path.join(datasets_path, 'silver_noc.parquet'))

In [153]:
query_country_df = noc_df.dropna(subset=['region'])


countries_alpha_3 = []
for region, code in zip(query_country_df.region, query_country_df.NOC):
    try:
        country_returned = pycountry.countries.lookup(region)
        alpha3_print = country_returned.alpha_3
    except LookupError:
        # Try to find a match with the code
        try:
            country_returned = pycountry.countries.lookup(code)
            alpha3_print = country_returned.alpha_3
        except LookupError:
            # Try to find a fuzzy match
            try:
                country_returned = pycountry.countries.search_fuzzy(region)
                if country_returned == 1:
                    alpha3_print = country_returned[0].alpha_3
                else:
                    print(f'Region {region} ({code}), multiple match found: {country_returned}')
                    alpha3_print = ''
            except LookupError:
                print(f'Region {region} ({code}), no match found')
                alpha3_print = ''
    countries_alpha_3.append(alpha3_print)
query_country_df = query_country_df.assign(alpha_3 = countries_alpha_3)
query_country_df = query_country_df.set_index('NOC')
display(query_country_df)

Region Curacao (AHO), multiple match found: [Country(alpha_2='NL', alpha_3='NLD', name='Netherlands', numeric='528', official_name='Kingdom of the Netherlands'), Country(alpha_2='CW', alpha_3='CUW', name='Curaçao', numeric='531', official_name='Curaçao')]
Region Antigua (ANT), multiple match found: [Country(alpha_2='AG', alpha_3='ATG', name='Antigua and Barbuda', numeric='028')]
Region Brunei (BRU), multiple match found: [Country(alpha_2='BN', alpha_3='BRN', name='Brunei Darussalam', numeric='096')]
Region Republic of Congo (CGO), no match found
Region Russia (EUN), multiple match found: [Country(alpha_2='RU', alpha_3='RUS', name='Russian Federation', numeric='643')]
Region Individual Olympic Athletes (IOA), no match found
Region Iran (IRI), multiple match found: [Country(alpha_2='IR', alpha_3='IRN', name='Iran, Islamic Republic of', numeric='364', official_name='Islamic Republic of Iran'), Country(alpha_2='AL', alpha_3='ALB', name='Albania', numeric='008', official_name='Republic of A

Unnamed: 0_level_0,region,notes,alpha_3
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AFG,Afghanistan,,AFG
AHO,Curacao,Netherlands Antilles,
ALB,Albania,,ALB
ALG,Algeria,,DZA
AND,Andorra,,AND
...,...,...,...
YEM,Yemen,,YEM
YMD,Yemen,South Yemen,YEM
YUG,Serbia,Yugoslavia,SRB
ZAM,Zambia,,ZMB


In [154]:
query_country_df['alpha_3'].loc['AHO'] = 'CUW'
query_country_df['alpha_3'].loc['ANT'] = 'ATG'
query_country_df['alpha_3'].loc['BRU'] = 'BRN'
query_country_df['alpha_3'].loc['CGO'] = 'COG'
query_country_df['alpha_3'].loc['EUN'] = 'RUS'
query_country_df['alpha_3'].loc['IOA'] = ''
query_country_df['alpha_3'].loc['IRI'] = 'IRN'
query_country_df['alpha_3'].loc['KOS'] = ''
query_country_df['alpha_3'].loc['PLE'] = 'PSE'
query_country_df['alpha_3'].loc['SKN'] = 'KNA'
query_country_df['alpha_3'].loc['UAR'] = 'SYR'
query_country_df['alpha_3'].loc['URS'] = 'SUN'
query_country_df['alpha_3'].loc['VIN'] = 'VCT'
query_country_df['alpha_3'].loc['WIF'] = 'TTO'
query_country_df['alpha_3'].loc['ISV'] = 'VIR'
query_country_df['alpha_3'].loc['HKG'] = 'HKG'
display(query_country_df.head(3))

Unnamed: 0_level_0,region,notes,alpha_3
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AFG,Afghanistan,,AFG
AHO,Curacao,Netherlands Antilles,CUW
ALB,Albania,,ALB


In [155]:
# Get country full names
country_names = []
for alpha_3 in query_country_df.alpha_3:
    print(f'{alpha_3}')
    if not alpha_3:
        print(f'No country code')
        country_names.append('None')
    else:
        try:
            country_returned = pycountry.countries.get(alpha_3= alpha_3)
            if not country_returned:
                country_returned = pycountry.historic_countries.get(alpha_3= alpha_3)
            if country_returned:
                country_names.append(country_returned.name)
            else:
                print(f'{alpha_3} not found')
                country_names.append('None')
        except LookupError:
            print(f'{alpha_3} not found')
            country_names.append('None')


AFG
CUW
ALB
DZA
AND
AGO
ATG
AUS
ARG
ARM
ABW
ASM
AUS
AUT
AZE
BHS
BGD
BRB
BDI
BEL
BEN
BMU
BTN
BIH
BLZ
BLR
CZE
BOL
BWA
BRA
BHR
BRN
BGR
BFA
CAF
KHM
CAN
CYM
COG
TCD
CHL
CHN
CIV
CMR
COD
COK
COL
COM
CPV
CRI
HRV
GRC
CUB
CYP
CZE
DNK
DJI
DMA
DOM
ECU
EGY
ERI
SLV
ESP
EST
ETH
RUS
FJI
FIN
FRA
DEU
FSM
GAB
GMB
GBR
GNB
DEU
GEO
GNQ
DEU
GHA
GRC
GRD
GTM
GIN
GUM
GUY
HTI
HKG
HND
HUN
IDN
IND

No country code
IRN
IRL
IRQ
ISL
ISR
VIR
ITA
VGB
JAM
JOR
JPN
KAZ
KEN
KGZ
KIR
KOR

No country code
SAU
KWT
LAO
LVA
LBY
LBR
LCA
LSO
LBN
LIE
LTU
LUX
MDG
MYS
MAR
MYS
MWI
MDA
MDV
MEX
MNG
MHL
MKD
MLI
MLT
MNE
MCO
MOZ
MUS
MRT
MMR
NAM
MYS
NIC
NLD
NPL
CAN
NGA
NER
NOR
NRU
NZL
OMN
PAK
PAN
PRY
PER
PHL
PSE
PLW
PNG
POL
PRT
PRK
PRI
QAT
ZWE
ROU
ZAF
RUS
RWA
DEU
WSM
SRB
SEN
SYC
SGP
KNA
SLE
SVN
SMR
SLB
SOM
SRB
LKA
SSD
STP
SDN
CHE
SUR
SVK
SWE
SWZ
SYR
TZA
CZE
TON
THA
TJK
TKM
TLS
TGO
TWN
TTO
TUN
TUR
TUV
ARE
SYR
UGA
UKR
SUN
URY
USA
UZB
VUT
VEN
VNM
VCT
VNM
TTO
YEM
YEM
YEM
SRB
ZMB
ZWE


In [156]:
query_country_df = query_country_df.assign(iso_names=country_names)
query_country_df = query_country_df.reset_index()
query_country_df = query_country_df.drop(columns=['notes'])
query_country_df.to_parquet(os.path.join('dataset', 'iso_countries.parquet'))
display(query_country_df)


Unnamed: 0,NOC,region,alpha_3,iso_names
0,AFG,Afghanistan,AFG,Afghanistan
1,AHO,Curacao,CUW,Curaçao
2,ALB,Albania,ALB,Albania
3,ALG,Algeria,DZA,Algeria
4,AND,Andorra,AND,Andorra
...,...,...,...,...
223,YEM,Yemen,YEM,Yemen
224,YMD,Yemen,YEM,Yemen
225,YUG,Serbia,SRB,Serbia
226,ZAM,Zambia,ZMB,Zambia
