In [219]:
import pandas as pd
import os
from IPython.display import display
from pandasql import sqldf
import numpy as np
dataset_path = 'dataset'
pysqldf = lambda q: sqldf(q, globals())


silver_df = pd.read_parquet(os.path.join(dataset_path, 'silver_df.parquet'))


In [220]:
# Like for 3_descriptive_analysis
# Extract the number of different athletes for each NOC per Year for Summer Olympics

count_distinct_athletes = f" SELECT NOC, Year, COUNT(DISTINCT ID) as distinct_athletes_no" \
                f" FROM silver_df" \
                f" WHERE Season='Summer' " \
                 f" GROUP BY NOC, Year"
gold_distinct_athletes_df = pysqldf(count_distinct_athletes)
display(gold_distinct_athletes_df)


Unnamed: 0,NOC,Year,distinct_athletes_no
0,AFG,1936,15
1,AFG,1948,25
2,AFG,1956,12
3,AFG,1960,12
4,AFG,1964,8
...,...,...,...
2805,ZIM,2000,16
2806,ZIM,2004,12
2807,ZIM,2008,13
2808,ZIM,2012,7


In [221]:
# Validate the previous query by looking at the number of distinct athletes for team USA
# In the last 3 editions (2016, 2012, 2008)

# Results should be:
# - 2016: 554 (263 men and 291 women) in 33 sports
# - 2012: 530 (262 men and 268 women) in 31 sports
# - 2008: 588 (306 men and 282 women) in 32 sports

display(gold_distinct_athletes_df.loc[gold_distinct_athletes_df.NOC == 'USA'])
# Query result: 555 (+1), 530, 588.
# The team USA website reports 558 participants for Rio 2016


Unnamed: 0,NOC,Year,distinct_athletes_no
2673,USA,1896,14
2674,USA,1900,75
2675,USA,1904,524
2676,USA,1906,38
2677,USA,1908,122
2678,USA,1912,174
2679,USA,1920,288
2680,USA,1924,299
2681,USA,1928,280
2682,USA,1932,474


In [222]:
# Extract the number of competitions entries for each NOC per Year for Summer Olympics

count_events_partecipations = f" SELECT NOC, Year, COUNT(1) as events_partecipations_no" \
                f" FROM silver_df" \
                f" WHERE Season='Summer' " \
                 f" GROUP BY NOC, Year"
gold_events_partecipations_df = pysqldf(count_events_partecipations)
display(gold_events_partecipations_df)
display(gold_events_partecipations_df.loc[gold_events_partecipations_df.NOC == 'USA'])


Unnamed: 0,NOC,Year,events_partecipations_no
0,AFG,1936,16
1,AFG,1948,25
2,AFG,1956,12
3,AFG,1960,16
4,AFG,1964,8
...,...,...,...
2805,ZIM,2000,26
2806,ZIM,2004,14
2807,ZIM,2008,16
2808,ZIM,2012,9


Unnamed: 0,NOC,Year,events_partecipations_no
2673,USA,1896,27
2674,USA,1900,135
2675,USA,1904,1109
2676,USA,1906,81
2677,USA,1908,218
2678,USA,1912,364
2679,USA,1920,473
2680,USA,1924,459
2681,USA,1928,392
2682,USA,1932,544


In [223]:
# Extract number of medals by type for each NOC


subquery_count_winning = f" SELECT NOC, Sport, Event, Year, Season, Games, Team, Medal" \
                f" FROM silver_df" \
                f" WHERE Medal IS NOT NULL AND Season='Summer' " \
                 f" GROUP BY NOC, Sport, Event, Year, Season, Games, Team, Medal" \
                f" ORDER BY NOC"
medalists_df = pysqldf(subquery_count_winning)
display(medalists_df)

Unnamed: 0,NOC,Sport,Event,Year,Season,Games,Team,Medal
0,AFG,Taekwondo,Taekwondo Men's Featherweight,2012,Summer,2012 Summer,Afghanistan,Bronze
1,AFG,Taekwondo,Taekwondo Men's Flyweight,2008,Summer,2008 Summer,Afghanistan,Bronze
2,AHO,Sailing,Sailing Mixed Windsurfer,1988,Summer,1988 Summer,Netherlands Antilles,Silver
3,ALG,Athletics,"Athletics Men's 1,500 metres",1996,Summer,1996 Summer,Algeria,Gold
4,ALG,Athletics,"Athletics Men's 1,500 metres",2012,Summer,2012 Summer,Algeria,Gold
...,...,...,...,...,...,...,...,...
16067,ZIM,Swimming,Swimming Women's 200 metres Backstroke,2004,Summer,2004 Summer,Zimbabwe,Gold
16068,ZIM,Swimming,Swimming Women's 200 metres Backstroke,2008,Summer,2008 Summer,Zimbabwe,Gold
16069,ZIM,Swimming,Swimming Women's 200 metres Individual Medley,2004,Summer,2004 Summer,Zimbabwe,Bronze
16070,ZIM,Swimming,Swimming Women's 200 metres Individual Medley,2008,Summer,2008 Summer,Zimbabwe,Silver


In [224]:
# NOC, year, and different medals
noc_medals_df = pd.pivot_table(medalists_df,
                               index=['NOC', 'Year'],
                               columns='Medal',
                               aggfunc='count',
                               values='Event',
                               fill_value=0,
                               margins=True,
                               margins_name='Total')

noc_medals_df.columns.name = ''
noc_medals_df.reset_index(inplace=True)
display(noc_medals_df)


Unnamed: 0,NOC,Year,Bronze,Gold,Silver,Total
0,AFG,2008,1,0,0,1
1,AFG,2012,1,0,0,1
2,AHO,1988,0,0,1,1
3,ALG,1984,2,0,0,2
4,ALG,1992,1,1,0,2
...,...,...,...,...,...,...
1271,ZAM,1996,0,0,1,1
1272,ZIM,1980,0,1,0,1
1273,ZIM,2004,1,1,1,3
1274,ZIM,2008,0,1,3,4


In [225]:
# gold_partecipants_df contains NOC, Year, events_partecipations_no, distinct_athletes_no, Bronze, Silver, Gold, Total
gold_partecipants_df = gold_events_partecipations_df.merge(gold_distinct_athletes_df, on=['NOC', 'Year'], how='left')
gold_partecipants_df = gold_partecipants_df.merge(noc_medals_df, on=['NOC', 'Year'], how='left')
gold_partecipants_df = gold_partecipants_df.fillna(value=0)
gold_partecipants_df.Gold = gold_partecipants_df.Gold.astype(int)
gold_partecipants_df.Silver = gold_partecipants_df.Silver.astype(int)
gold_partecipants_df.Bronze = gold_partecipants_df.Bronze.astype(int)
gold_partecipants_df.Total = gold_partecipants_df.Total.astype(int)
gold_partecipants_df = gold_partecipants_df.loc[gold_partecipants_df.Year > 1960]
display(gold_partecipants_df)

Unnamed: 0,NOC,Year,events_partecipations_no,distinct_athletes_no,Bronze,Gold,Silver,Total
4,AFG,1964,8,8,0,0,0,0
5,AFG,1968,5,5,0,0,0,0
6,AFG,1972,8,8,0,0,0,0
7,AFG,1980,11,11,0,0,0,0
8,AFG,1988,5,5,0,0,0,0
...,...,...,...,...,...,...,...,...
2805,ZIM,2000,26,16,0,0,0,0
2806,ZIM,2004,14,12,1,1,1,3
2807,ZIM,2008,16,13,0,1,3,4
2808,ZIM,2012,9,7,0,0,0,0


In [226]:
# Read NOC table
noc_df = pd.read_parquet(os.path.join(dataset_path, 'silver_noc.parquet'))
gold_partecipants_df = gold_partecipants_df.merge(noc_df, on=['NOC'], how='left')
gold_partecipants_df.fillna('Not Found')
display(gold_partecipants_df)

Unnamed: 0,NOC,Year,events_partecipations_no,distinct_athletes_no,Bronze,Gold,Silver,Total,region,notes
0,AFG,1964,8,8,0,0,0,0,Afghanistan,
1,AFG,1968,5,5,0,0,0,0,Afghanistan,
2,AFG,1972,8,8,0,0,0,0,Afghanistan,
3,AFG,1980,11,11,0,0,0,0,Afghanistan,
4,AFG,1988,5,5,0,0,0,0,Afghanistan,
...,...,...,...,...,...,...,...,...,...,...
2175,ZIM,2000,26,16,0,0,0,0,Zimbabwe,
2176,ZIM,2004,14,12,1,1,1,3,Zimbabwe,
2177,ZIM,2008,16,13,0,1,3,4,Zimbabwe,
2178,ZIM,2012,9,7,0,0,0,0,Zimbabwe,


In [227]:
# Regions not found
display(gold_partecipants_df.loc[gold_partecipants_df.region == 'Not Found'])


Unnamed: 0,NOC,Year,events_partecipations_no,distinct_athletes_no,Bronze,Gold,Silver,Total,region,notes


In [228]:
# Column Temporary aide in population search
gold_partecipants_df['population_help'] = gold_partecipants_df.NOC


# Replacement countries needed for population fill
# 'GDR' 'GER'
# 'ROT'
# 'VNM' 'VIE'
# 'EUN' 'RUS'
# 'YUG' 'SRB'
# 'IOA'
# 'URS' 'RUS'
# 'FRG' 'GER'
# 'YAR' 'YEM'
# 'SCG' 'SRB'
# 'TCH' 'CZE'
# 'YMD' 'YEM'
gold_partecipants_df['population_help'] = gold_partecipants_df['population_help'].replace(
    to_replace=['GDR', 'VNM', 'EUN', 'YUG', 'URS', 'FRG', 'YAR', 'SCG', 'TCH', 'YMD'],
    value=['GER', 'VIE', 'RUS','SRB', 'RUS',  'GER', 'YEM', 'SRB', 'CZE', 'YEM']
)

display(gold_partecipants_df)


Unnamed: 0,NOC,Year,events_partecipations_no,distinct_athletes_no,Bronze,Gold,Silver,Total,region,notes,population_help
0,AFG,1964,8,8,0,0,0,0,Afghanistan,,AFG
1,AFG,1968,5,5,0,0,0,0,Afghanistan,,AFG
2,AFG,1972,8,8,0,0,0,0,Afghanistan,,AFG
3,AFG,1980,11,11,0,0,0,0,Afghanistan,,AFG
4,AFG,1988,5,5,0,0,0,0,Afghanistan,,AFG
...,...,...,...,...,...,...,...,...,...,...,...
2175,ZIM,2000,26,16,0,0,0,0,Zimbabwe,,ZIM
2176,ZIM,2004,14,12,1,1,1,3,Zimbabwe,,ZIM
2177,ZIM,2008,16,13,0,1,3,4,Zimbabwe,,ZIM
2178,ZIM,2012,9,7,0,0,0,0,Zimbabwe,,ZIM


In [229]:
# Load population data
# From https://data.worldbank.org/indicator/SP.POP.TOTL
# and https://population.un.org/wpp/Download/Standard/Population/
population_df = pd.read_parquet(os.path.join(dataset_path, 'populations.parquet'))
population_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16409 entries, 2 to 16405
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Country Name  16409 non-null  object
 1   Country Code  16409 non-null  object
 2   Year          16409 non-null  int32 
 3   Population    16306 non-null  Int64 
dtypes: Int64(1), int32(1), object(2)
memory usage: 592.9+ KB


In [230]:
# Check distinct countries in population_df
all_countries_codes = set(population_df['Country Code'])

# All distinct countries in my olympics df
all_olympics_codes = set(gold_partecipants_df.NOC)

print(all_olympics_codes - all_countries_codes)

{'GDR', 'ROT', 'VNM', 'EUN', 'YUG', 'IOA', 'URS', 'FRG', 'YAR', 'SCG', 'TCH', 'YMD'}


In [231]:
population_df = population_df.set_index(['Country Code', 'Year'])
display(population_df.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,Country Name,Population
Country Code,Year,Unnamed: 2_level_1,Unnamed: 3_level_1
AFG,1960,Afghanistan,8996967
AFG,1961,Afghanistan,9169406
AFG,1962,Afghanistan,9351442
AFG,1963,Afghanistan,9543200
AFG,1964,Afghanistan,9744772


In [232]:
display((gold_partecipants_df.NOC, gold_partecipants_df.Year))

# Fill population
test_pop = population_df.loc[(gold_partecipants_df.NOC, gold_partecipants_df.Year.astype(int)), 'Population'].reset_index()
gold_partecipants_df = gold_partecipants_df.merge(test_pop, left_on=['population_help', 'Year'], right_on=['Country Code', 'Year'], how='left')
gold_partecipants_df = gold_partecipants_df.drop(columns=['population_help', 'notes', 'Country Code'])
gold_partecipants_df = gold_partecipants_df.dropna(subset=['Population'])
gold_partecipants_df = gold_partecipants_df.loc[gold_partecipants_df.Total>0]
gold_partecipants_df = gold_partecipants_df.assign(
    PopulationPerMedal_thousands = np.divide(gold_partecipants_df.Population/1000, gold_partecipants_df.Total) )
gold_partecipants_df = gold_partecipants_df.assign(
    EventPartecipationPerMedal = np.divide(gold_partecipants_df.events_partecipations_no, gold_partecipants_df.Total) )
gold_partecipants_df = gold_partecipants_df.assign(
    AthletePerMedal = np.divide(gold_partecipants_df.distinct_athletes_no, gold_partecipants_df.Total) )
gold_partecipants_df = gold_partecipants_df.assign(
    AthletePerEventPartecipation = np.divide(gold_partecipants_df.distinct_athletes_no, gold_partecipants_df.events_partecipations_no) )

(0       AFG
 1       AFG
 2       AFG
 3       AFG
 4       AFG
        ... 
 2175    ZIM
 2176    ZIM
 2177    ZIM
 2178    ZIM
 2179    ZIM
 Name: NOC, Length: 2180, dtype: object,
 0       1964
 1       1968
 2       1972
 3       1980
 4       1988
         ... 
 2175    2000
 2176    2004
 2177    2008
 2178    2012
 2179    2016
 Name: Year, Length: 2180, dtype: int64)

In [233]:
gold_partecipants_df.drop(columns='Year').describe()

Unnamed: 0,events_partecipations_no,distinct_athletes_no,Bronze,Gold,Silver,Total,Population,PopulationPerMedal_thousands,EventPartecipationPerMedal,AthletePerMedal,AthletePerEventPartecipation
count,861.0,861.0,861.0,861.0,861.0,861.0,861.0,861.0,861.0,861.0,861.0
mean,162.288037,119.869919,4.441347,3.996516,3.982578,12.420441,57597380.0,16619.76,25.607903,19.74804,0.785246
std,164.983912,119.404391,6.834275,8.565275,7.022451,21.66322,165493000.0,79553.59,24.84535,18.49284,0.119667
min,3.0,3.0,0.0,0.0,0.0,1.0,53200.0,50.24067,2.8,2.384615,0.355263
25%,43.0,36.0,1.0,0.0,0.0,2.0,5591572.0,1012.278,11.56,8.684211,0.713287
50%,95.0,74.0,2.0,1.0,1.0,4.0,14760090.0,2463.843,18.0,14.142857,0.785571
75%,232.0,163.0,5.0,4.0,4.0,13.0,49230580.0,9222.692,31.0,24.0,0.87037
max,839.0,648.0,46.0,82.0,69.0,195.0,1378665000.0,1129623.0,281.0,174.0,1.0


In [234]:
print('Gold:')
display(gold_partecipants_df.sort_values(by='Gold', ascending=False).head(5))
print('Silver:')
display(gold_partecipants_df.sort_values(by='Silver', ascending=False).head(5))
print('Bronze:')
display(gold_partecipants_df.sort_values(by='Bronze', ascending=False).head(5))
print('Total:')
display(gold_partecipants_df.sort_values(by='Total', ascending=False).head(5))


Gold:


Unnamed: 0,NOC,Year,events_partecipations_no,distinct_athletes_no,Bronze,Gold,Silver,Total,region,Population,PopulationPerMedal_thousands,EventPartecipationPerMedal,AthletePerMedal,AthletePerEventPartecipation
2082,USA,1984,693,522,30,82,61,173,United States,235825000,1363.150289,4.00578,3.017341,0.753247
2063,URS,1980,660,489,46,80,69,195,Russia,139010000,712.871795,3.384615,2.507692,0.740909
2064,URS,1988,647,481,46,54,31,131,Russia,146857000,1121.045802,4.938931,3.671756,0.743431
410,CHN,2008,730,599,28,51,21,100,China,1324655000,13246.55,7.3,5.99,0.820548
2061,URS,1972,531,371,22,50,27,99,Russia,131909000,1332.414141,5.363636,3.747475,0.698682


Silver:


Unnamed: 0,NOC,Year,events_partecipations_no,distinct_athletes_no,Bronze,Gold,Silver,Total,region,Population,PopulationPerMedal_thousands,EventPartecipationPerMedal,AthletePerMedal,AthletePerEventPartecipation
2063,URS,1980,660,489,46,80,69,195,Russia,139010000,712.871795,3.384615,2.507692,0.740909
2082,USA,1984,693,522,30,82,61,173,United States,235825000,1363.150289,4.00578,3.017341,0.753247
2062,URS,1976,574,410,35,49,41,125,Russia,135147000,1081.176,4.592,3.28,0.714286
2088,USA,2008,763,588,35,36,39,110,United States,304093966,2764.4906,6.936364,5.345455,0.770642
2087,USA,2004,726,533,26,36,39,101,United States,292805298,2899.062356,7.188119,5.277228,0.73416


Bronze:


Unnamed: 0,NOC,Year,events_partecipations_no,distinct_athletes_no,Bronze,Gold,Silver,Total,region,Population,PopulationPerMedal_thousands,EventPartecipationPerMedal,AthletePerMedal,AthletePerEventPartecipation
2064,URS,1988,647,481,46,54,31,131,Russia,146857000,1121.045802,4.938931,3.671756,0.743431
2063,URS,1980,660,489,46,80,69,195,Russia,139010000,712.871795,3.384615,2.507692,0.740909
742,GDR,1980,495,346,42,47,37,126,Germany,78288576,621.337905,3.928571,2.746032,0.69899
2090,USA,2016,719,555,38,46,37,121,United States,323071755,2670.014504,5.942149,4.586777,0.771905
2084,USA,1992,734,545,37,37,34,108,United States,256514000,2375.12963,6.796296,5.046296,0.742507


Total:


Unnamed: 0,NOC,Year,events_partecipations_no,distinct_athletes_no,Bronze,Gold,Silver,Total,region,Population,PopulationPerMedal_thousands,EventPartecipationPerMedal,AthletePerMedal,AthletePerEventPartecipation
2063,URS,1980,660,489,46,80,69,195,Russia,139010000,712.871795,3.384615,2.507692,0.740909
2082,USA,1984,693,522,30,82,61,173,United States,235825000,1363.150289,4.00578,3.017341,0.753247
2064,URS,1988,647,481,46,54,31,131,Russia,146857000,1121.045802,4.938931,3.671756,0.743431
742,GDR,1980,495,346,42,47,37,126,Germany,78288576,621.337905,3.928571,2.746032,0.69899
2062,URS,1976,574,410,35,49,41,125,Russia,135147000,1081.176,4.592,3.28,0.714286


In [235]:
gold_partecipants_df.info()
gold_partecipants_df.to_parquet(os.path.join(dataset_path, 'gold_partecipants.parquet'))


<class 'pandas.core.frame.DataFrame'>
Int64Index: 861 entries, 7 to 2177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   NOC                           861 non-null    object 
 1   Year                          861 non-null    int64  
 2   events_partecipations_no      861 non-null    int64  
 3   distinct_athletes_no          861 non-null    int64  
 4   Bronze                        861 non-null    int32  
 5   Gold                          861 non-null    int32  
 6   Silver                        861 non-null    int32  
 7   Total                         861 non-null    int32  
 8   region                        858 non-null    object 
 9   Population                    861 non-null    Int64  
 10  PopulationPerMedal_thousands  861 non-null    Float64
 11  EventPartecipationPerMedal    861 non-null    float64
 12  AthletePerMedal               861 non-null    float64
 13  Athl