# Import Libraries

In [68]:
import pandas as pd
import numpy as np

# Obtain Data

Read csv file with medal counts into Pandas

In [69]:
medals_df = pd.read_csv('Data/medals.csv')
medals_df.columns = ['Olympic Country', 'Gold', 'Silver', 'Bronze', 'Totals']
print(medals_df.shape)
medals_df.head()

(93, 5)


Unnamed: 0,Olympic Country,Gold,Silver,Bronze,Totals
0,United States,39,41,33,113
1,China,38,32,18,88
2,Japan,27,14,17,58
3,Great Britain,22,21,22,65
4,Russian Olympic Committee,20,28,23,71


Read csv file with country indicator data into pandas

In [70]:
indicators_df = pd.read_csv('Data/indicators.csv')
indicators_df.columns = ['Indicator Country', 'Health Expenditure (% of GDP)', 'GDP Per Capita', 'Gini Index', 
                         'Education Expenditure (% of GDP)', 'Migrant Population Proportion', 
                         'Air Pollution Exposure', 'Ages 20-24 Female Population Proportion',
                         'Ages 20-24 Male Population Proportion', 'Ages 25-29 Female Population Proportion',
                         'Ages 25-29 Male Population Proportion', 'Ages 30-34 Female Population Proportion',
                         'Ages 30-34 Male Population Proportion', 'Population', 'Urban Population Proportion']
indicators_df.head()

Unnamed: 0,Indicator Country,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Ages 20-24 Female Population Proportion,Ages 20-24 Male Population Proportion,Ages 25-29 Female Population Proportion,Ages 25-29 Male Population Proportion,Ages 30-34 Female Population Proportion,Ages 30-34 Male Population Proportion,Population,Urban Population Proportion
0,Afghanistan,9.395727,1978.961579,,3.21378,1.175547,56.910808,10.073862,10.098379,7.90726,8.063549,6.221683,6.503323,38928341.0,26.026
1,Africa Eastern and Southern,5.925882,3371.814704,,4.7193,1.734768,35.512988,9.218579,9.355607,7.906863,7.947182,6.763558,6.723442,677243299.0,36.783306
2,Africa Western and Central,4.00479,3995.039927,,3.03285,2.079691,58.064482,8.940792,9.041989,7.522335,7.578698,6.425444,6.427822,458803476.0,47.848625
3,Albania,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,7.853874,8.013442,8.138641,8.828823,7.16807,8.372488,2837743.0,62.112
4,Algeria,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,6.769581,6.910889,8.120237,8.17923,8.527349,8.474931,43851043.0,73.733


Read csv file with list of olympic countries into Pandas

In [71]:
countries_df = pd.read_csv('Data/olympic_countries.csv')
countries_df['Country'] = countries_df['Country'].apply(lambda x: x.strip())
countries_df.head()

Unnamed: 0,Country
0,Afghanistan
1,Albania
2,Algeria
3,American Samoa
4,Andorra


# Scrub Data

View list of countries whose names are different in medals_df and countries_df 

In [72]:
test_df = pd.merge(countries_df, medals_df, left_on='Country', right_on='Olympic Country', how='right')
print(test_df[test_df['Country'].isna()]['Olympic Country'].values)

['United States' 'China' 'Russian Olympic Committee' 'South Korea' 'Iran'
 'Hong Kong' 'Ivory Coast' 'Syria' 'Moldova']


Replace different country names from medals_df with the corresponding name from countries_df and merge the two dfs

In [73]:
countries_replace_dict = {'United States': 'United States of America', 'China': "People's Republic of China",
                          'Russian Olympic Committee': 'ROC*', 'South Korea': 'Republic of Korea', 'Iran':
                          'Islamic Republic of Iran', 'Hong Kong': 'Hong Kong, China', 
                          'Ivory Coast' : "Côte d'Ivoire", 'Syria' : 'Syrian Arab Republic', 
                          'Moldova' : 'Republic of Moldova'}
medals_df['Olympic Country'] = medals_df['Olympic Country'].replace(countries_replace_dict)
df = pd.merge(countries_df, medals_df, left_on='Country', right_on='Olympic Country', how='left')
df.drop(columns='Olympic Country', inplace=True)
print(df.shape)
df.head()

(206, 5)


Unnamed: 0,Country,Gold,Silver,Bronze,Totals
0,Afghanistan,,,,
1,Albania,,,,
2,Algeria,,,,
3,American Samoa,,,,
4,Andorra,,,,


View list of countries whose names are different in df and indicators_df

In [74]:
test2_df = pd.merge(df, indicators_df, left_on='Country', right_on='Indicator Country', how='left')
print(test2_df[test2_df['Indicator Country'].isna()]['Country'].values)

['Bahamas' 'Cape Verde' 'Chinese Taipei' 'Congo' 'Cook Islands'
 "Côte d'Ivoire" "Democratic People's Republic of Korea"
 'Democratic Republic of the Congo' 'Dominique' 'Egypt'
 'Federated States of Micronesia' 'Gambia' 'Great Britain'
 'Hong Kong, China' 'Islamic Republic of Iran' 'Kyrgyzstan'
 "Lao People's Democratic Republic" 'Palestine'
 "People's Republic of China" 'Republic of Korea' 'Republic of Moldova'
 'ROC*' 'Saint Kitts and Nevis' 'Saint Lucia'
 'Samoa (until 1996 Western Samoa)' 'Slovakia'
 'St Vincent and the Grenadines' 'United Republic of Tanzania'
 'United States of America' 'Venezuela' 'Virgin Islands, British'
 'Virgin Islands, US' 'Yemen']


Replace different country names from indicators_df with the corresponding name from df and merge the two dfs

In [75]:
indicator_replacements_df = pd.read_excel('Data/indicator_countries.xlsx')
indicator_countries = list(indicator_replacements_df['Indicator Country'].values)
indicator_replacements = list(indicator_replacements_df['Country'].values)
indicators_replace_dict = {}
for key, value in zip(indicator_countries, indicator_replacements):
    indicators_replace_dict[key] = value
indicators_df['Indicator Country'] = indicators_df['Indicator Country'].replace(indicators_replace_dict)
df = pd.merge(df, indicators_df, left_on='Country', right_on='Indicator Country', how='inner')
df.drop(columns=['Indicator Country'], inplace=True)
df.head()

Unnamed: 0,Country,Gold,Silver,Bronze,Totals,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Ages 20-24 Female Population Proportion,Ages 20-24 Male Population Proportion,Ages 25-29 Female Population Proportion,Ages 25-29 Male Population Proportion,Ages 30-34 Female Population Proportion,Ages 30-34 Male Population Proportion,Population,Urban Population Proportion
0,Afghanistan,,,,,9.395727,1978.961579,,3.21378,1.175547,56.910808,10.073862,10.098379,7.90726,8.063549,6.221683,6.503323,38928341.0,26.026
1,Albania,,,,,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,7.853874,8.013442,8.138641,8.828823,7.16807,8.372488,2837743.0,62.112
2,Algeria,,,,,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,6.769581,6.910889,8.120237,8.17923,8.527349,8.474931,43851043.0,73.733
3,American Samoa,,,,,,,,,41.802009,12.47382,,,,,,,55197.0,87.153
4,Andorra,,,,,6.710331,,,3.15061,59.713649,10.307621,,,,,,,77265.0,87.916


Dop columns not useful for modeling

In [76]:
df.drop(columns=['Gold', 'Silver', 'Bronze'], inplace=True)
df.head()

Unnamed: 0,Country,Totals,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Ages 20-24 Female Population Proportion,Ages 20-24 Male Population Proportion,Ages 25-29 Female Population Proportion,Ages 25-29 Male Population Proportion,Ages 30-34 Female Population Proportion,Ages 30-34 Male Population Proportion,Population,Urban Population Proportion
0,Afghanistan,,9.395727,1978.961579,,3.21378,1.175547,56.910808,10.073862,10.098379,7.90726,8.063549,6.221683,6.503323,38928341.0,26.026
1,Albania,,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,7.853874,8.013442,8.138641,8.828823,7.16807,8.372488,2837743.0,62.112
2,Algeria,,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,6.769581,6.910889,8.120237,8.17923,8.527349,8.474931,43851043.0,73.733
3,American Samoa,,,,,,41.802009,12.47382,,,,,,,55197.0,87.153
4,Andorra,,6.710331,,,3.15061,59.713649,10.307621,,,,,,,77265.0,87.916


Calculate Ages 20-34 Population Proportion interaction between features and drop columns used to calculate interaction

In [77]:
df['Ages 20-34 Population Proportion'] = ((df['Ages 20-24 Female Population Proportion'] +
                                           df['Ages 25-29 Female Population Proportion'] + 
                                           df['Ages 30-34 Female Population Proportion'])
                                          +
                                          (df['Ages 20-24 Male Population Proportion'] + 
                                           df['Ages 25-29 Male Population Proportion'] + 
                                           df['Ages 30-34 Male Population Proportion']))/2
df.drop(columns=['Ages 20-24 Female Population Proportion', 'Ages 20-24 Male Population Proportion',
                'Ages 25-29 Female Population Proportion', 'Ages 25-29 Male Population Proportion',
                'Ages 30-34 Female Population Proportion', 'Ages 30-34 Male Population Proportion'], inplace=True)
df.head()

Unnamed: 0,Country,Totals,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Population,Urban Population Proportion,Ages 20-34 Population Proportion
0,Afghanistan,,9.395727,1978.961579,,3.21378,1.175547,56.910808,38928341.0,26.026,24.434028
1,Albania,,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,2837743.0,62.112,24.187669
2,Algeria,,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,43851043.0,73.733,23.491109
3,American Samoa,,,,,,41.802009,12.47382,55197.0,87.153,
4,Andorra,,6.710331,,,3.15061,59.713649,10.307621,77265.0,87.916,


use df.info to see what further scrubbing needed

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203 entries, 0 to 202
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country                           203 non-null    object 
 1   Totals                            92 non-null     float64
 2   Health Expenditure (% of GDP)     189 non-null    float64
 3   GDP Per Capita                    188 non-null    float64
 4   Gini Index                        153 non-null    float64
 5   Education Expenditure (% of GDP)  177 non-null    float64
 6   Migrant Population Proportion     202 non-null    float64
 7   Air Pollution Exposure            191 non-null    float64
 8   Population                        203 non-null    float64
 9   Urban Population Proportion       202 non-null    float64
 10  Ages 20-34 Population Proportion  189 non-null    float64
dtypes: float64(10), object(1)
memory usage: 19.0+ KB


Drop rows missing 4 or more features

In [79]:
df.drop(index=df[df.drop(columns=['Totals']).isna().sum(axis=1)>=4].index, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Country,Totals,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Population,Urban Population Proportion,Ages 20-34 Population Proportion
0,Afghanistan,,9.395727,1978.961579,,3.21378,1.175547,56.910808,38928341.0,26.026,24.434028
1,Albania,,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,2837743.0,62.112,24.187669
2,Algeria,,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,43851043.0,73.733,23.491109
3,Andorra,,6.710331,,,3.15061,59.713649,10.307621,77265.0,87.916,
4,Angola,,2.549005,6198.083841,51.3,1.82118,0.427005,32.388505,32866268.0,66.825,22.352392


In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country                           193 non-null    object 
 1   Totals                            91 non-null     float64
 2   Health Expenditure (% of GDP)     187 non-null    float64
 3   GDP Per Capita                    185 non-null    float64
 4   Gini Index                        152 non-null    float64
 5   Education Expenditure (% of GDP)  174 non-null    float64
 6   Migrant Population Proportion     193 non-null    float64
 7   Air Pollution Exposure            187 non-null    float64
 8   Population                        193 non-null    float64
 9   Urban Population Proportion       193 non-null    float64
 10  Ages 20-34 Population Proportion  186 non-null    float64
dtypes: float64(10), object(1)
memory usage: 16.7+ KB
