# Import Libraries

In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

# Obtain Data

Read csv file with medal counts into Pandas

In [34]:
medals_df = pd.read_csv('Data/medals.csv')
medals_df.columns = ['Olympic Country', 'Gold', 'Silver', 'Bronze', 'Totals']
print(medals_df.shape)
medals_df.head()

(93, 5)


Unnamed: 0,Olympic Country,Gold,Silver,Bronze,Totals
0,United States,39,41,33,113
1,China,38,32,18,88
2,Japan,27,14,17,58
3,Great Britain,22,21,22,65
4,Russian Olympic Committee,20,28,23,71


Read csv file with country indicator data into pandas

In [35]:
indicators_df = pd.read_csv('Data/indicators.csv')
indicators_df.columns = ['Indicator Country', 'Health Expenditure (% of GDP)', 'GDP Per Capita', 'Gini Index', 
                         'Education Expenditure (% of GDP)', 'Migrant Population Proportion', 
                         'Air Pollution Exposure', 'Ages 20-24 Female Population Proportion',
                         'Ages 20-24 Male Population Proportion', 'Ages 25-29 Female Population Proportion',
                         'Ages 25-29 Male Population Proportion', 'Ages 30-34 Female Population Proportion',
                         'Ages 30-34 Male Population Proportion', 'Population', 'Urban Population Proportion']
indicators_df.head()

Unnamed: 0,Indicator Country,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Ages 20-24 Female Population Proportion,Ages 20-24 Male Population Proportion,Ages 25-29 Female Population Proportion,Ages 25-29 Male Population Proportion,Ages 30-34 Female Population Proportion,Ages 30-34 Male Population Proportion,Population,Urban Population Proportion
0,Afghanistan,9.395727,1978.961579,,3.21378,1.175547,56.910808,10.073862,10.098379,7.90726,8.063549,6.221683,6.503323,38928341.0,26.026
1,Africa Eastern and Southern,5.925882,3371.814704,,4.7193,1.734768,35.512988,9.218579,9.355607,7.906863,7.947182,6.763558,6.723442,677243299.0,36.783306
2,Africa Western and Central,4.00479,3995.039927,,3.03285,2.079691,58.064482,8.940792,9.041989,7.522335,7.578698,6.425444,6.427822,458803476.0,47.848625
3,Albania,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,7.853874,8.013442,8.138641,8.828823,7.16807,8.372488,2837743.0,62.112
4,Algeria,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,6.769581,6.910889,8.120237,8.17923,8.527349,8.474931,43851043.0,73.733


Read csv file with list of olympic countries into Pandas

In [36]:
countries_df = pd.read_csv('Data/olympic_countries.csv')
countries_df['Country'] = countries_df['Country'].apply(lambda x: x.strip())
countries_df.head()

Unnamed: 0,Country
0,Afghanistan
1,Albania
2,Algeria
3,American Samoa
4,Andorra


# Scrub Data

View list of countries whose names are different in medals_df and countries_df 

In [37]:
test_df = pd.merge(countries_df, medals_df, left_on='Country', right_on='Olympic Country', how='right')
print(test_df[test_df['Country'].isna()]['Olympic Country'].values)

['United States' 'China' 'Russian Olympic Committee' 'South Korea' 'Iran'
 'Hong Kong' 'Ivory Coast' 'Syria' 'Moldova']


Replace different country names from medals_df with the corresponding name from countries_df and merge the two dfs

In [38]:
countries_replace_dict = {'United States': 'United States of America', 'China': "People's Republic of China",
                          'Russian Olympic Committee': 'ROC*', 'South Korea': 'Republic of Korea', 'Iran':
                          'Islamic Republic of Iran', 'Hong Kong': 'Hong Kong, China', 
                          'Ivory Coast' : "Côte d'Ivoire", 'Syria' : 'Syrian Arab Republic', 
                          'Moldova' : 'Republic of Moldova'}
medals_df['Olympic Country'] = medals_df['Olympic Country'].replace(countries_replace_dict)
df = pd.merge(countries_df, medals_df, left_on='Country', right_on='Olympic Country', how='left')
df.drop(columns='Olympic Country', inplace=True)
print(df.shape)
df.head()

(206, 5)


Unnamed: 0,Country,Gold,Silver,Bronze,Totals
0,Afghanistan,,,,
1,Albania,,,,
2,Algeria,,,,
3,American Samoa,,,,
4,Andorra,,,,


View list of countries whose names are different in df and indicators_df

In [39]:
test2_df = pd.merge(df, indicators_df, left_on='Country', right_on='Indicator Country', how='left')
print(test2_df[test2_df['Indicator Country'].isna()]['Country'].values)

['Bahamas' 'Cape Verde' 'Chinese Taipei' 'Congo' 'Cook Islands'
 "Côte d'Ivoire" "Democratic People's Republic of Korea"
 'Democratic Republic of the Congo' 'Dominique' 'Egypt'
 'Federated States of Micronesia' 'Gambia' 'Great Britain'
 'Hong Kong, China' 'Islamic Republic of Iran' 'Kyrgyzstan'
 "Lao People's Democratic Republic" 'Palestine'
 "People's Republic of China" 'Republic of Korea' 'Republic of Moldova'
 'ROC*' 'Saint Kitts and Nevis' 'Saint Lucia'
 'Samoa (until 1996 Western Samoa)' 'Slovakia'
 'St Vincent and the Grenadines' 'United Republic of Tanzania'
 'United States of America' 'Venezuela' 'Virgin Islands, British'
 'Virgin Islands, US' 'Yemen']


Replace different country names from indicators_df with the corresponding name from df and merge the two dfs

In [40]:
indicator_replacements_df = pd.read_excel('Data/indicator_countries.xlsx')
indicator_countries = list(indicator_replacements_df['Indicator Country'].values)
indicator_replacements = list(indicator_replacements_df['Country'].values)
indicators_replace_dict = {}
for key, value in zip(indicator_countries, indicator_replacements):
    indicators_replace_dict[key] = value
indicators_df['Indicator Country'] = indicators_df['Indicator Country'].replace(indicators_replace_dict)
df = pd.merge(df, indicators_df, left_on='Country', right_on='Indicator Country', how='inner')
df.drop(columns=['Indicator Country'], inplace=True)
df.head()

Unnamed: 0,Country,Gold,Silver,Bronze,Totals,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Ages 20-24 Female Population Proportion,Ages 20-24 Male Population Proportion,Ages 25-29 Female Population Proportion,Ages 25-29 Male Population Proportion,Ages 30-34 Female Population Proportion,Ages 30-34 Male Population Proportion,Population,Urban Population Proportion
0,Afghanistan,,,,,9.395727,1978.961579,,3.21378,1.175547,56.910808,10.073862,10.098379,7.90726,8.063549,6.221683,6.503323,38928341.0,26.026
1,Albania,,,,,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,7.853874,8.013442,8.138641,8.828823,7.16807,8.372488,2837743.0,62.112
2,Algeria,,,,,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,6.769581,6.910889,8.120237,8.17923,8.527349,8.474931,43851043.0,73.733
3,American Samoa,,,,,,,,,41.802009,12.47382,,,,,,,55197.0,87.153
4,Andorra,,,,,6.710331,,,3.15061,59.713649,10.307621,,,,,,,77265.0,87.916


Dop columns not useful for modeling

In [41]:
df.drop(columns=['Gold', 'Silver', 'Bronze'], inplace=True)
df.head()

Unnamed: 0,Country,Totals,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Ages 20-24 Female Population Proportion,Ages 20-24 Male Population Proportion,Ages 25-29 Female Population Proportion,Ages 25-29 Male Population Proportion,Ages 30-34 Female Population Proportion,Ages 30-34 Male Population Proportion,Population,Urban Population Proportion
0,Afghanistan,,9.395727,1978.961579,,3.21378,1.175547,56.910808,10.073862,10.098379,7.90726,8.063549,6.221683,6.503323,38928341.0,26.026
1,Albania,,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,7.853874,8.013442,8.138641,8.828823,7.16807,8.372488,2837743.0,62.112
2,Algeria,,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,6.769581,6.910889,8.120237,8.17923,8.527349,8.474931,43851043.0,73.733
3,American Samoa,,,,,,41.802009,12.47382,,,,,,,55197.0,87.153
4,Andorra,,6.710331,,,3.15061,59.713649,10.307621,,,,,,,77265.0,87.916


Calculate Ages 20-34 Population Proportion interaction between features and drop columns used to calculate interaction

In [42]:
df['Ages 20-34 Population Proportion'] = ((df['Ages 20-24 Female Population Proportion'] +
                                           df['Ages 25-29 Female Population Proportion'] + 
                                           df['Ages 30-34 Female Population Proportion'])
                                          +
                                          (df['Ages 20-24 Male Population Proportion'] + 
                                           df['Ages 25-29 Male Population Proportion'] + 
                                           df['Ages 30-34 Male Population Proportion']))/2
df.drop(columns=['Ages 20-24 Female Population Proportion', 'Ages 20-24 Male Population Proportion',
                'Ages 25-29 Female Population Proportion', 'Ages 25-29 Male Population Proportion',
                'Ages 30-34 Female Population Proportion', 'Ages 30-34 Male Population Proportion'], inplace=True)
df.head()

Unnamed: 0,Country,Totals,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Population,Urban Population Proportion,Ages 20-34 Population Proportion
0,Afghanistan,,9.395727,1978.961579,,3.21378,1.175547,56.910808,38928341.0,26.026,24.434028
1,Albania,,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,2837743.0,62.112,24.187669
2,Algeria,,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,43851043.0,73.733,23.491109
3,American Samoa,,,,,,41.802009,12.47382,55197.0,87.153,
4,Andorra,,6.710331,,,3.15061,59.713649,10.307621,77265.0,87.916,


use df.info to see what further scrubbing needed

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203 entries, 0 to 202
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country                           203 non-null    object 
 1   Totals                            92 non-null     float64
 2   Health Expenditure (% of GDP)     189 non-null    float64
 3   GDP Per Capita                    188 non-null    float64
 4   Gini Index                        153 non-null    float64
 5   Education Expenditure (% of GDP)  177 non-null    float64
 6   Migrant Population Proportion     202 non-null    float64
 7   Air Pollution Exposure            191 non-null    float64
 8   Population                        203 non-null    float64
 9   Urban Population Proportion       202 non-null    float64
 10  Ages 20-34 Population Proportion  189 non-null    float64
dtypes: float64(10), object(1)
memory usage: 19.0+ KB


Replace NaN values in Totals column with 0

In [44]:
totals_replace_dict = {np.nan: 0}
df['Totals'] = df['Totals'].replace(totals_replace_dict)
df.head()

Unnamed: 0,Country,Totals,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Population,Urban Population Proportion,Ages 20-34 Population Proportion
0,Afghanistan,0.0,9.395727,1978.961579,,3.21378,1.175547,56.910808,38928341.0,26.026,24.434028
1,Albania,0.0,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,2837743.0,62.112,24.187669
2,Algeria,0.0,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,43851043.0,73.733,23.491109
3,American Samoa,0.0,,,,,41.802009,12.47382,55197.0,87.153,
4,Andorra,0.0,6.710331,,,3.15061,59.713649,10.307621,77265.0,87.916,


Drop rows missing 4 or more features

In [46]:
df.drop(index=df[df.isna().sum(axis=1)>=4].index, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Country,Totals,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Population,Urban Population Proportion,Ages 20-34 Population Proportion
0,Afghanistan,0.0,9.395727,1978.961579,,3.21378,1.175547,56.910808,38928341.0,26.026,24.434028
1,Albania,0.0,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,2837743.0,62.112,24.187669
2,Algeria,0.0,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,43851043.0,73.733,23.491109
3,Andorra,0.0,6.710331,,,3.15061,59.713649,10.307621,77265.0,87.916,
4,Angola,0.0,2.549005,6198.083841,51.3,1.82118,0.427005,32.388505,32866268.0,66.825,22.352392


Impute missing values

In [89]:
impute_df = df.drop(columns=['Country'])
impute_x = impute_df.drop(columns=['Totals'])
impute_y = impute_df['Totals']
impute_x_train, impute_x_test, impute_y_train, impute_y_test = train_test_split(impute_x, impute_y
                                                                                , random_state=93)
impute_ss = StandardScaler()
impute_ss.fit(impute_x_train)
impute_x_train = pd.DataFrame(impute_ss.transform(impute_x_train), columns=impute_x_train.columns)
impute_x_test = pd.DataFrame(impute_ss.transform(impute_x_test), columns=impute_x_test.columns)

train_imputer = KNNImputer()
impute_x_train = pd.DataFrame(train_imputer.fit_transform(impute_x_train), columns=impute_x.columns)
impute_x_train = pd.DataFrame(impute_ss.inverse_transform(impute_x_train), columns = impute_x_train.columns)
impute_x_train.index = impute_y_train.index

test_imputer = KNNImputer()
impute_x_test = pd.DataFrame(test_imputer.fit_transform(impute_x_test), columns=impute_x.columns)
impute_x_test = pd.DataFrame(impute_ss.inverse_transform(impute_x_test), columns = impute_x_test.columns)
impute_x_test.index = impute_y_test.index

scaled_df = pd.concat([impute_x_train, impute_x_test], axis=0)
scaled_df.sort_index(inplace=True)
scaled_df['Medals'] = df['Totals']

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country                           193 non-null    object 
 1   Totals                            193 non-null    float64
 2   Health Expenditure (% of GDP)     187 non-null    float64
 3   GDP Per Capita                    185 non-null    float64
 4   Gini Index                        152 non-null    float64
 5   Education Expenditure (% of GDP)  174 non-null    float64
 6   Migrant Population Proportion     193 non-null    float64
 7   Air Pollution Exposure            187 non-null    float64
 8   Population                        193 non-null    float64
 9   Urban Population Proportion       193 non-null    float64
 10  Ages 20-34 Population Proportion  186 non-null    float64
dtypes: float64(10), object(1)
memory usage: 16.7+ KB


In [114]:
df['Totals'].eq(scaled_df['Medals']).values

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [105]:
scaled_df.iloc[68]

Health Expenditure (% of GDP)       7.715441e+00
GDP Per Capita                      2.728708e+04
Gini Index                          3.290000e+01
Education Expenditure (% of GDP)    3.593950e+00
Migrant Population Proportion       1.134238e+01
Air Pollution Exposure              1.621827e+01
Population                          1.071555e+07
Urban Population Proportion         7.971500e+01
Ages 20-34 Population Proportion    1.589443e+01
Medals                              4.000000e+00
Name: 68, dtype: float64

In [None]:
drop dependent variable
train test split
standard scale
impute missing values
remerge




In [104]:
impute_x_test

Unnamed: 0,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Population,Urban Population Proportion,Ages 20-34 Population Proportion
68,7.715441,27287.083401,32.9,3.59395,11.342377,16.218266,10715549.0,79.715,15.894429
60,9.037323,47090.725779,27.3,6.2693,5.739683,5.861331,5530719.0,85.517,18.415371
154,16.06308,1648.05336,35.7,9.26147,1.413457,21.625947,7976985.0,42.923,24.563726
65,11.429951,51259.239556,31.9,4.99274,14.879051,12.028767,83240525.0,77.453,17.698328
80,2.87053,11444.960683,38.2,2.84185,0.127676,16.502653,273523621.0,56.641,23.230719
50,8.136465,10329.198753,45.7,4.13413,2.400299,14.886101,17643060.0,64.166,24.935185
63,3.093796,2159.441909,35.9,2.8753,9.670886,33.982639,2416664.0,62.582,24.394556
41,4.190038,5174.100553,41.5,3.7004,9.582599,25.886266,26378275.0,51.706,24.272789
175,6.931829,23822.92306,35.98,3.56124,3.66763,24.108568,1399491.0,53.214,21.429955
188,3.56269,9668.786391,36.7,1.34408,4.514737,17.008554,28435943.0,88.279,21.786909


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country                           193 non-null    object 
 1   Totals                            193 non-null    float64
 2   Health Expenditure (% of GDP)     187 non-null    float64
 3   GDP Per Capita                    185 non-null    float64
 4   Gini Index                        152 non-null    float64
 5   Education Expenditure (% of GDP)  174 non-null    float64
 6   Migrant Population Proportion     193 non-null    float64
 7   Air Pollution Exposure            187 non-null    float64
 8   Population                        193 non-null    float64
 9   Urban Population Proportion       193 non-null    float64
 10  Ages 20-34 Population Proportion  186 non-null    float64
dtypes: float64(10), object(1)
memory usage: 16.7+ KB


In [52]:
df

Unnamed: 0,Country,Totals,Health Expenditure (% of GDP),GDP Per Capita,Gini Index,Education Expenditure (% of GDP),Migrant Population Proportion,Air Pollution Exposure,Population,Urban Population Proportion,Ages 20-34 Population Proportion
0,Afghanistan,0.0,9.395727,1978.961579,,3.21378,1.175547,56.910808,38928341.0,26.026,24.434028
1,Albania,0.0,5.262714,13295.410885,33.2,3.94576,1.989036,18.200603,2837743.0,62.112,24.187669
2,Algeria,0.0,6.218427,10681.679297,27.6,6.10036,0.611072,38.884011,43851043.0,73.733,23.491109
3,Andorra,0.0,6.710331,,,3.15061,59.713649,10.307621,77265.0,87.916,
4,Angola,0.0,2.549005,6198.083841,51.3,1.82118,0.427005,32.388505,32866268.0,66.825,22.352392
...,...,...,...,...,...,...,...,...,...,...,...
188,Venezuela,4.0,3.562690,,,1.34408,4.514737,17.008554,28435943.0,88.279,21.786909
189,Vietnam,0.0,5.917897,8200.331867,35.7,4.06197,0.077897,29.626728,97338583.0,37.340,24.478725
190,Yemen,0.0,4.882502,,36.7,,1.282529,50.456007,29825968.0,37.908,27.029662
191,Zambia,0.0,4.934843,3270.035112,57.1,4.46518,0.789026,27.438035,18383956.0,44.629,23.790278
