In [32]:
import numpy as np
import pandas as pd
import plotly.express as px

# Athletes

In [33]:
athlete_dataset = pd.read_json('../olympic_athletes.json')
athlete_dataset.head()
athlete_dataset.shape
athlete_dataset.isnull().sum()

athlete_url                 0
athlete_full_name           0
games_participations        0
first_game                 22
athlete_year_birth       2456
athlete_medals          60552
bio                     53062
dtype: int64

# Hosts

In [34]:
hosts_dataset = pd.read_xml('../olympic_hosts.xml')
hosts_dataset.head()
hosts_dataset.shape
hosts_dataset.columns
hosts_dataset.drop(['index'], axis=1, inplace=True)
hosts_dataset.columns
hosts_dataset.isnull().sum()

game_slug          0
game_end_date      0
game_start_date    0
game_location      0
game_name          0
game_season        0
game_year          0
dtype: int64

# Medals

In [35]:
medals_dataset = pd.read_excel('../olympic_medals.xlsx')
medals_dataset.shape
medals_dataset.isnull().sum()
medals_dataset.duplicated().sum()
medals_dataset.columns
medals_dataset = medals_dataset.rename({'slug_game': 'game_slug'}, axis=1)
medals_dataset.columns
medals_dataset.loc[(medals_dataset.duplicated())]
medals_dataset.loc[(medals_dataset.game_slug == 'paris-1900') & (medals_dataset.discipline_title == 'Polo')]
medals_dataset.loc[(medals_dataset.game_slug == 'london-1908') & (medals_dataset.discipline_title == 'Polo')]
medals_dataset.loc[(medals_dataset.game_slug == 'london-1908') & (medals_dataset.discipline_title == 'Hockey') & (
        medals_dataset.event_title == 'hockey men')]
medals_dataset.participant_title.unique()
medals_dataset.drop(['Unnamed: 0', 'participant_title', 'athlete_url'], axis=1, inplace=True)
medals_dataset.tail()

Unnamed: 0,discipline_title,game_slug,event_title,event_gender,medal_type,participant_type,athlete_full_name,country_name,country_code,country_3_letter_code
21692,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,SILVER,Athlete,Viggo JENSEN,Denmark,DK,DEN
21693,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,BRONZE,Athlete,Alexandros Nikolopoulos,Greece,GR,GRE
21694,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,GOLD,Athlete,Viggo JENSEN,Denmark,DK,DEN
21695,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,SILVER,Athlete,Launceston ELLIOT,Great Britain,GB,GBR
21696,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,BRONZE,Athlete,Sotirios VERSIS,Greece,GR,GRE


In [36]:
# Results

In [37]:
results_dataset = pd.read_html('../olympic_results.html')[0]
results_dataset.head()
results_dataset.shape
results_dataset.isnull().sum()
results_dataset.duplicated().sum()
results_dataset.loc[(results_dataset.duplicated())]
results_dataset.value_unit.unique()
results_dataset.value_type.unique()
results_dataset.drop(['Unnamed: 0', 'athlete_url'], axis=1, inplace=True)
results_dataset = results_dataset.rename({'slug_game': 'game_slug'}, axis=1)
results_dataset.columns

Index(['discipline_title', 'event_title', 'game_slug', 'participant_type',
       'medal_type', 'athletes', 'rank_equal', 'rank_position', 'country_name',
       'country_code', 'country_3_letter_code', 'athlete_full_name',
       'value_unit', 'value_type'],
      dtype='object')

# Separate winter and summer games

In [38]:
game_types = hosts_dataset[['game_slug', 'game_season', 'game_year']]
merged_hosts_results = results_dataset.merge(game_types, on='game_slug')
merged_hosts_results.head()
merged_hosts_results.shape
merged_hosts_results.game_season.unique()

summer_results = merged_hosts_results.loc[(merged_hosts_results.game_season == 'Summer')].copy()
winter_results = merged_hosts_results.loc[(merged_hosts_results.game_season == 'Winter')].copy()
summer_results.drop(['game_season'], axis=1, inplace=True)

summer_results.head()
summer_results.shape
summer_results.isnull().sum()
summer_results.duplicated().sum()
duplicate_summer_results = summer_results.loc[(summer_results.duplicated())]
duplicate_summer_results

Unnamed: 0,discipline_title,event_title,game_slug,participant_type,medal_type,athletes,rank_equal,rank_position,country_name,country_code,country_3_letter_code,athlete_full_name,value_unit,value_type,game_year
154361,Water Polo,Water Polo Women,antwerp-1920,GameTeam,,,,1,Netherlands,NL,NED,,2.0,SCORE,1920
155336,Sailing,8m mixed,stockholm-1912,GameTeam,,,,5,Russian Federation,RU,RUS,,0.0,POINTS,1912
157014,Shooting,trap 125 targets men,stockholm-1912,Athlete,GOLD,,,1,United States of America,US,USA,Jay Graham,96.0,POINTS,1912
157015,Shooting,trap 125 targets men,stockholm-1912,Athlete,SILVER,,,2,Germany,DE,GER,Alfred GOELDEL,94.0,POINTS,1912
157016,Shooting,trap 125 targets men,stockholm-1912,Athlete,BRONZE,,,3,Russian Federation,RU,RUS,Harry Blaus,91.0,POINTS,1912
157017,Shooting,trap 125 targets men,stockholm-1912,Athlete,,,,4,Great Britain,GB,GBR,Harry Robinson HUMBY,88.0,POINTS,1912
157018,Shooting,trap 125 targets men,stockholm-1912,Athlete,,,,5,Germany,DE,GER,Albert PREUSS,88.0,POINTS,1912
157019,Shooting,trap 125 targets men,stockholm-1912,Athlete,,,,6,Greece,GR,GRE,Anastasios METAXAS,88.0,POINTS,1912
157020,Shooting,trap 125 targets men,stockholm-1912,Athlete,,,,7,Germany,DE,GER,"Franz, Baron von Zedlitz und Leipe",88.0,POINTS,1912
157021,Shooting,trap 125 targets men,stockholm-1912,Athlete,,,,8,Finland,FI,FIN,Gustaf-Adolf SCHNITT,88.0,POINTS,1912


# Calculate the total number of each type of medals for each games by country

In [39]:
summer_results.medal_type.unique()
summer_results['medal_type'] = summer_results['medal_type'].fillna('None')
summer_results['total_medals'] = summer_results['medal_type'].apply(lambda x: 0 if x == 'None' else 1)
summer_results['gold_medals'] = summer_results['medal_type'].apply(lambda x: 1 if x == 'GOLD' else 0)
summer_results['silver_medals'] = summer_results['medal_type'].apply(lambda x: 1 if x == 'SILVER' else 0)
summer_results['bronze_medals'] = summer_results['medal_type'].apply(lambda x: 1 if x == 'BRONZE' else 0)
summer_results
summer_results.country_name.unique()
medals_by_country = summer_results.groupby(['game_year', 'country_name']).agg(
    {'total_medals': 'sum', 'gold_medals': 'sum', 'silver_medals': 'sum', 'bronze_medals': 'sum'})
medals_by_country = medals_by_country.sort_values(by=['game_year', 'total_medals'],
                                                  ascending=[True, False]).reset_index()
medals_by_country

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals
0,1896,Greece,47,10,18,19
1,1896,United States of America,20,11,7,2
2,1896,Germany,13,6,5,2
3,1896,France,11,5,4,2
4,1896,Great Britain,7,2,3,2
5,1896,Denmark,6,1,2,3
6,1896,Hungary,6,2,1,3
7,1896,Austria,5,2,1,2
8,1896,Switzerland,3,1,2,0
9,1896,Australia,2,2,0,0


# Calculate the total number of disciplines for each games by country

In [40]:
sports_by_country = summer_results.groupby(['game_year', 'country_name', 'discipline_title']).count()
sports_by_country
sports_by_country = sports_by_country[['event_title']].reset_index()
sports_by_country = sports_by_country.rename({'discipline_title': 'sports', 'event_title': 'events'},
                                             axis=1)
sports_by_country
sports_by_country = sports_by_country.groupby(['game_year', 'country_name']).agg({'sports': 'count'})
sports_by_country = sports_by_country.reset_index()
sports_by_country
events_by_country = summer_results.groupby(['game_year', 'country_name', 'event_title']).count()
events_by_country

events_by_country = events_by_country[['discipline_title']].reset_index()
events_by_country = events_by_country.rename(
    {'event_title': 'events', 'discipline_title': 'participation'}, axis=1)
events_by_country = events_by_country.groupby(['game_year', 'country_name']).agg(
    {'events': 'count'})
events_by_country = events_by_country.reset_index()
events_by_country
olympic_data = medals_by_country.merge(sports_by_country, on=['game_year', 'country_name'])
olympic_data = olympic_data.merge(events_by_country, on=['game_year', 'country_name'])
olympic_data = olympic_data.sort_values(by=['game_year', 'total_medals'], ascending=[True, False])
olympic_data

Unnamed: 0,game_year,country_name,total_medals,gold_medals,silver_medals,bronze_medals,sports,events
0,1896,Greece,47,10,18,19,10,39
1,1896,United States of America,20,11,7,2,3,17
2,1896,Germany,13,6,5,2,7,27
3,1896,France,11,5,4,2,7,20
4,1896,Great Britain,7,2,3,2,9,20
5,1896,Denmark,6,1,2,3,6,14
6,1896,Hungary,6,2,1,3,6,18
7,1896,Austria,5,2,1,2,3,8
8,1896,Switzerland,3,1,2,0,2,5
9,1896,Australia,2,2,0,0,2,4


In [41]:
games_participation = summer_results.groupby(['country_name', 'game_year']).agg(
    {'total_medals': 'sum', 'gold_medals': 'sum', 'silver_medals': 'sum', 'bronze_medals': 'sum'})
games_participation
games_participation_reset = games_participation.reset_index()
games_participation_france = games_participation_reset.loc[(games_participation_reset.country_name == 'France')]
games_participation_france
import warnings

warnings.filterwarnings('ignore')
games_participation_france = games_participation_reset.loc[(games_participation_reset.country_name == 'France')]
games_participation_france['game_part'] = range(0, games_participation_france.shape[0])
games_participation_france['previous_game_medal'] = games_participation_france['total_medals'].shift(1, fill_value=0)
games_participation_france['previous_game_gold'] = games_participation_france['gold_medals'].shift(1, fill_value=0)
games_participation_france['previous_game_silver'] = games_participation_france['silver_medals'].shift(1, fill_value=0)
games_participation_france['previous_game_bronze'] = games_participation_france['bronze_medals'].shift(1, fill_value=0)

games_participation_france
games_participation_reset
country_list = list(games_participation_reset.country_name.unique())
country_list.remove('France')
country_list
for country in country_list:
    temp_games_participation = games_participation_reset.loc[(games_participation_reset.country_name == country)]
    temp_games_participation['game_part'] = range(0, temp_games_participation.shape[0])
    temp_games_participation['previous_game_medal'] = temp_games_participation['total_medals'].shift(1, fill_value=0)
    temp_games_participation['previous_game_gold'] = temp_games_participation['gold_medals'].shift(1, fill_value=0)
    temp_games_participation['previous_game_silver'] = temp_games_participation['silver_medals'].shift(1, fill_value=0)
    temp_games_participation['previous_game_bronze'] = temp_games_participation['bronze_medals'].shift(1, fill_value=0)
    games_participation_france = pd.concat([games_participation_france, temp_games_participation])
games_participation_france.columns
games_participation_france = games_participation_france[
    ['game_year', 'country_name', 'total_medals', 'gold_medals', 'silver_medals', 'bronze_medals', 'game_part',
     'previous_game_medal', 'previous_game_gold', 'previous_game_silver', 'previous_game_bronze']]
games_participation_france = games_participation_france.sort_values(by=['game_year', 'total_medals'], ascending=[True, False])
games_participation_france
games_participation_france = games_participation_france[
    ['game_year', 'country_name', 'game_part', 'previous_game_medal', 'previous_game_gold', 'previous_game_silver',
     'previous_game_bronze']]
games_participation_france
olympic_data = pd.merge(olympic_data, games_participation_france, on=['game_year', 'country_name'])

olympic_data = olympic_data.sort_values(by=['game_year', 'total_medals'], ascending=[True, False])

# Training set to prepare for regression models

In [42]:
country_names = list(olympic_data.country_name.unique())
country_dict = {}
for idx, country in enumerate(country_names):
    country_dict[country] = idx
country_dict

train_data = olympic_data.loc[(olympic_data.game_year < 2020)]
test_data = olympic_data.loc[(olympic_data.game_year == 2020)]

X_train = train_data[
    ['country_name', 'sports', 'events', 'game_part', 'previous_game_medal', 'previous_game_gold', 'previous_game_silver',
     'previous_game_bronze']]
X_test = test_data[
    ['country_name', 'sports', 'events', 'game_part', 'previous_game_medal', 'previous_game_gold', 'previous_game_silver',
     'previous_game_bronze']]

y_train_total = train_data['total_medals']
y_train_gold = train_data['gold_medals']
y_train_silver = train_data['silver_medals']
y_train_bronze = train_data['bronze_medals']

y_test_total = test_data['total_medals']
y_test_gold = test_data['gold_medals']
y_test_silver = test_data['silver_medals']
y_test_bronze = test_data['bronze_medals']

X_train = X_train.replace(country_dict)
X_test = X_test.replace(country_dict)

## Linear Regression

In [43]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

linear_model = LinearRegression()
linear_model.fit(X_train, y_train_total)
linear_predictions = linear_model.predict(X_test)

print('Predicted labels : ', np.round(linear_predictions)[:10])
print('Actual labels : ', y_test_total[:10])

linear_mse = mean_squared_error(y_test_total, linear_predictions)
print("MSE:", linear_mse)

linear_rmse = np.sqrt(linear_mse)
print("RMSE:", linear_rmse)

linear_r2 = r2_score(y_test_total, linear_predictions)
print("R2:", linear_r2)

results_2020_total_medals = pd.DataFrame(
    {'country': test_data['country_name'], 'linear_total_medals_pred': np.round(linear_predictions),
     'total_medals_actual': test_data['total_medals']})

pd.set_option('display.max_rows', None)

results_2020_total_medals

Predicted labels :  [102.  61.  37.  60.  37.  31.  33.  45.  18.  30.]
Actual labels :  2699    113
2700     88
2701     70
2702     65
2703     57
2704     45
2705     40
2706     37
2707     36
2708     33
Name: total_medals, dtype: int64
MSE: 32.30623748434444
RMSE: 5.683857623511029
R2: 0.8449657359930957


Unnamed: 0,country,linear_total_medals_pred,total_medals_actual
2699,United States of America,102.0,113
2700,People's Republic of China,61.0,88
2701,ROC,37.0,70
2702,Great Britain,60.0,65
2703,Japan,37.0,57
2704,Australia,31.0,45
2705,Italy,33.0,40
2706,Germany,45.0,37
2707,Netherlands,18.0,36
2708,France,30.0,33


## Decision Tree Regressor

In [44]:
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train_total)
tree_predictions = tree_model.predict(X_test)

print('Predicted labels : ', np.round(tree_predictions)[:10])
print('Actual labels : ', y_test_total[:10])

tree_mse = mean_squared_error(y_test_total, tree_predictions)
print("MSE:", tree_mse)

tree_rmse = np.sqrt(tree_mse)
print("RMSE:", tree_rmse)

tree_r2 = r2_score(y_test_total, tree_predictions)
print("R2:", tree_r2)

results_2020_total_medals['tree_total_medals_pred'] = np.round(tree_predictions)
results_2020_total_medals

Predicted labels :  [104.  56. 112.  63.  19.  29.  29.  38.  19.  10.]
Actual labels :  2699    113
2700     88
2701     70
2702     65
2703     57
2704     45
2705     40
2706     37
2707     36
2708     33
Name: total_medals, dtype: int64
MSE: 34.52105263157895
RMSE: 5.875461907933618
R2: 0.8343370691163279


Unnamed: 0,country,linear_total_medals_pred,total_medals_actual,tree_total_medals_pred
2699,United States of America,102.0,113,104.0
2700,People's Republic of China,61.0,88,56.0
2701,ROC,37.0,70,112.0
2702,Great Britain,60.0,65,63.0
2703,Japan,37.0,57,19.0
2704,Australia,31.0,45,29.0
2705,Italy,33.0,40,29.0
2706,Germany,45.0,37,38.0
2707,Netherlands,18.0,36,19.0
2708,France,30.0,33,10.0


## Gradient Boosting Regressor

In [45]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_model = GradientBoostingRegressor()
gbr_model.fit(X_train, y_train_total)
gbr_predictions = gbr_model.predict(X_test)

print('Predicted labels : ', np.round(gbr_predictions)[:10])
print('Actual labels : ', y_test_total[:10])

gbr_mse = mean_squared_error(y_test_total, gbr_predictions)
print("MSE:", gbr_mse)

gbr_rmse = np.sqrt(gbr_mse)
print("RMSE:", gbr_rmse)

gbr_r2 = r2_score(y_test_total, gbr_predictions)
print("R2:", gbr_r2)

results_2020_total_medals['gbr_total_medals_pred'] = np.round(gbr_predictions)
results_2020_total_medals.columns

Predicted labels :  [112.  60.  52.  66.  38.  28.  30.  38.  19.  34.]
Actual labels :  2699    113
2700     88
2701     70
2702     65
2703     57
2704     45
2705     40
2706     37
2707     36
2708     33
Name: total_medals, dtype: int64
MSE: 15.142308201996796
RMSE: 3.8913118870114736
R2: 0.9273336423469333


Index(['country', 'linear_total_medals_pred', 'total_medals_actual',
       'tree_total_medals_pred', 'gbr_total_medals_pred'],
      dtype='object')

### Results

In [46]:
results_2020_total_medals = results_2020_total_medals[['country', 'linear_total_medals_pred', 'tree_total_medals_pred', 'gbr_total_medals_pred', 'total_medals_actual']]
results_2020_total_medals

Unnamed: 0,country,linear_total_medals_pred,tree_total_medals_pred,gbr_total_medals_pred,total_medals_actual
2699,United States of America,102.0,104.0,112.0,113
2700,People's Republic of China,61.0,56.0,60.0,88
2701,ROC,37.0,112.0,52.0,70
2702,Great Britain,60.0,63.0,66.0,65
2703,Japan,37.0,19.0,38.0,57
2704,Australia,31.0,29.0,28.0,45
2705,Italy,33.0,29.0,30.0,40
2706,Germany,45.0,38.0,38.0,37
2707,Netherlands,18.0,19.0,19.0,36
2708,France,30.0,10.0,34.0,33
