In [1]:
# Importing the necessary libraries
import warnings
warnings.simplefilter('ignore')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from numpy import isnan
import numpy as np

In [2]:
# Reading the dataset
olympic_data = pd.read_csv("recent_olympic_data.csv", na_values='NA')

In [3]:
# printing the first few records
olympic_data.head()

Unnamed: 0,id,sex,age,height,weight,noc,year,season,city,sport,event,medal,region
0,1,M,24.0,180.0,80.0,CHN,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,China
1,2,M,23.0,170.0,60.0,CHN,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,China
2,5,F,21.0,185.0,82.0,NED,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,,Netherlands
3,5,F,21.0,185.0,82.0,NED,1988,Winter,Calgary,Speed Skating,"Speed Skating Women's 1,000 metres",,Netherlands
4,5,F,25.0,185.0,82.0,NED,1992,Winter,Albertville,Speed Skating,Speed Skating Women's 500 metres,,Netherlands


In [4]:
# Columns available in the dataset
olympic_data_columns = olympic_data.columns.tolist()
olympic_data_columns


['id',
 'sex',
 'age',
 'height',
 'weight',
 'noc',
 'year',
 'season',
 'city',
 'sport',
 'event',
 'medal',
 'region']

In [5]:
# Data type of columns
olympic_data.dtypes

id          int64
sex        object
age       float64
height    float64
weight    float64
noc        object
year        int64
season     object
city       object
sport      object
event      object
medal      object
region     object
dtype: object

In [6]:
# Getting data Stats
data_stats = olympic_data.describe()

In [7]:
# Filling the NA Values in Medal to 'No Medal'
olympic_data['medal'] = olympic_data['medal'].fillna(value='No Medal')
olympic_data.head()

Unnamed: 0,id,sex,age,height,weight,noc,year,season,city,sport,event,medal,region
0,1,M,24.0,180.0,80.0,CHN,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,No Medal,China
1,2,M,23.0,170.0,60.0,CHN,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,No Medal,China
2,5,F,21.0,185.0,82.0,NED,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,No Medal,Netherlands
3,5,F,21.0,185.0,82.0,NED,1988,Winter,Calgary,Speed Skating,"Speed Skating Women's 1,000 metres",No Medal,Netherlands
4,5,F,25.0,185.0,82.0,NED,1992,Winter,Albertville,Speed Skating,Speed Skating Women's 500 metres,No Medal,Netherlands


In [8]:
# list of columns to be imputed
cols_to_impute = ['year', 'age', 'height', 'weight']
# create IterativeImputer object and set min and max value parameters
iter_imp = IterativeImputer(min_value=olympic_data[cols_to_impute].min(), max_value=olympic_data[cols_to_impute].max())
# apply the imputer to fit and transform
imputed_cols = iter_imp.fit_transform(olympic_data[cols_to_impute])
# Assign the imputed array back to the original dataframe
olympic_data[cols_to_impute] = imputed_cols

In [9]:
# split into input and output elements
data = olympic_data.values
ix = [i for i in range(data.shape[1]) if i != 12]
X, y = data[:, ix], data[:, 12]
# print total missing
print('Missing: %d' % sum(pd.isnull(X).flatten()))

Missing: 0


In [10]:
# Choosing the top five countries with the most data
countries = ['USA','BRA','GER','AUS','FRA']
sports = ['Athletics']

In [11]:
# Filtering on the most important features
olympic_data = olympic_data[olympic_data['noc'].isin(countries)]
olympic_data = olympic_data[olympic_data['season'] == 'Summer']
olympic_data = olympic_data[olympic_data['height'].notna()]
olympic_data = olympic_data[olympic_data['age'].notna()]
olympic_data['height (m)'] = olympic_data['height']/100
olympic_data = olympic_data[olympic_data['weight'].notna()] 
olympic_data['BMI'] = round(olympic_data['weight']/(olympic_data['height (m)']*olympic_data['height (m)']), 2)
# olympic_data = olympic_data[olympics['Medal'] == 'Gold']
# wins = ['Gold','Bronze']
# olympic_data = olympic_data[olympic_data['Medal'].isin(wins)]

In [12]:
# Assign "Medal" to the Medal column if the athlete got Bronze, Silver, or Gold. Otherwise, assign "Non-Medal"
olympic_data.loc[(olympic_data['medal'] == 'Gold'),'medal'] = 'medal'
olympic_data.loc[(olympic_data['medal'] == 'Silver'),'medal'] = 'medal'
olympic_data.loc[(olympic_data['medal'] == 'Bronze'),'medal'] = 'medal'
olympic_data.loc[(olympic_data['medal'].isna()),'medal'] = 'No-Medal'

In [13]:
# Split the data on year = 2016
olympics_df = olympic_data[olympic_data['year'] < 2016]
olympics_2016 = olympic_data[olympic_data['year'] == 2016]

In [14]:
# Using features with the highest importance
X = pd.get_dummies(olympics_df[["sex", "age", "height", "weight", "BMI", "noc"]])
# X = olympics[["BMI","noc"]]
y = olympics_df["medal"]
print(X.shape, y.shape)


(24997, 11) (24997,)


In [15]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_2016 = pd.get_dummies(olympics_2016[["sex", "age", "height", "weight", "BMI", "noc"]])
y_2016 = olympics_2016["medal"]

In [16]:
# Logistic Regression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression()

In [17]:
# the closer the one the stronger the coorelation/prediction model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7712700698778471
Testing Data Score: 0.78208


In [18]:
#Machine learning model predictions tested on 2016 data
# using only the 2016 Olympics dataset to see how close our predictions are
predictions = classifier.predict(X_2016)
print(f"First 10 Predictions: {predictions[:10]}")
print(f"First 10 Actual labels: {y_2016[:10].tolist()}")

First 10 Predictions: ['No Medal' 'No Medal' 'No Medal' 'No Medal' 'No Medal' 'No Medal'
 'No Medal' 'No Medal' 'No Medal' 'No Medal']
First 10 Actual labels: ['medal', 'No Medal', 'No Medal', 'No Medal', 'No Medal', 'No Medal', 'medal', 'No Medal', 'No Medal', 'No Medal']


In [19]:
# putting actuals and predictions into a dataframe
Testing = pd.DataFrame({"Prediction": predictions, "Actual": y_2016}).reset_index(drop=True)

In [20]:

#Reviewing prediction data
Testing_crosstab = pd.crosstab(Testing['Actual'],Testing['Prediction'])
Testing[(Testing['Prediction'] == 'medal')]

Unnamed: 0,Prediction,Actual
13,medal,medal
16,medal,medal
17,medal,medal
18,medal,medal
19,medal,medal
...,...,...
2522,medal,medal
2604,medal,medal
2648,medal,medal
2686,medal,No Medal


In [21]:
## Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [22]:
# Calculating the accuracy score.
rf_predictions = rf_model.predict(X_test)
acc_score = accuracy_score(y_test, rf_predictions)

In [23]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, rf_predictions)

In [24]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
print("Confusion Matrix")
print(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix
          Predicted 0  Predicted 1
Actual 0         4331          538
Actual 1          947          434
Accuracy Score : 0.7624
Classification Report
              precision    recall  f1-score   support

    No Medal       0.82      0.89      0.85      4869
       medal       0.45      0.31      0.37      1381

    accuracy                           0.76      6250
   macro avg       0.63      0.60      0.61      6250
weighted avg       0.74      0.76      0.75      6250



In [25]:
#Sort the features by their importance.6`
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3029195434080895, 'age'),
 (0.2502865306936226, 'BMI'),
 (0.18375550534724355, 'weight'),
 (0.18321408137696185, 'height'),
 (0.03421204191562858, 'noc_USA'),
 (0.010699503568939965, 'noc_FRA'),
 (0.008505050587300266, 'sex_F'),
 (0.008258571343592154, 'sex_M'),
 (0.006329597589326147, 'noc_GER'),
 (0.006133539261136529, 'noc_AUS'),
 (0.005686034908158824, 'noc_BRA')]

In [26]:
#Machine learning model predictions tested on 2016 data
# using only the 2016 Olympics dataset to see how close our predictions are
rf_predictions = rf_model.predict(X_2016)

In [27]:
# putting actuals and predictions into a dataframe
Testing = pd.DataFrame({"Prediction": rf_predictions, "Actual": y_2016}).reset_index(drop=True)