In [2]:
from sklearn import tree
import numpy as np
import pandas as pd
import os

# Read CSV and get_dummies

In [3]:
swimming = pd.read_csv(os.path.join("data/Summer_events/summer_male_gym.csv"))
swimming.head()

Unnamed: 0,Age,Height,Weight,Team,Year,City,Event,Medal
0,28.0,175.0,64.0,Finland,1948,London,Gymnastics Men's Individual All-Around,Bronze
1,32.0,175.0,64.0,Finland,1952,Helsinki,Gymnastics Men's Individual All-Around,---
2,23.0,167.0,64.0,Spain,2016,Rio de Janeiro,Gymnastics Men's Individual All-Around,---
3,22.0,167.0,63.0,Egypt,2000,Sydney,Gymnastics Men's Individual All-Around,---
4,20.0,159.0,64.0,Portugal,1968,Mexico City,Gymnastics Men's Individual All-Around,---


In [4]:
swimmingcopied = swimming.copy()

swimming_dummies = pd.get_dummies(swimmingcopied, columns=["Team", "City", "Event"])
swimming_dummies.head()

Unnamed: 0,Age,Height,Weight,Year,Medal,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,...,City_Moskva,City_Munich,City_Paris,City_Rio de Janeiro,City_Roma,City_Seoul,City_St. Louis,City_Sydney,City_Tokyo,Event_Gymnastics Men's Individual All-Around
0,28.0,175.0,64.0,1948,Bronze,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,32.0,175.0,64.0,1952,---,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,23.0,167.0,64.0,2016,---,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,22.0,167.0,63.0,2000,---,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,20.0,159.0,64.0,1968,---,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Select Feature columns

In [5]:
target = swimming_dummies["Medal"]
data = swimming_dummies.drop("Medal", axis=1)
feature_names = data.columns
print(target.shape, data.shape)

(1408,) (1408, 101)


# Create a Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
X_train.head()

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,Team_Azerbaijan,...,City_Moskva,City_Munich,City_Paris,City_Rio de Janeiro,City_Roma,City_Seoul,City_St. Louis,City_Sydney,City_Tokyo,Event_Gymnastics Men's Individual All-Around
497,29.0,162.0,59.0,2008,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
881,22.0,171.0,69.0,2012,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1370,20.0,160.0,54.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
846,21.0,178.0,69.0,1968,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10,21.0,156.0,55.0,1992,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Pre-process Data

In [7]:
# Scale your data
from sklearn.preprocessing import StandardScaler, LabelEncoder

# scale the data with StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label Encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train.values.ravel())
encoded_y_train = label_encoder.transform(y_train.values.ravel())
encoded_y_test = label_encoder.transform(y_test.values.ravel())
encoded_y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,

# Train Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, encoded_y_train)
rf.score(X_test, encoded_y_test)

0.96875

In [9]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.1723333157871064, 'Height'),
 (0.169727855738957, 'Weight'),
 (0.16502128113184084, 'Age'),
 (0.08331233854847454, 'Year'),
 (0.05344057134219808, 'Team_Soviet Union'),
 (0.05313713767088264, 'Team_Japan'),
 (0.022443098681458933, 'Team_Unified Team'),
 (0.02077420501712445, 'City_London'),
 (0.013891296654092208, 'Team_China'),
 (0.01360818946200282, 'City_Los Angeles'),
 (0.012487184560426929, 'Team_United States'),
 (0.011415446914053745, 'City_Sydney'),
 (0.01140812809035311, 'City_Montreal'),
 (0.011127810011419501, 'City_Munich'),
 (0.01112622483558853, 'City_Tokyo'),
 (0.010644859397058068, 'City_Beijing'),
 (0.00991615665863769, 'City_Seoul'),
 (0.009796446808025482, 'Team_Great Britain'),
 (0.009449869201270803, 'Team_Russia'),
 (0.00943067158104139, 'City_Atlanta'),
 (0.009047524274834521, 'Team_Belarus'),
 (0.008449568390135009, 'City_Melbourne'),
 (0.008405741367934238, 'City_Mexico City'),
 (0.008387898371246153, 'City_Moskva'),
 (0.008120271720701077, 'City_Athina'),


In [10]:
rf.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [11]:
predictions = rf.predict(X_test_scaled)
print(f"Training Data Score before hypertuning: {rf.score(X_train, encoded_y_train)}")
print(f"Testing Data Score before hypertuning: {rf.score(X_test, encoded_y_test)}")

Training Data Score before hypertuning: 1.0
Testing Data Score before hypertuning: 0.96875


In [12]:
results_df = pd.DataFrame({"Prediction": predictions, "Actual": encoded_y_test}).reset_index(drop=True)
results_df.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


# Export model for future use

In [13]:
# save your model by updating "your_name" with your name
import joblib
filename = 'models/random_forest_model.sav'
joblib.dump(rf, filename)

['models/random_forest_model.sav']

# View X_test df to reset index to merge with y_test df to merge into one df for data and predictions merged, then export into csv file

In [14]:
X_test.reset_index(inplace = True, drop = True)
X_test.head()

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,Team_Azerbaijan,...,City_Moskva,City_Munich,City_Paris,City_Rio de Janeiro,City_Roma,City_Seoul,City_St. Louis,City_Sydney,City_Tokyo,Event_Gymnastics Men's Individual All-Around
0,23.0,166.0,70.0,1964,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,27.0,170.0,66.0,1984,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,26.0,164.0,58.0,1976,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,23.0,155.0,51.0,1976,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,23.0,168.0,67.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [15]:
merge_results = pd.concat([X_test, results_df], axis=1)
merge_results

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,Team_Azerbaijan,...,City_Paris,City_Rio de Janeiro,City_Roma,City_Seoul,City_St. Louis,City_Sydney,City_Tokyo,Event_Gymnastics Men's Individual All-Around,Prediction,Actual
0,23.0,166.0,70.0,1964,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,27.0,170.0,66.0,1984,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,26.0,164.0,58.0,1976,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,23.0,155.0,51.0,1976,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,23.0,168.0,67.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,24.0,160.0,60.0,1964,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
348,31.0,167.0,65.0,2008,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
349,22.0,163.0,62.0,1964,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
350,26.0,175.0,65.0,1964,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [16]:
shorter_merge_results = merge_results[["Age", "Height", "Weight", "Team_Soviet Union", "Team_Japan", "Event_Gymnastics Men's Individual All-Around", "Prediction", "Actual"]]
shorter_merge_results

Unnamed: 0,Age,Height,Weight,Team_Soviet Union,Team_Japan,Event_Gymnastics Men's Individual All-Around,Prediction,Actual
0,23.0,166.0,70.0,0,0,1,0,0
1,27.0,170.0,66.0,0,1,1,0,0
2,26.0,164.0,58.0,0,0,1,0,0
3,23.0,155.0,51.0,0,0,1,0,0
4,23.0,168.0,67.0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
347,24.0,160.0,60.0,0,0,1,0,0
348,31.0,167.0,65.0,0,0,1,0,0
349,22.0,163.0,62.0,0,0,1,0,0
350,26.0,175.0,65.0,0,0,1,0,0


In [17]:
shorter_merge_results.to_csv("data/results/results_Event_male_gym_rf_model.csv", index=False, header=True)