In [1]:
from sklearn import tree
import numpy as np
import pandas as pd
import os

# Read CSV and get_dummies

In [2]:
swimming = pd.read_csv(os.path.join("data/Summer_events/summer_female_gym.csv"))
swimming.head()

Unnamed: 0,Age,Height,Weight,Team,Year,City,Event,Medal
0,19.0,160.0,48.0,East Germany,1972,Munich,Gymnastics Women's Individual All-Around,---
1,25.0,165.0,55.0,Germany,2008,Beijing,Gymnastics Women's Individual All-Around,---
2,18.0,153.0,48.0,Israel,1960,Roma,Gymnastics Women's Individual All-Around,---
3,22.0,148.0,46.0,Japan,1960,Roma,Gymnastics Women's Individual All-Around,---
4,26.0,148.0,46.0,Japan,1964,Tokyo,Gymnastics Women's Individual All-Around,---


In [3]:
swimmingcopied = swimming.copy()

swimming_dummies = pd.get_dummies(swimmingcopied, columns=["Team", "City", "Event"])
swimming_dummies.head()

Unnamed: 0,Age,Height,Weight,Year,Medal,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,...,City_Mexico City,City_Montreal,City_Moskva,City_Munich,City_Rio de Janeiro,City_Roma,City_Seoul,City_Sydney,City_Tokyo,Event_Gymnastics Women's Individual All-Around
0,19.0,160.0,48.0,1972,---,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,25.0,165.0,55.0,2008,---,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,18.0,153.0,48.0,1960,---,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,22.0,148.0,46.0,1960,---,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,26.0,148.0,46.0,1964,---,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


# Select Feature columns

In [4]:
target = swimming_dummies["Medal"]
data = swimming_dummies.drop("Medal", axis=1)
feature_names = data.columns
print(target.shape, data.shape)

(1328,) (1328, 98)


# Create a Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
X_train.head()

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,Team_Belarus,...,City_Mexico City,City_Montreal,City_Moskva,City_Munich,City_Rio de Janeiro,City_Roma,City_Seoul,City_Sydney,City_Tokyo,Event_Gymnastics Women's Individual All-Around
287,18.0,168.0,59.0,1960,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1280,16.0,165.0,52.0,2008,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1105,16.0,166.0,51.0,1992,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
294,21.0,164.0,52.0,1960,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
415,28.0,158.0,52.0,1972,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


# Pre-process Data

In [6]:
# Scale your data
from sklearn.preprocessing import StandardScaler, LabelEncoder

# scale the data with StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label Encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train.values.ravel())
encoded_y_train = label_encoder.transform(y_train.values.ravel())
encoded_y_test = label_encoder.transform(y_test.values.ravel())
encoded_y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Train Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, encoded_y_train)
rf.score(X_test, encoded_y_test)

0.963855421686747

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.18748965993265004, 'Weight'),
 (0.18618697065791628, 'Height'),
 (0.16872844486550093, 'Age'),
 (0.08110317645212342, 'Team_Soviet Union'),
 (0.06313928509455322, 'Year'),
 (0.04871400860339647, 'Team_United States'),
 (0.029898898479623855, 'Team_Romania'),
 (0.01958946853291778, 'Team_Russia'),
 (0.017774033387291208, 'Team_China'),
 (0.016408064878335647, 'Team_East Germany'),
 (0.013608197741908733, 'Team_Czechoslovakia'),
 (0.011936511356502606, 'City_Beijing'),
 (0.011661884282112357, 'City_London'),
 (0.010660510153860017, 'City_Moskva'),
 (0.010496029013332618, 'City_Atlanta'),
 (0.010353809674033775, 'City_Rio de Janeiro'),
 (0.0101945181430038, 'City_Sydney'),
 (0.010062582714966793, 'City_Athina'),
 (0.009826627975872424, 'City_Mexico City'),
 (0.008702320503976183, 'City_Roma'),
 (0.008346684675413269, 'City_Tokyo'),
 (0.007031192003699022, 'City_Montreal'),
 (0.005682750632723575, 'City_Los Angeles'),
 (0.005529966489985507, 'City_Seoul'),
 (0.0055253814713389655, 'Cit

In [9]:
rf.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [10]:
predictions = rf.predict(X_test_scaled)
print(f"Training Data Score before hypertuning: {rf.score(X_train, encoded_y_train)}")
print(f"Testing Data Score before hypertuning: {rf.score(X_test, encoded_y_test)}")

Training Data Score before hypertuning: 1.0
Testing Data Score before hypertuning: 0.963855421686747


In [11]:
results_df = pd.DataFrame({"Prediction": predictions, "Actual": encoded_y_test}).reset_index(drop=True)
results_df.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


# Export model for future use

In [12]:
# save your model by updating "your_name" with your name
import joblib
filename = 'models/random_forest_model.sav'
joblib.dump(rf, filename)

['models/random_forest_model.sav']

# View X_test df to reset index to merge with y_test df to merge into one df for data and predictions merged, then export into csv file

In [13]:
X_test.reset_index(inplace = True, drop = True)
X_test.head()

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,Team_Belarus,...,City_Mexico City,City_Montreal,City_Moskva,City_Munich,City_Rio de Janeiro,City_Roma,City_Seoul,City_Sydney,City_Tokyo,Event_Gymnastics Women's Individual All-Around
0,16.0,159.0,45.0,1980,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,17.0,153.0,50.0,1968,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,15.0,149.0,41.0,1992,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,20.0,160.0,56.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,17.0,157.0,51.0,1960,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [14]:
merge_results = pd.concat([X_test, results_df], axis=1)
merge_results

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,Team_Belarus,...,City_Moskva,City_Munich,City_Rio de Janeiro,City_Roma,City_Seoul,City_Sydney,City_Tokyo,Event_Gymnastics Women's Individual All-Around,Prediction,Actual
0,16.0,159.0,45.0,1980,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,17.0,153.0,50.0,1968,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,15.0,149.0,41.0,1992,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,20.0,160.0,56.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
4,17.0,157.0,51.0,1960,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,16.0,160.0,53.0,2008,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
328,17.0,149.0,39.0,1992,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
329,14.0,156.0,45.0,1996,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
330,14.0,153.0,41.0,1968,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [22]:
shorter_merge_results = merge_results[["Age", "Height", "Weight", "Team_Soviet Union", "Team_United States", "Event_Gymnastics Women's Individual All-Around", "Prediction", "Actual"]]
shorter_merge_results.head(250)

Unnamed: 0,Age,Height,Weight,Team_Soviet Union,Team_United States,Event_Gymnastics Women's Individual All-Around,Prediction,Actual
0,16.0,159.0,45.0,0,0,1,0,0
1,17.0,153.0,50.0,0,0,1,0,0
2,15.0,149.0,41.0,0,0,1,0,0
3,20.0,160.0,56.0,0,0,1,0,0
4,17.0,157.0,51.0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
245,14.0,141.0,30.0,0,0,1,0,0
246,18.0,150.0,45.0,0,0,1,0,0
247,19.0,157.0,49.0,0,0,1,0,0
248,14.0,148.0,38.0,0,0,1,0,0


In [23]:
shorter_merge_results.to_csv("data/results/results_Event_Gymnastics Women's Individual All-Around_rf_model.csv", index=False, header=True)