In [6]:
from sklearn import tree
import numpy as np
import pandas as pd
import os

# Read CSV and get_dummies

In [7]:
swimming = pd.read_csv(os.path.join("data/Summer_events/summer_male_tennis.csv"))
swimming.head()

Unnamed: 0,Age,Height,Weight,Team,Year,City,Event,Medal
0,26.0,180.0,80.0,United States,1996,Atlanta,Tennis Men's Singles,Gold
1,23.0,180.0,79.0,Haiti,1988,Seoul,Tennis Men's Singles,---
2,31.0,180.0,79.0,Haiti,1996,Atlanta,Tennis Men's Singles,---
3,19.0,185.0,85.0,Morocco,1992,Barcelona,Tennis Men's Singles,---
4,27.0,185.0,85.0,Morocco,2000,Sydney,Tennis Men's Singles,---


In [8]:
swimmingcopied = swimming.copy()

swimming_dummies = pd.get_dummies(swimmingcopied, columns=["Team", "City", "Event"])
swimming_dummies.head()

Unnamed: 0,Age,Height,Weight,Year,Medal,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,...,City_Barcelona,City_Beijing,City_London,City_Paris,City_Rio de Janeiro,City_Seoul,City_St. Louis,City_Stockholm,City_Sydney,Event_Tennis Men's Singles
0,26.0,180.0,80.0,1996,Gold,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,23.0,180.0,79.0,1988,---,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,31.0,180.0,79.0,1996,---,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,19.0,185.0,85.0,1992,---,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,27.0,185.0,85.0,2000,---,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


# Select Feature columns

In [9]:
target = swimming_dummies["Medal"]
data = swimming_dummies.drop("Medal", axis=1)
feature_names = data.columns
print(target.shape, data.shape)

(515,) (515, 99)


# Create a Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
X_train.head()

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,Team_Bahamas,...,City_Barcelona,City_Beijing,City_London,City_Paris,City_Rio de Janeiro,City_Seoul,City_St. Louis,City_Stockholm,City_Sydney,Event_Tennis Men's Singles
278,24.0,187.0,75.0,1988,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
467,31.0,180.0,72.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
513,26.0,183.0,80.0,2008,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
154,23.0,189.0,79.0,1988,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
25,25.0,188.0,87.0,2016,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


# Pre-process Data

In [11]:
# Scale your data
from sklearn.preprocessing import StandardScaler, LabelEncoder

# scale the data with StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label Encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train.values.ravel())
encoded_y_train = label_encoder.transform(y_train.values.ravel())
encoded_y_test = label_encoder.transform(y_test.values.ravel())
encoded_y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3])

# Train Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, encoded_y_train)
rf.score(X_test, encoded_y_test)

0.9147286821705426

In [13]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.16031629830559826, 'Weight'),
 (0.1437295349291838, 'Height'),
 (0.13908843783711689, 'Age'),
 (0.06653310872259399, 'Year'),
 (0.04681747912750093, 'Team_United States'),
 (0.03705379953405698, 'Team_Czechoslovakia'),
 (0.03589893120167479, 'Team_Chile'),
 (0.02831970452661544, 'Team_Serbia'),
 (0.025703062958722905, 'City_Seoul'),
 (0.024675348151213087, 'Team_Switzerland'),
 (0.023016402100434565, 'Team_Sweden'),
 (0.02075313774869684, 'City_Sydney'),
 (0.02013114390381745, 'Team_Great Britain'),
 (0.019842537699977188, 'Team_Spain'),
 (0.019247495425611855, 'Team_Japan'),
 (0.019225171102620668, 'Team_Argentina'),
 (0.018908029080348266, 'City_London'),
 (0.018883001643767213, 'Team_India'),
 (0.017604185882518296, 'Team_France'),
 (0.017330574610250168, 'City_Beijing'),
 (0.015073621175902262, 'Team_Russia'),
 (0.014848559752538203, 'City_Rio de Janeiro'),
 (0.013039934957364729, 'City_Athina'),
 (0.01242796234513156, 'City_Atlanta'),
 (0.004812207817630799, 'City_Barcelona'),

In [14]:
rf.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [15]:
predictions = rf.predict(X_test_scaled)
print(f"Training Data Score before hypertuning: {rf.score(X_train, encoded_y_train)}")
print(f"Testing Data Score before hypertuning: {rf.score(X_test, encoded_y_test)}")

Training Data Score before hypertuning: 1.0
Testing Data Score before hypertuning: 0.9147286821705426


In [16]:
results_df = pd.DataFrame({"Prediction": predictions, "Actual": encoded_y_test}).reset_index(drop=True)
results_df.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,2
9,0,0


# Export model for future use

In [17]:
# save your model by updating "your_name" with your name
import joblib
filename = 'models/random_forest_model.sav'
joblib.dump(rf, filename)

['models/random_forest_model.sav']

# View X_test df to reset index to merge with y_test df to merge into one df for data and predictions merged, then export into csv file

In [18]:
X_test.reset_index(inplace = True, drop = True)
X_test.head()

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,Team_Bahamas,...,City_Barcelona,City_Beijing,City_London,City_Paris,City_Rio de Janeiro,City_Seoul,City_St. Louis,City_Stockholm,City_Sydney,Event_Tennis Men's Singles
0,31.0,184.0,80.0,2012,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
1,27.0,183.0,79.0,2012,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,23.0,191.0,84.0,2016,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,29.0,179.0,77.0,2016,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,22.0,180.0,72.0,1992,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [19]:
merge_results = pd.concat([X_test, results_df], axis=1)
merge_results

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Argentina,Team_Armenia,Team_Australia,Team_Austria,Team_Bahamas,...,City_London,City_Paris,City_Rio de Janeiro,City_Seoul,City_St. Louis,City_Stockholm,City_Sydney,Event_Tennis Men's Singles,Prediction,Actual
0,31.0,184.0,80.0,2012,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
1,27.0,183.0,79.0,2012,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,23.0,191.0,84.0,2016,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,29.0,179.0,77.0,2016,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,22.0,180.0,72.0,1992,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,23.0,166.0,63.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
125,30.0,175.0,65.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
126,25.0,178.0,75.0,1996,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
127,31.0,185.0,79.0,2012,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [22]:
shorter_merge_results = merge_results[["Age", "Height", "Weight", "Team_United States", "Team_Chile", "Event_Tennis Men's Singles", "Prediction", "Actual"]]
shorter_merge_results


Unnamed: 0,Age,Height,Weight,Team_United States,Team_Chile,Event_Tennis Men's Singles,Prediction,Actual
0,31.0,184.0,80.0,0,0,1,0,0
1,27.0,183.0,79.0,0,0,1,0,0
2,23.0,191.0,84.0,1,0,1,0,0
3,29.0,179.0,77.0,0,0,1,0,0
4,22.0,180.0,72.0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
124,23.0,166.0,63.0,0,0,1,0,0
125,30.0,175.0,65.0,0,0,1,0,0
126,25.0,178.0,75.0,0,0,1,0,0
127,31.0,185.0,79.0,0,0,1,0,0


In [23]:
shorter_merge_results.to_csv("data/results/results_Event_male_tennis_rf_model.csv", index=False, header=True)