In [1]:
from sklearn import tree
import numpy as np
import pandas as pd
import os

# Read CSV and get_dummies

In [2]:
swimming = pd.read_csv(os.path.join("data/Summer_events/summer_female_tennis.csv"))
swimming.head()

Unnamed: 0,Age,Height,Weight,Team,Year,City,Event,Medal
0,24.0,190.0,85.0,Uzbekistan,2008,Beijing,Tennis Women's Singles,---
1,26.0,175.0,64.0,Estonia,2008,Beijing,Tennis Women's Singles,---
2,20.0,169.0,60.0,Belgium,1992,Barcelona,Tennis Women's Singles,---
3,24.0,169.0,60.0,Belgium,1996,Atlanta,Tennis Women's Singles,---
4,28.0,169.0,60.0,Belgium,2000,Sydney,Tennis Women's Singles,---


In [3]:
swimmingcopied = swimming.copy()

swimming_dummies = pd.get_dummies(swimmingcopied, columns=["Team", "City", "Event"])
swimming_dummies.head()

Unnamed: 0,Age,Height,Weight,Year,Medal,Team_Argentina,Team_Australia,Team_Austria,Team_Belarus,Team_Belgium,...,Team_Zimbabwe,City_Athina,City_Atlanta,City_Barcelona,City_Beijing,City_London,City_Rio de Janeiro,City_Seoul,City_Sydney,Event_Tennis Women's Singles
0,24.0,190.0,85.0,2008,---,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,26.0,175.0,64.0,2008,---,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,20.0,169.0,60.0,1992,---,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,24.0,169.0,60.0,1996,---,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,28.0,169.0,60.0,2000,---,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1


# Select Feature columns

In [4]:
target = swimming_dummies["Medal"]
data = swimming_dummies.drop("Medal", axis=1)
feature_names = data.columns
print(target.shape, data.shape)

(487,) (487, 77)


# Create a Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
X_train.head()

Unnamed: 0,Age,Height,Weight,Year,Team_Argentina,Team_Australia,Team_Austria,Team_Belarus,Team_Belgium,Team_Bosnia and Herzegovina,...,Team_Zimbabwe,City_Athina,City_Atlanta,City_Barcelona,City_Beijing,City_London,City_Rio de Janeiro,City_Seoul,City_Sydney,Event_Tennis Women's Singles
459,28.0,185.0,75.0,2008,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
131,23.0,175.0,59.0,1992,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
57,22.0,175.0,63.0,2012,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
181,23.0,172.0,60.0,1992,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
317,23.0,177.0,65.0,2000,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1


# Pre-process Data

In [6]:
# Scale your data
from sklearn.preprocessing import StandardScaler, LabelEncoder

# scale the data with StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label Encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train.values.ravel())
encoded_y_train = label_encoder.transform(y_train.values.ravel())
encoded_y_test = label_encoder.transform(y_test.values.ravel())
encoded_y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Train Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, encoded_y_train)
rf.score(X_test, encoded_y_test)

0.9590163934426229

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.17031070591322287, 'Height'),
 (0.13818953662983888, 'Age'),
 (0.13713027569869202, 'Weight'),
 (0.072742033186787, 'Year'),
 (0.06620092599510573, 'Team_United States'),
 (0.03088601231800304, 'Team_Russia'),
 (0.02771401053196688, 'Team_Czech Republic'),
 (0.027643688873747862, 'Team_Germany'),
 (0.024321831083827385, 'Team_Puerto Rico'),
 (0.02384059183620616, 'City_Sydney'),
 (0.023724307413028882, 'Team_Spain'),
 (0.02203759833646729, 'City_Atlanta'),
 (0.021981902384581406, 'Team_Belarus'),
 (0.020980540193891766, 'City_Barcelona'),
 (0.020550290435833948, 'City_London'),
 (0.019553455893115808, 'Team_West Germany'),
 (0.018362781089531464, 'City_Athina'),
 (0.017983567885321395, 'Team_Bulgaria'),
 (0.017103553153177485, 'City_Seoul'),
 (0.01674168802533981, 'City_Beijing'),
 (0.015506277501378417, 'Team_Belgium'),
 (0.014718961661493859, 'Team_France'),
 (0.013477874187793556, 'City_Rio de Janeiro'),
 (0.009632912164280397, 'Team_Argentina'),
 (0.0029898241813035987, 'Team_A

In [9]:
rf.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [10]:
predictions = rf.predict(X_test_scaled)
print(f"Training Data Score before hypertuning: {rf.score(X_train, encoded_y_train)}")
print(f"Testing Data Score before hypertuning: {rf.score(X_test, encoded_y_test)}")

Training Data Score before hypertuning: 1.0
Testing Data Score before hypertuning: 0.9590163934426229


In [11]:
results_df = pd.DataFrame({"Prediction": predictions, "Actual": encoded_y_test}).reset_index(drop=True)
results_df.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,3
9,0,0


# Export model for future use

In [12]:
# save your model by updating "your_name" with your name
import joblib
filename = 'models/random_forest_model.sav'
joblib.dump(rf, filename)

['models/random_forest_model.sav']

# View X_test df to reset index to merge with y_test df to merge into one df for data and predictions merged, then export into csv file

In [13]:
X_test.reset_index(inplace = True, drop = True)
X_test.head()

Unnamed: 0,Age,Height,Weight,Year,Team_Argentina,Team_Australia,Team_Austria,Team_Belarus,Team_Belgium,Team_Bosnia and Herzegovina,...,Team_Zimbabwe,City_Athina,City_Atlanta,City_Barcelona,City_Beijing,City_London,City_Rio de Janeiro,City_Seoul,City_Sydney,Event_Tennis Women's Singles
0,21.0,183.0,72.0,2004,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,24.0,175.0,62.0,2004,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,26.0,168.0,64.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,21.0,177.0,77.0,1992,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,19.0,162.0,62.0,2008,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [14]:
merge_results = pd.concat([X_test, results_df], axis=1)
merge_results

Unnamed: 0,Age,Height,Weight,Year,Team_Argentina,Team_Australia,Team_Austria,Team_Belarus,Team_Belgium,Team_Bosnia and Herzegovina,...,City_Atlanta,City_Barcelona,City_Beijing,City_London,City_Rio de Janeiro,City_Seoul,City_Sydney,Event_Tennis Women's Singles,Prediction,Actual
0,21.0,183.0,72.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,24.0,175.0,62.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,26.0,168.0,64.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,21.0,177.0,77.0,1992,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,19.0,162.0,62.0,2008,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,19.0,172.0,70.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
118,24.0,182.0,69.0,2016,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
119,19.0,162.0,56.0,1988,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
120,21.0,171.0,57.0,1992,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [17]:
shorter_merge_results = merge_results[["Age", "Height", "Weight", "Team_United States", "Team_Russia", "Event_Tennis Women's Singles", "Prediction", "Actual"]]
merge_results

Unnamed: 0,Age,Height,Weight,Year,Team_Argentina,Team_Australia,Team_Austria,Team_Belarus,Team_Belgium,Team_Bosnia and Herzegovina,...,City_Atlanta,City_Barcelona,City_Beijing,City_London,City_Rio de Janeiro,City_Seoul,City_Sydney,Event_Tennis Women's Singles,Prediction,Actual
0,21.0,183.0,72.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,24.0,175.0,62.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,26.0,168.0,64.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,21.0,177.0,77.0,1992,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,19.0,162.0,62.0,2008,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,19.0,172.0,70.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
118,24.0,182.0,69.0,2016,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
119,19.0,162.0,56.0,1988,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
120,21.0,171.0,57.0,1992,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [19]:
shorter_merge_results.to_csv("data/results/results_Event_female_tennis_rf_model.csv", index=False, header=True)