In [1]:
from sklearn import tree
import pandas as pd
import os

# Read CSV and get_dummies

In [2]:
swimming = pd.read_csv(os.path.join("data/Summer_events/summer_male_Swimming_400_freestyle.csv"))
swimming.head()

Unnamed: 0,Age,Height,Weight,Team,Year,City,Event,Medal
0,22.0,190.0,78.0,Soviet Union,1972,Munich,Swimming Men's 400 metres Freestyle,---
1,15.0,177.0,64.0,Portugal,1976,Montreal,Swimming Men's 400 metres Freestyle,---
2,14.0,181.0,77.0,Qatar,2004,Athina,Swimming Men's 400 metres Freestyle,---
3,20.0,187.0,87.0,Czechoslovakia,1980,Moskva,Swimming Men's 400 metres Freestyle,---
4,18.0,168.0,65.0,Malta,2004,Athina,Swimming Men's 400 metres Freestyle,---


In [3]:
swimmingcopied = swimming.copy()

swimming_dummies = pd.get_dummies(swimmingcopied, columns=["Team", "City", "Event"])
swimming_dummies.head()

Unnamed: 0,Age,Height,Weight,Year,Medal,Team_Algeria,Team_Andorra,Team_Argentina,Team_Australasia,Team_Australia,...,City_Moskva,City_Munich,City_Paris,City_Rio de Janeiro,City_Roma,City_Seoul,City_Stockholm,City_Sydney,City_Tokyo,Event_Swimming Men's 400 metres Freestyle
0,22.0,190.0,78.0,1972,---,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,15.0,177.0,64.0,1976,---,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,14.0,181.0,77.0,2004,---,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,20.0,187.0,87.0,1980,---,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,18.0,168.0,65.0,2004,---,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


<!-- dummy df exported, now importing again to make it easier for random forest tree due to x and x copy -->

# Select Feature columns

In [4]:
target = swimming_dummies["Medal"]
data = swimming_dummies.drop("Medal", axis=1)
feature_names = data.columns
print(target.shape, data.shape)

(597,) (597, 118)


# Create a Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
X_train.head()

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Andorra,Team_Argentina,Team_Australasia,Team_Australia,Team_Austria,...,City_Moskva,City_Munich,City_Paris,City_Rio de Janeiro,City_Roma,City_Seoul,City_Stockholm,City_Sydney,City_Tokyo,Event_Swimming Men's 400 metres Freestyle
512,22.0,182.0,78.0,1980,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
382,23.0,182.0,72.0,1984,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
329,22.0,189.0,78.0,1968,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
235,24.0,185.0,71.0,1972,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
563,17.0,170.0,62.0,1976,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Pre-process Data

In [6]:
# Scale your data
from sklearn.preprocessing import StandardScaler, LabelEncoder

# scale the data with StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label Encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train.values.ravel())
encoded_y_train = label_encoder.transform(y_train.values.ravel())
encoded_y_test = label_encoder.transform(y_test.values.ravel())

# Decision Tree Classifier

In [7]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, encoded_y_train)
clf.score(X_test_scaled, encoded_y_test)

0.8266666666666667

In [8]:
sorted(zip(clf.feature_importances_, feature_names), reverse=True)

[(0.14347024348788168, 'Age'),
 (0.14189400027172713, 'Year'),
 (0.1399045349365042, 'Weight'),
 (0.08773170372663834, 'Height'),
 (0.05675595543970723, 'Team_United States'),
 (0.05675055194302389, 'Team_Australia'),
 (0.04593304713189622, 'City_London'),
 (0.04243780516159031, 'City_Beijing'),
 (0.040872203244325986, 'City_Rio de Janeiro'),
 (0.03188740462048994, 'City_Moskva'),
 (0.027714485003487558, 'City_Munich'),
 (0.025471290079874517, 'City_Los Angeles'),
 (0.0244770856681895, 'Team_Unified Team'),
 (0.02446930662931281, 'Team_Australasia'),
 (0.02078586375261566, 'City_Atlanta'),
 (0.01847632333565836, 'Team_France'),
 (0.013006701446874675, 'Team_China'),
 (0.011339993417417269, 'Team_Japan'),
 (0.01039293187630784, 'City_Barcelona'),
 (0.007875212507152625, 'Team_South Korea'),
 (0.006062543594512901, 'City_Tokyo'),
 (0.005593580067210265, 'Team_Soviet Union'),
 (0.0054758064780218345, 'Team_Canada'),
 (0.0048305176326501244, 'Team_Germany'),
 (0.002303551603195921, 'Team_S

In [9]:
clf.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Train the Model

In [10]:
predictions = clf.predict(X_test)
print(f"Training Data Score: {clf.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {clf.score(X_test_scaled, encoded_y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.8266666666666667


In [16]:
results_df = pd.DataFrame({"Prediction": predictions, "Actual": encoded_y_test}).reset_index(drop=True)
results_df.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,2
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


# Hyperparameter Tuning with GridSearchCV to tune the model's parameters

# Export model for future use

In [21]:
# save your model by updating "your_name" with your name
import joblib
filename = 'models/decision_tree_classifier_model.sav'
joblib.dump(clf, filename)

['models/decision_tree_classifier_model.sav']

# View X_test df to reset index to merge with y_test df to merge into one df for data and predictions merged, then export into csv file

In [23]:
X_test.head()

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Andorra,Team_Argentina,Team_Australasia,Team_Australia,Team_Austria,...,City_Moskva,City_Munich,City_Paris,City_Rio de Janeiro,City_Roma,City_Seoul,City_Stockholm,City_Sydney,City_Tokyo,Event_Swimming Men's 400 metres Freestyle
109,22.0,174.0,78.0,1972,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
480,24.0,182.0,77.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
135,20.0,185.0,77.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
77,21.0,175.0,70.0,1968,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
396,19.0,168.0,71.0,1964,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [24]:
X_test.reset_index(inplace = True, drop = True)

In [25]:
X_test

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Andorra,Team_Argentina,Team_Australasia,Team_Australia,Team_Austria,...,City_Moskva,City_Munich,City_Paris,City_Rio de Janeiro,City_Roma,City_Seoul,City_Stockholm,City_Sydney,City_Tokyo,Event_Swimming Men's 400 metres Freestyle
0,22.0,174.0,78.0,1972,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,24.0,182.0,77.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,20.0,185.0,77.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,21.0,175.0,70.0,1968,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,19.0,168.0,71.0,1964,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,30.0,192.0,88.0,2008,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
146,25.0,200.0,95.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
147,20.0,176.0,70.0,1972,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
148,22.0,190.0,78.0,1972,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [26]:
merge_results = pd.concat([X_test, results_df], axis=1)
merge_results

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Andorra,Team_Argentina,Team_Australasia,Team_Australia,Team_Austria,...,City_Paris,City_Rio de Janeiro,City_Roma,City_Seoul,City_Stockholm,City_Sydney,City_Tokyo,Event_Swimming Men's 400 metres Freestyle,Prediction,Actual
0,22.0,174.0,78.0,1972,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,24.0,182.0,77.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,20.0,185.0,77.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,21.0,175.0,70.0,1968,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2
4,19.0,168.0,71.0,1964,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,30.0,192.0,88.0,2008,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
146,25.0,200.0,95.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
147,20.0,176.0,70.0,1972,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
148,22.0,190.0,78.0,1972,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [27]:
merge_results

Unnamed: 0,Age,Height,Weight,Year,Team_Algeria,Team_Andorra,Team_Argentina,Team_Australasia,Team_Australia,Team_Austria,...,City_Paris,City_Rio de Janeiro,City_Roma,City_Seoul,City_Stockholm,City_Sydney,City_Tokyo,Event_Swimming Men's 400 metres Freestyle,Prediction,Actual
0,22.0,174.0,78.0,1972,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,24.0,182.0,77.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,20.0,185.0,77.0,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,21.0,175.0,70.0,1968,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2
4,19.0,168.0,71.0,1964,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,30.0,192.0,88.0,2008,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
146,25.0,200.0,95.0,2000,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
147,20.0,176.0,70.0,1972,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
148,22.0,190.0,78.0,1972,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [30]:
shorter_merge_results = merge_results[["Age", "Height", "Weight", "Team_Algeria", "Team_Argentina", "City_Roma", "Prediction", "Actual"]]
shorter_merge_results

Unnamed: 0,Age,Height,Weight,Team_Algeria,Team_Argentina,City_Roma,Prediction,Actual
0,22.0,174.0,78.0,0,0,0,0,0
1,24.0,182.0,77.0,0,0,0,0,0
2,20.0,185.0,77.0,0,0,0,0,0
3,21.0,175.0,70.0,0,0,0,0,2
4,19.0,168.0,71.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
145,30.0,192.0,88.0,0,0,0,0,0
146,25.0,200.0,95.0,0,0,0,0,0
147,20.0,176.0,70.0,0,0,0,0,0
148,22.0,190.0,78.0,0,0,0,0,0


In [31]:
shorter_merge_results.to_csv("data/results/results_mens_swimming_shorter_clf_model.csv", index=False, header=True)