In [63]:
import pandas as pd
import numpy as np
import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [64]:
df = pd.read_csv("data/athlete_events.csv")
# drop cols which have NA for suggusted features
df = df.dropna(subset=['Age', 'Height', 'Weight', 'Medal'])
# filter for just Athletics Events
df_athletics = df[df["Sport"] == "Athletics"]
df_athletics = df_athletics[['Age', 'Height', 'Weight', 'Medal', 'Event']]
df_athletics.head()

Unnamed: 0,Age,Height,Weight,Medal,Event
163,22.0,179.0,80.0,Silver,Athletics Women's Javelin Throw
191,26.0,182.0,67.0,Bronze,"Athletics Men's 1,500 metres"
620,26.0,190.0,125.0,Gold,Athletics Men's Hammer Throw
655,21.0,160.0,50.0,Bronze,"Athletics Men's 10,000 metres"
720,22.0,166.0,58.0,Gold,Athletics Men's Marathon


In [65]:
df_athletics.groupby(['Event']).size().reset_index(name='Count').sort_values("Count", ascending=False)

Unnamed: 0,Event,Count
21,Athletics Men's 4 x 400 metres Relay,295
20,Athletics Men's 4 x 100 metres Relay,282
64,Athletics Women's 4 x 100 metres Relay,248
65,Athletics Women's 4 x 400 metres Relay,182
6,Athletics Men's 100 metres,84
...,...,...
17,"Athletics Men's 3,200 metres Steeplechase",2
46,Athletics Men's Pentathlon (Ancient),2
31,Athletics Men's All-Around Championship,1
8,"Athletics Men's 2,500 metres Steeplechase",1


In [66]:
value_counts = df_athletics['Event'].value_counts()
df_athletics = df_athletics[df_athletics['Event'].isin(value_counts[value_counts >= 50].index)]

In [67]:
athletic_events = df_athletics["Event"].unique()
print(len(athletic_events))

32


In [68]:
results = {"Event":[], "Accuracy":[], "N":[]}

for event in athletic_events:
    # filter dataframe for this event
    df_event = df_athletics[df_athletics["Event"] == event]

    # split into features and test train split
    y = df_event["Medal"]
    X = df_event[["Age", "Height", "Weight"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # train model
    mlp = MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", max_iter=5000, random_state=42)
    mlp.fit(X_train, y_train)

    # see how model performs
    y_pred = mlp.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # append scores to the results
    results["Event"].append(event)
    results["Accuracy"].append(accuracy)
    results["N"].append(df_event.shape[0])


In [69]:
df_results = pd.DataFrame(results).sort_values("Accuracy", ascending=False)
df_results

Unnamed: 0,Event,Accuracy,N
10,Athletics Women's Shot Put,0.545455,51
11,Athletics Men's 400 metres Hurdles,0.533333,74
21,Athletics Women's Discus Throw,0.5,59
23,Athletics Men's 200 metres,0.5,78
20,Athletics Men's Shot Put,0.4375,80
19,Athletics Men's 110 metres Hurdles,0.4375,78
14,Athletics Women's 4 x 100 metres Relay,0.42,248
8,Athletics Men's 100 metres,0.411765,84
22,"Athletics Men's 5,000 metres",0.4,71
30,Athletics Men's Javelin Throw,0.4,73


In [None]:
results = {"Event":[], "Accuracy":[], "N":[]}

for event in athletic_events:
    # filter dataframe for this event
    df_event = df_athletics[df_athletics["Event"] == event]

    # split into features and test train split
    y = df_event["Medal"]
    X = df_event[["Age", "Height", "Weight"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # train model
    mlp = MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", max_iter=5000, random_state=42)
    mlp.fit(X_train, y_train)

    # see how model performs
    y_pred = mlp.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # append scores to the results
    results["Event"].append(event)
    results["Accuracy"].append(accuracy)
    results["N"].append(df_event.shape[0])

In [86]:
df = pd.read_csv("data/athlete_events.csv")
# drop cols which have NA for suggusted features
df = df.dropna(subset=['Age', 'Height', 'Weight', 'Sex'])
df.groupby(['Sport']).size().reset_index(name='Count').sort_values("Count", ascending=False).iloc[:10,:]

Unnamed: 0,Sport,Count
3,Athletics,32374
44,Swimming,18776
22,Gymnastics,18271
33,Rowing,7790
14,Cycling,7775
12,Cross Country Skiing,7529
37,Shooting,7260
17,Fencing,6537
0,Alpine Skiing,6322
11,Canoeing,5550


In [91]:
df_pred_events = df[df["Sport"].isin(["Swimming", "Gymnastics", "Rowing", "Cycling", "Fencing"])]
df_pred_events = df_pred_events[["Age", "Height", "Weight", "Sex", "Sport"]]

df_pred_events["Sex_F"] = np.where(df_pred_events['Sex'] == "F", 1, 0)
df_pred_events["Sex_M"] = np.where(df_pred_events['Sex'] == "M", 1, 0)
df_pred_events = df_pred_events.drop('Sex', axis=1)

df_pred_events.head()

Unnamed: 0,Age,Height,Weight,Sport,Sex_F,Sex_M
41,28.0,175.0,64.0,Gymnastics,0,1
42,28.0,175.0,64.0,Gymnastics,0,1
43,28.0,175.0,64.0,Gymnastics,0,1
44,28.0,175.0,64.0,Gymnastics,0,1
45,28.0,175.0,64.0,Gymnastics,0,1


In [98]:
def scale(x):
    return (x - min(x)) / (max(x) - min(x))

df_pred_events[["Age", "Height", "Weight"]] = df_pred_events[["Age", "Height", "Weight"]].apply(scale)

# split into features and test train split
y = df_pred_events["Sport"]
X = df_pred_events[["Age", "Height", "Weight", "Sex_F", "Sex_M"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [99]:
# train model
mlp = MLPClassifier(hidden_layer_sizes=(200, 100, 50), activation="relu", solver="adam", max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)

# see how model performs
y_pred = mlp.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

0.6575655114116653


In [121]:
results = {"labels":y_test, "predicted":y_pred}
df_results = pd.DataFrame(results)

df_results = df_results.groupby(["labels", "predicted"]).size().reset_index(name='Count')

df_sum = df_results.groupby('labels').sum('count').reset_index().rename(columns={'Count': 'Sum'})

df_results = pd.merge(df_results, df_sum, on="labels")
df_results["Decimal"] = df_results["Count"] / df_results["Sum"]

df_results.head()

Unnamed: 0,labels,predicted,Count,Sum,Decimal
0,Cycling,Cycling,421,1583,0.265951
1,Cycling,Fencing,195,1583,0.123184
2,Cycling,Gymnastics,378,1583,0.238787
3,Cycling,Rowing,92,1583,0.058117
4,Cycling,Swimming,497,1583,0.313961
