In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv("data/athlete_events.csv")
# drop cols which have NA for suggusted features
df = df.dropna(subset=['Age', 'Height', 'Weight', 'Medal'])
# filter for just Athletics Events
df_athletics = df[df["Sport"] == "Athletics"]
df_athletics = df_athletics[['Age', 'Height', 'Weight', 'Medal', 'Event']]
df_athletics.head()

Unnamed: 0,Age,Height,Weight,Medal,Event
163,22.0,179.0,80.0,Silver,Athletics Women's Javelin Throw
191,26.0,182.0,67.0,Bronze,"Athletics Men's 1,500 metres"
620,26.0,190.0,125.0,Gold,Athletics Men's Hammer Throw
655,21.0,160.0,50.0,Bronze,"Athletics Men's 10,000 metres"
720,22.0,166.0,58.0,Gold,Athletics Men's Marathon


In [4]:
df_athletics.groupby(['Event']).size().reset_index(name='Count').sort_values("Count", ascending=False)

Unnamed: 0,Event,Count
21,Athletics Men's 4 x 400 metres Relay,295
20,Athletics Men's 4 x 100 metres Relay,282
64,Athletics Women's 4 x 100 metres Relay,248
65,Athletics Women's 4 x 400 metres Relay,182
6,Athletics Men's 100 metres,84
...,...,...
17,"Athletics Men's 3,200 metres Steeplechase",2
46,Athletics Men's Pentathlon (Ancient),2
31,Athletics Men's All-Around Championship,1
8,"Athletics Men's 2,500 metres Steeplechase",1


In [5]:
value_counts = df_athletics['Event'].value_counts()
df_athletics = df_athletics[df_athletics['Event'].isin(value_counts[value_counts >= 50].index)]

In [6]:
athletic_events = df_athletics["Event"].unique()
print(len(athletic_events))

32


In [7]:
results = {"Event":[], "Accuracy":[], "N":[]}

for event in athletic_events:
    # filter dataframe for this event
    df_event = df_athletics[df_athletics["Event"] == event]

    # split into features and test train split
    y = df_event["Medal"]
    X = df_event[["Age", "Height", "Weight"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # train model
    mlp = MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", max_iter=5000, random_state=42)
    mlp.fit(X_train, y_train)

    # see how model performs
    y_pred = mlp.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # append scores to the results
    results["Event"].append(event)
    results["Accuracy"].append(accuracy)
    results["N"].append(df_event.shape[0])


In [8]:
df_results = pd.DataFrame(results).sort_values("Accuracy", ascending=False)
df_results

Unnamed: 0,Event,Accuracy,N
17,Athletics Men's 800 metres,0.5625,77
23,Athletics Men's 200 metres,0.5625,78
28,Athletics Men's Decathlon,0.533333,72
1,"Athletics Men's 1,500 metres",0.5,77
8,Athletics Men's 100 metres,0.470588,84
3,"Athletics Men's 10,000 metres",0.466667,71
11,Athletics Men's 400 metres Hurdles,0.466667,74
27,Athletics Women's High Jump,0.461538,62
7,Athletics Men's 4 x 400 metres Relay,0.423729,295
9,Athletics Men's 4 x 100 metres Relay,0.421053,282


In [9]:
results = {"Event":[], "Accuracy":[], "N":[]}

for event in athletic_events:
    # filter dataframe for this event
    df_event = df_athletics[df_athletics["Event"] == event]

    # split into features and test train split
    y = df_event["Medal"]
    X = df_event[["Age", "Height", "Weight"]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # train model
    mlp = MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", max_iter=5000, random_state=42)
    mlp.fit(X_train, y_train)

    # see how model performs
    y_pred = mlp.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # append scores to the results
    results["Event"].append(event)
    results["Accuracy"].append(accuracy)
    results["N"].append(df_event.shape[0])