# Olympics Project
Description: In this notebook, we construct a Decision Tree, Random Forest Classifier, Bagging, and Boosting models in attempt to model the data from the Olympics. 
### Authors: Kelvin Wang, Ethan Baird, Arnav Singh, Tuna Akmehmet

In [12]:
#importing data
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

df = pd.read_csv("data/athlete_events.csv") ## Here we have the raw .csv, feel free to download from Kaggle
print(df.shape)

(271116, 15)


In [13]:
#subsetting
# subset = df[['Sex', 'Age', 'Height', 'Weight', 'NOC', 'Year', 'Sport', 'Event', 'Medal']]
# subset['Medal'].fillna('No Medal', inplace=True)
# 
# 
# #Medal is 0 if they did not get a medal, 1 if they got bronze, silver, or gold
# subset['Medal'] = subset['Medal'].apply(lambda x: 0 if x == 'No Medal' else 1)
# 
# #Sex is 0 if male, 1 if female
# subset['Sex'] = subset['Sex'].apply(lambda x: 0 if x == 'M' else 1)
# 
# subset = subset.dropna()
# 
# subset.shape

###Better way without errors
# Create a subset of the dataframe
subset = df[['Sex', 'Age', 'Height', 'Weight', 'NOC', 'Year', 'Sport', 'Event', 'Medal']].copy()

# Fill missing values in the 'Medal' column
subset['Medal'] = subset['Medal'].fillna('No Medal')

# Convert 'Medal' to 0 if no medal, 1 if any medal
subset['Medal'] = subset['Medal'].apply(lambda x: 0 if x == 'No Medal' else 1)

# Convert 'Sex' to 0 if male, 1 if female
subset['Sex'] = subset['Sex'].apply(lambda x: 0 if x == 'M' else 1)

# Drop rows with any missing values
subset = subset.dropna()

# Display the first few rows of the modified subset, same size as before
subset.shape

(206165, 9)

In [14]:
def reduce_no_medal(df, n):
    '''
    :param df: the dataframe of the athlete events
    :param n: The requested size to reduce the non-medal count too, in order to accurately compare to the medal count.
    :return: the reduced_df with a more accurate size of medals vs. non-medals. We did this because size of non-medals is far greater than with medals
    '''
    no_medals = df[df['Medal'] == 0]
    won_medals = df[df['Medal'] == 1]

    no_medal_sample = no_medals.sample(n=n, random_state=42)

    reduced_df = pd.concat([no_medal_sample, won_medals])

    #shuffle the data, not sure if needed
    reduced_df = reduced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    return reduced_df

In [15]:
winners = subset[subset['Medal'] == 1]

# Step 2: Group by Sport/Event and count the occurrences
#sport_counts = medal_1_df['Sport'].value_counts()
sport_counts = winners['Event'].value_counts() # In this case we choose event

# Step 3: Filter to get events/sports with more than 20 entries, in order to get a good sample
sports_with_more_than_20 = sport_counts[sport_counts > 50].index.tolist()

#temp_subset = temp[temp['Sport'].isin(sports_with_more_than_20)]
subset_enough_winners = subset[subset['Event'].isin(sports_with_more_than_20)]

# Display the subset DataFrame
subset_enough_winners.shape

(107065, 9)

In [16]:
#mean_weight_medal_1 = temp[temp['Medal'] == 1].groupby('Sport')['Height'].mean()
mean_weight_medal_1 = subset_enough_winners[subset_enough_winners['Medal'] == 1].groupby('Event')['Height'].mean()

# Group by Sport and calculate the mean Weight for Medal == 0
#mean_weight_medal_0 = temp[temp['Medal'] == 0].groupby('Sport')['Height'].mean()
mean_weight_medal_0 = subset_enough_winners[subset_enough_winners['Medal'] == 0].groupby('Event')['Height'].mean()

# Display the results
df1 = mean_weight_medal_1.to_frame(name='Mean_Weight_Medal_1')
df2 = mean_weight_medal_0.to_frame(name='Mean_Weight_Medal_0')

# Merge the two DataFrames on the 'Sport' index
merged_df = pd.merge(df1, df2, left_index=True, right_index=True)

merged_df["diff"] = merged_df["Mean_Weight_Medal_1"] - merged_df["Mean_Weight_Medal_0"]

merged_df.sort_values(by="diff", ascending = True)

Unnamed: 0_level_0,Mean_Weight_Medal_1,Mean_Weight_Medal_0,diff
Event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Gymnastics Men's Floor Exercise,165.274510,167.676226,-2.401717
Gymnastics Men's Individual All-Around,165.352941,167.669123,-2.316182
Gymnastics Men's Horse Vault,165.615385,167.625000,-2.009615
Gymnastics Men's Team All-Around,165.982877,167.942632,-1.959755
"Wrestling Men's Featherweight, Greco-Roman",164.714286,166.044898,-1.330612
...,...,...,...
Swimming Women's 4 x 100 metres Freestyle Relay,175.461847,172.109306,3.352542
Rhythmic Gymnastics Women's Group,171.420000,168.030189,3.389811
Swimming Men's 4 x 100 metres Freestyle Relay,190.467337,186.833583,3.633753
Cycling Men's 100 kilometres Team Time Trial,180.417476,176.557542,3.859934


In [46]:
# BEGINNING OF UI FOR SELECTION OF SPORT/EVENT
###Creating a UI
import ipywidgets as widgets
from IPython.display import display

model_data = None

# Create a dropdown for selecting between "Sport" and "Event"
category_dropdown = widgets.Dropdown(
    options=["Sport", "Event"],
    value="Sport",  # Default value
    description='Select Sport/Event:',
    disabled=False,
)

# Create a text box for inputting the specific sport or event name (e.g., "Boxing")
input_box = widgets.Text(
    value='',
    placeholder='Type here',
    description='Input specific Sport/Event:',
    disabled=False
)

# Output widget to display results
output = widgets.Output()

# Function to update and display the selection
def on_button_click(b):
    global model_data
    '''
    :param b: button click
    :return: The selection for model
    '''
    with output:
        output.clear_output()
        selected_category = category_dropdown.value
        user_input = input_box.value
        print(f"Selected Category: {selected_category}, Input: {user_input}")

        # Example usage of the selected value in your code
        model_data = subset_enough_winners[subset_enough_winners[selected_category] == user_input]
        display(model_data.head())

# Create a button to trigger the selection
submit_button = widgets.Button(
    description='Submit',
    disabled=False,
    button_style='',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to submit selection',
)

# Attach the function to button click
submit_button.on_click(on_button_click)

# Display the widgets in the notebook
display(category_dropdown)
display(input_box)
display(submit_button)
display(output)
#model_data = subset_enough_winners[subset_enough_winners["Sport"] == "Boxing"]

Dropdown(description='Select Sport/Event:', options=('Sport', 'Event'), value='Sport')

Text(value='', description='Input specific Sport/Event:', placeholder='Type here')

Button(description='Submit', style=ButtonStyle(), tooltip='Click to submit selection')

Output()

In [47]:
### Getting rid of Unecessary columns for building the models
model_data = model_data.drop(columns = ["Sport", "Event", "Year"])
# Making sure the size medals vs. non-medals is similar
model_data = reduce_no_medal(model_data, len(model_data[model_data["Medal"] == 1]))

# Creating another model 1-hot encoding the country data
one_hot_model = pd.get_dummies(model_data, columns=['NOC'], dtype = 'int')

# Dropping the NOC column in the original data
model_data = model_data.drop(columns = ["NOC"])

In [48]:
## The model without geographic influence

X = model_data.drop(columns=['Medal'])
Y = model_data['Medal']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state = 42)

basic_data = (X_train, X_test, y_train, y_test)

In [49]:
#one_hot_model = one_hot_model.drop(columns = ["Sport", "Event", "Year"])

### The model with Geographic influence
one_hot_X = one_hot_model.drop(columns=['Medal'])
one_hot_Y = one_hot_model['Medal']

one_hot_X_train, one_hot_X_test, one_hot_y_train, one_hot_y_test = train_test_split(one_hot_X, one_hot_Y, test_size=0.1, random_state = 42)

one_hot_data = (one_hot_X_train, one_hot_X_test, one_hot_y_train, one_hot_y_test)

In [50]:
def model_scorer(model_name, model, data):
    (X_train, X_test, y_train, y_test) = data
    model.fit(X_train, y_train)
    return ((model_name, model.score(X_train, y_train), model.score(X_test, y_test)))

In [51]:
model_scores = []

In [52]:
dtc = DecisionTreeClassifier(max_leaf_nodes = 20)

# Fit model to training data

#dtc.fit(X_train, y_train)
#print("training accuracy: {}| testing accuracy: {}".format(round(dtc.score(X_train, y_train),2), round(dtc.score(X_test, y_test),2)))

#print(export_text(dtc, feature_names = X.columns))
model_scores.append(model_scorer("DTC", dtc, basic_data))

In [53]:
one_hot_dtc = DecisionTreeClassifier(max_leaf_nodes = 20)

# Fit model to training data

#one_hot_dtc.fit(one_hot_X_train, one_hot_y_train)
#print("training accuracy: {}| testing accuracy: {}".format(round(one_hot_dtc.score(one_hot_X_train, one_hot_y_train),2), round(one_hot_dtc.score(one_hot_X_test, one_hot_y_test),2)))

#print(export_text(one_hot_dtc, feature_names = one_hot_X.columns))
model_scores.append(model_scorer("one_hot_DTC", one_hot_dtc, one_hot_data))

In [55]:
#importing sklearn model constructors
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


'''ADD SOME CODE HERE'''
#creating logistic regression model
lg = LogisticRegression(max_iter = 1000)

#saving results
model_scores.append(model_scorer("Logistic", lg, basic_data))

#one hot logistic regression
one_hot_lg = LogisticRegression()

#saving results
model_scores.append(model_scorer("one_hot_Logistic", lg, one_hot_data))

In [56]:
#creating LDA model
lda = LinearDiscriminantAnalysis(store_covariance = True)

#saving results
model_scores.append(model_scorer("LDA", lda, basic_data))

#one hot version
one_hot_lda = LinearDiscriminantAnalysis(store_covariance = True)

#saving results
model_scores.append(model_scorer("one_hot_LDA", one_hot_lda, one_hot_data))


In [57]:
#creating random forest model
rfc = RandomForestClassifier()

#saving results
model_scores.append(model_scorer("RFC", rfc, basic_data))

#one hot version
one_hot_rfc = RandomForestClassifier()

#saving results
model_scores.append(model_scorer("one_hot_RFC", one_hot_rfc, one_hot_data))

#creating bagging model
bc = BaggingClassifier()

#saving results
model_scores.append(model_scorer("Bagging", bc, basic_data))

#one hot version
one_hot_bc = BaggingClassifier()

#saving results
model_scores.append(model_scorer("one_hot_Bagging", one_hot_bc, one_hot_data))

#creating boosting model
gbc = GradientBoostingClassifier()

#saving results
model_scores.append(model_scorer("Boosting", gbc, basic_data))

#one hot version
one_hot_gbc = GradientBoostingClassifier()

#saving results
model_scores.append(model_scorer("one_hot_Boosting", one_hot_gbc, one_hot_data))

In [58]:
#creating svc model
svc = SVC()

#saving results
model_scores.append(model_scorer("SVC", svc, basic_data))

#one hot version
one_hot_svc = SVC()

#saving results
model_scores.append(model_scorer("one_hot_SVC", one_hot_svc, one_hot_data))

In [59]:
model_scores = pd.DataFrame(model_scores, columns=['Model', 'Training Accuracy', 'Testing Accuracy'])

In [32]:
model_scores

Unnamed: 0,Model,Training Accuracy,Testing Accuracy
0,DTC,0.580581,0.576577
1,one_hot_DTC,0.703704,0.657658
2,Logistic,0.553554,0.54955
3,one_hot_Logistic,0.713714,0.657658
4,LDA,0.553554,0.558559
5,one_hot_LDA,0.725726,0.603604
6,RFC,0.920921,0.513514
7,one_hot_RFC,0.997998,0.612613
8,Bagging,0.8999,0.522523
9,one_hot_Bagging,0.95996,0.630631


In [34]:
test_df = pd.read_csv("data/cvs")
test_boxing = test_df[test_df['Sport']=="Boxing"].drop(columns=["Year", "Season", "City","Name", "Team","Unnamed: 13", "Event","Sport","NOC","Games"] )
test_boxing['Sex'] = test_boxing['Sex'].apply(lambda x: 0 if x == 'M' else 1)
print(test_boxing.shape)
predictions = lg.predict(test_boxing)
predictions

(0, 4)


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- NOC_AFG
- NOC_AHO
- NOC_ALB
- NOC_ALG
- NOC_AND
- ...


In [60]:
test_boxing_one_hot = test_boxing = test_df[test_df['Sport']=="Boxing"].drop(columns=["Year", "Season", "City","Name", "Team","Unnamed: 13", "Event","Sport","Games"] )
test_boxing_one_hot = pd.get_dummies(test_boxing_one_hot, columns=['NOC'], dtype = 'int')
list_of_col = list(one_hot_X_train.columns.values)
for col in list_of_col:
    test_boxing_one_hot[col] = 0

test_boxing_one_hot = test_boxing_one_hot[one_hot_X_train.columns]

predictions = one_hot_dtc.predict(test_boxing_one_hot)
print(predictions)

  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0
  test_boxing_one_hot[col] = 0


ValueError: Found array with 0 sample(s) (shape=(0, 124)) while a minimum of 1 is required by DecisionTreeClassifier.