In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [56]:
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 15)

In [57]:
df = pd.read_csv("../data/2024_result/2024_result.csv")

In [58]:
df['Host'] = 0 
df.loc[df['Team'] == 'France', 'Host'] = 1

In [59]:
df

Unnamed: 0,Bronze,Silver,Gold,ID,Sex,Age,Height,Weight,Year,Medal,BMI,Host,Event,Name,Team,Sport
0,0.900000,0.0,0.100000,110716,0,27.0,172.0,67.0,2016,0,22.647377,0,Alpine Skiing Women's Super G,Georgia Simmerling,Canada,Alpine Skiing
1,0.900000,0.0,0.100000,110716,0,27.0,172.0,67.0,2016,0,22.647377,0,Alpine Skiing Women's Super G,Georgia Simmerling,Canada,Alpine Skiing
2,0.900000,0.0,0.100000,110716,0,27.0,172.0,67.0,2016,0,22.647377,0,Alpine Skiing Women's Super G,Georgia Simmerling,Canada,Alpine Skiing
3,0.100000,0.0,0.900000,124031,1,27.0,180.0,83.0,2016,2,25.617284,1,Archery Men's Individual,Jean-Charles Valladont,France,Archery
4,0.100000,0.9,0.000000,64487,1,23.0,181.0,84.0,2016,1,25.640243,0,Archery Men's Individual,Ku Bon-Chan,South Korea,Archery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,0.000000,0.4,0.600000,134776,0,23.0,164.0,58.0,2016,2,21.564545,0,"Wrestling Women's Lightweight, Freestyle",Valeriya Sergeyevna Zholobova-Koblova,Russia,Wrestling
845,0.716667,0.0,0.283333,111288,0,31.0,160.0,58.0,2016,0,22.656250,0,"Wrestling Women's Lightweight, Freestyle",Nataliya Siniin,Ukraine,Wrestling
846,0.200000,0.1,0.700000,74264,0,23.0,163.0,65.0,2016,2,24.464602,0,"Wrestling Women's Middleweight, Freestyle",Mariya Ruslanovna Mamoshuk,Belarus,Wrestling
847,0.600000,0.2,0.200000,58163,0,21.0,160.0,61.0,2016,1,23.828125,0,"Wrestling Women's Middleweight, Freestyle",Risako Kawai,Japan,Wrestling


In [60]:
df['Sex'] = df['Sex'].astype('category')

In [61]:
df = df.drop(columns=['Medal'])

In [62]:
df['pred_medal'] = df[['Bronze', 'Silver', 'Gold']].idxmax(axis=1)

In [63]:
df['pred_medal'].value_counts()

pred_medal
Bronze    331
Gold      266
Silver    252
Name: count, dtype: int64

In [64]:
medals_by_sport = df.groupby(['Sport', 'pred_medal']).size().unstack().fillna(0)
medals_by_sport_sorted = medals_by_sport.sum(axis=1).sort_values(ascending=False).index
medals_by_sport_sorted_df = medals_by_sport.loc[medals_by_sport_sorted]

# Plot predicted medals by sport (sorted)
fig = px.bar(medals_by_sport_sorted_df, title='Predicted Medals by Sport', barmode='stack')
fig.show()

In [65]:
# Filter data for sports with significant medal predictions
sports_with_predictions = df.groupby('Sport')['pred_medal'].sum().sort_values(ascending=False).index[:5]
df_filtered = df[df['Sport'].isin(sports_with_predictions)]

# Plot predicted medals by sport and host status
fig = px.bar(df_filtered, x='Sport', color='pred_medal', facet_col='Sex', title='Predicted Medals by Sport: Sex',
             labels={'pred_medal': 'Medal Type', 'Sport': 'Sport'}, category_orders={'pred_medal': ['Bronze', 'Silver', 'Gold']},
             color_discrete_sequence=['lightblue', 'lightgreen', 'lightcoral'])
fig.update_layout(xaxis_title=None, yaxis_title='Number of Medals', legend_title='Medal Type')
fig.show()


In [69]:
# Scatter plot of Age vs predicted medal, colored by host status
fig = px.scatter(df, x='Age', y='pred_medal', color='Sex', title='Age vs Predicted Medal by Sex',
                 labels={'Age': 'Age', 'pred_medal': 'Predicted Medal', 'Sex': 'Sex'},
                 color_discrete_sequence=['magenta', 'darkblue'],
                 category_orders={'pred_medal': ['Bronze', 'Silver', 'Gold'], 'Sex': ['M', 'F']})
fig.update_traces(marker=dict(size=8))
fig.update_layout(xaxis_title='Age', yaxis_title='Predicted Medal', legend_title='Sex')
fig.show()


In [67]:
# Scatter plot of Age vs Height colored by predicted medal
fig = px.scatter(df, x='Age', y='Height', color='pred_medal', title='Age vs Height by Predicted Medal')
fig.update_layout(xaxis_title='Age', yaxis_title='Height')
fig.show()

# Box plot of Weight by predicted medal
fig = px.box(df, x='pred_medal', y='Weight', title='Weight Distribution by Predicted Medal')
fig.update_layout(xaxis_title='Predicted Medal', yaxis_title='Weight')
fig.show()
