In [None]:
from dis import show_code

import polars as pl
import polars.selectors as cs

import altair as alt
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from seaborn import color_palette

import hvplot.polars

import datetime as dt

In [None]:
athlete_events_path = r"F:\Datasets\CSV datasets\Olympic_Athletes\athlete_events.csv"

In [None]:
df = pl.read_csv(athlete_events_path, null_values="NA")

In [None]:
df

In [None]:
df.collect_schema()

In [None]:
df = df.with_columns(
    pl.col('ID').cast(pl.UInt32),
    pl.col('Age').cast(pl.UInt16),
    pl.col('Height').cast(pl.Float32),
    pl.col('Weight').cast(pl.Float32),
    pl.col('Year').cast(pl.UInt16),
)

In [None]:
df.collect_schema()

In [None]:
df.estimated_size('mb')

In [None]:
df.group_by(
    'Year'
).len().sort(by='len', descending=True)

In [None]:
df.group_by(
    'Name', 'Team'
).len().sort(by='len', descending=True)

In [None]:
count_genre = df.group_by(
    pl.col('Sex')
).len()

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(count_genre['len'], labels=count_genre['Sex'], autopct='%1.1f%%')
plt.title('Distribution of Athletes by Gender')
plt.legend()
plt.show()


In [None]:
fig = px.pie(
    data_frame=count_genre,
    values='len',
    names='Sex',
    title='Distribution of Athletes by Gender',
    labels={'len': 'Count', 'Sex': 'Gender'}
)

fig.update_layout(
    width=700,
    height=700
)

In [None]:
alt.Chart(count_genre).mark_arc(innerRadius=0).encode(
    theta='len',
    color='Sex',
    tooltip=['Sex', 'len'],
).properties(
    title='Distribution of Athletes by Gender',
    width=400,
    height=400
).configure_view(
    strokeWidth=0
).encode(
    theta='len:Q',
    text=alt.Text('len:Q', format='.1%')
).mark_arc(innerRadius=0, padAngle=0.03).encode(
    theta=alt.Theta(field='len', type='quantitative', stack=True),
    text=alt.Text(field='len', format='.1%')
).mark_arc(innerRadius=0, stroke="#fff", padAngle=0.03).encode(
    text=alt.Text(field='len', format=',d')
)


In [None]:
age_mean = df.group_by(
    'Sex'
).agg(
    pl.mean('Age').round(2).alias('mean_age'),
)

age_mean

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(age_mean['mean_age'], labels=['Male', 'Female'],
        autopct=lambda p: f'{(p / 100) * sum(age_mean["mean_age"]):.2f}')
plt.title('Distribution of Athletes by Gender (Mean Age)')
plt.legend()
plt.show()


In [None]:
fig = px.bar(
    data_frame=age_mean,
    x='Sex',
    y='mean_age',
    color='Sex',
    text='mean_age'
).update_traces(
    textposition='outside'
).update_layout(
    width=700,
    height=700
)

fig.show()


In [None]:
df.group_by(
    'Sex',
).agg(
    pl.mean('Height').round(2).alias('mean_height'),
)

In [None]:
df.group_by(
    'Team'
).len().sort(by='len', descending=True)

In [None]:
df.group_by(
    'Team', "Sex"
).len().sort(by='len', descending=True)

In [None]:
gold_medal = df.filter(
    pl.col('Medal') == 'Gold'
)

top_10_gold_winners = gold_medal.group_by(
    'Team'
).len().sort(by='len', descending=True).head(10)

top_10_gold_winners

In [None]:
plt.figure(figsize=(10, 8))

legend_labels = [f"{team}: {count}" for team, count in zip(top_10_gold_winners['Team'], top_10_gold_winners['len'])]

plt.pie(top_10_gold_winners['len'], labels=top_10_gold_winners['Team'], autopct='%1.1f%%')

plt.legend(legend_labels, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.title('Top 10 Gold Medal Winners by Team')
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=133)

palette = color_palette("Set2", len(top_10_gold_winners))
colors = palette.as_hex()

bars = plt.barh(top_10_gold_winners['Team'], top_10_gold_winners['len'], color=colors)
legend_labels = [f'{row['Team']} ({row['len']})' for _, row in top_10_gold_winners.to_pandas().iterrows()]
plt.legend(bars, legend_labels, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.title('Top 10 Gold Medal Winners by Team')
plt.xlabel('Count')
plt.ylabel('Team')
plt.show()

In [None]:
top_10_silver_medal = df.filter(
    pl.col('Medal') == 'Silver'
).group_by(
    'Team'
).len().sort(by='len', descending=True).head(10)

top_10_silver_medal

In [None]:
fig = px.pie(
    data_frame=top_10_silver_medal,
    values='len',
    names='Team',
    title='Top 10 Silver Medal Winners by Team'
)

fig.update_layout(
    width=700,
    height=700
)

fig.show()

In [None]:
fig = px.bar(
    data_frame=top_10_silver_medal,
    x='Team',
    y='len',
    color='Team',
    text='len'
)

fig.update_layout(
    width=900,
    height=700
)

fig.show()

In [None]:
bronze_top_10_winners = df.filter(
    pl.col('Medal') == 'Bronze'
).group_by(
    'Team'
).len().sort(by='len', descending=True).head(10)

bronze_top_10_winners

In [None]:
alt.Chart(bronze_top_10_winners).mark_bar().encode(
    x='Team',
    y='len',
    tooltip=['Team', 'len']
).properties(
    width=600,
    height=600
).interactive()
