# Importing

**In this notebook, I'll use <span style='color:red'>bold red colored text</span> to indicate conclusions or insights I drew from the dataset!**

In [None]:
import pandas as pd
import numpy as np

import missingno as msno
import plotly.graph_objs as go
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/crossfit-athletes/athletes.csv')
df.head()

In [None]:
df.shape

# Visualizing missing values

In [None]:
msno.matrix(df)

In [None]:
msno.bar(df, sort="descending")

In [None]:
# Printing exact missing values for each feature in percentage
(df.isnull().sum() / df.shape[0]) * 100

In [None]:
# filthy50 column has the highest number of missing values, 
# so printing how many non-null values it has

df['filthy50'].count()

**<span style='color:red'>While some of the feature columns in the dataset have a high percentage of missing values, it is important to note that the actual number of non-null values is still considerable for analysis. For example, the 'filthy50' column has a missing value percentage of 95.4235%, but the remaining 4.5765% of non-null values still amount to 19,359 observations. This should be sufficient for our analysis, and we can still gain valuable insights from this feature.</span>**

**<span style='color:red'>NOTE: I won't be doing any kind of imputation or manual insertion of information for any of the missing values. I will analyze it as it is.</span>**

# Exploratory Data Analysis

# 'Gender' insights

In [None]:
df['gender'].value_counts()

**'--' should be null value. So, changing it.**

In [None]:
df['gender'] = df['gender'].apply(lambda x: np.nan if x == '--' else x)

In [None]:
nonNull_gender = df.dropna(subset=['gender'])
fig = px.pie(nonNull_gender, names='gender', title=f'Gender distribution of {nonNull_gender.shape[0]} crossfit athelets')
fig.show()

# 'Age' Insights

In [None]:
df['age'].describe()

**'Age' feature has some heavy outliers and is right skewed.**

In [None]:
df[ (df['age']>50) ]['age'].value_counts()

In [None]:
# Filtering out the relevant data
df = df[ (df['age']<60) ]

In [None]:
fig = px.histogram(df, x='age', title=f'Age distribution of {(df.dropna(subset=["age"])).shape[0]} crossfit athelets')

median = np.median(df.dropna(subset=['age'])['age'])
q1, q3 = np.percentile(df.dropna(subset=['age'])['age'], [25, 75])

# Add vertical lines
fig.add_vline(x=median, line_dash="dash", line_color="black", annotation_text=f'Median')
fig.add_vline(x=q1, line_dash="dash", line_color="green", annotation_text=f'25%')
fig.add_vline(x=q3, line_dash="dash", line_color="red", annotation_text=f'75%')

fig.show()

**<span style='color:red'>The age distribution of CrossFit athletes is centered around 31 years old, with the majority of athletes falling within the age range of 27 to 37 lbs. Most CrossFit athelets are 29 year old.</span>**

# 'weight' Insights

In [None]:
df['weight'].describe()

In [None]:
df[ (df['weight']>351) ]['weight'].value_counts()

In [None]:
# Filtering out the relevant data
df = df[ (df['weight']<351) ]

In [None]:
fig = px.histogram(df, x='weight', title=f'Weight distribution of {(df.dropna(subset=["weight"])).shape[0]} crossfit athelets')

median = np.median(df.dropna(subset=['weight'])['weight'])
q1, q3 = np.percentile(df.dropna(subset=['weight'])['weight'], [25, 75])

# Add vertical lines
fig.add_vline(x=median, line_dash="dash", line_color="black", annotation_text=f'Median')
fig.add_vline(x=q1, line_dash="dash", line_color="green", annotation_text=f'25%')
fig.add_vline(x=q3, line_dash="dash", line_color="red", annotation_text=f'75%')

fig.show()

**<span style='color:red'>CrossFit athletes' weight centers around 170 lbs and typically falls within the range of 145 lbs to 192 lbs, with the weight of 185 lbs being the most common among the athletes.</span>**

# 'height' Insights

In [None]:
df['height'].describe()

In [None]:
# Seeing upper outliers
df[ (df['height']>80) ]['height'].value_counts()

In [None]:
# Removing the upper outliers
df = df[ (df['height']<85) ]

In [None]:
# Seeing lower outliers
df[df['height'] < 55]['height'].value_counts().sort_values(ascending=False)

In [None]:
# Removing the upper outliers
df = df[ (df['height']>55) ]

In [None]:
fig = px.histogram(df, x='height', title=f'Height distribution of {(df.dropna(subset=["height"])).shape[0]} crossfit athelets')

median = np.median(df.dropna(subset=['height'])['height'])
q1, q3 = np.percentile(df.dropna(subset=['height'])['height'], [25, 75])

# Add vertical lines
fig.add_vline(x=median, line_dash="dash", line_color="black", annotation_text=f'Median')
fig.add_vline(x=q1, line_dash="dash", line_color="green", annotation_text=f'25%')
fig.add_vline(x=q3, line_dash="dash", line_color="red", annotation_text=f'75%')

fig.show()

**<span style='color:red'>The height distribution of CrossFit athletes is centered around 5 feet and 9 inches and typically ranges from 5 feet and 6 inches to 6 feet, with the majority of athletes being around 5 feet 10 inches tall. Seems like most CrossFit athletes are not particularly tall.</span>**

# Stats vs performance

In [None]:
df.columns

In [None]:
stats_and_performance_features = df[
    ['age',
     'height', 
     'weight',
     'run400', 
     'run5k',
     'snatch', 
     'deadlift', 
     'backsq', 
     'pullups'
    ]
]

**I'll first clean the selected features manually by find the lower and upper boundry of the features and removing entries outside of them.**

In [None]:
# run400 feature


# Removing the upper outliers
df = df[ (df['run400']<150) ]

# Removing the lower outliers
df = df[ (df['run400']>44) ]

In [None]:
# run5k feature


# Removing the upper outliers
df = df[ (df['run5k']<2101) ]

# Removing the lower outliers
df = df[ (df['run5k']>910) ]

In [None]:
# snatch feature


# Removing the upper outliers
df = df[ (df['snatch']<301) ]

# Removing the lower outliers
df = df[ (df['snatch']>55) ]

In [None]:
# deadlift feature


# Removing the upper outliers
df = df[ (df['deadlift']<630) ]

# Removing the lower outliers
df = df[ (df['deadlift']>160) ]

In [None]:
# backsq feature


# Removing the upper outliers
df = df[ (df['backsq']<540) ]

# Removing the lower outliers
df = df[ (df['backsq']>124) ]

In [None]:
# pullups feature


# Removing the upper outliers
df = df[ (df['pullups']<80) ]

# Removing the lower outliers
df = df[ (df['pullups']>0) ]

In [None]:
df['pullups'].describe()

In [None]:
df[ (df['pullups']<5) ]['pullups'].value_counts()

In [None]:
plt.subplots(figsize=(12, 10))
sns.heatmap(stats_and_performance_features.corr(), cmap='coolwarm', annot=True, center=0)

plt.title('Stats vs Performance')
plt.show()

**<span style='color:red'>As expected, weight and height are highly correlated.</span>**

**<span style='color:red'>Good back squat performance transfers best to deadlift and snatch performances, with the greater transfer being on the deadlift.</span>**

**<span style='color:red'>Similarly, good back deadlift performance transfers best to backsquat and then slightly lesser to snatch performances.</span>**

**<span style='color:red'>Also, good snatch tranfers nearly equally to deadlife and backsquat with transfer to backsquat being slightly greater.</span>**

**<span style='color:red'>Interestingly, CrossFit athelets with higher 5km runtime do good in deadlift.</span>**