In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

In [None]:
# set to True for CH demographics only
only_CH = True

In [None]:
df = pd.read_csv("../original_data/mhs_demographics_sorted.csv")

if (only_CH):
    df = df[df['COUNTRY'] == 'CH']

print(df.head())
print(f"Number of entries in dataset: {len(df)}")

In [None]:
df["FITNESS_LEVEL"].value_counts()

In [None]:
gender_counts = df["GENDER"].value_counts()

plt.bar(gender_counts.index, gender_counts.values, edgecolor="black")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.title("Gender Distribution")
plt.savefig("descriptive_statistics_plots/gender_distribution.png", dpi=300, bbox_inches="tight")
plt.show()

print(gender_counts)

In [None]:
if (not only_CH):

    country_counts = df["COUNTRY"].value_counts()
    
    plt.bar(country_counts.index[:10], country_counts.values[:10], edgecolor="black")
    plt.xlabel("Country")
    plt.ylabel("Count")
    plt.title("Country Distribution of 10 most frequent countries")
    plt.savefig("descriptive_statistics_plots/country_distibution_10_most_frequent.png", dpi=300, bbox_inches="tight")
    plt.show()
    
    print(country_counts[:20])

In [None]:
# calculate age based on submission dates

df["FIRST_SUBMISSION_DATE"] = pd.to_datetime(df["FIRST_SUBMISSION_DATE"])
df["LAST_SUBMISSION_DATE"] = pd.to_datetime(df["LAST_SUBMISSION_DATE"])
df["BIRTHDAY"] = pd.to_datetime(df["BIRTHDAY"])

df["MIDPOINT_DATE"] = df["FIRST_SUBMISSION_DATE"] + (df["LAST_SUBMISSION_DATE"] - df["FIRST_SUBMISSION_DATE"]) / 2
df["AGE"] = df.apply(lambda row: row["MIDPOINT_DATE"].year - row["BIRTHDAY"].year - ((row["MIDPOINT_DATE"].month, row["MIDPOINT_DATE"].day) < (row["BIRTHDAY"].month, row["BIRTHDAY"].day)), axis=1)

In [None]:
age_counts = df["AGE"].value_counts()
bins = range(0, df["AGE"].max() + 10, 10)

plt.hist(df["AGE"], bins=bins, edgecolor="black")
plt.xlabel("Age [years]")
plt.ylabel("Frequency")
plt.title("Age Distribution (10-year intervals)")
plt.savefig("descriptive_statistics_plots/age_distribution_10_year_intervals.png", dpi=300, bbox_inches="tight")
plt.show()

age_bins = pd.cut(df["AGE"], bins=bins, right=False)
age_counts = age_bins.value_counts().sort_index()
print(age_counts)

In [None]:
male_ages = df[df["GENDER"] == "male"]["AGE"]
female_ages = df[df["GENDER"] == "female"]["AGE"]
bins = range(0, df["AGE"].max() + 10, 10)

plt.hist([male_ages, female_ages], bins=bins, edgecolor="black", label=["male", "female"], alpha=0.7)
plt.xlabel("Age [years]")
plt.ylabel("Frequency")
plt.title("Age Distribution by Gender (10-year intervals)")
plt.legend()
plt.savefig("descriptive_statistics_plots/age_distribution_10_year_intervals_by_gender.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
bins = range(0, df["AGE"].max() + 10, 10)

genders = ["male", "female", "non-binary", "i prefer not to choose"]
gender_ages = {g: df[df["GENDER"] == g]["AGE"] for g in genders}

plt.figure(figsize=(10, 6))
plt.hist(
    [gender_ages[g] for g in genders],
    bins=bins,
    edgecolor="black",
    label=genders,
    alpha=0.7
)
plt.xlabel("Age [years]")
plt.ylabel("Frequency")
plt.title("Age Distribution by Gender (10-year intervals)")
plt.legend()
plt.savefig("descriptive_statistics_plots/age_distribution_10_year_intervals_by_gender.png", dpi=300, bbox_inches="tight")
plt.show()

for g in genders:
    print(f"\n{g.title()} counts by age bin:")
    binned = pd.cut(gender_ages[g], bins=bins, right=False)
    print(binned.value_counts().sort_index())

In [None]:
gender_counts = df.pivot_table(index="AGE", columns="GENDER", aggfunc="size", fill_value=0)
gender_counts = gender_counts.drop(columns=["i prefer not to choose", "non-binary"])
gender_counts["f/m ratio"] = gender_counts["female"] / gender_counts["male"]
gender_counts.replace([float("inf"), float("nan")], None, inplace=True)

plt.plot(gender_counts.index, gender_counts["f/m ratio"])
plt.xlabel("Age [years]")
plt.ylabel("female to male ratio")
plt.title("Female-to-Male Ratio for each Age (year)")
plt.savefig("descriptive_statistics_plots/female_to_male_ratio_year.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
gender_counts = df.pivot_table(index="AGE", columns="GENDER", aggfunc="size", fill_value=0)
gender_counts = gender_counts.drop(columns=["i prefer not to choose", "non-binary"])
gender_counts["Age Group"] = (gender_counts.index // 10) * 10

gender_counts_binned = gender_counts.groupby("Age Group")[["female", "male"]].sum()
gender_counts_binned["f/m ratio"] = gender_counts["female"] / gender_counts["male"]
gender_counts_binned.replace([float("inf"), float("nan")], None, inplace=True)

plt.plot(gender_counts_binned.index, gender_counts_binned["f/m ratio"])
plt.xlabel("Age [years]")
plt.ylabel("female to male ratio")
plt.title("Female-to-Male Ratio for each 10-Year Age Group)")
plt.savefig("descriptive_statistics_plots/female_to_male_ratio_year_10years.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
bin_width = 0.1

# create bins for WHOOP_HEIGHT with step of 0.1 meters
min_height = df["WHOOP_HEIGHT"].min()
max_height = df["WHOOP_HEIGHT"].max()
bins = np.arange(min_height, max_height + bin_width, bin_width)


plt.hist(df["WHOOP_HEIGHT"], bins=bins, edgecolor="black")
plt.xlabel("Height [m]")
plt.ylabel("Frequency")
#plt.title("WHOOP Height Distribution (0.1 m intervals)")
plt.savefig("descriptive_statistics_plots/whoop_height_distribution_0.1m_intervals.png", 
           dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='white')
plt.show()

height_bins = pd.cut(df["WHOOP_HEIGHT"], bins=bins, right=False)
height_counts = height_bins.value_counts().sort_index()
print(height_counts)

In [None]:
bin_width = 5

# create bins for WHOOP_WEIGHT with step of 5 kg
min_WHOOP_WEIGHT = df["WHOOP_WEIGHT"].min()
max_WHOOP_WEIGHT = df["WHOOP_WEIGHT"].max()
bins = np.arange(min_WHOOP_WEIGHT, max_WHOOP_WEIGHT + bin_width, bin_width)

plt.hist(df["WHOOP_WEIGHT"], bins=bins, edgecolor="black")
plt.xlabel("WHOOP_WEIGHT [kg]")
plt.ylabel("Frequency")
#plt.title("WHOOP_WEIGHT Distribution (5 kg intervals)")
plt.savefig("descriptive_statistics_plots/WHOOP_WEIGHT_distribution_5kg_intervals.png", 
           dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='white')
plt.show()

WHOOP_WEIGHT_bins = pd.cut(df["WHOOP_WEIGHT"], bins=bins, right=False)
WHOOP_WEIGHT_counts = WHOOP_WEIGHT_bins.value_counts().sort_index()
print(WHOOP_WEIGHT_counts)

In [None]:
bin_width = 1

# create bins for WHOOP_BMI with step of 1
min_bmi = df["WHOOP_BMI"].min()
max_bmi = df["WHOOP_BMI"].max()
bins = np.arange(min_bmi, max_bmi + bin_width, bin_width)

plt.hist(df["WHOOP_BMI"], bins=bins, edgecolor="black")
plt.xlabel("BMI")
plt.ylabel("Frequency")
#plt.title("WHOOP BMI Distribution (1-unit intervals)")
plt.savefig("descriptive_statistics_plots/whoop_bmi_distribution_1unit_intervals.png", 
           dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='white')
plt.show()

bmi_bins = pd.cut(df["WHOOP_BMI"], bins=bins, right=False)
bmi_counts = bmi_bins.value_counts().sort_index()
print(bmi_counts)