<a href="https://colab.research.google.com/github/anw-g01/strava-data-analysis/blob/main/eda_visualisations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports + Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib   # for cmap
import seaborn as sns
# configure matplotlib defaults
plt.rcParams.update({
    "font.size": 8,
    "font.family": "monospace",
})
pd.set_option("display.max_columns", None)  # view ALL columns in a DataFrame
# pd.reset_option("display.max_columns")      # reset to default

In [None]:
# read activities dataset into a DataFrame:
df = pd.read_parquet("all_activities.parquet")

df.info()

In [None]:
df.sample(n=5)

# Exploratory Data Analysis (EDA) + Visualisations

## Describe Matrix

In [None]:
def describe_runs(df: pd.DataFrame) -> pd.DataFrame:

    df = df[df["type"] == "Run"]

    df = df.describe().round(2).drop(columns=["activity_id"], axis=1)

    # time values:
    for left, right in zip(
        ["moving_time_s", "elapsed_time_s"],
        ["moving_time", "elapsed_time"]
    ):
        df[right] = df[left].apply(
            lambda x: (
                f"{int(x // 3600):02d}:"
                f"{int((x % 3600) // 60):02d}:"
                f"{int(x % 60):02d}"
            )
        )

    # average running pace:
    for speed_col, pace_col, unit in zip(
        ["avg_speed_km_h", "max_speed_km_h", "avg_speed_mph", "max_speed_mph"],
        ["avg_pace_km", "max_pace_km", "avg_pace_mile", "max_pace_mile"],
        ["km", "km", "mile", "mile"]
    ):
        df[pace_col] = (
            pd.to_timedelta(1 / df[speed_col] * 60, unit="min", errors="coerce")
            .apply(
                lambda x: (
                    f"{int((x.total_seconds() % 3600) // 60):02d}:"
                    f"{int(x.total_seconds() % 60):02d}"
                    f" min/{unit}"
                )
            )
        )

    # formatting
    for col in ["avg_hr", "max_hr"]:
        df[col] = df[col].apply(lambda x: f"{x:.0f} bpm")

    # cadence - add units
    df["avg_cadence_spm"] = df["avg_cadence_spm"].apply(lambda x: f"{x:.0f} spm")

    # speeds - add km and miles units:
    for mile_col, km_col, unit in zip(
        ["avg_speed_mph", "avg_speed_km_h"],
        ["max_speed_mph", "max_speed_km_h"],
        ["mph", "km/h"]
    ):
        df[km_col] = df[km_col].apply(lambda x: f"{x:.2f} {unit}")
        df[mile_col] = df[mile_col].apply(lambda x: f"{x:.2f} {unit}")

    # elevations - add comma and units
    for elev_col in ["total_elev_gain", "lowest_elev", "highest_elev"]:
        df[elev_col] = df[elev_col].apply(lambda x: f"{x:,.0f} m")

    # distances - add units
    for dist_col, unit in zip(
        ["distance_km", "distance_miles"],
        ["km", "mi"]
    ):
        df[dist_col] = df[dist_col].apply(lambda x: f"{x:.1f} {unit}")

    # ordering of columns to include:
    columns = [
        "distance_km", "distance_miles",
        "moving_time", "elapsed_time",
        "avg_pace_km", "avg_pace_mile",
        "avg_cadence_spm",
        # "max_pace_km", "max_pace_mile",
        "avg_hr", "max_hr",
        "total_elev_gain", "lowest_elev", "highest_elev",
        "num_comments", "num_achievements",  "num_kudos",
        "avg_speed_mph", "avg_speed_km_h",
        # "max_speed_mph", "max_speed_km_h",
        # columns not included from API:
        # "num_photos",
        # "relative_effort",
        # "max_grade",
        # "calories",
        # "wind_speed",
        # "avg_temp",
        # "humidity"
    ]

    return (
        df[columns]
        .transpose()
        .reset_index(names="metric")
        .drop("count", axis=1)
        # drop percentiles:
        .drop(columns=["25%", "50%", "75%"], axis=1)
        # re-order columns:
        .loc[:, ["metric", "mean", "min", "max", "std"]]
    )

desc = describe_runs(df)

desc

## Activities Overview (Pie Chart)

In [None]:
matplotlib.colormaps.get_cmap('Set3')

In [None]:
# get data:
temp = df["type"].value_counts().reset_index()
types = temp["type"].tolist()
counts = temp["count"].tolist()

# create figure:
plt.figure(figsize=(10, 6))
plt.title("Proportion of Activity Types", fontsize=10)

# use the modern colormap interface
cmap = matplotlib.colormaps.get_cmap('Set3')
colours = cmap(range(len(types)))

plt.pie(
    counts,
    labels=types,
    autopct="%.1f%%",
    counterclock=False,
    wedgeprops={'edgecolor': 'white'},
    colors=colours
)
plt.tight_layout()
plt.show()