### Importing Libraries

In [1]:
import json
import pandas as pd
import numpy as np
import warnings
from datetime import datetime

# Suppress all warnings
warnings.filterwarnings("ignore")

### Load in the data

In [2]:
# # Open and read the JSON file (OLD Version, only public activities)
# with open("activities_2022-07-01_to_2024-11-17.json", "r") as file1:
#     data1 = json.load(file1)
# with open("activities_2020-07-18_to_2022-06-30.json", "r") as file2:
#     data2 = json.load(file2)
# with open("activities_2019-07-02_to_2020-07-16.json", "r") as file3:
#     data3 = json.load(file3)

In [8]:
import os
print(os.path.exists("activities_data/all_activities_03-18-2025_05-25-2025.json"))
print(os.getcwd())

True
c:\Users\ajnet\Documents\StravaApp


In [9]:

with open("activities_data/all_activities_08-03-2020_12-25-2021.json", "r") as file1:
    data1 = json.load(file1)
with open("activities_data/all_activities_12-25-2021_11-23-2023.json", "r") as file2:
    data2 = json.load(file2)
with open("activities_data/all_activities_11-22-2023_3-18-2025.json", "r") as file3:
    data3 = json.load(file3)
with open("activities_data/all-activities_07-02-2019_08-03-2020.json", "r") as file4:
    data4 = json.load(file4)
with open("activities_data/all_activities_03-18-2025_05-25-2025.json", "r") as file5:
    data5 = json.load(file5)

### Convert the data from JSON to dataframe

In [11]:
# Extract relevant fields
datasets = [data1, data2, data3, data4, data5]
extracted_data = []
for dataset in datasets:
    for activity in dataset:
        extracted_data.append({
            "id": activity["id"],
            "name": activity["name"],
            "distance": activity["distance"],
            "moving_time": activity["moving_time"],
            "elapsed_time": activity["elapsed_time"],
            "total_elevation_gain": activity["total_elevation_gain"],
            "type": activity["type"],
            "start_date": activity["start_date"],
            "start_date_local": activity["start_date_local"],
            "timezone": activity["timezone"],
            "kudos_count": activity["kudos_count"],
            "average_speed": activity["average_speed"],
            "max_speed": activity["max_speed"],
            "sport_type": activity["sport_type"],
            "elev_high": activity["elev_high"] if "elev_high" in activity.keys() else 0,
            "elev_low": activity["elev_low"] if "elev_low" in activity.keys() else 0
        })


# Create DataFrame
df = pd.DataFrame(extracted_data)
filtered_activities = df[["name", "start_date", "type", "sport_type","distance", "total_elevation_gain", "elev_high", "elev_low"]]

### Hardcoded adjustments for activities with mixed types

In [12]:
activities_distance_divisors = {
    "Ride": 4,
    "MountainBikeRide": 2.5,
    "GravelRide": 3,
    "Run": 1,
    "TrailRun": 1,
    "NordicSki": 1,
    "Hike": (4/3)
}

activities_elevation_divisors = {
    "Ride": 1,
    "MountainBikeRide": 1,
    "GravelRide": 1,
    "Run": 1,
    "TrailRun": 1,
    "NordicSki": 1,
    "Hike": (4/3)
}

partial_trail_run_adjustment = {
    "Mt. Teneriffe + Mt. Si": 0.25,
    "Spray Park Loop": 0.5,
    "Oyster Dome Loop": 0.75,
    "Cutthroat Pass": 0.25,
    "Trappers Peak / Thornton Lakes": 0.25,
    "Trap Pass": 0.25,
    "Mailbox Peak": 0.5,
    "Goat Lake": 0.75,
    "Green Mountain": 0.75
}

def update_ptra(partial_trail_run_adjustment):
        updated_ptra = {}
        for key, value in partial_trail_run_adjustment.items():
            if value == 0.25:
                # if the value is 0.25, this means it is designated as a hike and has already been scaled, so we need to unscale
                updated_ptra[key] = (4/3)*(3/4 + value/4)
            else:
                # other values indicate that this is categorized as a TrailRun, meaning we need to scale down.
                updated_ptra[key] = (3/4 + value/4)
        return updated_ptra

updated_partial_trail_run_adjustment = update_ptra(partial_trail_run_adjustment)

partial_gravel_ride_adjustment = {
     "BLOM / Island Lake": 2.5/(0.25*2.5 + 0.75*3),
     "Island Lake": 2.5/(0.25*2.5 + 0.75*3),
     "Waterloo + DTE": 2.5/(0.25*2.5 + 0.75*3)
}

def convert_timestamp(timestamp):
     return (datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")).strftime("%m/%d/%y")


elevation_to_capacity = {
    0: 1.0,
    1000: 0.992,
    2000: 0.983,
    3000: 0.972,
    4000: 0.959,
    5000: 0.944,
    6000: 0.927,
    7000: 0.907,
    8000: 0.886,
    9000: 0.863,
    10000: 0.837,
    11000: 0.809,
    12000: 0.78,
}
     

### Compute the difficulty score

In [13]:
# Compute the difficulty score by calculting the distance and elevation contributions with the sport type adjustments
#Start by removing ski
filtered_activities = filtered_activities[filtered_activities['sport_type'] != 'AlpineSki']
filtered_activities["distance_score"] = (df["distance"]*0.000621371)/ df["sport_type"].map(lambda x: activities_distance_divisors.get(x, 1))
filtered_activities["distance_score"] = (filtered_activities["distance_score"]*df["name"].map(lambda x: partial_gravel_ride_adjustment.get(x, 1))).round(2)

filtered_activities["elevation_score"] = (df["total_elevation_gain"]*3*3.28084*0.001)/ df["sport_type"].map(lambda x: activities_elevation_divisors.get(x, 1))
filtered_activities["difficulty_score"] = filtered_activities["distance_score"] + filtered_activities["elevation_score"]
filtered_activities["difficulty_score_without_elevation"] = (filtered_activities["difficulty_score"]*df["name"].map(lambda x: updated_partial_trail_run_adjustment.get(x, 1))).round(2)

#Add adjustments for the average elevation of the activity
filtered_activities["average_elevation"] = (((filtered_activities["elev_high"] + filtered_activities["elev_low"])/2)*3.28084).round(2)
filtered_activities["performance_capacity"] = (filtered_activities["average_elevation"].round(-3)).map(elevation_to_capacity)
filtered_activities["difficulty_score"] = (filtered_activities["difficulty_score_without_elevation"] / filtered_activities["performance_capacity"]).round(2)

# Convert raw distance, elevation, and date to readable formats
filtered_activities["Distance (miles)"] = (filtered_activities["distance"]*0.000621371).round(2)
filtered_activities["Total Elevation Gain (ft)"] = (filtered_activities["total_elevation_gain"]*3.28084).round(0).astype(int)
filtered_activities["Date"] = filtered_activities["start_date"].apply(convert_timestamp)

filtered_activities["elev_high"] = filtered_activities["elev_high"]*3.28084
# Extract only cleaned columns
cleaned_activities = filtered_activities[["name", "Date", "sport_type", "Distance (miles)", "Total Elevation Gain (ft)", "elev_high","average_elevation", "performance_capacity", "difficulty_score_without_elevation", "difficulty_score"]]

### Filter and analyze output data

In [14]:
#filtered_activities[filtered_activities["sport_type"].isin(["TrailRun", "Hike"])].sort_values(by="difficulty_score", ascending=False).head(40)
hardest_hiking_activities = cleaned_activities[cleaned_activities["sport_type"].isin(["TrailRun", "Hike"])].sort_values(by="difficulty_score", ascending=False).reset_index(drop=True)
hardest_biking_activities = cleaned_activities[cleaned_activities["sport_type"].isin(["MountainBikeRide", "GravelRide", "Ride"])].sort_values(by="difficulty_score", ascending=False).reset_index(drop=True)
hardest_running_activities = cleaned_activities[cleaned_activities["sport_type"].isin(["TrailRun", "Run"])].sort_values(by="difficulty_score", ascending=False).reset_index(drop=True)
hardest_offroad_riding_activities = cleaned_activities[cleaned_activities["sport_type"].isin(["MountainBikeRide", "GravelRide"])].sort_values(by="difficulty_score", ascending=False).reset_index(drop=True)
hardest_overall_activities = cleaned_activities.sort_values(by="difficulty_score", ascending=False).reset_index(drop=True)