In [1]:
import polars as pl
from sqlalchemy import create_engine
from sqlalchemy import text
from datetime import datetime
import plotly.express as px
from typing import Optional

import streamlit

In [2]:
# Connect to DB
connection_string = 'mysql+mysqlconnector://strava_db_user:StravaConnect@localhost:3306/strava_db'
engine = create_engine(connection_string)


In [3]:
with engine.connect() as connection:
    activities = pl.read_database(
        query="SELECT * FROM activities",
        connection=connection,
        infer_schema_length = 10000,
        schema_overrides={
            'sport_type': pl.Categorical
        }

    )

In [4]:
activities.glimpse()

Rows: 450
Columns: 20
$ total_elevation_gain             <f64> 0.0, 0.0, 10.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
$ sport_type                       <cat> Run, Walk, Walk, Run, Walk, Walk, Run, Run, Walk, Walk
$ start_date_local        <datetime[μs]> 2014-11-05 06:45:20, 2014-11-08 20:42:58, 2014-11-09 20:11:46, 2014-11-09 20:29:15, 2014-11-09 20:52:26, 2014-11-14 20:53:40, 2014-11-14 21:08:21, 2014-11-14 21:19:07, 2014-11-14 21:35:55, 2014-11-15 21:14:01
$ gear_id                          <str> None, None, None, None, None, None, None, None, None, None
$ pr_count                         <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ activity_id                      <i64> 215643518, 216862601, 217313186, 217335592, 217335641, 219142311, 219142342, 219142389, 219142398, 219496241
$ activity_name                    <str> 'Morning Run', 'Evening Walk', 'Evening Walk', 'Evening Run', 'Evening walk ', 'Evening Walk', 'Night Run', 'Night Run', 'Night Walk', 'Night Walk'
$ distance_km                   

In [5]:
# Fun metrics
first_use_date = datetime.now() - activities.select('start_date_local').min().item()
first_use_delta = first_use_date.days

last_run_delta = datetime.now() - activities.filter(pl.col('sport_type') == 'Run').select('start_date_local').max().item()
last_walk_delta = datetime.now() - activities.filter(pl.col('sport_type') == 'Walk').select('start_date_local').max().item()
last_hike_delta = datetime.now() - activities.filter(pl.col('sport_type') == 'Hike').select('start_date_local').max().item()
last_ride_delta = datetime.now() - activities.filter(pl.col('sport_type') == 'Ride').select('start_date_local').max().item()

In [6]:
activities_by_sport_type = activities.group_by('sport_type').len().sort('len', descending=True).filter(pl.col('len') > 1)

px.bar(x= 'sport_type',
       y= 'len',
       color='sport_type',
       data_frame=activities_by_sport_type,
       title='Activities done across Strava',
       labels={
           'sport_type': '',
           'len': 'Count of activities'
       },
        template="simple_white",
        color_discrete_sequence=px.colors.sequential.Aggrnyl).update_layout(
            hovermode='x'
        ).update_traces(
            hovertemplate=None
        )

In [7]:
activities

total_elevation_gain,sport_type,start_date_local,gear_id,pr_count,activity_id,activity_name,distance_km,moving_time_hr,average_speed_km_per_hr,max_speed_km_per_hr,start_latitude,start_longitude,start_latlng,end_latlng,activity_hour,activity_weekday,activity_year,activity_month,row_num
f64,cat,datetime[μs],str,i64,i64,str,f64,f64,f64,f64,f64,f64,null,null,i64,i64,i64,i64,i64
0.0,"""Run""",2014-11-05 06:45:20,,0,215643518,"""Morning Run""",2.53,0.29,8.77,18.36,12.929582,77.678667,,,,,,,1
0.0,"""Walk""",2014-11-08 20:42:58,,0,216862601,"""Evening Walk""",3.64,0.63,5.8,11.52,12.92891,77.678536,,,,,,,1
10.6,"""Walk""",2014-11-09 20:11:46,,0,217313186,"""Evening Walk""",1.62,0.28,5.87,23.76,12.929202,77.678646,,,,,,,1
0.0,"""Run""",2014-11-09 20:29:15,,0,217335592,"""Evening Run""",2.62,0.29,8.94,16.56,12.92628,77.671726,,,,,,,1
0.0,"""Walk""",2014-11-09 20:52:26,,0,217335641,"""Evening walk """,1.49,0.31,4.85,12.24,12.926293,77.671747,,,,,,,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
11.2,"""Walk""",2025-07-17 21:11:26,,0,15150480680,"""Unwind walk 🍃""",3.12,0.65,4.77,9.5,,,,,21,4,2025,7,0
8.0,"""Run""",2025-07-17 20:20:40,"""g17360762""",0,15150243997,"""Nike Run Club: Running For Mor…",4.61,0.58,7.9,22.25,,,,,20,4,2025,7,0
2.2,"""Walk""",2025-09-12 21:58:50,,0,15792075655,"""Night Walk""",1.45,0.34,4.26,8.64,,,,,21,5,2025,9,0
17.0,"""Run""",2025-09-13 20:51:25,"""g17360762""",0,15803236441,"""Nike Run Club: Another Thank Y…",5.53,0.75,7.38,18.14,,,,,20,6,2025,9,0


In [8]:
activities = activities.with_columns(
    pl.col('start_date_local').dt.year().alias('activity_year'),
    pl.col('start_date_local').dt.month().alias('activity_month'),
    pl.col('start_date_local').dt.weekday().alias('activity_weekday'),
    pl.col('start_date_local').dt.hour().alias('activity_hour'),
)

In [9]:
def remove_outliers_z_score(
    df: pl.DataFrame,
    column: str,
    grouping: Optional[str] = None,
    z_threshold: int = 3
) -> pl.DataFrame:
    
    if grouping is None:
        # global mean & std
        column_mean = df[column].mean()
        column_std = df[column].std()
        return df.filter(((pl.col(column) - column_mean).abs() / column_std) < z_threshold)
    
    else:
        # mean & std per group using window functions
        return df.filter(
            (
                (pl.col(column) - pl.col(column).mean().over(grouping)).abs()
                / pl.col(column).std().over(grouping)
            ) < z_threshold
        )


In [10]:
moving_time_over_years = activities.filter(pl.col('sport_type').is_in(['Run', 'Walk', 'Ride'])
                                    ).group_by('sport_type', 'activity_year'
                                        ).agg(pl.col('moving_time_hr').mean()
                                            ).sort('activity_year', descending=True)
                                        
# Remove outliers
moving_time_over_years = remove_outliers_z_score(moving_time_over_years, column='moving_time_hr')                                    

In [11]:
px.line(
    data_frame=moving_time_over_years,
    x='activity_year',
    y='moving_time_hr',
    facet_col='sport_type'
)

In [12]:
moving_time_by_months = activities.filter(pl.col('sport_type').is_in(['Run', 'Walk', 'Ride'])
                                    ).group_by('sport_type', 'activity_year', 'activity_month'
                                        ).agg(pl.col('moving_time_hr').mean()
                                            ).sort('activity_month')
                                        
# Remove outliers
moving_time_by_months = remove_outliers_z_score(moving_time_by_months, column='moving_time_hr')                                    

In [13]:
moving_time_by_months

sport_type,activity_year,activity_month,moving_time_hr
cat,i32,i8,f64
"""Ride""",2018,1,0.37
"""Walk""",2022,1,1.05
"""Walk""",2021,1,0.535
"""Run""",2023,1,0.28
"""Ride""",2024,1,0.67
…,…,…,…
"""Run""",2014,12,0.123333
"""Run""",2021,12,0.15
"""Run""",2022,12,0.33
"""Ride""",2023,12,0.9


In [14]:
px.line(
    data_frame=moving_time_by_months.filter(
        (pl.col('activity_year') >= 2022) & (pl.col('sport_type') == 'Run')),
    x='activity_month',
    y='moving_time_hr',
    facet_col='activity_year',
    title= ' Running time over years'
)

In [15]:
# Weekly Snapshot

# Metrics to show:
# Activities this week
# Distance moved this week
# Time active this week

weekly_snapshot = activities.with_columns(
    pl.col('start_date_local').dt.week().alias('week')
).group_by('activity_year','week').agg(
    pl.col('activity_id').len().alias('activities'),
    pl.col('distance_km').sum().alias('distance'),
    pl.col('moving_time_hr').sum().alias('time')
).top_k(2, by = ['activity_year', 'week']).with_columns( # Add previous week metrics
    pl.col('activities').shift(-1).alias('previous_week_activities'),
    pl.col('distance').shift(-1).alias('previous_week_distance'),
    pl.col('time').shift(-1).alias('previous_week_time')
    ).top_k(1, by=['activity_year', 'week']) # Keep only this week row

In [16]:
now = datetime.today()
date_str = str(now.date()) 
date_str = date_str.replace("-","")
print(date_str)

20250918


In [17]:
activities.sort(by='start_date_local', descending=True)

total_elevation_gain,sport_type,start_date_local,gear_id,pr_count,activity_id,activity_name,distance_km,moving_time_hr,average_speed_km_per_hr,max_speed_km_per_hr,start_latitude,start_longitude,start_latlng,end_latlng,activity_hour,activity_weekday,activity_year,activity_month,row_num
f64,cat,datetime[μs],str,i64,i64,str,f64,f64,f64,f64,f64,f64,null,null,i8,i8,i32,i8,i64
12.0,"""Run""",2025-09-15 20:24:27,"""g17360762""",0,15825696708,"""Nike Run Club: Running Towards…",4.92,0.67,7.37,27.86,,,,,20,1,2025,9,0
17.0,"""Run""",2025-09-13 20:51:25,"""g17360762""",0,15803236441,"""Nike Run Club: Another Thank Y…",5.53,0.75,7.38,18.14,,,,,20,6,2025,9,0
2.2,"""Walk""",2025-09-12 21:58:50,,0,15792075655,"""Night Walk""",1.45,0.34,4.26,8.64,,,,,21,5,2025,9,0
74.6,"""Ride""",2025-09-10 18:48:47,"""b9747906""",0,15770008703,"""Evening Ride""",12.0,0.84,14.36,39.1,,,,,18,3,2025,9,0
9.0,"""Run""",2025-09-09 19:41:02,"""g17360762""",0,15758472677,"""Nike Run Club: Thirty Minute R…",3.38,0.43,7.79,22.61,,,,,19,2,2025,9,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.0,"""Walk""",2014-11-09 20:52:26,,0,217335641,"""Evening walk """,1.49,0.31,4.85,12.24,12.926293,77.671747,,,20,7,2014,11,1
0.0,"""Run""",2014-11-09 20:29:15,,0,217335592,"""Evening Run""",2.62,0.29,8.94,16.56,12.92628,77.671726,,,20,7,2014,11,1
10.6,"""Walk""",2014-11-09 20:11:46,,0,217313186,"""Evening Walk""",1.62,0.28,5.87,23.76,12.929202,77.678646,,,20,7,2014,11,1
0.0,"""Walk""",2014-11-08 20:42:58,,0,216862601,"""Evening Walk""",3.64,0.63,5.8,11.52,12.92891,77.678536,,,20,6,2014,11,1
