# Objective: Predicting Calorie Expenditure During Cardio Training

In [297]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from src.features import time_to_seconds, calc_avg_speed

### Import raw dataset  
*Dataset imported from Garmin Connect application*

In [298]:
df_activities = pd.read_csv("data_files/activities_bronze.csv")

* Display data

In [299]:
df_activities

Unnamed: 0,Activity Type,Date,Favorite,Title,Distance,Calories,Time,Avg HR,Max HR,Aerobic TE,Avg Bike Cadence,Max Bike Cadence,Avg Speed,Max Speed,Total Ascent,Total Descent,Avg Stride Length,Avg Vertical Ratio,Avg Vertical Oscillation,Avg Ground Contact Time,Avg GAP,Normalized Power® (NP®),Training Stress Score®,Avg Power,Max Power,Steps,Total Reps,Total Sets,Body Battery Drain,Decompression,Best Lap Time,Number of Laps,Avg Resp,Min Resp,Max Resp,Stress Change,Stress Start,Stress End,Avg Stress,Max Stress,Moving Time,Elapsed Time,Min Elevation,Max Elevation
0,Cycling,2025-11-06 14:37:17,False,Poznań Kolarstwo,5.25,169,00:22:48,128,164,2.0,--,--,13.8,26.8,24,32,--,--,--,--,--,--,0.0,--,--,--,--,--,-7,No,00:01:32.0,6,--,--,--,--,--,--,--,--,00:21:04,01:22:36,60,78
1,Strength Training,2025-11-06 06:34:30,False,Siła,0.00,113,00:30:31,96,117,0.3,--,--,--,--,--,--,--,--,--,--,--,--,0.0,--,--,--,--,1,-4,No,00:30:31,1,--,--,--,--,--,--,--,--,00:30:31,00:31:22,--,--
2,Cycling,2025-11-05 16:06:31,False,Poznań Kolarstwo,7.08,183,00:36:12,107,158,2.0,--,--,11.7,25.5,35,38,--,--,--,--,--,--,0.0,--,--,--,--,--,-8,No,00:01:26.0,8,--,--,--,--,--,--,--,--,00:31:03,01:18:05,57,85
3,Indoor Cycling,2025-10-28 17:58:46,False,Jazda na rowerze treningowym,0.00,454,00:55:10,141,169,3.5,--,--,--,--,--,--,--,--,--,--,--,--,0.0,--,--,--,--,--,-10,No,00:55:10,1,--,--,--,--,--,--,--,--,00:00:00,00:55:10,--,--
4,Yoga,2025-10-28 06:31:35,False,Joga,--,57,00:27:33,78,96,0.1,--,--,--,--,--,--,--,--,--,--,--,--,0.0,--,--,--,--,1,-5,No,00:27:33,1,17,10,21,-14,24,10,54,84,00:27:33,00:27:33,--,--
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,Cycling,2023-04-22 15:54:40,False,Poznań Kolarstwo,17.08,304,01:12:07,--,--,--,--,--,14.2,27.2,111,107,--,--,--,--,--,--,0.0,--,--,--,--,--,--,No,00:00:00.7,2,--,--,--,--,--,--,--,--,01:09:02,01:20:41,54,98
434,Cycling,2023-04-22 14:35:57,False,Poznań Kolarstwo,14.42,282,01:11:06,--,--,--,--,--,12.2,25.1,147,152,--,--,--,--,--,--,0.0,--,--,--,--,--,--,No,00:00:00.9,2,--,--,--,--,--,--,--,--,01:03:57,01:18:36,44,96
435,Walking,2023-04-21 09:53:01,False,Chodzenie,3.28,182,00:44:44,--,--,--,--,--,13:39,6:24,--,--,--,--,--,--,--,--,0.0,--,--,--,--,--,--,No,00:00:00.5,2,--,--,--,--,--,--,--,--,00:33:06,00:45:56,--,--
436,Running,2023-04-21 06:38:44,False,Poznań Bieganie,3.04,177,00:21:44,--,--,--,--,--,7:09,3:19,30,32,--,--,--,--,--,--,0.0,--,--,--,--,--,--,No,00:00:00.4,2,--,--,--,--,--,--,--,--,00:21:00,00:21:44,50,71


* Set display.max_columns option, to increase readability of dataframe

In [300]:
pd.set_option('display.max_columns', None)

* Display columns and remove obsolete ones

In [301]:
df_activities.columns

Index(['Activity Type', 'Date', 'Favorite', 'Title', 'Distance', 'Calories',
       'Time', 'Avg HR', 'Max HR', 'Aerobic TE', 'Avg Bike Cadence',
       'Max Bike Cadence', 'Avg Speed', 'Max Speed', 'Total Ascent',
       'Total Descent', 'Avg Stride Length', 'Avg Vertical Ratio',
       'Avg Vertical Oscillation', 'Avg Ground Contact Time', 'Avg GAP',
       'Normalized Power® (NP®)', 'Training Stress Score®', 'Avg Power',
       'Max Power', 'Steps', 'Total Reps', 'Total Sets', 'Body Battery Drain',
       'Decompression', 'Best Lap Time', 'Number of Laps', 'Avg Resp',
       'Min Resp', 'Max Resp', 'Stress Change', 'Stress Start', 'Stress End',
       'Avg Stress', 'Max Stress', 'Moving Time', 'Elapsed Time',
       'Min Elevation', 'Max Elevation'],
      dtype='object')

In [302]:
df_activities = df_activities[['Activity Type', 'Date', 'Distance', 'Calories', 'Avg HR',  'Avg Bike Cadence','Avg Speed', 'Total Ascent',
       'Total Descent', 'Avg Stride Length', 'Steps', 'Body Battery Drain', 'Elapsed Time',
       'Min Elevation', 'Max Elevation']]

* Normalize column names - replace space with underscore, lower text and remove polish special characters

In [303]:
columns = df_activities.columns
new_columns = [column.replace(" ", "_").lower() for column in columns]

for column, new_column in zip(columns, new_columns):
    df_activities.rename(columns={column: new_column}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_activities.rename(columns={column: new_column}, inplace=True)


* Explore data types and dataframe shape

In [304]:
df_activities.dtypes

activity_type         object
date                  object
distance              object
calories              object
avg_hr                object
avg_bike_cadence      object
avg_speed             object
total_ascent          object
total_descent         object
avg_stride_length     object
steps                 object
body_battery_drain    object
elapsed_time          object
min_elevation         object
max_elevation         object
dtype: object

In [305]:
df_activities.shape

(438, 15)

* Narrow data only to cardio activities

In [306]:
df_activities['activity_type'].unique()

array(['Cycling', 'Strength Training', 'Indoor Cycling', 'Yoga',
       'Running', 'Walking', 'Treadmill Running', 'Pilates', 'Breathwork',
       'Hiking', 'Swimming', 'Other', 'Resort Skiing'], dtype=object)

In [307]:
df_activities = df_activities[df_activities['activity_type'].isin(['Running', 'Cycling',
       'Indoor Cycling', 'Walking', 'Treadmill Running', 'Hiking'])]

* Verify dataframe dimension

In [308]:
df_activities.shape

(264, 15)

* Look for NaN values
    * no NaN because missing values represented as "--"
    * replace "--" with NaN


In [309]:
df_activities.isna().sum()

activity_type         0
date                  0
distance              0
calories              0
avg_hr                0
avg_bike_cadence      0
avg_speed             0
total_ascent          0
total_descent         0
avg_stride_length     0
steps                 0
body_battery_drain    0
elapsed_time          0
min_elevation         0
max_elevation         0
dtype: int64

In [310]:
df_activities.replace('--', np.nan, inplace=True)

* Display DF to look for other data issues

In [311]:
df_activities

Unnamed: 0,activity_type,date,distance,calories,avg_hr,avg_bike_cadence,avg_speed,total_ascent,total_descent,avg_stride_length,steps,body_battery_drain,elapsed_time,min_elevation,max_elevation
0,Cycling,2025-11-06 14:37:17,5.25,169,128,,13.8,24,32,,,-7,01:22:36,60,78
2,Cycling,2025-11-05 16:06:31,7.08,183,107,,11.7,35,38,,,-8,01:18:05,57,85
3,Indoor Cycling,2025-10-28 17:58:46,0.00,454,141,,,,,,,-10,00:55:10,,
6,Running,2025-10-24 17:45:37,7.02,437,161,169,5:48,31,33,1.01,6926,-12,00:44:06,51,68
7,Cycling,2025-10-23 14:41:35,4.00,85,101,,12.5,26,21,,,-5,01:18:21,56,74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,Walking,2023-04-24 13:48:06,5.04,226,,,15:49,,,,,,01:22:31,,
433,Cycling,2023-04-22 15:54:40,17.08,304,,,14.2,111,107,,,,01:20:41,54,98
434,Cycling,2023-04-22 14:35:57,14.42,282,,,12.2,147,152,,,,01:18:36,44,96
435,Walking,2023-04-21 09:53:01,3.28,182,,,13:39,,,,,,00:45:56,,


* Remove commas from suma_kalorii, kroki, neccessary for type casting


In [312]:
df_activities.loc[:, ['calories', 'steps']] = df_activities[['calories', 'steps']].map(lambda x: str(x).replace(',', ''))

* Verify datatypes

In [313]:
df_activities.dtypes

activity_type         object
date                  object
distance              object
calories              object
avg_hr                object
avg_bike_cadence      object
avg_speed             object
total_ascent          object
total_descent         object
avg_stride_length     object
steps                 object
body_battery_drain    object
elapsed_time          object
min_elevation         object
max_elevation         object
dtype: object

In [314]:
df_activities.columns

Index(['activity_type', 'date', 'distance', 'calories', 'avg_hr',
       'avg_bike_cadence', 'avg_speed', 'total_ascent', 'total_descent',
       'avg_stride_length', 'steps', 'body_battery_drain', 'elapsed_time',
       'min_elevation', 'max_elevation'],
      dtype='object')

* Convert numeric columns to numeric

In [315]:
numeric_cols = ['distance', 'calories',
       'avg_hr', 'avg_bike_cadence', 'avg_speed',
       'total_ascent', 'total_descent', 'avg_stride_length',
       'steps', 'body_battery_drain', 'min_elevation',
       'max_elevation']

for col in numeric_cols:
    df_activities[col] = pd.to_numeric(df_activities[col], errors='coerce')

In [316]:
df_activities.dtypes

activity_type          object
date                   object
distance              float64
calories                int64
avg_hr                float64
avg_bike_cadence      float64
avg_speed             float64
total_ascent          float64
total_descent         float64
avg_stride_length     float64
steps                 float64
body_battery_drain    float64
elapsed_time           object
min_elevation         float64
max_elevation         float64
dtype: object

* Convert activity time column to seconds using *time_to_seconds* function from features.py

In [317]:
df_activities['elapsed_time'] = df_activities['elapsed_time'].apply(time_to_seconds)

In [318]:
df_activities.dtypes

activity_type          object
date                   object
distance              float64
calories                int64
avg_hr                float64
avg_bike_cadence      float64
avg_speed             float64
total_ascent          float64
total_descent         float64
avg_stride_length     float64
steps                 float64
body_battery_drain    float64
elapsed_time          float64
min_elevation         float64
max_elevation         float64
dtype: object

* Convert datetime column to datetime type

In [319]:
df_activities['date'] = pd.to_datetime(df_activities['date'])

In [320]:
df_activities.dtypes

activity_type                 object
date                  datetime64[ns]
distance                     float64
calories                       int64
avg_hr                       float64
avg_bike_cadence             float64
avg_speed                    float64
total_ascent                 float64
total_descent                float64
avg_stride_length            float64
steps                        float64
body_battery_drain           float64
elapsed_time                 float64
min_elevation                float64
max_elevation                float64
dtype: object

*  Unify activities type to main 3 groups

In [321]:
df_activities['activity_type'].unique() 

array(['Cycling', 'Indoor Cycling', 'Running', 'Walking',
       'Treadmill Running', 'Hiking'], dtype=object)

In [322]:
map_activities = {
    'Running': 'running',
    'Treadmill Running': 'running',
    'Cycling': 'cycling',
    'Indoor Cycling': 'cycling',
    'Walking': 'walking',
    'Hiking': 'walking'
}

df_activities['activity_type'] = df_activities['activity_type'].replace(map_activities)

In [323]:
df_activities

Unnamed: 0,activity_type,date,distance,calories,avg_hr,avg_bike_cadence,avg_speed,total_ascent,total_descent,avg_stride_length,steps,body_battery_drain,elapsed_time,min_elevation,max_elevation
0,cycling,2025-11-06 14:37:17,5.25,169,128.0,,13.8,24.0,32.0,,,-7.0,4956.0,60.0,78.0
2,cycling,2025-11-05 16:06:31,7.08,183,107.0,,11.7,35.0,38.0,,,-8.0,4685.0,57.0,85.0
3,cycling,2025-10-28 17:58:46,0.00,454,141.0,,,,,,,-10.0,3310.0,,
6,running,2025-10-24 17:45:37,7.02,437,161.0,169.0,,31.0,33.0,1.01,6926.0,-12.0,2646.0,51.0,68.0
7,cycling,2025-10-23 14:41:35,4.00,85,101.0,,12.5,26.0,21.0,,,-5.0,4701.0,56.0,74.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,walking,2023-04-24 13:48:06,5.04,226,,,,,,,,,4951.0,,
433,cycling,2023-04-22 15:54:40,17.08,304,,,14.2,111.0,107.0,,,,4841.0,54.0,98.0
434,cycling,2023-04-22 14:35:57,14.42,282,,,12.2,147.0,152.0,,,,4716.0,44.0,96.0
435,walking,2023-04-21 09:53:01,3.28,182,,,,,,,,,2756.0,,


* Verify NaN values

In [324]:
df_activities.isna().sum()

activity_type           0
date                    0
distance                0
calories                0
avg_hr                 17
avg_bike_cadence      246
avg_speed             240
total_ascent           47
total_descent          49
avg_stride_length     246
steps                 164
body_battery_drain    233
elapsed_time            0
min_elevation          57
max_elevation          49
dtype: int64

* Remove unnecessary columns

In [325]:
# Remove unnecessary columns
df_activities = df_activities[['activity_type', 'distance', 'calories', 'avg_hr', 'avg_speed', 'elapsed_time']]

In [326]:
df_activities.isna().sum()

activity_type      0
distance           0
calories           0
avg_hr            17
avg_speed        240
elapsed_time       0
dtype: int64

* Imput values for column 'srednia_predkosc'. Value is calculated based on columns dystans and czas_ruchu, if present. Using *calc_avg_speed* function from features.py

In [327]:
df_activities['avg_speed'] = df_activities.apply(lambda row: calc_avg_speed(row['elapsed_time'], row['distance']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_activities['avg_speed'] = df_activities.apply(lambda row: calc_avg_speed(row['elapsed_time'], row['distance']), axis=1)


In [328]:
df_activities.isna().sum()

activity_type     0
distance          0
calories          0
avg_hr           17
avg_speed        25
elapsed_time      0
dtype: int64

* Create new column 'aktywnosc_stacjonarna' that indicates if activity is outdoor or indoor

In [329]:
df_activities['indoor_activity'] = np.where((df_activities['avg_speed'].isna() | df_activities['avg_hr'].isna()), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_activities['indoor_activity'] = np.where((df_activities['avg_speed'].isna() | df_activities['avg_hr'].isna()), 1, 0)


In [330]:
df_activities

Unnamed: 0,activity_type,distance,calories,avg_hr,avg_speed,elapsed_time,indoor_activity
0,cycling,5.25,169,128.0,3.81,4956.0,0
2,cycling,7.08,183,107.0,5.44,4685.0,0
3,cycling,0.00,454,141.0,,3310.0,1
6,running,7.02,437,161.0,9.55,2646.0,0
7,cycling,4.00,85,101.0,3.06,4701.0,0
...,...,...,...,...,...,...,...
431,walking,5.04,226,,3.66,4951.0,1
433,cycling,17.08,304,,12.70,4841.0,1
434,cycling,14.42,282,,11.01,4716.0,1
435,walking,3.28,182,,4.28,2756.0,1


* Verify current dimension and missing values

In [331]:
df_activities.isna().sum()

activity_type       0
distance            0
calories            0
avg_hr             17
avg_speed          25
elapsed_time        0
indoor_activity     0
dtype: int64

In [332]:
df_activities.shape

(264, 7)

* Imput 0.0 in 'srednia_predkosc' column, as this data was not collected in raw data. Model will recognize it and will learn that 0.0 and aktywnosc_stacjonarna == 1, means that activity should be calculated based on time and hr

In [333]:
df_activities['avg_speed'] = df_activities['avg_speed'].fillna(0.0).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_activities['avg_speed'] = df_activities['avg_speed'].fillna(0.0).astype(float)


* Imput 'srednie_tetno' median for each activity group

In [334]:
mediany_tetna = df_activities.groupby('activity_type')['avg_hr'].median()
mediany_tetna

activity_type
cycling    132.0
running    166.0
walking     99.0
Name: avg_hr, dtype: float64

In [335]:
df_activities['avg_hr'] = df_activities.groupby('activity_type')['avg_hr'].transform(lambda x: x.fillna(x.median()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_activities['avg_hr'] = df_activities.groupby('activity_type')['avg_hr'].transform(lambda x: x.fillna(x.median()))


* Confirm missing values

In [336]:
df_activities.isna().sum()

activity_type      0
distance           0
calories           0
avg_hr             0
avg_speed          0
elapsed_time       0
indoor_activity    0
dtype: int64

In [337]:
df_activities

Unnamed: 0,activity_type,distance,calories,avg_hr,avg_speed,elapsed_time,indoor_activity
0,cycling,5.25,169,128.0,3.81,4956.0,0
2,cycling,7.08,183,107.0,5.44,4685.0,0
3,cycling,0.00,454,141.0,0.00,3310.0,1
6,running,7.02,437,161.0,9.55,2646.0,0
7,cycling,4.00,85,101.0,3.06,4701.0,0
...,...,...,...,...,...,...,...
431,walking,5.04,226,99.0,3.66,4951.0,1
433,cycling,17.08,304,132.0,12.70,4841.0,1
434,cycling,14.42,282,132.0,11.01,4716.0,1
435,walking,3.28,182,99.0,4.28,2756.0,1


* Export cleaned data to CSV file

In [338]:
df_activities.to_csv("data_files/activities_silver.csv", index=False)