# 1. Define the problem

- To predict the time it will take the rider to complete the set track.
- This is a problem of the type **regression**.


# 2. Collect the data

- Data has been collected from my Strava account, transformed to CSV, and stored in the data folder.
- It has been merged into one big csv, adding an id column to identify the activity.
- It has been tried to get data from Kaggle or other sources, but it was not possible.
- As our problem is to predict the time given a track, we will only keep the GPS data.

In [6]:
import pandas as pd

activity_df = pd.read_csv("data/processed_activities.csv")

# 3. Exploratory Data Analysis, new features, and data cleaning
   - Descriptive statistical analysis (means, medians, standard deviations, etc.)
   - Visualizations (histograms, bar charts, boxplots, correlation matrices, etc.)
   - Identify missing data
   - Detect outliers
   - Create new features
   - Data cleaning (remove duplicates, fix data types, etc.)

In [7]:
# First we need to group the data by the id and create new columns

activity_df.describe()

Unnamed: 0,id,position_lat,position_long,distance,speed,cadence,power,temperature,altitude,grade
count,426104.0,425936.0,425936.0,426008.0,425964.0,201917.0,201905.0,425874.0,423863.0,423821.0
mean,10923400000.0,438140900.0,-53237380.0,12679.586805,4.145886,72.429984,143.165568,16.579927,121.796346,1.571252
std,813670600.0,353151.6,364993.9,9676.908156,2.024192,30.693327,87.105788,5.374928,132.378802,5.321067
min,9364723000.0,437425600.0,-54846290.0,0.0,0.0,0.0,0.0,5.0,-34.2,-34.86
25%,10515560000.0,437813500.0,-53319870.0,5019.8425,2.28,79.0,106.0,13.0,20.8,0.0
50%,11283060000.0,438168100.0,-53186320.0,10036.84,4.258,85.0,147.0,16.0,61.2,0.0
75%,11540950000.0,438447800.0,-53023150.0,18587.265,5.631,88.0,186.0,20.0,207.2,3.65
max,11926310000.0,439118300.0,-52308470.0,41829.96,36.559,166.0,879.0,33.0,642.8,25.57


In [8]:
# timestamp column to datetime
activity_df["timestamp"] = pd.to_datetime(activity_df["timestamp"])

# See types
activity_df.dtypes

id                        int64
position_lat            float64
position_long           float64
distance                float64
speed                   float64
timestamp        datetime64[ns]
cadence                 float64
power                   float64
temperature             float64
altitude                float64
grade                   float64
dtype: object

In [9]:
cols = ["id", "timestamp", "position_lat", "position_long", "distance", "altitude", "grade", "temperature"]
activity_df = activity_df[cols]
activity_df = activity_df.sort_values(by=["id", "timestamp"])

activity_df.head()

Unnamed: 0,id,timestamp,position_lat,position_long,distance,altitude,grade,temperature
0,9364722629,2023-03-16 15:41:18,438138152.0,-53432178.0,0.0,,,
1,9364722629,2023-03-16 15:41:39,438138243.0,-53435415.0,0.0,,,20.0
2,9364722629,2023-03-16 15:41:40,438138288.0,-53435785.0,2.5,,,20.0
3,9364722629,2023-03-16 15:41:41,438138379.0,-53436143.0,5.23,,,20.0
4,9364722629,2023-03-16 15:41:42,438138379.0,-53436467.0,7.68,,,20.0


In [10]:
def categorize_grade(df):
    # Categorize in cats:
    # flat -5 to 5
    # dh -5 to -10
    # hard_dh -10 to -inf
    # up 5 to 10
    # hard_up 10 to 20
    # extreme_up 20 to inf

    cuts = [-float("inf"), -10, -5, 5, 10, 20, float("inf")]
    labels = ["hard_dh", "dh", "flat", "up", "hard_up", "extreme_up"]
    df["grade_cat"] = pd.cut(df["grade"], bins=cuts, labels=labels)
    return df

activity_df = categorize_grade(activity_df)
activity_df.tail()

Unnamed: 0,id,timestamp,position_lat,position_long,distance,altitude,grade,temperature,grade_cat
426099,11926306295,2024-04-11 18:48:43,438135093.0,-52892641.0,15268.7,36.2,-0.92,14.0,flat
426100,11926306295,2024-04-11 18:48:44,438134784.0,-52891999.0,15274.35,36.2,0.0,14.0,flat
426101,11926306295,2024-04-11 18:48:45,438134611.0,-52891536.0,15278.21,36.0,-0.17,14.0,flat
426102,11926306295,2024-04-11 18:48:46,438134575.0,-52891252.0,15280.49,35.8,-1.13,14.0,flat
426103,11926306295,2024-04-11 18:48:47,438134657.0,-52891008.0,15282.54,35.8,-1.25,14.0,flat


In [11]:
def time_of_day(timestamp):
    if 6 <= timestamp.hour < 12:
        return "morning"
    elif 12 <= timestamp.hour < 18:
        return "afternoon"
    else:
        return "night"
    
def season(timestamp):
    if timestamp.month in [12, 1, 2]:
        return "summer"
    elif timestamp.month in [3, 4, 5]:
        return "autumn"
    elif timestamp.month in [6, 7, 8]:
        return "winter"
    else:
        return "spring"

def group_dataset(df):
    # Group by id 
    # Create the following columns:
    # - total_distance
    # - total time in seconds
    # - ascent_meters %
    # - meters in each grade category %
    # - time of the day - morning (6-12), afternoon (12-18), night (18-24)
    # - season

    # Group by id
    df = df.groupby("id")
    df = df.agg(
        total_distance=("distance", "max"),
        total_time=("timestamp", lambda x: (x.max() - x.min()).total_seconds()),
        time_of_day=("timestamp", lambda x: time_of_day(x.min())),
        season=("timestamp", lambda x: season(x.min())),
        ascent_pctg=("grade_cat", lambda x: (x == "up").sum() / len(x)),
        hard_dh_pctg=("grade_cat", lambda x: (x == "hard_dh").sum() / len(x)),
        dh_pctg=("grade_cat", lambda x: (x == "dh").sum() / len(x)),
        flat_pctg=("grade_cat", lambda x: (x == "flat").sum() / len(x)),
        up_pctg=("grade_cat", lambda x: (x == "up").sum() / len(x)),
        hard_up_pctg=("grade_cat", lambda x: (x == "hard_up").sum() / len(x)),
        extreme_up_pctg=("grade_cat", lambda x: (x == "extreme_up").sum() / len(x))
    )

    return df

activity_df = group_dataset(activity_df)
activity_df.head()

Unnamed: 0_level_0,total_distance,total_time,time_of_day,season,ascent_pctg,hard_dh_pctg,dh_pctg,flat_pctg,up_pctg,hard_up_pctg,extreme_up_pctg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9364722629,20985.8,6098.0,afternoon,autumn,0.230218,0.022398,0.072884,0.545621,0.230218,0.12539,0.000184
9374174799,41829.96,13446.0,morning,autumn,0.216995,0.035323,0.075957,0.549401,0.216995,0.118193,0.001517
9395240266,35638.09,8104.0,afternoon,autumn,0.094512,0.013388,0.039369,0.833378,0.094512,0.018955,0.000265
9407119831,21860.83,6442.0,afternoon,autumn,0.208517,0.028334,0.080081,0.452664,0.208517,0.224126,0.0
9469049236,14268.97,5579.0,morning,autumn,0.180618,0.066436,0.086347,0.462617,0.180618,0.188135,0.0


In [12]:
# Predict variable
pred_var = "total_time"

X = activity_df.drop(columns=[pred_var])
y = activity_df[pred_var]

print("X shape:", X.shape)
print("y shape:", y.shape)



X shape: (74, 10)
y shape: (74,)


In [13]:
# Convert to np arrays

X = X.values
y = y.values

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (74, 10)
y shape: (74,)


In [14]:
# One hot encode columns with categorical data

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# One hot encode the columns
print(X[:5])
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [1, 2])], remainder="passthrough")
X = ct.fit_transform(X)


print(X[:5])
print("X shape:", X.shape)
print("y shape:", y.shape)

[[20985.8 'afternoon' 'autumn' 0.23021846888195338 0.02239765008261428
  0.07288415641637598 0.5456214429961447 0.23021846888195338
  0.12539012300348815 0.00018358729575913347]
 [41829.96 'morning' 'autumn' 0.21699544764795145 0.035322879784184795
  0.07595683695835441 0.5494014500084303 0.21699544764795145
  0.11819254763109088 0.0015174506828528073]
 [35638.09 'afternoon' 'autumn' 0.09451219512195122 0.0133881230116649
  0.03936903499469777 0.8333775185577943 0.09451219512195122
  0.018955461293743373 0.0002651113467656416]
 [21860.83 'afternoon' 'autumn' 0.20851713607058026 0.02833389888021717
  0.08008143875127248 0.45266372582287073 0.20851713607058026
  0.22412623006447235 0.0]
 [14268.97 'morning' 'autumn' 0.1806176351076798 0.06643640796424218
  0.08634701340918326 0.46261682242990654 0.1806176351076798
  0.18813490451036163 0.0]]
[[1.0 0.0 0.0 1.0 0.0 0.0 0.0 20985.8 0.23021846888195338
  0.02239765008261428 0.07288415641637598 0.5456214429961447
  0.23021846888195338 0.12539

In [15]:
# Split the data into train and test

from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train = np.array(X_train).astype(np.float32)
X_test = np.array(X_test).astype(np.float32)
y_train = np.array(y_train).astype(np.float32)
y_test = np.array(y_test).astype(np.float32)


print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Types
print("X_train type:", type(X_train))
print("X_test type:", type(X_test))


X_train shape: (59, 15)
X_test shape: (15, 15)
y_train shape: (59,)
y_test shape: (15,)
X_train type: <class 'numpy.ndarray'>
X_test type: <class 'numpy.ndarray'>


In [16]:
# Use Ridge regression with lienar kernel

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

ridge = Ridge()
ridge.fit(X_train, y_train)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [17]:
# Get accuracy, precision, recall and f1 score
y_pred = ridge.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)


Mean squared error: 680366.06


In [18]:
# SVR lineal
from sklearn.svm import SVR
svr = SVR(kernel="linear")
svr.fit(X_train, y_train)

# Get accuracy, precision, recall and f1 score
y_pred = svr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

Mean squared error: 1056653.1772810658


In [19]:
# Get r2 score of both models
from sklearn.metrics import r2_score

r2_ridge = r2_score(y_test, ridge.predict(X_test))
r2_svr = r2_score(y_test, svr.predict(X_test))

print("R2 Ridge:", r2_ridge)
print("R2 SVR:", r2_svr)

R2 Ridge: 0.781035134692907
R2 SVR: 0.6599331571806867
