In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Start from the original DataFrame
df = runs.copy()

# Filter to running activities (adjust string if your labels differ)
df = df[df["Activity Type"] == "Run"]

# Drop rows with missing critical values
critical_cols = ["Pace", "Distance", "Person"]
df = df.dropna(subset=critical_cols)

# Define target and features
target_col = "Pace"

numeric_features = [
    "Distance",
    "Elevation Gain",
    "Elevation Loss",
    "Elevation Low",
    "Elevation High",
    "Average Grade",
    "Max Grade",
    # add any other numeric columns you trust
]

categorical_features = [
    "Person",
    # "Activity Type",   # we filtered, but you could include if there are multiple types
]

X = df[numeric_features + categorical_features]
y = df[target_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

In [None]:
import pandas as pd
import os

data_folder = "Data"   # folder inside repo containing all runner CSVs

# ------- HELPERS --------

def seconds_to_mmss(value):
    value = float(value)
    minutes = int(value) // 60
    seconds = int(value) % 60
    return f"{minutes}:{seconds:02d}"

def mph_to_pace(mph):
    if pd.isna(mph) or mph <= 0:
        return None
    pace = 60 / mph
    minutes = int(pace)
    seconds = int(round((pace - minutes) * 60))
    if seconds == 60:
        minutes += 1
        seconds = 0
    return f"{minutes}:{seconds:02d}"

# ------- READ & PROCESS ALL CSV FILES -------

all_dataframes = []

for filename in os.listdir(data_folder):
    if filename.endswith(".csv"):
        
        # Extract person name by removing extension (e.g., "Alex.csv" → "Alex")
        person_name = os.path.splitext(filename)[0]

        # Read the file
        file_path = os.path.join(data_folder, filename)
        df = pd.read_csv(file_path)

        # Add person column
        df["Person"] = person_name

        # Keep only the needed columns
        columns_to_keep = [
            "Activity Date", "Activity Type", "Elapsed Time", "Distance", "Moving Time",
            "Max Speed", "Average Speed", "Elevation Gain", "Elevation Loss",
            "Elevation Low", "Elevation High", "Max Grade", "Average Grade",
            "Average Grade Adjusted Pace", "Person"
        ]
        df = df[columns_to_keep]

        # Filter to runs only
        df = df[df["Activity Type"] == "Run"]

        # Format date
        df["Activity Date"] = pd.to_datetime(df["Activity Date"], format="mixed")
        df["Activity Date"] = df["Activity Date"].dt.strftime("%-m/%-d/%y")  # use %#m/%#d/%y on Windows

        # Convert time columns
        df["Elapsed Time"] = df["Elapsed Time"].apply(seconds_to_mmss)
        df["Moving Time"] = df["Moving Time"].apply(seconds_to_mmss)

        # Convert distance (km → miles)
        df["Distance"] = (df["Distance"] * 0.621371).round(2)

        # Convert speeds (m/s → mph)
        df["Max Speed"] = (df["Max Speed"] * 2.23694).round(2)
        df["Average Speed"] = (df["Average Speed"] * 2.23694).round(2)
        df["Average Grade Adjusted Pace"] = (df["Average Grade Adjusted Pace"] * 2.23694).round(2)

        # Convert elevation (m → ft)
        meters_to_feet = 3.28084
        elevation_cols = ["Elevation Gain", "Elevation Loss", "Elevation Low", "Elevation High"]
        df[elevation_cols] = (df[elevation_cols] * meters_to_feet).round(1)

        # Add new pace columns
        df["Pace"] = df["Average Speed"].apply(mph_to_pace)
        df["Grade Adjusted Pace"] = df["Average Grade Adjusted Pace"].apply(mph_to_pace)

        # Store processed dataframe
        all_dataframes.append(df)

# ------- MERGE ALL RUNNERS INTO ONE DF -------

runs = pd.concat(all_dataframes, ignore_index=True)

runs = runs[
    [
        "Activity Date", "Activity Type", "Elapsed Time", "Distance", "Pace",
        "Moving Time", "Max Speed", "Average Speed", "Elevation Gain",
        "Elevation Loss", "Elevation Low", "Elevation High", "Max Grade",
        "Average Grade", "Average Grade Adjusted Pace", "Grade Adjusted Pace",
        "Person"
    ]
]

runs

Unnamed: 0,Activity Date,Activity Type,Elapsed Time,Distance,Pace,Moving Time,Max Speed,Average Speed,Elevation Gain,Elevation Loss,Elevation Low,Elevation High,Max Grade,Average Grade,Average Grade Adjusted Pace,Grade Adjusted Pace,Person
0,9/5/23,Run,24:16,2.04,10:56,22:18,9.92,5.49,42.0,42.0,33.5,59.4,7.7,0.0,,,Karina
1,11/19/24,Run,18:49,1.07,9:10,9:50,11.36,6.55,9.8,149.9,282.8,429.1,14.5,-2.6,6.38,9:24,Karina
2,11/21/24,Run,29:14,1.62,17:45,28:38,10.36,3.38,197.8,197.8,196.5,403.5,35.7,0.0,3.76,15:57,Karina
3,11/24/24,Run,119:21,6.15,13:22,82:14,9.91,4.49,418.6,417.7,192.9,444.6,46.8,0.0,4.64,12:56,Karina
4,12/17/24,Run,73:39,2.97,9:17,27:38,11.05,6.46,93.8,93.8,27.9,58.7,28.8,0.0,6.53,9:11,Karina
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,10/30/25,Run,33:39,4.14,7:14,30:00,11.32,8.29,91.5,96.8,320.5,367.1,5.7,0.0,8.33,7:12,Zubin
420,11/2/25,Run,173:16,10.13,7:50,79:24,16.33,7.66,376.6,749.3,9.8,444.6,22.7,-0.7,7.65,7:51,Zubin
421,11/14/25,Run,85:10,10.21,7:40,78:13,12.03,7.83,237.9,238.5,294.6,367.1,7.1,0.0,7.87,7:37,Zubin
422,11/20/25,Run,48:31,5.31,9:06,48:25,8.77,6.59,55.4,53.1,321.5,340.2,3.4,0.0,6.60,9:05,Zubin


# 1. Exploratory Analysis, Data Collection, Pre-Preprocessing and Discussion


## 1.1 Context

Our dataset, stored in a DataFrame called `runs`, comes from our own exported GPS activity logs from Strava (a popular social media app to record workouts). We each downloaded our Strava data directly from Strava and merged the "activity.cvs" files together to create one comprehensive dataset with all of our data. We even got some of our friends who also use Strava regularly to send us their statistics as well. Each row corresponds to a single activity with summary statistics such as:

- `Activity Date`, `Activity Type`
- `Elapsed Time`, `Moving Time`
- `Distance`
- `Pace`
- `Elevation Gain`, `Elevation Loss`, `Elevation Low`, `Elevation High`
- `Max Grade`, `Average Grade`
- `Person`

We focus on **running activities** where `Activity Type` indicates a run.

In [3]:
runs.sample(5)

Unnamed: 0,Activity Date,Activity Type,Elapsed Time,Distance,Pace,Moving Time,Max Speed,Average Speed,Elevation Gain,Elevation Loss,Elevation Low,Elevation High,Max Grade,Average Grade,Average Grade Adjusted Pace,Grade Adjusted Pace,Person
378,6/8/25,Run,100:28,3.46,9:53,34:14,9.26,6.07,235.9,236.2,315.6,444.6,47.7,0.1,6.11,9:49,Zubin
415,10/17/25,Run,34:06,4.58,7:26,34:05,10.78,8.07,221.8,221.8,229.0,367.1,11.9,0.0,8.19,7:20,Zubin
26,11/11/25,Run,42:25,2.32,9:42,22:31,10.96,6.19,203.4,183.7,184.4,404.5,35.3,0.0,6.51,9:13,Karina
273,11/21/24,Run,10:16,1.01,9:58,10:03,9.89,6.02,43.0,21.7,407.8,442.3,30.3,0.4,6.08,9:52,Audrey
180,3/27/25,Run,47:22,6.23,7:36,47:22,10.6,7.89,119.1,118.8,138.8,217.2,10.2,0.0,7.93,7:34,Alex


## 1.2 Discussion

Key preprocessing steps:

- Add person column 
- Keep only relevant columns (activity, time, elevation, grade, person)
- Format columns to not be in imperial system (time column, mph column, distance, speed) 
- Filter to **running activities** only
- Merge all runners into one dataframe 
- Split into train/test

In [None]:
import pandas as pd
import os

data_folder = "Data"   # folder inside repo containing all runner CSVs

# ------- HELPERS --------

def seconds_to_mmss(value):
    value = float(value)
    minutes = int(value) // 60
    seconds = int(value) % 60
    return f"{minutes}:{seconds:02d}"

def mph_to_pace(mph):
    if pd.isna(mph) or mph <= 0:
        return None
    pace = 60 / mph
    minutes = int(pace)
    seconds = int(round((pace - minutes) * 60))
    if seconds == 60:
        minutes += 1
        seconds = 0
    return f"{minutes}:{seconds:02d}"

# ------- READ & PROCESS ALL CSV FILES -------

all_dataframes = []

for filename in os.listdir(data_folder):
    if filename.endswith(".csv"):
        
        # Extract person name by removing extension (e.g., "Alex.csv" → "Alex")
        person_name = os.path.splitext(filename)[0]

        # Read the file
        file_path = os.path.join(data_folder, filename)
        df = pd.read_csv(file_path)

        # Add person column
        df["Person"] = person_name

        # Keep only the needed columns
        columns_to_keep = [
            "Activity Date", "Activity Type", "Elapsed Time", "Distance", "Moving Time",
            "Max Speed", "Average Speed", "Elevation Gain", "Elevation Loss",
            "Elevation Low", "Elevation High", "Max Grade", "Average Grade",
            "Average Grade Adjusted Pace", "Person"
        ]
        df = df[columns_to_keep]

        # Filter to runs only
        df = df[df["Activity Type"] == "Run"]

        # Format date
        df["Activity Date"] = pd.to_datetime(df["Activity Date"], format="mixed")
        df["Activity Date"] = df["Activity Date"].dt.strftime("%-m/%-d/%y")  # use %#m/%#d/%y on Windows

        # Convert time columns
        df["Elapsed Time"] = df["Elapsed Time"].apply(seconds_to_mmss)
        df["Moving Time"] = df["Moving Time"].apply(seconds_to_mmss)

        # Convert distance (km → miles)
        df["Distance"] = (df["Distance"] * 0.621371).round(2)

        # Convert speeds (m/s → mph)
        df["Max Speed"] = (df["Max Speed"] * 2.23694).round(2)
        df["Average Speed"] = (df["Average Speed"] * 2.23694).round(2)
        df["Average Grade Adjusted Pace"] = (df["Average Grade Adjusted Pace"] * 2.23694).round(2)

        # Convert elevation (m → ft)
        meters_to_feet = 3.28084
        elevation_cols = ["Elevation Gain", "Elevation Loss", "Elevation Low", "Elevation High"]
        df[elevation_cols] = (df[elevation_cols] * meters_to_feet).round(1)

        # Add new pace columns
        df["Pace"] = df["Average Speed"].apply(mph_to_pace)
        df["Grade Adjusted Pace"] = df["Average Grade Adjusted Pace"].apply(mph_to_pace)

        # Store processed dataframe
        all_dataframes.append(df)

# ------- MERGE ALL RUNNERS INTO ONE DF -------

runs = pd.concat(all_dataframes, ignore_index=True)

runs = runs[
    [
        "Activity Date", "Activity Type", "Elapsed Time", "Distance", "Pace",
        "Moving Time", "Max Speed", "Average Speed", "Elevation Gain",
        "Elevation Loss", "Elevation Low", "Elevation High", "Max Grade",
        "Average Grade", "Average Grade Adjusted Pace", "Grade Adjusted Pace",
        "Person"
    ]
]

## 1.3 Code 
##### Support your analysis with tables, plots, statistics, etc.

# 2. Predictive Task

In [None]:
#Convert MM:SS pace to numeric 
def pace_str_to_float(p):
    if pd.isna(p):
        return np.nan
    if isinstance(p, str) and ":" in p:
        m, s = p.split(":")
        return float(m) + float(s)/60
    return float(p)

runs["Pace_numeric"] = runs["Pace"].apply(pace_str_to_float)


In [12]:
#Sort Data by Date (Important for time trends)
runs["Activity Date"] = pd.to_datetime(runs["Activity Date"])
runs = runs.sort_values(["Person", "Activity Date"]).reset_index(drop=True)

In [14]:
#create rolling trend based features (using past 5 runs to predict next run)
import numpy as np
WINDOW = 5

feature_rows = []

for person, person_df in runs.groupby("Person"):
    person_df = person_df.sort_values("Activity Date").reset_index(drop=True)

    for i in range(WINDOW, len(person_df) - 1):
        hist = person_df.iloc[i-WINDOW:i]
        next_run = person_df.iloc[i+1]

        row = {
            "Person": person,

            # ----- PACE HISTORY -----
            "pace_mean_5": hist["Pace_numeric"].mean(),
            "pace_std_5": hist["Pace_numeric"].std(),
            "pace_last": hist["Pace_numeric"].iloc[-1],
            "pace_slope_5": np.polyfit(range(WINDOW), hist["Pace_numeric"], 1)[0],

            # ----- DISTANCE HISTORY -----
            "dist_mean_5": hist["Distance"].mean(),
            "dist_std_5": hist["Distance"].std(),
            "dist_last": hist["Distance"].iloc[-1],
            "dist_slope_5": np.polyfit(range(WINDOW), hist["Distance"], 1)[0],

            # ----- TERRAIN HISTORY -----
            "elev_gain_mean_5": hist["Elevation Gain"].mean(),
            "grade_mean_5": hist["Average Grade"].mean(),

            # ----- TIME GAP FEATURE -----
            "days_since_last": (
                person_df.iloc[i]["Activity Date"] -
                person_df.iloc[i-1]["Activity Date"]
            ).days,

            # ----- TARGETS (NEXT RUN) -----
            "next_run_pace": next_run["Pace_numeric"],
            "next_run_distance": next_run["Distance"]
        }

        feature_rows.append(row)

trend_df = pd.DataFrame(feature_rows)
trend_df = trend_df.dropna()
trend_df.head()


Unnamed: 0,Person,pace_mean_5,pace_std_5,pace_last,pace_slope_5,dist_mean_5,dist_std_5,dist_last,dist_slope_5,elev_gain_mean_5,grade_mean_5,days_since_last,next_run_pace,next_run_distance
0,Alex,7.3,0.37951,7.016667,-0.158333,2.036,0.32997,2.08,0.154,4.54,-1.94,3,7.35,1.03
1,Alex,7.05,0.644097,6.116667,-0.375,1.902,0.563844,1.01,-0.158,10.52,-2.06,34,7.6,1.38
2,Alex,6.933333,0.474634,7.35,-0.05,1.756,0.690167,1.03,-0.375,10.72,-2.12,8,7.016667,2.0
3,Alex,7.026667,0.561694,7.6,0.143333,1.6,0.663664,1.38,-0.329,9.4,-2.44,5,6.55,2.01
4,Alex,7.02,0.561545,7.016667,0.148333,1.5,0.515218,2.0,0.021,21.14,-1.9,6,7.6,3.0


In [15]:
#feature matrix and targets
X = trend_df.drop(columns=["next_run_pace", "next_run_distance"])
y_pace = trend_df["next_run_pace"]
y_dist = trend_df["next_run_distance"]


In [16]:
#One hot encode person and do the train/test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_features = X.drop(columns=["Person"]).columns.tolist()
categorical_features = ["Person"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

X_train, X_test, y_pace_train, y_pace_test = train_test_split(
    X, y_pace, test_size=0.2, random_state=42
)

_, _, y_dist_train, y_dist_test = train_test_split(
    X, y_dist, test_size=0.2, random_state=42
)


In [17]:
#Linear regression baselines 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

linreg_pace = Pipeline([
    ("preprocess", preprocessor),
    ("model", LinearRegression())
])

linreg_dist = Pipeline([
    ("preprocess", preprocessor),
    ("model", LinearRegression())
])

linreg_pace.fit(X_train, y_pace_train)
linreg_dist.fit(X_train, y_dist_train)

pace_pred_lr = linreg_pace.predict(X_test)
dist_pred_lr = linreg_dist.predict(X_test)

print("Linear Regression — Pace MAE:", mean_absolute_error(y_pace_test, pace_pred_lr))
print("Linear Regression — Pace RMSE:", mean_squared_error(y_pace_test, pace_pred_lr, squared=False))

print("Linear Regression — Distance MAE:", mean_absolute_error(y_dist_test, dist_pred_lr))
print("Linear Regression — Distance RMSE:", mean_squared_error(y_dist_test, dist_pred_lr, squared=False))


Linear Regression — Pace MAE: 8.42841004549719
Linear Regression — Pace RMSE: 19.979106193877797
Linear Regression — Distance MAE: 1.5823890301924006
Linear Regression — Distance RMSE: 2.324928601240988




In [18]:
#Ridge and GridSearchCV - opimized model 
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", Ridge())
])

param_grid = {
    "model__alpha": [0.01, 0.1, 1, 10, 50, 100]
}

#ridge for pace 
ridge_grid_pace = GridSearchCV(
    ridge_pipe, param_grid,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

ridge_grid_pace.fit(X_train, y_pace_train)

best_ridge_pace = ridge_grid_pace.best_estimator_
pace_pred_ridge = best_ridge_pace.predict(X_test)

print("Best Ridge alpha (Pace):", ridge_grid_pace.best_params_)
print("Ridge Pace MAE:", mean_absolute_error(y_pace_test, pace_pred_ridge))
print("Ridge Pace RMSE:", mean_squared_error(y_pace_test, pace_pred_ridge, squared=False))

#ridge for distance 
ridge_grid_dist = GridSearchCV(
    ridge_pipe, param_grid,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

ridge_grid_dist.fit(X_train, y_dist_train)

best_ridge_dist = ridge_grid_dist.best_estimator_
dist_pred_ridge = best_ridge_dist.predict(X_test)

print("Best Ridge alpha (Distance):", ridge_grid_dist.best_params_)
print("Ridge Distance MAE:", mean_absolute_error(y_dist_test, dist_pred_ridge))
print("Ridge Distance RMSE:", mean_squared_error(y_dist_test, dist_pred_ridge, squared=False))



Best Ridge alpha (Pace): {'model__alpha': 100}
Ridge Pace MAE: 5.055894056823785
Ridge Pace RMSE: 6.904936067333741
Best Ridge alpha (Distance): {'model__alpha': 0.01}
Ridge Distance MAE: 1.5841714721188922
Ridge Distance RMSE: 2.3277022153770885




In [19]:
#strong baseline 
last_pace_baseline = X_test["pace_last"]
last_dist_baseline = X_test["dist_last"]

print("Last-Run Pace MAE:", mean_absolute_error(y_pace_test, last_pace_baseline))
print("Last-Run Distance MAE:", mean_absolute_error(y_dist_test, last_dist_baseline))


Last-Run Pace MAE: 1.022573839662447
Last-Run Distance MAE: 1.8837974683544303


In [20]:
results = pd.DataFrame({
    "Model": ["Last Run", "Linear Regression", "Ridge Regression"],
    "Pace MAE": [
        mean_absolute_error(y_pace_test, last_pace_baseline),
        mean_absolute_error(y_pace_test, pace_pred_lr),
        mean_absolute_error(y_pace_test, pace_pred_ridge)
    ],
    "Distance MAE": [
        mean_absolute_error(y_dist_test, last_dist_baseline),
        mean_absolute_error(y_dist_test, dist_pred_lr),
        mean_absolute_error(y_dist_test, dist_pred_ridge)
    ]
})

results


Unnamed: 0,Model,Pace MAE,Distance MAE
0,Last Run,1.022574,1.883797
1,Linear Regression,8.42841,1.582389
2,Ridge Regression,5.055894,1.584171


## 2.1 Task Definition

## 2.2 Evaluation Strategy

We evaluate our models using a **train/validation/test split**:

- Split the dataset into:
  - **Training set**: fit the model and (via cross-validation) tune hyperparameters.
  - **Test set**: held-out data for final performance reporting.

We use 
- **Mean Absolute Error (MAE)**:  
  Interpretable as the average absolute error in pace units and distance units


## 2.3 Relevant Baselines 

## 2.4 Validity of Predictions

# 3 Modeling

## 3.1 Context 

## 3.2 Discussion

## 3.3 Code