In [33]:
import pandas as pd
import os

data_folder = "Data"   # folder inside repo containing all runner CSVs

# ------- HELPERS --------

def seconds_to_mmss(value):
    value = float(value)
    minutes = int(value) // 60
    seconds = int(value) % 60
    return f"{minutes}:{seconds:02d}"

def mph_to_pace(mph):
    if pd.isna(mph) or mph <= 0:
        return None
    pace = 60 / mph
    minutes = int(pace)
    seconds = int(round((pace - minutes) * 60))
    if seconds == 60:
        minutes += 1
        seconds = 0
    return f"{minutes}:{seconds:02d}"

# ------- READ & PROCESS ALL CSV FILES -------

all_dataframes = []

for filename in os.listdir(data_folder):
    if filename.endswith(".csv"):
        
        # Extract person name by removing extension (e.g., "Alex.csv" → "Alex")
        person_name = os.path.splitext(filename)[0]

        # Read the file
        file_path = os.path.join(data_folder, filename)
        df = pd.read_csv(file_path)

        # Add person column
        df["Person"] = person_name

        # Keep only the needed columns
        columns_to_keep = [
            "Activity Date", "Activity Type", "Elapsed Time", "Distance", "Moving Time",
            "Max Speed", "Average Speed", "Elevation Gain", "Elevation Loss",
            "Elevation Low", "Elevation High", "Max Grade", "Average Grade",
            "Average Grade Adjusted Pace", "Person"
        ]
        df = df[columns_to_keep]

        # Filter to runs only
        df = df[df["Activity Type"] == "Run"]

        # Format date
        df["Activity Date"] = pd.to_datetime(df["Activity Date"], format="mixed")
        df["Activity Date"] = df["Activity Date"].dt.strftime("%-m/%-d/%y")  # use %#m/%#d/%y on Windows

        # Convert time columns
        df["Elapsed Time"] = df["Elapsed Time"].apply(seconds_to_mmss)
        df["Moving Time"] = df["Moving Time"].apply(seconds_to_mmss)

        # Convert distance (km → miles)
        df["Distance"] = (df["Distance"] * 0.621371).round(2)

        # Convert speeds (m/s → mph)
        df["Max Speed"] = (df["Max Speed"] * 2.23694).round(2)
        df["Average Speed"] = (df["Average Speed"] * 2.23694).round(2)
        df["Average Grade Adjusted Pace"] = (df["Average Grade Adjusted Pace"] * 2.23694).round(2)

        # Convert elevation (m → ft)
        meters_to_feet = 3.28084
        elevation_cols = ["Elevation Gain", "Elevation Loss", "Elevation Low", "Elevation High"]
        df[elevation_cols] = (df[elevation_cols] * meters_to_feet).round(1)

        # Add new pace columns
        df["Pace"] = df["Average Speed"].apply(mph_to_pace)
        df["Grade Adjusted Pace"] = df["Average Grade Adjusted Pace"].apply(mph_to_pace)

        # Store processed dataframe
        all_dataframes.append(df)

# ------- MERGE ALL RUNNERS INTO ONE DF -------

runs = pd.concat(all_dataframes, ignore_index=True)

runs = runs[
    [
        "Activity Date", "Activity Type", "Elapsed Time", "Distance", "Pace",
        "Elevation Loss", "Elevation Low", "Elevation High", "Max Grade",
        "Average Grade", "Average Grade Adjusted Pace", "Grade Adjusted Pace",
        "Person"
    ]
]

# 1. Exploratory Analysis, Data Collection, Pre-Preprocessing and Discussion


## 1.1 Context
**Where does your dataset come from? What is it for, how was it collected, etc.?**

Our dataset, stored in a DataFrame called `runs`, comes from our own exported GPS activity logs from Strava (a popular social media app to record workouts). We each downloaded our Strava data directly from Strava and merged the "activity.cvs" files together to create one comprehensive dataset with all of our data. We even got some of our friends who also use Strava regularly to send us their statistics as well. Each row corresponds to a single activity with summary statistics such as:

- `Activity Date`, `Activity Type`
- `Elapsed Time`, `Moving Time`
- `Distance`
- `Pace`
- `Elevation Gain`, `Elevation Loss`, `Elevation Low`, `Elevation High`
- `Max Grade`, `Average Grade`
- `Person`

We focus on **running activities** where `Activity Type` indicates a run.

In [3]:
runs.sample(5)

Unnamed: 0,Activity Date,Activity Type,Elapsed Time,Distance,Pace,Moving Time,Max Speed,Average Speed,Elevation Gain,Elevation Loss,Elevation Low,Elevation High,Max Grade,Average Grade,Average Grade Adjusted Pace,Grade Adjusted Pace,Person
378,6/8/25,Run,100:28,3.46,9:53,34:14,9.26,6.07,235.9,236.2,315.6,444.6,47.7,0.1,6.11,9:49,Zubin
415,10/17/25,Run,34:06,4.58,7:26,34:05,10.78,8.07,221.8,221.8,229.0,367.1,11.9,0.0,8.19,7:20,Zubin
26,11/11/25,Run,42:25,2.32,9:42,22:31,10.96,6.19,203.4,183.7,184.4,404.5,35.3,0.0,6.51,9:13,Karina
273,11/21/24,Run,10:16,1.01,9:58,10:03,9.89,6.02,43.0,21.7,407.8,442.3,30.3,0.4,6.08,9:52,Audrey
180,3/27/25,Run,47:22,6.23,7:36,47:22,10.6,7.89,119.1,118.8,138.8,217.2,10.2,0.0,7.93,7:34,Alex


## 1.2 Discussion
**Report how you processed the data (or how it was already processed)**

Key preprocessing steps:

- Add person column 
- Keep only relevant columns (activity, time, elevation, grade, person)
- Format columns to not be in imperial system (time column, mph column, distance, speed) 
- Filter to **running activities** only
- Merge all runners into one dataframe 
- Split into train/test

In [None]:
import pandas as pd
import os

data_folder = "Data"   # folder inside repo containing all runner CSVs

# ------- HELPERS --------

def seconds_to_mmss(value):
    value = float(value)
    minutes = int(value) // 60
    seconds = int(value) % 60
    return f"{minutes}:{seconds:02d}"

def mph_to_pace(mph):
    if pd.isna(mph) or mph <= 0:
        return None
    pace = 60 / mph
    minutes = int(pace)
    seconds = int(round((pace - minutes) * 60))
    if seconds == 60:
        minutes += 1
        seconds = 0
    return f"{minutes}:{seconds:02d}"

# ------- READ & PROCESS ALL CSV FILES -------

all_dataframes = []

for filename in os.listdir(data_folder):
    if filename.endswith(".csv"):
        
        # Extract person name by removing extension (e.g., "Alex.csv" → "Alex")
        person_name = os.path.splitext(filename)[0]

        # Read the file
        file_path = os.path.join(data_folder, filename)
        df = pd.read_csv(file_path)

        # Add person column
        df["Person"] = person_name

        # Keep only the needed columns
        columns_to_keep = [
            "Activity Date", "Activity Type", "Elapsed Time", "Distance", "Moving Time",
            "Max Speed", "Average Speed", "Elevation Gain", "Elevation Loss",
            "Elevation Low", "Elevation High", "Max Grade", "Average Grade",
            "Average Grade Adjusted Pace", "Person"
        ]
        df = df[columns_to_keep]

        # Filter to runs only
        df = df[df["Activity Type"] == "Run"]

        # Format date
        df["Activity Date"] = pd.to_datetime(df["Activity Date"], format="mixed")
        df["Activity Date"] = df["Activity Date"].dt.strftime("%-m/%-d/%y")  # use %#m/%#d/%y on Windows

        # Convert time columns
        df["Elapsed Time"] = df["Elapsed Time"].apply(seconds_to_mmss)
        df["Moving Time"] = df["Moving Time"].apply(seconds_to_mmss)

        # Convert distance (km → miles)
        df["Distance"] = (df["Distance"] * 0.621371).round(2)

        # Convert speeds (m/s → mph)
        df["Max Speed"] = (df["Max Speed"] * 2.23694).round(2)
        df["Average Speed"] = (df["Average Speed"] * 2.23694).round(2)
        df["Average Grade Adjusted Pace"] = (df["Average Grade Adjusted Pace"] * 2.23694).round(2)

        # Convert elevation (m → ft)
        meters_to_feet = 3.28084
        elevation_cols = ["Elevation Gain", "Elevation Loss", "Elevation Low", "Elevation High"]
        df[elevation_cols] = (df[elevation_cols] * meters_to_feet).round(1)

        # Add new pace columns
        df["Pace"] = df["Average Speed"].apply(mph_to_pace)
        df["Grade Adjusted Pace"] = df["Average Grade Adjusted Pace"].apply(mph_to_pace)

        # Store processed dataframe
        all_dataframes.append(df)

# ------- MERGE ALL RUNNERS INTO ONE DF -------

runs = pd.concat(all_dataframes, ignore_index=True)

runs = runs[
    [
        "Activity Date", "Activity Type", "Elapsed Time", "Distance", "Pace",
        "Moving Time", "Max Speed", "Average Speed", "Elevation Gain",
        "Elevation Loss", "Elevation Low", "Elevation High", "Max Grade",
        "Average Grade", "Average Grade Adjusted Pace", "Grade Adjusted Pace",
        "Person"
    ]
]

## 1.3 Code 
**Support your analysis with tables, plots, statistics, etc.**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# --- BASIC DATASET STATISTICS ---
print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"Total number of runs: {len(runs)}")
print(f"Number of unique runners: {runs['Person'].nunique()}")
print(f"Date range: {runs['Activity Date'].min()} to {runs['Activity Date'].max()}")
print()

# --- RUNS PER PERSON ---
print("=" * 60)
print("RUNS PER PERSON")
print("=" * 60)
runs_per_person = runs.groupby('Person').size().sort_values(ascending=False)
print(runs_per_person)
print()

# --- VISUALIZE RUNS PER PERSON ---
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
runs_per_person.plot(kind='bar', ax=ax, color='steelblue')
ax.set_title('Number of Runs per Person', fontsize=14, fontweight='bold')
ax.set_xlabel('Person', fontsize=12)
ax.set_ylabel('Number of Runs', fontsize=12)
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

# --- CONVERT PACE TO NUMERIC FOR ANALYSIS ---
def pace_str_to_float(p):
    if pd.isna(p):
        return np.nan
    if isinstance(p, str) and ":" in p:
        m, s = p.split(":")
        return float(m) + float(s)/60
    return float(p)

runs['Pace_numeric'] = runs['Pace'].apply(pace_str_to_float)
runs['Grade Adjusted Pace_numeric'] = runs['Grade Adjusted Pace'].apply(pace_str_to_float)

# --- REMOVE EXTREME OUTLIERS (pace > 30 min/mile is unrealistic for running) ---
print("=" * 60)
print("OUTLIER REMOVAL")
print("=" * 60)
print(f"Runs before outlier removal: {len(runs)}")
runs_clean = runs[runs['Pace_numeric'] <= 30].copy()
print(f"Runs after removing pace outliers (pace > 30 min/mile): {len(runs_clean)}")
print(f"Removed {len(runs) - len(runs_clean)} outlier runs")
print()

# Use cleaned data for all subsequent analysis
runs = runs_clean

# --- SUMMARY STATISTICS BY PERSON ---
print("=" * 60)
print("SUMMARY STATISTICS BY PERSON")
print("=" * 60)
summary_stats = runs.groupby('Person').agg({
    'Distance': ['mean', 'std', 'min', 'max'],
    'Pace_numeric': ['mean', 'std', 'min', 'max'],
    'Elevation Gain': ['mean', 'std'],
    'Average Grade': ['mean', 'std']
}).round(2)
print(summary_stats)
print()

# --- DISTRIBUTION OF DISTANCES ---
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Overall distance distribution
axes[0].hist(runs['Distance'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Run Distances', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Distance (miles)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].axvline(runs['Distance'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {runs["Distance"].mean():.2f}')
axes[0].legend()

# Distance distribution by person
for person in runs['Person'].unique():
    person_data = runs[runs['Person'] == person]['Distance']
    axes[1].hist(person_data, bins=20, alpha=0.5, label=person, edgecolor='black')
axes[1].set_title('Distance Distribution by Person', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Distance (miles)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].legend()

plt.tight_layout()
plt.show()

# --- DISTRIBUTION OF PACE ---
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Overall pace distribution
axes[0].hist(runs['Pace_numeric'].dropna(), bins=30, color='lightcoral', edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Pace (Outliers Removed)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Pace (min/mile)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].axvline(runs['Pace_numeric'].mean(), color='darkred', linestyle='--', linewidth=2, label=f'Mean: {runs["Pace_numeric"].mean():.2f}')
axes[0].legend()

# Pace distribution by person (boxplot)
runs.boxplot(column='Pace_numeric', by='Person', ax=axes[1], patch_artist=True)
axes[1].set_title('Pace Distribution by Person (Outliers Removed)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Person', fontsize=12)
axes[1].set_ylabel('Pace (min/mile)', fontsize=12)
plt.suptitle('')  # Remove the automatic title from boxplot

plt.tight_layout()
plt.show()

# --- ELEVATION GAIN ANALYSIS ---
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Overall elevation gain distribution
axes[0].hist(runs['Elevation Gain'].dropna(), bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
axes[0].set_title('Distribution of Elevation Gain', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Elevation Gain (ft)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].axvline(runs['Elevation Gain'].mean(), color='darkgreen', linestyle='--', linewidth=2, label=f'Mean: {runs["Elevation Gain"].mean():.2f}')
axes[0].legend()

# Elevation gain by person
elevation_by_person = runs.groupby('Person')['Elevation Gain'].mean().sort_values()
elevation_by_person.plot(kind='barh', ax=axes[1], color='seagreen')
axes[1].set_title('Average Elevation Gain by Person', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Average Elevation Gain (ft)', fontsize=12)
axes[1].set_ylabel('Person', fontsize=12)

plt.tight_layout()
plt.show()

# --- CORRELATION ANALYSIS ---
print("=" * 60)
print("CORRELATION ANALYSIS")
print("=" * 60)
numeric_cols = ['Distance', 'Pace_numeric', 'Elevation Gain', 'Elevation Loss',
                'Elevation Low', 'Elevation High', 'Max Grade', 'Average Grade']
correlation_matrix = runs[numeric_cols].corr()
print(correlation_matrix.round(2))
print()

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Running Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# --- PACE vs DISTANCE SCATTER ---
plt.figure(figsize=(10, 6))
for person in runs['Person'].unique():
    person_data = runs[runs['Person'] == person]
    plt.scatter(person_data['Distance'], person_data['Pace_numeric'],
                alpha=0.6, label=person, s=50)
plt.xlabel('Distance (miles)', fontsize=12)
plt.ylabel('Pace (min/mile)', fontsize=12)
plt.title('Pace vs Distance by Person', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# --- TEMPORAL TRENDS ---
runs['Activity Date'] = pd.to_datetime(runs['Activity Date'])
runs_sorted = runs.sort_values('Activity Date')

plt.figure(figsize=(14, 6))
for person in runs['Person'].unique():
    person_data = runs_sorted[runs_sorted['Person'] == person]
    plt.plot(person_data['Activity Date'], person_data['Distance'],
             marker='o', alpha=0.7, label=person, markersize=4)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Distance (miles)', fontsize=12)
plt.title('Running Distance Over Time by Person', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- KEY INSIGHTS ---
print("=" * 60)
print("KEY INSIGHTS")
print("=" * 60)
print(f"• Average run distance: {runs['Distance'].mean():.2f} miles")
print(f"• Average pace: {runs['Pace_numeric'].mean():.2f} min/mile")
print(f"• Average elevation gain: {runs['Elevation Gain'].mean():.2f} feet")
print(f"• Most active runner: {runs_per_person.idxmax()} with {runs_per_person.max()} runs")
print(f"• Fastest average pace: {runs.groupby('Person')['Pace_numeric'].mean().idxmin()}")
print(f"• Longest average distance: {runs.groupby('Person')['Distance'].mean().idxmax()}")
print("=" * 60)

# 2. Predictive Task

## 2.1 Task Definition
**Identify the predictive task you will study**

We study a **supervised regression** task focused on modeling short-term training dynamics in runners.

> **Given a runner’s recent training history, predict the pace and distance of their *next run*.**

Each data point is constructed using a sliding time window over past runs for an individual runner, and the target corresponds to the immediately following run. This allows us to model **temporal trends, fitness adaptation, and training consistency**.

### Inputs (Features)

For each runner and each prediction point, we use summary **trend-based features** computed from the previous 5 runs:

- **Pace history**
  - Mean pace over last 5 runs  
  - Standard deviation of pace  
  - Most recent pace  
  - Linear slope of pace over last 5 runs (trend)

- **Distance history**
  - Mean distance over last 5 runs  
  - Standard deviation of distance  
  - Most recent distance  
  - Linear slope of distance over last 5 runs (trend)

- **Training load & terrain**
  - Mean elevation gain over last 5 runs  
  - Mean grade over last 5 runs  

- **Recovery**
  - Days since last run  

- **Identity**
  - Runner identity (`Person`, one-hot encoded)

These features jointly encode **current fitness level, training direction, consistency, and recovery behavior**.

### Outputs (Targets)

We perform **two regression tasks**:

- `next_run_pace`: numeric pace of the next run (minutes per mile)
- `next_run_distance`: numeric distance of the next run (miles)

## 2.2 Evaluation Strategy
**Describe how you will evaluate your model at this predictive task**


We evaluate our models using a **train/validation/test split**:

- Split the dataset into:
  - **Training set**: fit the model and (via cross-validation) tune hyperparameters.
  - **Test set**: held-out data for final performance reporting.

We use 
- **Mean Absolute Error (MAE)**:  
  Interpretable as the average absolute error in pace units and distance units

## 2.3 Relevant Baselines 
**What relevant baselines can be used for comparison?**

## 2.4 Validity of Predictions
**How you will assess the validity of your model’s predictions?**

# 3 Modeling

## 3.1 Context 
**How do you formulate your task as an ML problem, e.g. what are the inputs, outputs, and what is being optimized? What models are appropriate for the task?**

## 3.2 Discussion
**Discuss the advantages and disadvantages of different modeling approaches (complexity, efficiency, challenges in implementation, etc.)**

## 3.3 Code
**Walk through your code, explaining architectural choices and any implementation details.**

In [23]:
#Convert MM:SS pace to numeric 
def pace_str_to_float(p):
    if pd.isna(p):
        return np.nan
    if isinstance(p, str) and ":" in p:
        m, s = p.split(":")
        return float(m) + float(s)/60
    return float(p)

runs["Pace_numeric"] = runs["Pace"].apply(pace_str_to_float)
#Sort Data by Date (Important for time trends)
runs["Activity Date"] = pd.to_datetime(runs["Activity Date"])
runs = runs.sort_values(["Person", "Activity Date"]).reset_index(drop=True)

In [24]:
#create rolling trend based features (using past 5 runs to predict next run)
import numpy as np
WINDOW = 5

feature_rows = []

for person, person_df in runs.groupby("Person"):
    person_df = person_df.sort_values("Activity Date").reset_index(drop=True)

    for i in range(WINDOW, len(person_df) - 1):
        hist = person_df.iloc[i-WINDOW:i]
        next_run = person_df.iloc[i+1]

        row = {
            "Person": person,

            # ----- PACE HISTORY -----
            "pace_mean_5": hist["Pace_numeric"].mean(),
            "pace_std_5": hist["Pace_numeric"].std(),
            "pace_last": hist["Pace_numeric"].iloc[-1],
            "pace_slope_5": np.polyfit(range(WINDOW), hist["Pace_numeric"], 1)[0],

            # ----- DISTANCE HISTORY -----
            "dist_mean_5": hist["Distance"].mean(),
            "dist_std_5": hist["Distance"].std(),
            "dist_last": hist["Distance"].iloc[-1],
            "dist_slope_5": np.polyfit(range(WINDOW), hist["Distance"], 1)[0],

            # ----- TERRAIN HISTORY -----
            "elev_gain_mean_5": hist["Elevation Gain"].mean(),
            "grade_mean_5": hist["Average Grade"].mean(),

            # ----- TIME GAP FEATURE -----
            "days_since_last": (
                person_df.iloc[i]["Activity Date"] -
                person_df.iloc[i-1]["Activity Date"]
            ).days,

            # ----- TARGETS (NEXT RUN) -----
            "next_run_pace": next_run["Pace_numeric"],
            "next_run_distance": next_run["Distance"]
        }

        feature_rows.append(row)

trend_df = pd.DataFrame(feature_rows)
trend_df = trend_df.dropna()
trend_df.head()

Unnamed: 0,Person,pace_mean_5,pace_std_5,pace_last,pace_slope_5,dist_mean_5,dist_std_5,dist_last,dist_slope_5,elev_gain_mean_5,grade_mean_5,days_since_last,next_run_pace,next_run_distance
0,Alex,7.3,0.37951,7.016667,-0.158333,2.036,0.32997,2.08,0.154,4.54,-1.94,3,7.35,1.03
1,Alex,7.05,0.644097,6.116667,-0.375,1.902,0.563844,1.01,-0.158,10.52,-2.06,34,7.6,1.38
2,Alex,6.933333,0.474634,7.35,-0.05,1.756,0.690167,1.03,-0.375,10.72,-2.12,8,7.016667,2.0
3,Alex,7.026667,0.561694,7.6,0.143333,1.6,0.663664,1.38,-0.329,9.4,-2.44,5,6.55,2.01
4,Alex,7.02,0.561545,7.016667,0.148333,1.5,0.515218,2.0,0.021,21.14,-1.9,6,7.6,3.0


In [25]:
#feature matrix and targets
X = trend_df.drop(columns=["next_run_pace", "next_run_distance"])
y_pace = trend_df["next_run_pace"]
y_dist = trend_df["next_run_distance"]

#One hot encode person and do the train/test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_features = X.drop(columns=["Person"]).columns.tolist()
categorical_features = ["Person"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

X_train, X_test, y_pace_train, y_pace_test = train_test_split(
    X, y_pace, test_size=0.2, random_state=42
)

_, _, y_dist_train, y_dist_test = train_test_split(
    X, y_dist, test_size=0.2, random_state=42
)


In [26]:
#Linear regression baselines 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

linreg_pace = Pipeline([
    ("preprocess", preprocessor),
    ("model", LinearRegression())
])

linreg_dist = Pipeline([
    ("preprocess", preprocessor),
    ("model", LinearRegression())
])

linreg_pace.fit(X_train, y_pace_train)
linreg_dist.fit(X_train, y_dist_train)

pace_pred_lr = linreg_pace.predict(X_test)
dist_pred_lr = linreg_dist.predict(X_test)

print("Linear Regression — Pace MAE:", mean_absolute_error(y_pace_test, pace_pred_lr))
print("Linear Regression — Pace RMSE:", mean_squared_error(y_pace_test, pace_pred_lr, squared=False))

print("Linear Regression — Distance MAE:", mean_absolute_error(y_dist_test, dist_pred_lr))
print("Linear Regression — Distance RMSE:", mean_squared_error(y_dist_test, dist_pred_lr, squared=False))


Linear Regression — Pace MAE: 8.42841004549719
Linear Regression — Pace RMSE: 19.979106193877797
Linear Regression — Distance MAE: 1.5823890301924006
Linear Regression — Distance RMSE: 2.324928601240988




In [27]:
#Ridge and GridSearchCV - opimized model 
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", Ridge())
])

param_grid = {
    "model__alpha": [0.01, 0.1, 1, 10, 50, 100]
}

#ridge for pace 
ridge_grid_pace = GridSearchCV(
    ridge_pipe, param_grid,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

ridge_grid_pace.fit(X_train, y_pace_train)

best_ridge_pace = ridge_grid_pace.best_estimator_
pace_pred_ridge = best_ridge_pace.predict(X_test)

print("Best Ridge alpha (Pace):", ridge_grid_pace.best_params_)
print("Ridge Pace MAE:", mean_absolute_error(y_pace_test, pace_pred_ridge))
print("Ridge Pace RMSE:", mean_squared_error(y_pace_test, pace_pred_ridge, squared=False))

#ridge for distance 
ridge_grid_dist = GridSearchCV(
    ridge_pipe, param_grid,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

ridge_grid_dist.fit(X_train, y_dist_train)

best_ridge_dist = ridge_grid_dist.best_estimator_
dist_pred_ridge = best_ridge_dist.predict(X_test)

print("Best Ridge alpha (Distance):", ridge_grid_dist.best_params_)
print("Ridge Distance MAE:", mean_absolute_error(y_dist_test, dist_pred_ridge))
print("Ridge Distance RMSE:", mean_squared_error(y_dist_test, dist_pred_ridge, squared=False))



Best Ridge alpha (Pace): {'model__alpha': 100}
Ridge Pace MAE: 5.055894056823785
Ridge Pace RMSE: 6.904936067333741
Best Ridge alpha (Distance): {'model__alpha': 0.01}
Ridge Distance MAE: 1.5841714721188922
Ridge Distance RMSE: 2.3277022153770885




In [28]:
#strong baseline 
last_pace_baseline = X_test["pace_last"]
last_dist_baseline = X_test["dist_last"]

print("Last-Run Pace MAE:", mean_absolute_error(y_pace_test, last_pace_baseline))
print("Last-Run Distance MAE:", mean_absolute_error(y_dist_test, last_dist_baseline))


Last-Run Pace MAE: 1.022573839662447
Last-Run Distance MAE: 1.8837974683544303


In [29]:
results = pd.DataFrame({
    "Model": ["Last Run", "Linear Regression", "Ridge Regression"],
    "Pace MAE": [
        mean_absolute_error(y_pace_test, last_pace_baseline),
        mean_absolute_error(y_pace_test, pace_pred_lr),
        mean_absolute_error(y_pace_test, pace_pred_ridge)
    ],
    "Distance MAE": [
        mean_absolute_error(y_dist_test, last_dist_baseline),
        mean_absolute_error(y_dist_test, dist_pred_lr),
        mean_absolute_error(y_dist_test, dist_pred_ridge)
    ]
})

results


Unnamed: 0,Model,Pace MAE,Distance MAE
0,Last Run,1.022574,1.883797
1,Linear Regression,8.42841,1.582389
2,Ridge Regression,5.055894,1.584171


# 4. Evaluation 

## 4.1 Context 
**How should your task be evaluated? Can you justify why your particular metrics are more appropriate than others?**

## 4.2 Discussion 
**What are some baselines (trivial or otherwise) for your task? How do you demonstrate that your method is better than these methods?**

## 4.3 Code 
**Walk through the implementation of your evaluation protocol, and support your evaluation with tables, plots, statistics, etc.**

# 5. Discussion of Related Work  

## 5.1 
**How has this dataset been used before?**

## 5.2 
**How has prior work approached this same task?**

## 5.3 
**How do your results match or differ from what has been reported in related work?**