# üõ†Ô∏è 02 - Preprocessing and Feature Engineering

In this notebook, we prepare the data for modeling by:
- Cleaning and standardizing columns
- Engineering domain-specific features (e.g. HRV ratios, training deviations, ACWR)
- Creating rolling averages and binary flags
- Preparing the final dataset for machine learning
- Splitting this dataset in training and testing parts


In [53]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GroupShuffleSplit

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


## Load Merged Data

We load the merged dataset prepared during EDA.


In [54]:
# Load from disk and make a copy to work with
eda_merged = pd.read_pickle("../simulated_data/processed/merged.pkl")
merged = eda_merged.copy()

# Quick check
print(f"Shape: {merged.shape}")
merged.head()

Shape: (278496, 92)


Unnamed: 0,athlete_id,gender,age,height_cm,weight_kg,genetic_factor,hrv_baseline,hrv_range,max_hr,resting_hr_norm,lthr,hr_zones,vo2max,running_threshold_pace,ftp,css,training_experience,weekly_training_hours,recovery_rate,lifestyle,sleep_time_norm,sleep_quality_norm,nutrition_factor,stress_factor,smoking_factor,drinking_factor,date,resting_hr_daily,hrv,sleep_hours,deep_sleep,light_sleep,rem_sleep,sleep_quality_daily,body_battery_morning,stress,body_battery_evening,planned_tss,actual_tss,injury,bike_duration_minutes,bike_hr_zones,bike_power_zones,bike_tss,bike_intensity_factor,bike_distance_km,bike_avg_speed_kph,bike_avg_hr,bike_max_hr,bike_avg_power,bike_normalized_power,bike_work_kilojoules,bike_elevation_gain,run_duration_minutes,run_hr_zones,run_power_zones,run_tss,run_intensity_factor,run_distance_km,run_avg_speed_kph,run_avg_hr,run_max_hr,run_avg_power,run_normalized_power,run_work_kilojoules,run_elevation_gain,swim_duration_minutes,swim_hr_zones,swim_power_zones,swim_tss,swim_intensity_factor,swim_distance_km,swim_avg_speed_kph,swim_avg_hr,swim_max_hr,swim_avg_power,swim_normalized_power,swim_work_kilojoules,swim_elevation_gain,strength_duration_minutes,strength_hr_zones,strength_power_zones,strength_tss,strength_intensity_factor,strength_distance_km,strength_avg_speed_kph,strength_avg_hr,strength_max_hr,strength_avg_power,strength_normalized_power,strength_work_kilojoules,strength_elevation_gain
0,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,"(np.float64(82.9), np.float64(112.1))",187.5,44.9,188,"{'Z1': (np.float64(67.39847066423759), 150.4),...",68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-01,42.883056,100.134386,7.710502,1.189322,4.807406,1.713774,0.841,93,6.1,40.2,69,68.0,0,75.0,"{'Z1': 20.666666666666668, 'Z2': 78.6666666666...","{'Z1': 30.0, 'Z2': 70.0, 'Z3': 0.0, 'Z4': 0.0,...",50.0,0.63,36.5,29.2,154.0,170.0,119.0,120.0,536.0,182.0,36.0,"{'Z1': 99.30555555555556, 'Z2': 0.0, 'Z3': 0.0...",,18.0,0.55,5.94,9.9,126.0,135.0,0.0,0.0,0.0,59.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,"(np.float64(82.9), np.float64(112.1))",187.5,44.9,188,"{'Z1': (np.float64(67.39847066423759), 150.4),...",68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-02,41.417897,105.083406,7.904715,1.247279,4.882419,1.775016,0.841,97,5.5,40.6,86,86.0,0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,"{'Z1': 15.625, 'Z2': 3.515625, 'Z3': 1.953125,...",,86.0,0.9,4.148212,3.888949,174.0,188.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,"(np.float64(82.9), np.float64(112.1))",187.5,44.9,188,"{'Z1': (np.float64(67.39847066423759), 150.4),...",68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-03,46.199856,91.069844,7.650187,1.134905,4.840656,1.674626,0.802,73,16.5,26.0,91,91.0,0,78.0,"{'Z1': 17.94871794871795, 'Z2': 20.19230769230...","{'Z1': 55.769230769230774, 'Z2': 44.2307692307...",52.0,0.63,37.31,28.7,162.0,178.0,114.0,115.0,534.0,187.0,78.0,"{'Z1': 100.0, 'Z2': 0.0, 'Z3': 0.0, 'Z4': 0.0,...",,39.0,0.55,12.22,9.4,134.0,149.0,0.0,0.0,0.0,122.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,"(np.float64(82.9), np.float64(112.1))",187.5,44.9,188,"{'Z1': (np.float64(67.39847066423759), 150.4),...",68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-04,48.7403,89.847933,7.556995,1.098222,4.819924,1.638849,0.802,83,18.2,21.3,83,83.0,0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,"{'Z1': 19.25, 'Z2': 46.25, 'Z3': 34.5, 'Z4': 0...",,83.0,0.71,16.55,9.93,159.0,176.0,0.0,0.0,0.0,166.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,"(np.float64(82.9), np.float64(112.1))",187.5,44.9,188,"{'Z1': (np.float64(67.39847066423759), 150.4),...",68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-05,48.944653,87.347495,7.818202,1.130043,4.996613,1.691546,0.802,90,18.4,41.2,63,18.0,0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,"{'Z1': 100.0, 'Z2': 0.0, 'Z3': 0.0, 'Z4': 0.0,...",,18.0,0.55,5.66,9.44,132.0,144.0,0.0,0.0,0.0,57.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Basic Preprocessing

Since this dataset was synthetically created this step does not involve much, missing data is minimal and only exists for sport specific variables (e.g. cycling power on running workouts is naturally missing). Also no duplicates exist. We will do the following:

- Convert dates
- Handle missing values
- drop hrv_range as this is simply 85% - 115% of baseline
- drop athlete hr_zones as they are simply fixed percentages of lthr
- flatten the zone dictionaries into individual columns


In [55]:
# Convert to datetime
merged['date'] = pd.to_datetime(merged['date'])

# Fill or impute missing values
merged.fillna(0, inplace=True)

merged.drop(['hrv_range', 'hr_zones'], axis=1, inplace=True)

  merged.fillna(0, inplace=True)


In [56]:
# List of dictionary columns to flatten
dict_columns = [
    'bike_hr_zones', 'run_hr_zones', 
    'bike_power_zones', 'run_power_zones',
    'swim_hr_zones', 'swim_power_zones',
    'strength_hr_zones', 'strength_power_zones'
]

# Function to safely convert to dictionary
def safe_convert_to_dict(x):
    if pd.isna(x):
        return {}
    if isinstance(x, dict):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except:
            return {}
    return {}

# Function to flatten the dictionary
def flatten_dict_column(df, col_name):
    # Check if column exists in the dataframe
    if col_name not in df.columns:
        return df
    
    # Safely convert to dictionary
    df[col_name] = df[col_name].apply(safe_convert_to_dict)
    
    # Create new column names
    prefix = col_name + '_'
    
    # Get all possible keys across all dictionaries
    all_keys = set()
    for d in df[col_name].dropna():
        if isinstance(d, dict):
            all_keys.update(d.keys())
    
    # Create a new column for each key
    for key in sorted(all_keys):  # Sort keys to ensure consistent order
        new_col = prefix + key
        df[new_col] = df[col_name].apply(lambda x: x.get(key, 0) if isinstance(x, dict) else 0)
    
    return df

# Apply the function to each dictionary column
for col in dict_columns:
    if col in merged.columns:  # Only process columns that exist
        merged = flatten_dict_column(merged, col)

# Optionally, drop the original dictionary columns
merged = merged.drop(columns=[col for col in dict_columns if col in merged.columns])

merged.head()

Unnamed: 0,athlete_id,gender,age,height_cm,weight_kg,genetic_factor,hrv_baseline,max_hr,resting_hr_norm,lthr,vo2max,running_threshold_pace,ftp,css,training_experience,weekly_training_hours,recovery_rate,lifestyle,sleep_time_norm,sleep_quality_norm,nutrition_factor,stress_factor,smoking_factor,drinking_factor,date,resting_hr_daily,hrv,sleep_hours,deep_sleep,light_sleep,rem_sleep,sleep_quality_daily,body_battery_morning,stress,body_battery_evening,planned_tss,actual_tss,injury,bike_duration_minutes,bike_tss,bike_intensity_factor,bike_distance_km,bike_avg_speed_kph,bike_avg_hr,bike_max_hr,bike_avg_power,bike_normalized_power,bike_work_kilojoules,bike_elevation_gain,run_duration_minutes,run_tss,run_intensity_factor,run_distance_km,run_avg_speed_kph,run_avg_hr,run_max_hr,run_avg_power,run_normalized_power,run_work_kilojoules,run_elevation_gain,swim_duration_minutes,swim_tss,swim_intensity_factor,swim_distance_km,swim_avg_speed_kph,swim_avg_hr,swim_max_hr,swim_avg_power,swim_normalized_power,swim_work_kilojoules,swim_elevation_gain,strength_duration_minutes,strength_tss,strength_intensity_factor,strength_distance_km,strength_avg_speed_kph,strength_avg_hr,strength_max_hr,strength_avg_power,strength_normalized_power,strength_work_kilojoules,strength_elevation_gain,bike_hr_zones_Z1,bike_hr_zones_Z2,bike_hr_zones_Z3,bike_hr_zones_Z4,bike_hr_zones_Z5,bike_hr_zones_Z6,run_hr_zones_Z1,run_hr_zones_Z2,run_hr_zones_Z3,run_hr_zones_Z4,run_hr_zones_Z5,run_hr_zones_Z6,bike_power_zones_Z1,bike_power_zones_Z2,bike_power_zones_Z3,bike_power_zones_Z4,bike_power_zones_Z5,bike_power_zones_Z6,bike_power_zones_Z7,swim_hr_zones_Z1,swim_hr_zones_Z2,swim_hr_zones_Z3,swim_hr_zones_Z4,swim_hr_zones_Z5,swim_hr_zones_Z6,strength_hr_zones_Z1,strength_hr_zones_Z2,strength_hr_zones_Z3,strength_hr_zones_Z4,strength_hr_zones_Z5,strength_hr_zones_Z6
0,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,187.5,44.9,188,68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-01,42.883056,100.134386,7.710502,1.189322,4.807406,1.713774,0.841,93,6.1,40.2,69,68.0,0,75.0,50.0,0.63,36.5,29.2,154.0,170.0,119.0,120.0,536.0,182.0,36.0,18.0,0.55,5.94,9.9,126.0,135.0,0.0,0.0,0.0,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.666667,78.666667,0.333333,0.0,0.0,0.0,99.305556,0.0,0.0,0.0,0.0,0.0,30.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,187.5,44.9,188,68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-02,41.417897,105.083406,7.904715,1.247279,4.882419,1.775016,0.841,97,5.5,40.6,86,86.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,86.0,0.9,4.148212,3.888949,174.0,188.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.625,3.515625,1.953125,78.90625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,187.5,44.9,188,68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-03,46.199856,91.069844,7.650187,1.134905,4.840656,1.674626,0.802,73,16.5,26.0,91,91.0,0,78.0,52.0,0.63,37.31,28.7,162.0,178.0,114.0,115.0,534.0,187.0,78.0,39.0,0.55,12.22,9.4,134.0,149.0,0.0,0.0,0.0,122.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.948718,20.192308,61.858974,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,55.769231,44.230769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,187.5,44.9,188,68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-04,48.7403,89.847933,7.556995,1.098222,4.819924,1.638849,0.802,83,18.2,21.3,83,83.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,83.0,0.71,16.55,9.93,159.0,176.0,0.0,0.0,0.0,166.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.25,46.25,34.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,187.5,44.9,188,68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-05,48.944653,87.347495,7.818202,1.130043,4.996613,1.691546,0.802,90,18.4,41.2,63,18.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,18.0,0.55,5.66,9.44,132.0,144.0,0.0,0.0,0.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Notice that some features are sport specific, (such as power, normalized power and work in kilojoules) and are therefore always zero for all other sports. We will drop these as they don't add any value and add unneccesary complexity.

In [57]:
null_columns = ['run_avg_power', 'run_normalized_power', 'run_work_kilojoules',
                'swim_avg_power', 'swim_normalized_power', 'swim_work_kilojoules', 'swim_elevation_gain',
                'strength_avg_power', 'strength_normalized_power', 'strength_work_kilojoules', 'strength_elevation_gain', 'strength_distance_km', 'strength_avg_speed_kph']

merged.drop(null_columns, axis=1, inplace=True)

merged.head()

Unnamed: 0,athlete_id,gender,age,height_cm,weight_kg,genetic_factor,hrv_baseline,max_hr,resting_hr_norm,lthr,vo2max,running_threshold_pace,ftp,css,training_experience,weekly_training_hours,recovery_rate,lifestyle,sleep_time_norm,sleep_quality_norm,nutrition_factor,stress_factor,smoking_factor,drinking_factor,date,resting_hr_daily,hrv,sleep_hours,deep_sleep,light_sleep,rem_sleep,sleep_quality_daily,body_battery_morning,stress,body_battery_evening,planned_tss,actual_tss,injury,bike_duration_minutes,bike_tss,bike_intensity_factor,bike_distance_km,bike_avg_speed_kph,bike_avg_hr,bike_max_hr,bike_avg_power,bike_normalized_power,bike_work_kilojoules,bike_elevation_gain,run_duration_minutes,run_tss,run_intensity_factor,run_distance_km,run_avg_speed_kph,run_avg_hr,run_max_hr,run_elevation_gain,swim_duration_minutes,swim_tss,swim_intensity_factor,swim_distance_km,swim_avg_speed_kph,swim_avg_hr,swim_max_hr,strength_duration_minutes,strength_tss,strength_intensity_factor,strength_avg_hr,strength_max_hr,bike_hr_zones_Z1,bike_hr_zones_Z2,bike_hr_zones_Z3,bike_hr_zones_Z4,bike_hr_zones_Z5,bike_hr_zones_Z6,run_hr_zones_Z1,run_hr_zones_Z2,run_hr_zones_Z3,run_hr_zones_Z4,run_hr_zones_Z5,run_hr_zones_Z6,bike_power_zones_Z1,bike_power_zones_Z2,bike_power_zones_Z3,bike_power_zones_Z4,bike_power_zones_Z5,bike_power_zones_Z6,bike_power_zones_Z7,swim_hr_zones_Z1,swim_hr_zones_Z2,swim_hr_zones_Z3,swim_hr_zones_Z4,swim_hr_zones_Z5,swim_hr_zones_Z6,strength_hr_zones_Z1,strength_hr_zones_Z2,strength_hr_zones_Z3,strength_hr_zones_Z4,strength_hr_zones_Z5,strength_hr_zones_Z6
0,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,187.5,44.9,188,68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-01,42.883056,100.134386,7.710502,1.189322,4.807406,1.713774,0.841,93,6.1,40.2,69,68.0,0,75.0,50.0,0.63,36.5,29.2,154.0,170.0,119.0,120.0,536.0,182.0,36.0,18.0,0.55,5.94,9.9,126.0,135.0,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.666667,78.666667,0.333333,0.0,0.0,0.0,99.305556,0.0,0.0,0.0,0.0,0.0,30.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,187.5,44.9,188,68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-02,41.417897,105.083406,7.904715,1.247279,4.882419,1.775016,0.841,97,5.5,40.6,86,86.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,86.0,0.9,4.148212,3.888949,174.0,188.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.625,3.515625,1.953125,78.90625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,187.5,44.9,188,68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-03,46.199856,91.069844,7.650187,1.134905,4.840656,1.674626,0.802,73,16.5,26.0,91,91.0,0,78.0,52.0,0.63,37.31,28.7,162.0,178.0,114.0,115.0,534.0,187.0,78.0,39.0,0.55,12.22,9.4,134.0,149.0,122.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.948718,20.192308,61.858974,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,55.769231,44.230769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,187.5,44.9,188,68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-04,48.7403,89.847933,7.556995,1.098222,4.819924,1.638849,0.802,83,18.2,21.3,83,83.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,83.0,0.71,16.55,9.93,159.0,176.0,166.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.25,46.25,34.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,f8173b91-1cd8-4abc-b235-b3126a2e0463,female,35,164,57.7,0.91,97.5,187.5,44.9,188,68.9,4.28,210.3,94.1,4,13.6,1.09,Highly Disciplined Athlete,7.733992,0.905808,0.986618,0.141615,0.0,0.060112,2024-01-05,48.944653,87.347495,7.818202,1.130043,4.996613,1.691546,0.802,90,18.4,41.2,63,18.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,18.0,0.55,5.66,9.44,132.0,144.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Lastly, before feature engineering we want to split the data into features and target (injury label)

In [46]:
X = merged.drop(columns=['injury'], errors='ignore')
y = merged['injury'] if 'injury' in merged.columns else None

## üß† Feature Engineering

In this section, we engineer meaningful features from the raw data that are relevant to injury prediction. These include:

- **Physiological Indicators**: HRV ratios, sleep quality, and associated risk flags.
- **Training Load Metrics**: TSS deviation, 7-day/28-day load trends, and adherence.
- **Load Management Indicators**: ACWR (Acute:Chronic Workload Ratio), training monotony, strain, and ramp rate.
- **Recovery Trends**: 7-day HRV slope and cumulative sleep debt.

These features help model short-term stress, long-term trends, and recovery states that may impact injury risk.

In [58]:
# --- HRV-Based Features ---
# HRV Ratio: Current HRV relative to personal baseline
X['hrv_ratio'] = X['hrv'] / X['hrv_baseline']

# HRV Zone: Categorize HRV into 'Below Normal', 'Normal', 'Above Normal'
X['hrv_zone'] = pd.cut(
    X['hrv_ratio'],
    bins=[0, 0.851, 1.15, np.inf],
    labels=['Below Normal', 'Normal', 'Above Normal']
)

# HRV Risk Flag: Binary indicator for low HRV
X['low_hrv_risk'] = (X['hrv_ratio'] < 0.85).astype(int)

# --- Sleep Features ---
# Sleep Quality Flag: Mark poor sleep quality (threshold < 0.6)
X['poor_sleep'] = (X['sleep_quality_daily'] < 0.6).astype(int)

# --- Training Load Features ---
# TSS Deviation: Difference and percent difference from planned TSS
X['tss_deviation'] = X['actual_tss'] - X['planned_tss']
X['tss_deviation_pct'] = (X['tss_deviation'] / X['planned_tss']) * 100

# Training Adherence: Undertraining, On Target, or Overtraining
X['training_adherence'] = pd.cut(
    X['tss_deviation_pct'],
    bins=[-np.inf, -15, 15, np.inf],
    labels=['Undertraining', 'On Target', 'Overtraining']
)

# 7-day rolling average of actual TSS
X['tss_7d_avg'] = (
    X.groupby('athlete_id')['actual_tss']
    .transform(lambda x: x.rolling(window=7, min_periods=1).mean())
)

# Acute Load: 7-day rolling sum of TSS
X['acute_load'] = (
    X.groupby('athlete_id')['actual_tss']
    .transform(lambda x: x.rolling(window=7, min_periods=1).sum())
)

# Chronic Load: 28-day rolling average √ó 7 for scale alignment
X['chronic_load'] = (
    X.groupby('athlete_id')['actual_tss']
    .transform(lambda x: x.rolling(window=28, min_periods=1).mean() * 7)
)

# Acute:Chronic Workload Ratio (ACWR)
X['acwr'] = X['acute_load'] / X['chronic_load']
X['acwr'].replace([np.inf, -np.inf], np.nan, inplace=True)
X['acwr'].fillna(0, inplace=True)

# ACWR Risk Zones: Categorize risk levels based on thresholds
X['acwr_risk'] = pd.cut(
    X['acwr'],
    bins=[0, 0.8, 1.3, 1.5, np.inf],
    labels=['Too Low', 'Optimal', 'Danger Zone', 'High Risk']
)

# --- Training Monotony & Strain ---
# 7-day rolling standard deviation
X['tss_7d_std'] = (
    X.groupby('athlete_id')['actual_tss']
    .transform(lambda x: x.rolling(window=7, min_periods=2).std())
)

# Training Monotony = avg / std dev (variability in training)
X['training_monotony'] = X['tss_7d_avg'] / X['tss_7d_std']
X['training_monotony'].replace([np.inf, -np.inf], np.nan, inplace=True)
X['training_monotony'].fillna(0, inplace=True)

# Training Strain = avg √ó days √ó monotony
X['training_strain'] = X['tss_7d_avg'] * 7 * X['training_monotony']

# --- Week-to-Week Load Change (Ramp Rate) ---
# Load from previous week
X['previous_week_load'] = (
    X.groupby('athlete_id')['acute_load']
    .transform(lambda x: x.shift(7))
)

X['previous_week_load'].fillna(0)

# Week-to-week percent change
X['week_to_week_change'] = (
    (X['acute_load'] - X['previous_week_load']) / X['previous_week_load'] * 100
)
X['week_to_week_change'].replace([np.inf, -np.inf], np.nan, inplace=True)
X['week_to_week_change'].fillna(0, inplace=True)

# Categorize ramp rate
X['ramp_rate_risk'] = pd.cut(
    X['week_to_week_change'],
    bins=[-np.inf, -5, 10, 20, np.inf],
    labels=['Decreasing', 'Optimal', 'Warning', 'High Risk']
)

# --- Recovery Metrics ---
# HRV 7-Day Trend (slope of HRV ratio over last 7 days)
X['hrv_7d_slope'] = (
    X.groupby('athlete_id')['hrv_ratio']
    .transform(lambda x: x.rolling(window=7, min_periods=3)
              .apply(lambda y: np.polyfit(range(len(y)), y, 1)[0] if len(y) > 2 else np.nan))
)

# Sleep Debt over 7 days (based on 8 hours recommended)
if 'sleep_hours' in X.columns:
    X['sleep_debt_7d'] = (
        X.groupby('athlete_id')['sleep_hours']
        .transform(lambda x: (8 - x).rolling(window=7, min_periods=1).sum())
    )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['acwr'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['acwr'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

## Feature Summary

Let's examine our newly created features.

In [59]:
# Show key engineered features
feature_cols = [
    'hrv_ratio', 'hrv_zone', 'tss_deviation_pct', 'training_adherence', 'poor_sleep', 
    'low_hrv_risk', 'acute_load', 'chronic_load', 'tss_7d_avg', 'acwr', 'acwr_risk',
    'tss_7d_std', 'training_monotony', 'training_strain', 'previous_week_load', 
    'week_to_week_change', 'ramp_rate_risk', 'hrv_7d_slope', 'sleep_debt_7d'
]

X[feature_cols].describe(include='all').T

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
hrv_ratio,278496.0,,,,0.913332,0.054462,0.6,0.8706,0.908631,0.948123,1.15
hrv_zone,278496.0,3.0,Normal,235943.0,,,,,,,
tss_deviation_pct,278496.0,,,,inf,,-98.028169,0.0,0.0,0.0,inf
training_adherence,278496.0,3.0,On Target,190332.0,,,,,,,
poor_sleep,278496.0,,,,0.156566,0.363392,0.0,0.0,0.0,0.0,1.0
low_hrv_risk,278496.0,,,,0.037983,0.191155,0.0,0.0,0.0,0.0,1.0
acute_load,278496.0,,,,635.926323,176.577323,3.0,516.4,620.4,742.3,1823.4
chronic_load,278496.0,,,,638.634393,138.120943,21.0,538.375,627.65,729.320673,1495.9
tss_7d_avg,278496.0,,,,91.702052,24.187976,3.0,74.428571,89.025,106.328571,260.485714
acwr,278496.0,,,,0.994743,0.169765,0.142857,0.898111,1.002713,1.103027,1.74916


It is also noticeable that there are 311 days with an actual TSS greater or equal 400 which is extremely high. The planned_TSS on those days is also accordingly high. 29 (9.3%) out of these days are labelled with injury. These days appear to be special training events as they are planned and their correlation with injuries indicates they could be important signals rather than errors.

We have to carefully handle these outliers:
- preserve the original data while adding transformed versions (tree-based models like XGBoost and random forest can work with the original data while LASSO is sensitive to outliers.)
- create categorical markers for extreme days
- track post-extreme recovery periods
- provide both log-transformed and winsorized options for modelling 

In [60]:
# 1. Create categorical features for extreme TSS events 
X['extreme_tss'] = pd.cut(
    X['actual_tss'],
    bins=[0, 300, 400, 500, np.inf],
    labels=['Normal', 'High', 'Very High', 'Extreme']
)

# 2-3. Log-transform and percentile rank 
for col in ['actual_tss', 'planned_tss', 'acute_load', 'chronic_load']:
    if col in X.columns:
        X[f'{col}_log'] = np.log1p(X[col])

X['tss_percentile_rank'] = X.groupby('athlete_id')['actual_tss'].transform(
    lambda x: x.rank(pct=True)
)

# 4. Create post-extreme monitoring features 
# First create a binary indicator for extreme events
X['is_extreme_event'] = X['extreme_tss'].isin(['Very High', 'Extreme']).astype(int)

# For each athlete, create a date-based tracking system
X = X.sort_values(['athlete_id', 'date'])

# Initialize counter for days since extreme TSS (999 impossible value)
X['days_since_extreme_tss'] = 999

# Process each athlete group to calculate days since extreme event
for athlete, group in X.groupby('athlete_id'):
    # Find indices of extreme events
    extreme_indices = group[group['is_extreme_event'] == 1].index
    
    if len(extreme_indices) > 0:
        # For each day, find the most recent extreme event
        for idx in group.index:
            # Find all extreme events that happened before this day
            previous_extremes = extreme_indices[extreme_indices <= idx]
            if len(previous_extremes) > 0:
                # Get the most recent extreme event index
                last_extreme = previous_extremes[-1]
                # Calculate days difference (this assumes daily entries)
                days_diff = group.loc[idx:idx, 'date'].reset_index(drop=True)[0] - group.loc[last_extreme:last_extreme, 'date'].reset_index(drop=True)[0]
                X.loc[idx, 'days_since_extreme_tss'] = days_diff.days
    
# Mark post-extreme period (7 days after an extreme event)
X['post_extreme_period'] = ((X['days_since_extreme_tss'] <= 7) & 
                                (X['days_since_extreme_tss'].notna())).astype(int)

# 5. Winsorize TSS values 
for col in ['actual_tss', 'planned_tss']:
    X[f'{col}_winsor'] = X[col].clip(upper=X[col].quantile(0.99))

## Encoding Categorical Features

Many machine learning models require all input features to be numeric. Therefore, categorical features must be transformed before model training. Below, we apply appropriate encoding techniques based on feature types:

- **Ordinal Encoding** for features with a meaningful order
- **One-Hot Encoding** for nominal features with no inherent order
- **Label Encoding** for tree-based models or features with minimal categories

In [61]:
# Identify categorical columns and their types
categorical_columns = {
    # Binary categorical
    'binary': ['gender'],
    
    # Nominal categorical (no inherent order)
    'nominal': ['lifestyle'],
    
    # Ordinal categorical (clear order)
    'ordinal': {
        'hrv_zone': ['below normal', 'normal', 'above normal'],
        'training_adherence': ['undertraining', 'on target', 'overtraining'],
        'acwr_risk': ['too low', 'optimal', 'danger zone', 'high risk'],
        'ramp_rate_risk': ['decreasing', 'optimal', 'warning', 'high risk'],
        'extreme_tss': ['normal', 'high', 'very high', 'extreme']
    }
}

def encode_categorical_features(X):
    """
    Handle categorical features appropriately for tree-based models.
    
    Args:
        X: Pandas DataFrame containing the features
        
    Returns:
        X_encoded: DataFrame with encoded categorical features
        feature_names: List of feature names after encoding
    """

    # Make sure categorical values are strings first, then lowercase
    for cat_type in categorical_columns:
        if cat_type == 'ordinal':
            for col in categorical_columns[cat_type]:
                if col in X.columns:
                    X[col] = X[col].astype(str).str.lower()
        else:
            for col in categorical_columns[cat_type]:
                if col in X.columns:
                    X[col] = X[col].astype(str).str.lower()

    # Get all categorical column names
    all_cat_cols = (
        categorical_columns['binary'] + 
        categorical_columns['nominal'] + 
        list(categorical_columns['ordinal'].keys())
    )
    
    # Filter to only include columns that exist in the dataset
    all_cat_cols = [col for col in all_cat_cols if col in X.columns]
    
    # Split between ordinal and non-ordinal columns
    ordinal_cols = [col for col in categorical_columns['ordinal'] if col in X.columns]
    nonordinal_cols = [col for col in all_cat_cols if col not in ordinal_cols]
    
    # Create transformer list
    transformers = []
    
    # Add one-hot encoder for nominal features
    if nonordinal_cols:
        transformers.append(
            ('onehot', OneHotEncoder(sparse_output=False, drop='first'), nonordinal_cols)
        )
    
    # Add ordinal encoder for ordinal features
    if ordinal_cols:
        # Create ordinal encoder with proper category ordering
        for col in ordinal_cols:
            categories = [categorical_columns['ordinal'][col]]
            encoder = OrdinalEncoder(categories=categories)
            transformers.append((f'ordinal_{col}', encoder, [col]))
    
    # Create and apply column transformer
    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder='passthrough'  # Keep other columns as is
    )
    
    # Fit and transform the data
    X_transformed = preprocessor.fit_transform(X)
    
    # Get feature names after transformation
    feature_names = []
    
    # Get feature names from one-hot encoding
    if nonordinal_cols and transformers:
        onehot_idx = [i for i, (name, _, _) in enumerate(transformers) if name == 'onehot'][0]
        onehot_features = preprocessor.transformers_[onehot_idx][1].get_feature_names_out(nonordinal_cols)
        feature_names.extend(onehot_features)
    
    # Add ordinal feature names
    feature_names.extend(ordinal_cols)
    
    # Add remaining feature names that weren't transformed
    non_cat_cols = [col for col in X.columns if col not in all_cat_cols]
    feature_names.extend(non_cat_cols)
    
    # Create a DataFrame with transformed features and proper column names
    X_encoded = pd.DataFrame(X_transformed, columns=feature_names, index=X.index)
    
    return X_encoded, feature_names

X_encoded, feature_names = encode_categorical_features(X)

X_encoded.head()

Unnamed: 0,gender_male,lifestyle_health-conscious athlete,lifestyle_highly disciplined athlete,lifestyle_sleep-deprived workaholic,lifestyle_under-recovered athlete,lifestyle_weekend socializer,hrv_zone,training_adherence,acwr_risk,ramp_rate_risk,extreme_tss,athlete_id,age,height_cm,weight_kg,genetic_factor,hrv_baseline,max_hr,resting_hr_norm,lthr,vo2max,running_threshold_pace,ftp,css,training_experience,weekly_training_hours,recovery_rate,sleep_time_norm,sleep_quality_norm,nutrition_factor,stress_factor,smoking_factor,drinking_factor,date,resting_hr_daily,hrv,sleep_hours,deep_sleep,light_sleep,rem_sleep,sleep_quality_daily,body_battery_morning,stress,body_battery_evening,planned_tss,actual_tss,bike_duration_minutes,bike_tss,bike_intensity_factor,bike_distance_km,bike_avg_speed_kph,bike_avg_hr,bike_max_hr,bike_avg_power,bike_normalized_power,bike_work_kilojoules,bike_elevation_gain,run_duration_minutes,run_tss,run_intensity_factor,run_distance_km,run_avg_speed_kph,run_avg_hr,run_max_hr,run_elevation_gain,swim_duration_minutes,swim_tss,swim_intensity_factor,swim_distance_km,swim_avg_speed_kph,swim_avg_hr,swim_max_hr,strength_duration_minutes,strength_tss,strength_intensity_factor,strength_avg_hr,strength_max_hr,bike_hr_zones_Z1,bike_hr_zones_Z2,bike_hr_zones_Z3,bike_hr_zones_Z4,bike_hr_zones_Z5,bike_hr_zones_Z6,run_hr_zones_Z1,run_hr_zones_Z2,run_hr_zones_Z3,run_hr_zones_Z4,run_hr_zones_Z5,run_hr_zones_Z6,bike_power_zones_Z1,bike_power_zones_Z2,bike_power_zones_Z3,bike_power_zones_Z4,bike_power_zones_Z5,bike_power_zones_Z6,bike_power_zones_Z7,swim_hr_zones_Z1,swim_hr_zones_Z2,swim_hr_zones_Z3,swim_hr_zones_Z4,swim_hr_zones_Z5,swim_hr_zones_Z6,strength_hr_zones_Z1,strength_hr_zones_Z2,strength_hr_zones_Z3,strength_hr_zones_Z4,strength_hr_zones_Z5,strength_hr_zones_Z6,hrv_ratio,low_hrv_risk,poor_sleep,tss_deviation,tss_deviation_pct,tss_7d_avg,acute_load,chronic_load,acwr,tss_7d_std,training_monotony,training_strain,previous_week_load,week_to_week_change,hrv_7d_slope,sleep_debt_7d,actual_tss_log,planned_tss_log,acute_load_log,chronic_load_log,tss_percentile_rank,is_extreme_event,days_since_extreme_tss,post_extreme_period,actual_tss_winsor,planned_tss_winsor
241974,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,00176a66-e7e2-4e21-bfe4-68b79cfe8cd6,32,173,74.2,0.9,102.0,184.3,51.3,177,70.5,4.01,340.0,90.7,3,8.0,1.09,7.79921,0.96784,0.919775,0.276424,0.0,0.08713,2024-01-01,48.159313,112.656963,8.348198,1.38263,5.05534,1.910228,0.921,100,6.8,52.7,51,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,51.0,0.71,11.08,10.9,144.0,157.0,111.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.262295,78.688525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.10448,0,0,0.0,0.0,51.0,51.0,357.0,0.142857,,0.0,0.0,,0.0,,-0.348198,3.951244,3.951244,3.951244,5.880533,0.38724,0,999,0,51.0,51
241975,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,00176a66-e7e2-4e21-bfe4-68b79cfe8cd6,32,173,74.2,0.9,102.0,184.3,51.3,177,70.5,4.01,340.0,90.7,3,8.0,1.09,7.79921,0.96784,0.919775,0.276424,0.0,0.08713,2024-01-02,43.605,117.3,7.929447,1.364821,4.715984,1.848642,0.868,100,11.5,34.9,70,121.2,52.0,51.2,0.77,32.59,37.6,165.0,184.0,245.0,252.0,764.0,163.0,84.0,70.0,0.71,15.41,11.01,143.0,159.0,154.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.461538,5.288462,2.403846,69.711538,7.211538,0.0,19.642857,77.97619,0.0,0.0,0.0,0.0,13.461538,14.423077,72.115385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.15,0,0,51.2,73.142857,86.1,172.2,602.7,0.285714,49.638896,1.734527,1045.399357,,0.0,,-0.277646,4.805659,4.26268,5.154447,6.403077,0.952522,0,999,0,121.2,70
241976,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,00176a66-e7e2-4e21-bfe4-68b79cfe8cd6,32,173,74.2,0.9,102.0,184.3,51.3,177,70.5,4.01,340.0,90.7,3,8.0,1.09,7.79921,0.96784,0.919775,0.276424,0.0,0.08713,2024-01-03,45.038714,116.157078,8.570574,1.256018,5.448742,1.865814,0.882,82,11.9,27.4,70,54.0,81.0,54.0,0.63,46.04,34.1,154.0,170.0,187.0,188.0,909.0,230.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.506173,19.444444,64.197531,0.308642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.259259,65.740741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.138795,0,0,-16.0,-22.857143,75.4,226.2,527.8,0.428571,39.692317,1.899612,1002.6152,,0.0,0.017157,-0.848219,4.007333,4.26268,5.425831,6.27061,0.440653,0,999,0,54.0,70
241977,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,00176a66-e7e2-4e21-bfe4-68b79cfe8cd6,32,173,74.2,0.9,102.0,184.3,51.3,177,70.5,4.01,340.0,90.7,3,8.0,1.09,7.79921,0.96784,0.919775,0.276424,0.0,0.08713,2024-01-04,47.355826,112.353308,7.566125,1.144558,4.753036,1.66853,0.841,85,14.1,29.3,0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,53.0,0.88,9.47,13.86,172.0,184.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.585366,5.487805,1.829268,3.04878,76.219512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.101503,0,0,53.0,inf,69.8,279.2,488.6,0.571429,34.289357,2.035617,994.602496,,0.0,-0.002014,-0.414344,3.988984,0.0,5.635504,6.193589,0.422849,0,999,0,53.0,0
241978,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,00176a66-e7e2-4e21-bfe4-68b79cfe8cd6,32,173,74.2,0.9,102.0,184.3,51.3,177,70.5,4.01,340.0,90.7,3,8.0,1.09,7.79921,0.96784,0.919775,0.276424,0.0,0.08713,2024-01-06,50.274206,110.94499,7.807086,1.358544,4.619822,1.82872,0.868,100,12.1,42.2,90,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,108.0,90.0,0.71,19.6,10.89,146.0,161.0,196.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.212963,76.851852,2.314815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.087696,0,0,0.0,0.0,73.84,369.2,516.88,0.714286,31.039137,2.378932,1229.622451,,0.0,-0.008207,-0.22143,4.51086,4.51086,5.914043,6.249744,0.85905,0,999,0,90.0,90


## üíæ Save Processed Data

We export the processed features dataFrame and target variable.

In [62]:
# Save for the next notebook
X_encoded.to_csv('../simulated_data/processed/processed_features.csv', index=False)
y.to_csv('../simulated_data/processed/target_variable.csv', index=False)

## Split data into training and testing sets

Here there are two options:
1. **Split by time period** 
    - use earlier months for training
    - use later months for testing (e.g., last 60 days of the year)
    - keep the same athletes in both training and testing sets but at different time periods

2. **Split by athlete**
    - some athlete's entire data goes to training (e.g., 800 athletes)
    - some athlete's entire data goes to testing (e.g., 200 athletes)

The question guiding the decision between these option is:
- Do we want our model to generize well to completely new athletes or
- Do we want our model to predict future injuries for already known athletes?

In [63]:
# Option 1: time-based split
# Create a time-based cutoff 
cutoff_date = pd.to_datetime('2024-11-01')  

# Create train/test masks
train_mask = merged['date'] < cutoff_date
test_mask = merged['date'] >= cutoff_date

# Split the data
X_train_time_based = X_encoded[train_mask]
X_test_time_based = X_encoded[test_mask] 
y_train_time_based = y[train_mask]
y_test_time_based = y[test_mask]


# Option 2: athlete-based split
# Set random state for reproducibility
random_state = 42

# Create a GroupShuffleSplit to keep all data from one athlete together
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)

# Get the train/test indices based on athlete_id groups
train_idx, test_idx = next(splitter.split(merged, groups=merged['athlete_id']))

# Create masks for indexing
train_mask = np.zeros(len(merged), dtype=bool)
train_mask[train_idx] = True
test_mask = ~train_mask

# Split processed features and target
X_train_athlete_based = X_encoded.iloc[train_idx]
X_test_athlete_based = X_encoded.iloc[test_idx]
y_train_athlete_based = y.iloc[train_idx]
y_test_athlete_based = y.iloc[test_idx]

# check how many athletes are in each set
train_athletes = merged.iloc[train_idx]['athlete_id'].nunique()
test_athletes = merged.iloc[test_idx]['athlete_id'].nunique()
print(f"Training set: {train_athletes} athletes")
print(f"Testing set: {test_athletes} athletes")

# Save for the next notebook
X_train_time_based.to_csv('../simulated_data/processed/time_based/X_train.csv', index=False)
X_test_time_based.to_csv('../simulated_data/processed/time_based/X_test.csv', index=False)
y_train_time_based.to_csv('../simulated_data/processed/time_based/y_train.csv', index=False)
y_test_time_based.to_csv('../simulated_data/processed/time_based/y_test.csv', index=False)

X_train_athlete_based.to_csv('../simulated_data/processed/athlete_based/X_train.csv', index=False)
X_test_athlete_based.to_csv('../simulated_data/processed/athlete_based/X_test.csv', index=False)
y_train_athlete_based.to_csv('../simulated_data/processed/athlete_based/y_train.csv', index=False)
y_test_athlete_based.to_csv('../simulated_data/processed/athlete_based/y_test.csv', index=False)

  X_train_time_based = X_encoded[train_mask]
  X_test_time_based = X_encoded[test_mask]


Training set: 800 athletes
Testing set: 200 athletes
