# WHOOP Data Project 
## Ava Delanty

In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import altair as alt
import streamlit as st
import plotly as px
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Whoop data import, filtering, and cleaning

In [2]:
# Read CSV files
sleep = pd.read_csv('sleeps.csv')
cycle = pd.read_csv('physiological_cycles.csv')

In [3]:
sleep.head()

Unnamed: 0,Cycle start time,Cycle end time,Cycle timezone,Sleep onset,Wake onset,Sleep performance %,Respiratory rate (rpm),Asleep duration (min),In bed duration (min),Light sleep duration (min),Deep (SWS) duration (min),REM duration (min),Awake duration (min),Sleep need (min),Sleep debt (min),Sleep efficiency %,Sleep consistency %,Nap
0,2024-12-09 03:40:05,,UTC-08:00,2024-12-09 03:40:05,2024-12-09 12:24:58,83.0,16.7,445.0,521.0,324.0,44.0,77.0,76.0,537.0,80.0,85.0,68.0,False
1,2024-12-08 07:56:32,2024-12-09 03:40:05,UTC-08:00,2024-12-08 07:56:32,2024-12-08 14:41:44,67.0,17.7,330.0,403.0,179.0,70.0,81.0,73.0,491.0,3.0,82.0,52.0,False
2,2024-12-07 03:47:29,2024-12-08 07:56:32,UTC-08:00,2024-12-07 03:47:29,2024-12-07 13:13:15,99.0,17.0,464.0,565.0,342.0,68.0,54.0,101.0,469.0,0.0,82.0,82.0,False
3,2024-12-06 04:36:56,2024-12-07 03:47:29,UTC-08:00,2024-12-06 04:36:56,2024-12-06 13:53:04,100.0,17.0,482.0,556.0,317.0,82.0,83.0,74.0,456.0,0.0,86.0,78.0,False
4,2024-12-05 03:40:11,2024-12-06 04:36:56,UTC-08:00,2024-12-05 03:40:11,2024-12-05 14:44:51,100.0,17.0,611.0,664.0,430.0,82.0,99.0,53.0,510.0,45.0,92.0,64.0,False


In [4]:
cycle.head()

Unnamed: 0,Cycle start time,Cycle end time,Cycle timezone,Recovery score %,Resting heart rate (bpm),Heart rate variability (ms),Skin temp (celsius),Blood oxygen %,Day Strain,Energy burned (cal),...,Asleep duration (min),In bed duration (min),Light sleep duration (min),Deep (SWS) duration (min),REM duration (min),Awake duration (min),Sleep need (min),Sleep debt (min),Sleep efficiency %,Sleep consistency %
0,2024-12-09 03:40:05,,UTC-08:00,91.0,68.0,64.0,34.6,96.1,,,...,445.0,521.0,324.0,44.0,77.0,76.0,537.0,80.0,85.0,68.0
1,2024-12-08 07:56:32,2024-12-09 03:40:05,UTC-08:00,12.0,92.0,20.0,35.7,94.33,4.3,1182.0,...,330.0,403.0,179.0,70.0,81.0,73.0,491.0,3.0,82.0,52.0
2,2024-12-07 03:47:29,2024-12-08 07:56:32,UTC-08:00,62.0,75.0,46.0,34.7,95.6,14.5,2299.0,...,464.0,565.0,342.0,68.0,54.0,101.0,469.0,0.0,82.0,82.0
3,2024-12-06 04:36:56,2024-12-07 03:47:29,UTC-08:00,93.0,70.0,60.0,34.44,95.6,11.1,1430.0,...,482.0,556.0,317.0,82.0,83.0,74.0,456.0,0.0,86.0,78.0
4,2024-12-05 03:40:11,2024-12-06 04:36:56,UTC-08:00,97.0,71.0,60.0,34.85,94.5,4.4,1502.0,...,611.0,664.0,430.0,82.0,99.0,53.0,510.0,45.0,92.0,64.0


In [5]:
# Convert 'Cycle start time' to datetime objects
cycle['Cycle start time'] = pd.to_datetime(cycle['Cycle start time'])

# Extract month and day
cycle['Month'] = cycle['Cycle start time'].dt.month
cycle['Day'] = cycle['Cycle start time'].dt.day

In [6]:
# Check for null values
cycle.isna().sum()

Cycle start time               0
Cycle end time                 1
Cycle timezone                 0
Recovery score %               3
Resting heart rate (bpm)       3
Heart rate variability (ms)    3
Skin temp (celsius)            3
Blood oxygen %                 3
Day Strain                     1
Energy burned (cal)            1
Max HR (bpm)                   1
Average HR (bpm)               1
Sleep onset                    3
Wake onset                     3
Sleep performance %            3
Respiratory rate (rpm)         3
Asleep duration (min)          3
In bed duration (min)          3
Light sleep duration (min)     3
Deep (SWS) duration (min)      3
REM duration (min)             3
Awake duration (min)           3
Sleep need (min)               3
Sleep debt (min)               3
Sleep efficiency %             3
Sleep consistency %            7
Month                          0
Day                            0
dtype: int64

In [7]:
# Delete null rows
cycle_1 = cycle.dropna()
cycle_1.isna().sum()

Cycle start time               0
Cycle end time                 0
Cycle timezone                 0
Recovery score %               0
Resting heart rate (bpm)       0
Heart rate variability (ms)    0
Skin temp (celsius)            0
Blood oxygen %                 0
Day Strain                     0
Energy burned (cal)            0
Max HR (bpm)                   0
Average HR (bpm)               0
Sleep onset                    0
Wake onset                     0
Sleep performance %            0
Respiratory rate (rpm)         0
Asleep duration (min)          0
In bed duration (min)          0
Light sleep duration (min)     0
Deep (SWS) duration (min)      0
REM duration (min)             0
Awake duration (min)           0
Sleep need (min)               0
Sleep debt (min)               0
Sleep efficiency %             0
Sleep consistency %            0
Month                          0
Day                            0
dtype: int64

In [8]:
cycle_1 = cycle_1.copy()

In [9]:
columns_to_convert = [
    'Asleep duration (min)',
    'In bed duration (min)',
    'Light sleep duration (min)',
    'Deep (SWS) duration (min)',
    'REM duration (min)',
    'Awake duration (min)'
]

for col in columns_to_convert:
    new_col = col.replace('(min)', '(hr)').strip()
    cycle_1.loc[:, new_col] = cycle_1[col] / 60
cycle_1.head()

Unnamed: 0,Cycle start time,Cycle end time,Cycle timezone,Recovery score %,Resting heart rate (bpm),Heart rate variability (ms),Skin temp (celsius),Blood oxygen %,Day Strain,Energy burned (cal),...,Sleep efficiency %,Sleep consistency %,Month,Day,Asleep duration (hr),In bed duration (hr),Light sleep duration (hr),Deep (SWS) duration (hr),REM duration (hr),Awake duration (hr)
1,2024-12-08 07:56:32,2024-12-09 03:40:05,UTC-08:00,12.0,92.0,20.0,35.7,94.33,4.3,1182.0,...,82.0,52.0,12,8,5.5,6.716667,2.983333,1.166667,1.35,1.216667
2,2024-12-07 03:47:29,2024-12-08 07:56:32,UTC-08:00,62.0,75.0,46.0,34.7,95.6,14.5,2299.0,...,82.0,82.0,12,7,7.733333,9.416667,5.7,1.133333,0.9,1.683333
3,2024-12-06 04:36:56,2024-12-07 03:47:29,UTC-08:00,93.0,70.0,60.0,34.44,95.6,11.1,1430.0,...,86.0,78.0,12,6,8.033333,9.266667,5.283333,1.366667,1.383333,1.233333
4,2024-12-05 03:40:11,2024-12-06 04:36:56,UTC-08:00,97.0,71.0,60.0,34.85,94.5,4.4,1502.0,...,92.0,64.0,12,5,10.183333,11.066667,7.166667,1.366667,1.65,0.883333
5,2024-12-04 05:31:38,2024-12-05 03:40:11,UTC-08:00,36.0,86.0,34.0,34.3,94.68,9.6,1321.0,...,87.0,50.0,12,4,7.716667,8.783333,5.883333,1.566667,0.266667,1.066667


In [10]:
# rename columns
cycle_1 = cycle_1.rename(columns={
    "Recovery score %": "recovery_score",
    "Resting heart rate (bpm)": "rhr",
    "Heart rate variability (ms)": "hrv",
    "Skin temp (celsius)": "skin_temp",
    "Blood oxygen %": "blood_oxygen",
    "Day Strain": "day_strain",
    "Sleep efficiency %": "sleep_efficiency",
    "Asleep duration (hr)": "asleep_hr",
    "In bed duration (hr)": "in_bed_hr",
    "Light sleep duration (hr)": "light_sleep_hr",
    "Deep (SWS) duration (hr)": "deep_sleep_hr",
    "REM duration (hr)": "rem_sleep_hr",
    "Awake duration (hr)": "awake_hr"
})


In [11]:
# Normalize dates
cycle_1["date"] = pd.to_datetime(cycle_1["Cycle start time"].dt.date)

# Feature Engineering 7-day averages of HRV, RHR, Recovery and Sleep efficiency 

In [12]:
# Sleep debt conversion 
cycle_1['Sleep_debt_hr'] = cycle_1['Sleep debt (min)'] / 60
cycle_1.head()

Unnamed: 0,Cycle start time,Cycle end time,Cycle timezone,recovery_score,rhr,hrv,skin_temp,blood_oxygen,day_strain,Energy burned (cal),...,Month,Day,asleep_hr,in_bed_hr,light_sleep_hr,deep_sleep_hr,rem_sleep_hr,awake_hr,date,Sleep_debt_hr
1,2024-12-08 07:56:32,2024-12-09 03:40:05,UTC-08:00,12.0,92.0,20.0,35.7,94.33,4.3,1182.0,...,12,8,5.5,6.716667,2.983333,1.166667,1.35,1.216667,2024-12-08,0.05
2,2024-12-07 03:47:29,2024-12-08 07:56:32,UTC-08:00,62.0,75.0,46.0,34.7,95.6,14.5,2299.0,...,12,7,7.733333,9.416667,5.7,1.133333,0.9,1.683333,2024-12-07,0.0
3,2024-12-06 04:36:56,2024-12-07 03:47:29,UTC-08:00,93.0,70.0,60.0,34.44,95.6,11.1,1430.0,...,12,6,8.033333,9.266667,5.283333,1.366667,1.383333,1.233333,2024-12-06,0.0
4,2024-12-05 03:40:11,2024-12-06 04:36:56,UTC-08:00,97.0,71.0,60.0,34.85,94.5,4.4,1502.0,...,12,5,10.183333,11.066667,7.166667,1.366667,1.65,0.883333,2024-12-05,0.75
5,2024-12-04 05:31:38,2024-12-05 03:40:11,UTC-08:00,36.0,86.0,34.0,34.3,94.68,9.6,1321.0,...,12,4,7.716667,8.783333,5.883333,1.566667,0.266667,1.066667,2024-12-04,0.9


In [13]:
# Capping values above 2.0 to 2.0
cycle_1['Sleep_debt_hr'] = cycle_1['Sleep_debt_hr'].clip(upper=2.0)
cycle_1['Sleep_debt_7d_avg'] = cycle_1['Sleep_debt_hr'].rolling(window=7, min_periods=1).mean()

In [14]:
#print(cycle_1.iloc[2:10][['Sleep_debt_hr', 'Sleep_debt_7d_avg']])

In [15]:
# 7-day rolling averages
cycle_1["hrv_7d_avg"] = cycle_1["hrv"].rolling(window=7, min_periods=1).mean()
cycle_1["rhr_7d_avg"] = cycle_1["rhr"].rolling(window=7, min_periods=1).mean()
cycle_1["recovery_7d_avg"] = cycle_1["recovery_score"].rolling(window=7, min_periods=1).mean()
cycle_1["sleep_efficiency_7d_avg"] = cycle_1["sleep_efficiency"].rolling(window=7, min_periods=1).mean()
#cycle_1['Sleep_debt_7d_avg'] = cycle_1['Sleep debt (min)'].rolling(window=7).mean()

In [16]:
# Lag features for predicting next day strain
cycle_1["prev_recovery_score"] = cycle_1["recovery_score"].shift(1)
cycle_1["prev_sleep_efficiency"] = cycle_1["sleep_efficiency"].shift(1)
cycle_1["prev_hrv"] = cycle_1["hrv"].shift(1)
cycle_1["prev_rhr"] = cycle_1["rhr"].shift(1)
cycle_1["prev_sleep_debt_hr"] = cycle_1["Sleep_debt_hr"].shift(1)

In [17]:
# Sleep proportions
total_sleep = cycle_1["light_sleep_hr"] + cycle_1["deep_sleep_hr"] + cycle_1["rem_sleep_hr"]
cycle_1["light_sleep_pct"] = cycle_1["light_sleep_hr"] / total_sleep
cycle_1["deep_sleep_pct"] = cycle_1["deep_sleep_hr"] / total_sleep
cycle_1["rem_sleep_pct"] = cycle_1["rem_sleep_hr"] / total_sleep

In [18]:
cycle_1 = cycle_1.dropna()

In [19]:
print(cycle_1.columns)

Index(['Cycle start time', 'Cycle end time', 'Cycle timezone',
       'recovery_score', 'rhr', 'hrv', 'skin_temp', 'blood_oxygen',
       'day_strain', 'Energy burned (cal)', 'Max HR (bpm)', 'Average HR (bpm)',
       'Sleep onset', 'Wake onset', 'Sleep performance %',
       'Respiratory rate (rpm)', 'Asleep duration (min)',
       'In bed duration (min)', 'Light sleep duration (min)',
       'Deep (SWS) duration (min)', 'REM duration (min)',
       'Awake duration (min)', 'Sleep need (min)', 'Sleep debt (min)',
       'sleep_efficiency', 'Sleep consistency %', 'Month', 'Day', 'asleep_hr',
       'in_bed_hr', 'light_sleep_hr', 'deep_sleep_hr', 'rem_sleep_hr',
       'awake_hr', 'date', 'Sleep_debt_hr', 'Sleep_debt_7d_avg', 'hrv_7d_avg',
       'rhr_7d_avg', 'recovery_7d_avg', 'sleep_efficiency_7d_avg',
       'prev_recovery_score', 'prev_sleep_efficiency', 'prev_hrv', 'prev_rhr',
       'prev_sleep_debt_hr', 'light_sleep_pct', 'deep_sleep_pct',
       'rem_sleep_pct'],
      dtype='ob

# Train Linear Reg model

In [20]:
# Modeling
features = [
    "prev_recovery_score", "prev_sleep_efficiency", "prev_hrv", "prev_rhr", "prev_sleep_debt_hr",
    "hrv_7d_avg", "rhr_7d_avg", "recovery_7d_avg", "sleep_efficiency_7d_avg",
    "Sleep_debt_7d_avg",
    "light_sleep_pct", "deep_sleep_pct", "rem_sleep_pct"
]
target = "day_strain"

X = cycle_1[features]
y = cycle_1[target]

print(X.head())
print(y.head())

   prev_recovery_score  prev_sleep_efficiency  prev_hrv  prev_rhr  \
2                 12.0                   82.0      20.0      92.0   
3                 62.0                   82.0      46.0      75.0   
4                 93.0                   86.0      60.0      70.0   
5                 97.0                   92.0      60.0      71.0   
6                 36.0                   87.0      34.0      86.0   

   prev_sleep_debt_hr  hrv_7d_avg  rhr_7d_avg  recovery_7d_avg  \
2                0.05        33.0   83.500000        37.000000   
3                0.00        42.0   79.000000        55.666667   
4                0.00        46.5   77.000000        66.000000   
5                0.75        44.0   78.800000        60.000000   
6                0.90        45.0   77.833333        62.666667   

   sleep_efficiency_7d_avg  Sleep_debt_7d_avg  light_sleep_pct  \
2                82.000000           0.025000         0.737069   
3                83.333333           0.016667         0.

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()

#  Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 6.883631324007791
R-squared: 0.13160252581346965


## Ridge Analysis

In [22]:
#from sklearn.linear_model import Ridge

# Create a Ridge Regression model
ridge_model = Ridge(alpha=1.0)  # You can adjust alpha for regularization strength

# Train the Ridge model
ridge_model.fit(X_train, y_train)

# Make predictions
y_pred_ridge = ridge_model.predict(X_test)

# Evaluate the Ridge model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Model - Mean Squared Error: {mse_ridge}")
print(f"Ridge Model - R-squared: {r2_ridge}")

Ridge Model - Mean Squared Error: 6.8928792114960915
Ridge Model - R-squared: 0.13043586801929163


This might indicate that there isnt a strong enough analysis with the current target variable day_strain. 

In [23]:
# Compute correlations
correlations = cycle_1[features + [target]].corr()[target].drop(target)
correlation_df = correlations.reset_index()
correlation_df.columns = ["Feature", "Correlation"]

# Sort by correlation strength
correlation_df = correlation_df.sort_values(by="Correlation", ascending=False)

# Altair bar chart
chart = alt.Chart(correlation_df).mark_bar().encode(
    x=alt.X("Correlation:Q", scale=alt.Scale(domain=[-1, 1])),
    y=alt.Y("Feature:N", sort="-x"),
    color=alt.Color("Correlation:Q", scale=alt.Scale(scheme='redblue'), legend=None),
    tooltip=["Feature", "Correlation"]
).properties(
    width=500,
    height=400,
    title="Feature Correlation with Target (day_strain)"
)

chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


This visual explores the positive and negative correlations with day strain. Previous resting heart rate has a moderately positive relationship with strain. Additionally sleep debts are very high in correlation and will be explored further in this analyis. The neg correlation with previous recovery core - Higher recovery score = your body is well-rested and primed for activity.

But in the context of your data, if you're measuring the strain that follows, people likely push harder on days they feel more recovered.

So ironically, a high recovery score may lead to a higher strain later that day — but since your variable is *prev*_recovery_score and your target is today's strain, you’re seeing a negative correlation:

Poor recovery yesterday → less strain today
Good recovery yesterday → more capable of higher strain today

## Refining day strain model with sleep debt as the selected features

In [24]:
# Selected top correlated features
selected_features = ["prev_rhr", "prev_sleep_debt_hr", "Sleep_debt_7d_avg"]
X_refined = cycle_1[selected_features]
y = cycle_1["day_strain"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_refined, y, test_size=0.2, random_state=42)

# Fit model
lr_refined = LinearRegression()
lr_refined.fit(X_train, y_train)

# Predict and evaluate
y_pred = lr_refined.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Refined Linear Regression - Mean Squared Error: {mse}")
print(f"Refined Linear Regression - R-squared: {r2}")

Refined Linear Regression - Mean Squared Error: 6.875987115443484
Refined Linear Regression - R-squared: 0.1325668731320495


This helps indicate the other features were not adding much to the model. 

In [25]:
# Create DataFrame for Altair
results_df = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
})

# Altair scatter plot
chart = alt.Chart(results_df).mark_circle(size=60).encode(
    x="Actual:Q",
    y="Predicted:Q",
    tooltip=["Actual", "Predicted"]
).properties(
    width=400,
    height=400,
    title="Predicted vs Actual Day Strain"
).interactive()

# Add ideal line
line = alt.Chart(pd.DataFrame({'x': [y_test.min(), y_test.max()]})).mark_line(color='red').encode(
    x='x',
    y='x'
)

chart + line

We see a lot of clustering around the horizontal band- most between 7 and 8 which indicates that the model is underfitting by predicting average like value regardless of the input. Its clear so far that other relationships are missing or non linear. 

## Retuning model using decision trees

In [26]:
# Define model
tree_model = DecisionTreeRegressor(random_state=42)

# Fit model
tree_model.fit(X_train, y_train)

# Predict
y_pred_tree = tree_model.predict(X_test)

# Evaluate
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

print(f"Decision Tree Model - Mean Squared Error: {mse_tree}")
print(f"Decision Tree Model - R-squared: {r2_tree}")

Decision Tree Model - Mean Squared Error: 13.004652777777778
Decision Tree Model - R-squared: -0.6405886796273004


In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "max_depth": [2, 3, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error'
)

grid_search.fit(X_train, y_train)

best_tree = grid_search.best_estimator_
y_pred_best = best_tree.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print(f"Tuned Tree - MSE: {mean_squared_error(y_test, y_pred_best)}")
print(f"Tuned Tree - R2: {r2_score(y_test, y_pred_best)}")

Best Parameters: {'max_depth': 2, 'min_samples_leaf': 4, 'min_samples_split': 2}
Tuned Tree - MSE: 7.320894194070314
Tuned Tree - R2: 0.0764400753502209


Utilizing best parameters results in less chance of overfitting however it also underfits a bit with such shallow depth, resulting in weaker performance than the linear model on your current feature set. Lets bring back other features.

In [28]:
# Full feature set
features = [
    "prev_recovery_score", "prev_sleep_efficiency", "prev_hrv", "prev_rhr", "prev_sleep_debt_hr",
    "hrv_7d_avg", "rhr_7d_avg", "recovery_7d_avg", "sleep_efficiency_7d_avg",
    "Sleep_debt_7d_avg", "light_sleep_pct", "deep_sleep_pct", "rem_sleep_pct"
]
target = "day_strain"

X = cycle_1[features]
y = cycle_1[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree + Grid Search
param_grid = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

tree = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Best model evaluation
best_tree = grid_search.best_estimator_
y_pred = best_tree.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", grid_search.best_params_)
print(f"Tuned Tree - MSE: {mse}")
print(f"Tuned Tree - R2: {r2}")

Best Parameters: {'max_depth': 2, 'min_samples_leaf': 4, 'min_samples_split': 2}
Tuned Tree - MSE: 7.319520085145251
Tuned Tree - R2: 0.07661342465723853


In [29]:
# Feature importance plot
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_tree.feature_importances_
}).sort_values(by='Importance', ascending=False)

importance_chart = alt.Chart(importance_df).mark_bar().encode(
    x=alt.X('Importance:Q', scale=alt.Scale(domain=[0, importance_df['Importance'].max()])),
    y=alt.Y('Feature:N', sort='-x'),
    tooltip=['Feature', 'Importance'],
    color=alt.Color('Importance:Q', scale=alt.Scale(scheme='blues'), legend=None)
).properties(
    title='Decision Tree Feature Importances',
    width=500,
    height=400
)

importance_chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [30]:
new_features = [ "prev_rhr", "sleep_efficiency_7d_avg"]
target = "day_strain"

X = cycle_1[new_features]
y = cycle_1[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model
tree_model = DecisionTreeRegressor(random_state=42)

# Fit model
tree_model.fit(X_train, y_train)

# Predict
y_pred_tree = tree_model.predict(X_test)

# Evaluate
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

print(f"Decision Tree Model - Mean Squared Error: {mse_tree}")
print(f"Decision Tree Model - R-squared: {r2_tree}")

Decision Tree Model - Mean Squared Error: 12.028368055555555
Decision Tree Model - R-squared: -0.5174264783182583


## Creating interaction terms for decision tree modeling:

In [31]:
df = cycle_1.copy()

# Create pairwise interaction terms
df["rhr_x_sleepdebt_hr"] = df["prev_rhr"] * df["prev_sleep_debt_hr"]
df["rhr_x_sleepdebt_7d"] = df["prev_rhr"] * df["Sleep_debt_7d_avg"]
df["sleepdebt_hr_x_sleepdebt_7d"] = df["prev_sleep_debt_hr"] * df["Sleep_debt_7d_avg"]

selected_features = [
    "prev_rhr",
    "prev_sleep_debt_hr",
    "Sleep_debt_7d_avg",
    "rhr_x_sleepdebt_hr",
    "rhr_x_sleepdebt_7d",
    "sleepdebt_hr_x_sleepdebt_7d"
]

In [32]:
target = "day_strain"

X = df[selected_features]
y = df[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()

#  Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 7.2661483350733915
R-squared: 0.08334648323834992


In [33]:
#importance plot and decision trees:
tree = DecisionTreeRegressor(max_depth=2, min_samples_leaf=4, min_samples_split=2, random_state=42)
tree.fit(X_train, y_train)

In [34]:
# Feature importance plot
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': tree.feature_importances_
}).sort_values(by='Importance', ascending=False)

importance_chart = alt.Chart(importance_df).mark_bar().encode(
    x=alt.X('Importance:Q', scale=alt.Scale(domain=[0, importance_df['Importance'].max()])),
    y=alt.Y('Feature:N', sort='-x'),
    tooltip=['Feature', 'Importance'],
    color=alt.Color('Importance:Q', scale=alt.Scale(scheme='blues'), legend=None)
).properties(
    title='Decision Tree Feature Importances',
    width=500,
    height=400
)

importance_chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


## New target variable- recovery score

In [35]:
target = "recovery_score"  # or the correct column name if it's slightly different
# Select only features + new target
# Compute correlations
correlations = cycle_1[features + [target]].corr()[target].drop(target)
correlation_df = correlations.reset_index()
correlation_df.columns = ["Feature", "Correlation"]

# Sort by correlation strength
correlation_df = correlation_df.sort_values(by="Correlation", ascending=False)

# Altair bar chart
chart = alt.Chart(correlation_df).mark_bar().encode(
    x=alt.X("Correlation:Q", scale=alt.Scale(domain=[-1, 1])),
    y=alt.Y("Feature:N", sort="-x"),
    color=alt.Color("Correlation:Q", scale=alt.Scale(scheme='redblue'), legend=None),
    tooltip=["Feature", "Correlation"]
).properties(
    width=500,
    height=400,
    title="Feature Correlation with Target (Recovery Score)"
)

chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


recovery_7d_avg, hrv_7d_avg, prev_recovery_score, and prev_hrv are the top positive predictors of recovery score.
These make intuitive sense — if your recent average recovery, HRV, and prior recovery scores are high, it’s likely your current recovery will also be high.

prev_sleep_debt_hr and rhr_7d_avg have the strongest negative relationships.

That aligns well — more sleep debt and higher resting heart rate likely indicate poorer recovery.

Sleep_debt_7d_avg, prev_rhr, and low sleep stages like light/deep sleep pct also appear weakly negatively correlated.


In [36]:
recovery_features = [
    "recovery_7d_avg",
    "hrv_7d_avg",
    "prev_recovery_score",
    "prev_hrv",
    "rem_sleep_pct",
    "sleep_efficiency_7d_avg",
    "prev_sleep_debt_hr",  # strong negative
    "rhr_7d_avg"           # moderate negative
]
target = "recovery_score"

In [37]:
X = cycle_1[recovery_features]
y = cycle_1[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Recovery Score Model - Mean Squared Error: {mse}")
print(f"Recovery Score Model - R-squared: {r2}")

Recovery Score Model - Mean Squared Error: 294.2527667012076
Recovery Score Model - R-squared: 0.331144012339162


R-sqared indicates 33.5% variance in revocery score which is a good starting point. Recovery is likely influenced by non-linear or latenet factors. The best predictors recovery 7 day average and previous hrv help but theres room for improvement

In [38]:
# Create a DataFrame for plotting
scatter_df = pd.DataFrame({
    "Actual Recovery Score": y_test,
    "Predicted Recovery Score": y_pred
})

# Create Altair scatter plot
scatter_plot = alt.Chart(scatter_df).mark_circle(size=60).encode(
    x=alt.X("Actual Recovery Score", scale=alt.Scale(zero=False)),
    y=alt.Y("Predicted Recovery Score", scale=alt.Scale(zero=False)),
    tooltip=["Actual Recovery Score", "Predicted Recovery Score"]
).properties(
    width=500,
    height=500,
    title="Predicted vs Actual Recovery Score"
)

# Add identity line (perfect prediction)
line = alt.Chart(pd.DataFrame({
    "x": [scatter_df["Actual Recovery Score"].min(), scatter_df["Actual Recovery Score"].max()]
})).mark_line(color='red').encode(
    x='x:Q',
    y='x:Q'
)

# Combine scatter and identity line
scatter_plot + line

## Decision trees

In [39]:
#from sklearn.tree import DecisionTreeRegressor
# Initialize and train the model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate performance
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print(f"Decision Tree Model - Mean Squared Error: {mse_dt}")
print(f"Decision Tree Model - R-squared: {r2_dt}")

Decision Tree Model - Mean Squared Error: 710.7916666666666
Decision Tree Model - R-squared: -0.6156764388631162


decision tree not performing well

In [40]:
# Feature importance plot
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': dt_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

importance_chart = alt.Chart(importance_df).mark_bar().encode(
    x=alt.X('Importance:Q', scale=alt.Scale(domain=[0, importance_df['Importance'].max()])),
    y=alt.Y('Feature:N', sort='-x'),
    tooltip=['Feature', 'Importance'],
    color=alt.Color('Importance:Q', scale=alt.Scale(scheme='blues'), legend=None)
).properties(
    title='Decision Tree Feature Importances',
    width=500,
    height=400
)

importance_chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


The importance variables include the previous sleep debt, 7 day average recovery and heart rate 7 day averages 

## Exploratory Data Analysis:

In [41]:
print(cycle_1.columns)

Index(['Cycle start time', 'Cycle end time', 'Cycle timezone',
       'recovery_score', 'rhr', 'hrv', 'skin_temp', 'blood_oxygen',
       'day_strain', 'Energy burned (cal)', 'Max HR (bpm)', 'Average HR (bpm)',
       'Sleep onset', 'Wake onset', 'Sleep performance %',
       'Respiratory rate (rpm)', 'Asleep duration (min)',
       'In bed duration (min)', 'Light sleep duration (min)',
       'Deep (SWS) duration (min)', 'REM duration (min)',
       'Awake duration (min)', 'Sleep need (min)', 'Sleep debt (min)',
       'sleep_efficiency', 'Sleep consistency %', 'Month', 'Day', 'asleep_hr',
       'in_bed_hr', 'light_sleep_hr', 'deep_sleep_hr', 'rem_sleep_hr',
       'awake_hr', 'date', 'Sleep_debt_hr', 'Sleep_debt_7d_avg', 'hrv_7d_avg',
       'rhr_7d_avg', 'recovery_7d_avg', 'sleep_efficiency_7d_avg',
       'prev_recovery_score', 'prev_sleep_efficiency', 'prev_hrv', 'prev_rhr',
       'prev_sleep_debt_hr', 'light_sleep_pct', 'deep_sleep_pct',
       'rem_sleep_pct'],
      dtype='ob

In [42]:
alt.Chart(cycle_1).mark_line().encode(
    x='Month:O',
    y='mean(Sleep_debt_hr):Q'
).properties(
    title="Average Sleep Debt Over the Months?",
    width=500,
    height=300
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Sleep debt very high in july and april but lowest in august

In [43]:
alt.Chart(cycle_1).mark_line().encode(
    x='Month:O',
    y='mean(asleep_hr):Q'
).properties(
    title="What's my Average Sleep Duration Over the Months?",
    width=500,
    height=300
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


Around 7 hoursup to 9 hours of sleep duration over the months. Highest amount of sleep in august with lowest in april. 

In [44]:
# Comparing sleep with strain
alt.Chart(cycle_1).mark_line().encode(
    y='mean(day_strain):Q',
    x='Month:O'
).properties(
    title="Average Day Strains over the months",
    width=500,
    height=300
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


LIGHT (0-9): Minimal stress put on the body, room for active recovery MODERATE (10-13): Moderate stress on the body, generally good for maintaining fitness. More on the light strain through each month. It seems over the year i have done light strain mostly throughout the year. this results in not working out as much as i could etc

# Interactive Dashboard- Sleep debt influences your strain- how to improve it

In [45]:
# Replace this with your actual DataFrame
df = cycle_1.copy()

# Melt to long format for switching target
df_melted = df.melt(
    id_vars=["prev_sleep_debt_hr", "Sleep_debt_7d_avg", "hrv_7d_avg", "rhr_7d_avg"],
    value_vars=["recovery_score", "day_strain"],
    var_name="Target",
    value_name="TargetValue"
)

# Define dropdown selector using selection_point
target_dropdown = alt.binding_select(options=["recovery_score", "day_strain"], name="Target Variable: ")
target_selection = alt.param(name="target_param", bind=target_dropdown, value="recovery_score")

# Build interactive scatter plot
scatter = alt.Chart(df_melted).mark_circle(size=80).encode(
    x=alt.X("Sleep_debt_7d_avg", title="7-Day Avg Sleep Debt (hrs)"),
    y=alt.Y("TargetValue", title="Target Value"),
   # color=alt.Color("rhr", scale=alt.Scale(scheme="blueorange"), title="HRV 7D Avg"),
    tooltip=["Target", "TargetValue"]
).transform_filter(
    alt.datum.Target == target_selection
).add_params(
    target_selection
).properties(
    width=600,
    height=400,
    title="Sleep Debt vs Target (Recovery Score or Day Strain)"
)

scatter

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


target variable day strain and recovery score based on sleep debt.

In [46]:
cycle_1

Unnamed: 0,Cycle start time,Cycle end time,Cycle timezone,recovery_score,rhr,hrv,skin_temp,blood_oxygen,day_strain,Energy burned (cal),...,recovery_7d_avg,sleep_efficiency_7d_avg,prev_recovery_score,prev_sleep_efficiency,prev_hrv,prev_rhr,prev_sleep_debt_hr,light_sleep_pct,deep_sleep_pct,rem_sleep_pct
2,2024-12-07 03:47:29,2024-12-08 07:56:32,UTC-08:00,62.0,75.0,46.0,34.70,95.60,14.5,2299.0,...,37.000000,82.000000,12.0,82.0,20.0,92.0,0.050000,0.737069,0.146552,0.116379
3,2024-12-06 04:36:56,2024-12-07 03:47:29,UTC-08:00,93.0,70.0,60.0,34.44,95.60,11.1,1430.0,...,55.666667,83.333333,62.0,82.0,46.0,75.0,0.000000,0.657676,0.170124,0.172199
4,2024-12-05 03:40:11,2024-12-06 04:36:56,UTC-08:00,97.0,71.0,60.0,34.85,94.50,4.4,1502.0,...,66.000000,85.500000,93.0,86.0,60.0,70.0,0.000000,0.703764,0.134206,0.162029
5,2024-12-04 05:31:38,2024-12-05 03:40:11,UTC-08:00,36.0,86.0,34.0,34.30,94.68,9.6,1321.0,...,60.000000,85.800000,97.0,92.0,60.0,71.0,0.750000,0.762419,0.203024,0.034557
6,2024-12-03 04:50:45,2024-12-04 05:31:38,UTC-08:00,76.0,73.0,50.0,35.16,94.78,13.2,2019.0,...,62.666667,85.000000,36.0,87.0,34.0,86.0,0.900000,0.680108,0.155914,0.163978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,2023-12-13 02:21:05,2023-12-14 00:45:16,UTC-08:00,49.0,79.0,51.0,31.70,90.00,10.0,1539.0,...,72.857143,85.857143,56.0,89.0,49.0,82.0,2.000000,0.713805,0.111111,0.175084
359,2023-12-12 00:52:08,2023-12-13 02:21:05,UTC-08:00,81.0,79.0,61.0,33.70,95.64,4.9,1512.0,...,77.714286,85.714286,49.0,80.0,51.0,79.0,1.366667,0.655696,0.167089,0.177215
360,2023-12-11 01:02:40,2023-12-12 00:52:08,UTC-08:00,74.0,74.0,58.0,33.78,93.30,12.9,1679.0,...,75.428571,83.714286,81.0,86.0,61.0,79.0,1.466667,0.548673,0.123894,0.327434
361,2023-12-10 03:10:01,2023-12-11 01:02:40,UTC-08:00,83.0,80.0,61.0,34.60,94.08,5.9,1350.0,...,74.714286,84.000000,74.0,76.0,58.0,74.0,0.933333,0.506024,0.230924,0.263052


In [47]:
alt.Chart(df).mark_bar().encode(
    y='mean(Energy burned (cal)):Q',
    x='Month:O'
).properties(
    title="Calories burned over the months",
    width=500,
    height=300
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


## Sleep debt with risk zones

In [48]:
# Create the base histogram
base = alt.Chart(df).transform_filter(
    alt.datum.Sleep_debt_7d_avg != None
).mark_bar().encode(
    alt.X("Sleep_debt_7d_avg:Q", bin=alt.Bin(maxbins=30), title="7-Day Avg Sleep Debt (hrs)"),
    alt.Y("count():Q", title="Frequency"),
    tooltip=["count()"]
).properties(
    width=600,
    height=400,
    title="Sleep Debt Distribution with Risk Zones"
)

# Add colored rule overlays to indicate risk zones
zones = alt.Chart(pd.DataFrame({
    'x': [0, .5, 2],
    'zone': ['Low Risk (<2 hrs)', 'Moderate Risk (2-4 hrs)', 'High Risk (>4 hrs)'],
    'color': ['green', 'orange', 'red']
})).mark_rule(size=0).encode(
    x='x:Q',
    color=alt.Color('color:N', scale=None, legend=None)
)

risk_bands = alt.Chart(pd.DataFrame({
    'start': [0, .5, 2],
    'end': [.5, 2, df["Sleep_debt_7d_avg"].max()],
    'color': ['Low Risk', 'Moderate Risk', 'High Risk']
})).mark_rect(opacity=0.1).encode(
    x='start:Q',
    x2='end:Q',
    color=alt.Color('color:N', scale=alt.Scale(domain=['Low Risk', 'Moderate Risk', 'High Risk'],
                                               range=['green', 'orange', 'red']),
                    legend=alt.Legend(title="Risk Zone"))
)

# Combine bands and histogram
chart = (risk_bands + base).resolve_scale(color='independent')
chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [49]:
print(df.columns)

Index(['Cycle start time', 'Cycle end time', 'Cycle timezone',
       'recovery_score', 'rhr', 'hrv', 'skin_temp', 'blood_oxygen',
       'day_strain', 'Energy burned (cal)', 'Max HR (bpm)', 'Average HR (bpm)',
       'Sleep onset', 'Wake onset', 'Sleep performance %',
       'Respiratory rate (rpm)', 'Asleep duration (min)',
       'In bed duration (min)', 'Light sleep duration (min)',
       'Deep (SWS) duration (min)', 'REM duration (min)',
       'Awake duration (min)', 'Sleep need (min)', 'Sleep debt (min)',
       'sleep_efficiency', 'Sleep consistency %', 'Month', 'Day', 'asleep_hr',
       'in_bed_hr', 'light_sleep_hr', 'deep_sleep_hr', 'rem_sleep_hr',
       'awake_hr', 'date', 'Sleep_debt_hr', 'Sleep_debt_7d_avg', 'hrv_7d_avg',
       'rhr_7d_avg', 'recovery_7d_avg', 'sleep_efficiency_7d_avg',
       'prev_recovery_score', 'prev_sleep_efficiency', 'prev_hrv', 'prev_rhr',
       'prev_sleep_debt_hr', 'light_sleep_pct', 'deep_sleep_pct',
       'rem_sleep_pct'],
      dtype='ob

In [52]:
df['month_name'] = pd.to_datetime(df['date']).dt.strftime('%B')  # 'January', 'February', etc.
month_dropdown = alt.binding_select(options=df['month_name'].unique().tolist(), name='Select Month: ')
month_selection = alt.selection_point(fields=['month_name'], bind=month_dropdown)

In [53]:
# Step 1: Strip any extra spaces from column names
df.columns = df.columns.str.strip()

# Step 2: Convert to datetime with coercion (invalid formats will become NaT)
df['Sleep onset'] = pd.to_datetime(df['Sleep onset'], errors='coerce')

In [54]:
# Step 2: Extract hour and weekday from 'Sleep onset'
df['hour'] = df['Sleep onset'].dt.hour
df['weekday'] = df['Sleep onset'].dt.day_name()

# Step 3: (Optional) Order weekdays for plotting
weekday_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
df['weekday'] = pd.Categorical(df['weekday'], categories=weekday_order, ordered=True)

In [55]:
heatmap = alt.Chart(df).mark_rect().encode(
    x=alt.X('hour:O', title='Hour of Sleep Onset'),
    y=alt.Y('weekday:O', title='Day of Week', sort=weekday_order),
    color=alt.Color('mean(Sleep_debt_hr):Q', scale=alt.Scale(scheme='reds'), title='Avg Sleep Debt (hrs)'),
    tooltip=[
        'weekday:O', 'hour:O', alt.Tooltip('mean(Sleep_debt_hr):Q', title='Avg Sleep Debt')
    ]
).add_params(
    month_selection
).transform_filter(
    month_selection
).properties(
    width=500,
    height=300,
    title='Avg Sleep Debt by Hour & Weekday (Filtered by Month)'
)
heatmap

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


X: day of week

Y: hour of day

Color: average sleep debt

Not much sleep debt on sundays specifially months of nov, oct, august, 


In [58]:
boxplot = alt.Chart(df).mark_boxplot().encode(
    x=alt.X('weekday:O', sort=weekday_order, title='Day of Week'),
    y=alt.Y('Sleep_debt_hr:Q', title='Sleep Debt (hrs)'),
    color=alt.Color('weekday:O', legend=None)
).properties(
    width=500,
    height=500,
    title='Distribution of Sleep Debt by Day of Week'
)
boxplot

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [61]:
calendar_heatmap_facet = alt.Chart(df).mark_rect().encode(
    x=alt.X('Day:O', title='Day of Month'),
    y=alt.Y('weekday:O', title='Day of Week', sort=weekday_order),
    color=alt.Color('Sleep_debt_hr:Q', scale=alt.Scale(scheme='reds')),
    tooltip=[
        alt.Tooltip('date:T', title='Date'),
        alt.Tooltip('Sleep_debt_hr:Q', title='Avg Sleep Debt (hrs)', format=".2f")
    ]
).properties(
    width=350,
    height=150,
).facet(
    column='Month:N'
).resolve_scale(
    color='independent'
).properties(
    title='Calendar Heatmap: Avg Sleep Debt by Day, Faceted by Month'
)

calendar_heatmap_facet

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
