In [140]:
#| echo: false
from datetime import datetime, timedelta, time
import pandas as pd
import numpy as np
import os
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, accuracy_score, mean_absolute_error, matthews_corrcoef
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
from xgboost import XGBRegressor

## Import and Clean Data

In [42]:
def hours_to_minutes(time):
    time = time.split(":")
    minutes = int(time[0])*60 + int(time[1])
    return minutes

In [32]:
def convert_to_24hr_time(timestamp):
    try:
        # Parse the timestamp string
        timestamp = timestamp.strip().lower()  # Strip any leading/trailing spaces and convert to lowercase
        
        # Check if the timestamp contains AM or PM
        is_pm = 'pm' in timestamp
        is_am = 'am' in timestamp
        
        if is_pm or is_am:
            # Remove 'am' or 'pm' from the timestamp string
            timestamp = timestamp.replace('am', '').replace('pm', '').strip()
            
            # Parse hours and minutes
            hours, minutes = map(int, timestamp.split(':'))
            
            # Adjust hours for PM time (add 12 to convert from 12-hour to 24-hour format)
            if is_pm and hours != 12:
                hours += 12
            
            # Create a time object
            time_obj = time(hours, minutes)
            
            return time_obj
        else:
            # Handle invalid timestamp format (missing 'am' or 'pm')
            print("Invalid timestamp format. Please include 'am' or 'pm'.")
            return None
        
    except (ValueError, AttributeError):
        # Handle any other parsing errors
        print("Invalid timestamp format. Please use format 'HH:MM AM/PM'.")
        return None

In [199]:
def more_than_eight(sleep_time):
    if int(sleep_time[0]) >= 8:
        return 1
    else:
        return 0

In [200]:
# File path
folder_path = "/Users/ben/Documents/GitHub/AdvancedMachineLearning/Data/FitbitDataKaggle/"

# Get file names
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Read CSV files into DataFrames
dataframes = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]

# Standardize column names
for frame in dataframes:
    frame.columns = ["Day", "Date", "Sleep Score", "Hours of Sleep", "REM Sleep", "Deep Sleep", "Heart Rate Below Resting", "Sleep Time"]

# Combine Dataframes
combined_df = pd.concat(dataframes)

# Drop blank rows
combined_df = combined_df.dropna()

# Reset index of combined DataFrame
combined_df = combined_df.reset_index(drop=True)

# Clean columns and create features
combined_df["REM Sleep"] = combined_df["REM Sleep"].apply(lambda x: float(x.strip("%")))
combined_df["Deep Sleep"] = combined_df["Deep Sleep"].apply(lambda x: float(x.strip("%")))
combined_df["Heart Rate Below Resting"] = combined_df["Heart Rate Below Resting"].apply(lambda x: float(x.strip("%")))

combined_df["Minutes of Sleep"] = combined_df["Hours of Sleep"].apply(lambda x: hours_to_minutes(x))
combined_df["Sleep Start"] = combined_df["Sleep Time"].apply(lambda x: x.split(" - ")[0])
combined_df.loc[125, "Sleep Start"] = "11:21pm"
combined_df.loc[135, "Sleep Start"] = "11:38pm"
combined_df["Sleep Start"] = combined_df["Sleep Start"].apply(lambda x: convert_to_24hr_time(x))
combined_df["Sleep End"] = combined_df["Sleep Time"].apply(lambda x: x.split(" - ")[1])
combined_df.loc[96, "Sleep End"] = "7:33am"
combined_df.loc[116, "Sleep End"] = "7:02am"
combined_df["Sleep End"] = combined_df["Sleep End"].apply(lambda x: convert_to_24hr_time(x))
combined_df["More than Eight"] = combined_df["Hours of Sleep"].apply(lambda x: more_than_eight(x))

combined_df

Unnamed: 0,Day,Date,Sleep Score,Hours of Sleep,REM Sleep,Deep Sleep,Heart Rate Below Resting,Sleep Time,Minutes of Sleep,Sleep Start,Sleep End,More than Eight
0,Saturday,01/01/2022,63.0,4:13:00,15.0,18.0,0.0,06:10am - 10:52am,253,06:10:00,10:52:00,0
1,Sunday,01/02/2022,90.0,8:09:00,22.0,18.0,92.0,02:51am - 11:59am,489,02:51:00,11:59:00,1
2,Monday,01/03/2022,83.0,7:27:00,16.0,13.0,96.0,11:27pm - 08:28am,447,23:27:00,08:28:00,0
3,Tuesday,01/04/2022,90.0,7:25:00,19.0,21.0,99.0,10:52pm - 07:19am,445,22:52:00,07:19:00,0
4,Wednesday,01/05/2022,81.0,7:10:00,18.0,15.0,89.0,10:42pm - 06:57am,430,22:42:00,06:57:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...
174,Sunday,03/27/2022,86.0,7:03:00,20.0,15.0,95.0,12:04am - 8:02am,423,12:04:00,08:02:00,0
175,Monday,03/28/2022,80.0,6:33:00,14.0,22.0,95.0,10:12pm - 5:51am,393,22:12:00,05:51:00,0
176,Tuesday,03/29/2022,86.0,6:59:00,22.0,23.0,93.0,10:00pm - 5:51am,419,22:00:00,05:51:00,0
177,Wednesday,03/30/2022,87.0,6:34:00,20.0,20.0,96.0,10:08pm - 5:29am,394,22:08:00,05:29:00,0


In [44]:
combined_df["Sleep Score"].corr(combined_df["Minutes of Sleep"])

0.32533163032012014

In [None]:
combined_df["Sleep Score"].corr(combined_df["REM Sleep"])

0.3460549569646571

In [None]:
combined_df["Sleep Score"].corr(combined_df["Deep Sleep"])

0.23942985469168782

In [157]:
combined_df["Sleep Score"].corr(combined_df["Heart Rate Below Resting"])

0.49963533676497995

In [242]:
# X = combined_df.drop(["Date", "Sleep Score", "Hours of Sleep", "Sleep Time", "Sleep Start", "Sleep End", "More than Eight"], axis = 1)
X = combined_df.drop(["Date", "Sleep Score", "Hours of Sleep", "Sleep Time", "Sleep Start", "Sleep End", "More than Eight"], axis = 1)
y = combined_df["Sleep Score"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Bagging Model

In [225]:
# Bagging Regressor
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"), ["Day"]),
        ("standardize", StandardScaler(), ["Minutes of Sleep", "REM Sleep", "Deep Sleep", "Heart Rate Below Resting"])
    ]
)

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("bagging", BaggingRegressor(n_jobs=-1, estimator=RandomForestRegressor(n_jobs=-1, criterion="absolute_error")))
    ]
)

parameters = {
    # "bagging__estimator": [DecisionTreeRegressor(), KNeighborsRegressor(), RandomForestRegressor(), LinearRegression()],
    "bagging__n_estimators": [25, 50, 100],
    "bagging__estimator__ccp_alpha": [0, .1, .001, .0001],
    "bagging__estimator__min_samples_leaf": [2, 3, 4, 5, 10],
    "bagging__estimator__min_samples_split": [2, 3, 4, 5, 10]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='neg_mean_absolute_error')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]

gscv_fitted.best_estimator_

In [244]:
print(f"Average MAE for Bagged Random Forest Regressor: {-1*np.mean(test_scores)}")

Average MAE for Bagged Random Forest Regressor: 2.164377346111111


In [243]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("bagging", BaggingRegressor(n_jobs=-1, estimator = RandomForestRegressor(ccp_alpha=.001, criterion="absolute_error", min_samples_leaf=2, min_samples_split=3), n_estimators = 25, random_state=3))
    ]
)

fitted_model = my_pipeline.fit(X_train, y_train)

y_pred = fitted_model.predict(X_test)

print(f"Mean Absolute Error: {mean_absolute_error(y_true = y_test, y_pred = y_pred)}")

Mean Absolute Error: 1.7135555555555555


## Stacking Model

In [191]:
# Stacking Regressor
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"), ["Day"]),
        ("standardize", StandardScaler(), ["Minutes of Sleep", "REM Sleep", "Deep Sleep", "Heart Rate Below Resting"])
    ]
)

estimators = [
    ("Linear Regression", LinearRegression()),
    ("DecisionTree", DecisionTreeRegressor()),
    # ("Elastic Net", ElasticNet()),
    ("RandomForest", RandomForestRegressor()),
    ("Ridge", Ridge()),
    ("ADABoost", AdaBoostRegressor()),
    # ("XGBoost", XGBRegressor())
]

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("stacking", StackingRegressor(n_jobs= -1, estimators = estimators))
    ]
)

fitted_pipeline = my_pipeline.fit(X = X_train, y = y_train)

y_pred = fitted_pipeline.predict(X_test)

print(f"Mean Absolute Error: {mean_absolute_error(y_true = y_test, y_pred = y_pred)}")

Mean Absolute Error: 1.8805849798450875
