In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np

In [None]:
df = pd.read_csv("working_data/mhs_sleep_ch.csv")
df = df[df['GENDER'].isin(['male', 'female'])]
df = df.drop(["AGE", "COUNTRY", "FITNESS_LEVEL", "WHOOP_HEIGHT", "WHOOP_WEIGHT", "WHOOP_BMI", 'SLEEP_START_LOCAL', 'SLEEP_END_LOCAL', 'GENDER'], axis=1)
print(df.columns.tolist)
print(len(df))

In [None]:
df = df.dropna()
print(len(df))

# variables to drop because they don't seem necessary or reliable
print(len(df.columns))
df = df.drop(["SLEEP_PERFORMANCE_SCORE", "TIME_IN_BED_MINUTES", "SLEEP_DEBT_MINUTES", "SLEEP_NEED_MINUTES", "RECOVERY_SCORE", "SCALED_DAY_STRAIN", "DAY_STRAIN"], axis=1)
print(len(df.columns))
print(df.columns.tolist)

In [None]:
# logarithmic histograms to identify outliers

columns = df.columns[2:]

for col in columns:
    if col in df.columns:
        plt.figure(figsize=(6, 4))
        df[col].dropna().hist(bins=30, edgecolor='black', log=True)
        plt.title(f'Histogram of {col}')
        plt.xlabel(col)
        plt.ylabel('Log Frequency')
        plt.grid(True)
        plt.tight_layout()
        plt.show()
        plt.close()

In [None]:
# linear histograms to identify outliers

columns = df.columns[2:]

for col in columns:
    if col in df.columns:
        plt.figure(figsize=(6, 4))
        df[col].dropna().hist(bins=30, edgecolor='black', log=False)
        plt.ylim(top=500)
        plt.title(f'Histogram of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.tight_layout()
        plt.show()
        plt.close()

In [None]:
# define bounds by inspecting the histograms

bounds = {
    'SLEEP_LATENCY': (0, 75),
    'RESTING_HEART_RATE': (30, 100),
    'HRV': (10, 200),
    'SLEEP_EFFICIENCY': (45, 100),
    'MINUTES_OF_SLEEP': (0, 750),
    'LIGHT_SLEEP_DURATION_MINUTES': (0, 550),
    'LIGHT_SLEEP_PERCENT': (0, 1),
    'REM_SLEEP_DURATION_MINUTES': (0, 290),
    'REM_SLEEP_PERCENT': (0, 1),
    'SLOW_WAVE_SLEEP_DURATION_MINUTES': (0, 190),
    'SLOW_WAVE_SLEEP_PERCENT': (0, 1),
    'WAKE_DURATION_MINUTES': (0, 300),
    'WAKE_DURATION_PERCENT': (0, 1),
    'RESTORATIVE_SLEEP_MINUTES': (0, 440),
    'RESTORATIVE_SLEEP_PERCENT': (0, 1),
    'DISTURBANCES': (0, 35),
    'AROUSAL_TIME_MINUTES': (0, 80),
    'CALORIES_BURNED': (0, 7200),
    'DAY_AVG_HEART_RATE': (42, 105),
    'DAY_MAX_HEART_RATE': (80, 210),
}

In [None]:
# remove datapoints that are out of bounds

for col, (min_val, max_val) in bounds.items():
    if col in df.columns:
        df = df[(df[col] >= min_val) & (df[col] <= max_val)]

In [None]:
print(len(df))

In [None]:
# check outlier-free data

columns = df.columns[2:]

for col in columns:
    if col in df.columns:
        plt.figure(figsize=(6, 4))
        df[col].dropna().hist(bins=30, edgecolor='black', log=True)
        plt.title(f'Histogram of {col}')
        plt.xlabel(col)
        plt.ylabel('Log Frequency')
        plt.grid(True)
        plt.tight_layout()
        plt.show()
        plt.close()

In [None]:
df.to_csv("working_data/mhs_sleep_ch_without_outliers.csv", index=False)