In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import umap.umap_ as umap
import seaborn as sns
import matplotlib.colors as mcolors
import gc, numpy as np
from scipy.stats import median_abs_deviation

In [None]:
df = pd.read_csv("working_data/mhs_sleep_weekly.csv")

In [None]:
# remove unused columns

df = df.drop(['WEEKDAY_Mon', 'DATE_Mon',
                 'WEEKDAY_Tue', 'DATE_Tue',
                 'WEEKDAY_Wed', 'DATE_Wed',
                 'WEEKDAY_Thu', 'DATE_Thu',
                 'WEEKDAY_Fri', 'DATE_Fri',
                 'WEEKDAY_Sat', 'DATE_Sat',
                 'WEEKDAY_Sun', 'DATE_Sun'], axis=1)
#print(df.columns.tolist())

In [None]:
# create all features MAD, TREND, MAXDROP, VAR, MEAN, MSSD, MAXSTREAK, CV, 

days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
all_columns = df.columns
metric_names = {col.rsplit('_', 1)[0]
                for col in all_columns
                if any(col.endswith(day) for day in days)}

num_cols = df.select_dtypes('number').columns
df[num_cols] = df[num_cols].astype(np.float32)

df_week = df[['USER_ID', 'WEEK_START']].copy()

for m_idx, metric in enumerate(metric_names, start=1):
    week_cols = [f"{metric}_{day}" for day in days]
    vals = df[week_cols].to_numpy(np.float32)
    
    mu  = vals.mean(1)
    dif = np.diff(vals, axis=1)
    abs_dif = np.abs(dif)

    df_week[f"{metric}_MAD"] = median_abs_deviation(vals, axis=1)
    slopes = np.apply_along_axis(lambda v: np.polyfit(np.arange(7), v, 1)[0], axis=1, arr=vals)
    df_week[f"{metric}_TREND"] = slopes
    df_week[f"{metric}_MAXDROP"] = dif.min(1)
    df_week[f"{metric}_VAR"] = vals.var(1, ddof=1)
    df_week[f"{metric}_MEAN"] = mu
    # mean absolute successive difference
    df_week[f"{metric}_MSSD"] = abs_dif.mean(1)
    
    # longest streak of nights with +/- 10 % of weekly mean
    flags = np.abs(vals - mu[:,None]) < (0.10 * mu[:,None])
    def longest_true_run(a):
        diff = np.diff(np.concatenate([np.zeros((a.shape[0],1),bool), a, np.zeros((a.shape[0],1),bool)],1),1,1)
        run_starts = np.where(diff==1)
        run_ends = np.where(diff==-1)
        length = np.zeros(a.shape[0], dtype=np.int8)
        for (r, s), (_, e) in zip(zip(*run_starts), zip(*run_ends)):
            length[r] = max(length[r], e-s)
        return length

    df_week[f"{metric}_MAXSTREAK"] = longest_true_run(flags)
    
    #coefficient of variation
    df_week[f"{metric}_CV"] = vals.std(1, ddof=1) / (mu + 1e-6)
    
    del vals, dif, abs_dif, flags
    gc.collect()

    print(f"processed {m_idx}/{len(metric_names)} metrics")
    
    
# stage entropy
stages = [
    "REM_SLEEP_PERCENT",
    "SLOW_WAVE_SLEEP_PERCENT",
    "WAKE_DURATION_PERCENT",
    "LIGHT_SLEEP_PERCENT",
]

def stage_entropy(row):
    p = row / 100.0
    p = p / p.sum() + 1e-12
    return -(p * np.log2(p)).sum()

ent_cols = []
for d in days:
    colname = f"ENT_{d}"
    df_week[colname] = df[[f"{s}_{d}" for s in stages]].apply(stage_entropy, axis=1)
    ent_cols.append(colname)
    print(f"processed {d} entropy")

# row-wise mean across the seven daily entropy columns
df_week["ENT_MEAN"] = df_week[ent_cols].mean(axis=1)

In [None]:
# decide on whether to use daily entropies

use_daily_entropy = False

if (not use_daily_entropy):
    df_week = df_week.drop(['ENT_Mon', 'ENT_Tue', 'ENT_Wed', 'ENT_Thu', 'ENT_Fri', 'ENT_Sat', 'ENT_Sun'], axis=1)

In [None]:
priority_cols = [
    c for c in df_week.columns
    if c.endswith(('_MSSD', '_MAXSTREAK', '_CV'))
]
other_cols = [c for c in df_week.columns
              if c not in priority_cols
              and c not in ('USER_ID', 'WEEK_START')]

df_week = df_week[['USER_ID', 'WEEK_START'] + priority_cols + other_cols]


df_week.to_csv("working_data/mhs_sleep_weekly_features.csv", index=False)