In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd

classic: `restless | asleep | awake`

stages: `deep | light | rem | wake`

In [3]:
# Processing Sleep
stages_map = {"restless":1,
              "light":1,
              "asleep":2,
              "deep":2,
              "rem":2,
              "awake":3,
              "wake":3}
sleep = pd.read_csv("/projects/bdata/datasets/covid-fitbit/processed/all_sleep.csv")              
sleep["sleep_classic"] = sleep["stage"].map(stages_map)
sleep["participant_id"] = sleep["user"]
sleep["timestamp"] = pd.to_datetime(sleep["datetime"]).dt.floor("1min")
sleep = sleep.sort_values("timestamp").drop_duplicates(subset = ["participant_id","timestamp"],keep="last")
sleep = sleep[["participant_id","timestamp","sleep_classic","stage_duration"]].set_index("participant_id")

In [4]:
sleep.groupby(["participant_id","timestamp"]).size().max()

1

In [5]:
steps = pd.read_csv("/projects/bdata/datasets/covid-fitbit/processed/all_steps.csv")    
steps["datetime"] = pd.to_datetime(steps["datetime"])
steps = steps.groupby(["user","datetime"])["steps"].mean().astype(int).reset_index()
steps = steps.rename(columns={"datetime":"timestamp", "user":"participant_id" })
steps = steps.set_index("participant_id")

In [6]:
heart_rate = pd.read_csv("/projects/bdata/datasets/covid-fitbit/processed/all_hr.csv")  
heart_rate["datetime"] = pd.to_datetime(heart_rate["datetime"]).dt.floor("1min")
heart_rate = heart_rate.groupby(["user","datetime"])["heartrate"].mean().astype(int).reset_index()
heart_rate = heart_rate.rename(columns={"datetime":"timestamp", "user":"participant_id", "heartrate":"heart_rate"})
heart_rate = heart_rate.set_index("participant_id")

In [7]:

from src.data.make_dataset import explode_str_column, safe_loc, get_new_index
from src.data.utils import process_minute_level_pandas
all_results = []
users_with_steps = steps.index.unique()
for user in users_with_steps.values:
    exploded_sleep = explode_str_column(safe_loc(sleep,user),
                                target_col = "sleep_classic",
                                rename_target_column="sleep_classic",
                                start_col="timestamp",
                                dur_col = "stage_duration",
                                dtype=pd.Int8Dtype(),
                                single_val=True)
    exploded_hr =  safe_loc(heart_rate,user).set_index("timestamp")
    exploded_steps = safe_loc(steps,user).set_index("timestamp")
    
    steps_and_hr = exploded_steps.join(exploded_hr,how = "left") 
    merged = steps_and_hr.join(exploded_sleep,how="left")                        

    processed = process_minute_level_pandas(minute_level_df = merged)

    # Keep datatypes in check
    processed["heart_rate"] = processed["heart_rate"].astype(pd.Int16Dtype())
    processed["participant_id"] = user
    all_results.append(processed)

all_results = pd.concat(all_results)
all_results["sleep_classic_0"] = all_results["sleep_classic_0"].fillna(False)
all_results["sleep_classic_1"] = all_results["sleep_classic_1"].fillna(False)
all_results["sleep_classic_2"] = all_results["sleep_classic_2"].fillna(False)
all_results["sleep_classic_3"] = all_results["sleep_classic_3"].fillna(False)


In [8]:
# Double check that "awake" isn't getting treated as missing
all_results["sleep_classic_0"].value_counts()

True     11900292
False     2387696
Name: sleep_classic_0, dtype: int64

In [9]:
from src.data.utils import read_parquet_to_pandas, load_processed_table
processed_original = read_parquet_to_pandas("/homes/gws/mikeam/seattleflustudy/data/processed/processed_fitbit_minute_level_activity")
processed_original["sleep_classic_0"].value_counts()

True     579535620
False    272728228
Name: sleep_classic_0, dtype: int64

In [13]:
all_results.to_parquet("/projects/bdata/datasets/covid-fitbit/processed/minute_level_fitbit", partition_cols=["participant_id"])

We'll also process the labels:

In [41]:
labels = pd.read_csv("/projects/bdata/datasets/covid-fitbit/processed/covid_dates.csv")
def ts_string_to_ts(x):
    Timestamp = lambda x: pd.to_datetime(x)
    NaT = None
    return eval(x)

for col in ["covid_diagnosis_dates","Symptom_dates","recovery_dates"]:
    labels[col] = labels[col].map(ts_string_to_ts)

labels = labels.rename(columns={"ParticipantID":"participant_id"})

In [43]:
labels.explode("covid_diagnosis_dates").dropna().to_csv("/projects/bdata/datasets/covid-fitbit/processed/covid_diagnosis_dates.csv")
labels.explode("Symptom_dates").dropna().to_csv("/projects/bdata/datasets/covid-fitbit/processed/Symptom_dates.csv")
labels.explode("recovery_dates").dropna().to_csv("/projects/bdata/datasets/covid-fitbit/processed/recovery_dates.csv")

In [46]:
covid = labels.explode("covid_diagnosis_dates").dropna()

In [48]:
(covid.groupby(["participant_id","covid_diagnosis_dates"]).size() > 0).to_dict()

{('A0NVTRV', Timestamp('2023-12-11 00:00:00')): True,
 ('A0VFT1N', Timestamp('2023-10-16 00:00:00')): True,
 ('A1K5DRI', Timestamp('2028-06-21 00:00:00')): True,
 ('A1ZJ41O', Timestamp('2027-08-09 00:00:00')): True,
 ('A1ZJ41O', Timestamp('2027-08-10 00:00:00')): True,
 ('A36HR6Y', Timestamp('2023-04-16 00:00:00')): True,
 ('A3OU183', Timestamp('2024-11-27 00:00:00')): True,
 ('A4E0D03', Timestamp('2028-05-18 00:00:00')): True,
 ('A4G0044', Timestamp('2027-03-08 00:00:00')): True,
 ('A7EM0B6', Timestamp('2023-12-21 00:00:00')): True,
 ('A7EM0B6', Timestamp('2023-12-26 00:00:00')): True,
 ('A7EM0B6', Timestamp('2023-12-31 00:00:00')): True,
 ('AA2KP1S', Timestamp('2025-01-11 00:00:00')): True,
 ('AAXAA7Z', Timestamp('2023-04-13 00:00:00')): True,
 ('AFPB8J2', Timestamp('2026-07-17 00:00:00')): True,
 ('AHYIJDV', Timestamp('2025-01-22 00:00:00')): True,
 ('AIFDJZB', Timestamp('2023-12-20 00:00:00')): True,
 ('AJMQUVV', Timestamp('2024-09-06 00:00:00')): True,
 ('AJWW3IY', Timestamp('2024