In [24]:
import vaex
import json
import pandas as pd

In [25]:
GARMIN_ACTIVITY_REGEX = "/projects/bdata/datasets/gatesfoundation/raw/COVID_Signals/*/all_other_datafiles/garmin/activities/*.parquet"
garmin_activity = vaex.open(GARMIN_ACTIVITY_REGEX)

In [26]:
list(garmin_activity.columns)

['id_participant_external',
 'startTimeInSeconds',
 'startTimeOffsetInSeconds',
 'activityType',
 'durationInSeconds',
 'averageHeartRateInBeatsPerMinute',
 'averageRunCadenceInStepsPerMinute',
 'averageSpeedInMetersPerSecond',
 'averagePaceInMinutesPerKilometer',
 'distanceInMeters',
 'maxBikeCadenceInRoundsPerMinute',
 'maxHeartRateInBeatsPerMinute',
 'maxPaceInMinutesPerKilometer',
 'maxRunCadenceInStepsPerMinute',
 'maxSpeedInMetersPerSecond',
 'numberOfActiveLengths',
 'steps',
 'totalElevationGainInMeters',
 'totalElevationLossInMeters']

In [27]:
garmin_activity["activityType"].value_counts()

WALKING                          9250
RUNNING                          4683
INDOOR_CARDIO                    1315
STRENGTH_TRAINING                 770
OTHER                             352
CYCLING                           294
TREADMILL_RUNNING                 252
YOGA                              238
ELLIPTICAL                        211
INDOOR_CYCLING                    125
VIRTUAL_RIDE                       85
ROAD_BIKING                        47
LAP_SWIMMING                       39
PILATES                            29
HIKING                             21
FITNESS_EQUIPMENT                  13
STAIR_CLIMBING                     11
INDOOR_ROWING                      10
TRAIL_RUNNING                       9
VIRTUAL_RUN                         9
WINTER_SPORTS                       4
RESORT_SKIING_SNOWBOARDING_WS       3
CASUAL_WALKING                      2
SNOW_SHOE_WS                        1
INDOOR_RUNNING                      1
RECUMBENT_CYCLING                   1
BREATHWORK  

In [28]:
len(garmin_activity)

17789

In [29]:
garmin_activity["id_participant_external"].nunique()

519

In [30]:
garmin_activity.column_names

['id_participant_external',
 'startTimeInSeconds',
 'startTimeOffsetInSeconds',
 'activityType',
 'durationInSeconds',
 'averageHeartRateInBeatsPerMinute',
 'averageRunCadenceInStepsPerMinute',
 'averageSpeedInMetersPerSecond',
 'averagePaceInMinutesPerKilometer',
 'distanceInMeters',
 'maxBikeCadenceInRoundsPerMinute',
 'maxHeartRateInBeatsPerMinute',
 'maxPaceInMinutesPerKilometer',
 'maxRunCadenceInStepsPerMinute',
 'maxSpeedInMetersPerSecond',
 'numberOfActiveLengths',
 'steps',
 'totalElevationGainInMeters',
 'totalElevationLossInMeters']

In [31]:
GARMIN_SLEEPS_REGEX = "/projects/bdata/datasets/gatesfoundation/raw/COVID_Signals/*/all_other_datafiles/garmin/sleeps/*.parquet"
garmin_sleeps = vaex.open(GARMIN_SLEEPS_REGEX)

In [32]:
garmin_sleeps.column_names

['id_participant_external',
 'calendarDate',
 'startTimeInSeconds',
 'startTimeOffsetInSeconds',
 'durationInSeconds',
 'unmeasurableSleepInSeconds',
 'deepSleepDurationInSeconds',
 'lightSleepDurationInSeconds',
 'remSleepInSeconds',
 'awakeDurationInSeconds',
 'sleepLevelsMap',
 'validation',
 'timeOffsetSleepRespiration',
 'timeOffsetSleepSpo2']

In [33]:
garmin_sleeps = garmin_sleeps.to_pandas_df()

In [34]:
# Based off of the FitBit "classic" sleep api: https://dev.fitbit.com/build/reference/web-api/sleep/get-sleep-log-by-date/
GARMIN_SLEEP_STAGE_MAP={
    "light": 1,
    "deep":2,
    "rem":2,
    "awake":3
}

In [35]:
garmin_sleeps["sleepLevelsMap"] = garmin_sleeps["sleepLevelsMap"].map(json.loads)

In [36]:
exploded_sleep = (pd.melt(
    pd.concat([garmin_sleeps,pd.json_normalize(garmin_sleeps["sleepLevelsMap"])],axis=1),
    id_vars=["id_participant_external","startTimeOffsetInSeconds"],
    value_vars=["awake","light","rem","deep"],
    var_name="sleep_type"
    ).explode("value").dropna(subset=["value"])
)


In [37]:
processed_sleep = pd.concat([exploded_sleep.drop(columns="value").reset_index(drop=True),pd.json_normalize(exploded_sleep["value"])],axis=1)

In [38]:
processed_sleep["endTime"] = pd.to_datetime(processed_sleep["endTimeInSeconds"], unit="s") + pd.to_timedelta(processed_sleep["startTimeOffsetInSeconds"])
processed_sleep["startTime"] = pd.to_datetime(processed_sleep["startTimeInSeconds"], unit="s") + pd.to_timedelta(processed_sleep["startTimeOffsetInSeconds"])
processed_sleep["sleep_classic"] = processed_sleep["sleep_type"].map(GARMIN_SLEEP_STAGE_MAP) 
processed_sleep["id_participant_external"] = processed_sleep["id_participant_external"].astype("category") 
processed_sleep["durationInSeconds"] = (processed_sleep["endTime"] - processed_sleep["startTime"]).dt.seconds
processed_sleep["timestamp"] = processed_sleep["startTime"] 
processed_sleep = processed_sleep.dropna().set_index("id_participant_external")

In [39]:
processed_sleep.head(10)

Unnamed: 0_level_0,startTimeOffsetInSeconds,sleep_type,endTimeInSeconds,startTimeInSeconds,endTime,startTime,sleep_classic,durationInSeconds,timestamp
id_participant_external,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
108757,-21600,awake,1608625680,1608625320,2020-12-22 08:27:59.999978400,2020-12-22 08:21:59.999978400,3,360,2020-12-22 08:21:59.999978400
108757,-21600,awake,1608630420,1608630360,2020-12-22 09:46:59.999978400,2020-12-22 09:45:59.999978400,3,60,2020-12-22 09:45:59.999978400
108757,-21600,awake,1608630960,1608630540,2020-12-22 09:55:59.999978400,2020-12-22 09:48:59.999978400,3,420,2020-12-22 09:48:59.999978400
943285,-18000,awake,1608620220,1608619980,2020-12-22 06:56:59.999982000,2020-12-22 06:52:59.999982000,3,240,2020-12-22 06:52:59.999982000
682052,-18000,awake,1608636780,1608636720,2020-12-22 11:32:59.999982000,2020-12-22 11:31:59.999982000,3,60,2020-12-22 11:31:59.999982000
604242,-21600,awake,1608551340,1608551280,2020-12-21 11:48:59.999978400,2020-12-21 11:47:59.999978400,3,60,2020-12-21 11:47:59.999978400
604242,-21600,awake,1608551520,1608551460,2020-12-21 11:51:59.999978400,2020-12-21 11:50:59.999978400,3,60,2020-12-21 11:50:59.999978400
604242,-21600,awake,1608551940,1608551760,2020-12-21 11:58:59.999978400,2020-12-21 11:55:59.999978400,3,180,2020-12-21 11:55:59.999978400
604242,-21600,awake,1608552660,1608552600,2020-12-21 12:10:59.999978400,2020-12-21 12:09:59.999978400,3,60,2020-12-21 12:09:59.999978400
56076,-25200,awake,1608568380,1608568320,2020-12-21 16:32:59.999974800,2020-12-21 16:31:59.999974800,3,60,2020-12-21 16:31:59.999974800


In [40]:
garmin_activity.column_names

['id_participant_external',
 'startTimeInSeconds',
 'startTimeOffsetInSeconds',
 'activityType',
 'durationInSeconds',
 'averageHeartRateInBeatsPerMinute',
 'averageRunCadenceInStepsPerMinute',
 'averageSpeedInMetersPerSecond',
 'averagePaceInMinutesPerKilometer',
 'distanceInMeters',
 'maxBikeCadenceInRoundsPerMinute',
 'maxHeartRateInBeatsPerMinute',
 'maxPaceInMinutesPerKilometer',
 'maxRunCadenceInStepsPerMinute',
 'maxSpeedInMetersPerSecond',
 'numberOfActiveLengths',
 'steps',
 'totalElevationGainInMeters',
 'totalElevationLossInMeters']

In [47]:
processed_heart_rate = garmin_activity[["id_participant_external","startTimeInSeconds","startTimeOffsetInSeconds",
                                        'averageHeartRateInBeatsPerMinute',"durationInSeconds"]].to_pandas_df()
processed_heart_rate = processed_heart_rate.dropna(subset=["durationInSeconds"])                                        
processed_heart_rate["timestamp"] = pd.to_datetime(processed_heart_rate["startTimeInSeconds"],unit="s") +\
                                    pd.to_timedelta(processed_heart_rate["startTimeOffsetInSeconds"].astype(int), unit="s")
processed_heart_rate["durationInSeconds"] = processed_heart_rate["durationInSeconds"].astype(int)
processed_heart_rate["id_participant_external"] = processed_heart_rate["id_participant_external"].astype("category") 
processed_heart_rate = processed_heart_rate.dropna().set_index("id_participant_external")
processed_heart_rate.head(10)

Unnamed: 0_level_0,startTimeInSeconds,startTimeOffsetInSeconds,averageHeartRateInBeatsPerMinute,durationInSeconds,timestamp
id_participant_external,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
139209,1608597574,-18000,134,735,2020-12-21 19:39:34
760451,1608556071,-21600,99,944,2020-12-21 07:07:51
139209,1608593758,-18000,107,555,2020-12-21 18:35:58
403728,1608565909,-21600,112,1934,2020-12-21 09:51:49
737470,1608546973,-18000,66,610,2020-12-21 05:36:13
831129,1608578714,-21600,102,1426,2020-12-21 13:25:14
917852,1608559880,-18000,69,953,2020-12-21 09:11:20
807214,1608614949,-25200,89,2711,2020-12-21 22:29:09
480902,1608595236,-21600,100,946,2020-12-21 18:00:36
350093,1608585841,-18000,101,1546,2020-12-21 16:24:01


In [48]:
processed_steps = garmin_activity[["id_participant_external","startTimeInSeconds","startTimeOffsetInSeconds",
                                        'steps',"durationInSeconds"]].to_pandas_df()
processed_steps = processed_steps.dropna(subset=["durationInSeconds"])  
processed_steps["timestamp"] = pd.to_datetime(processed_steps["startTimeInSeconds"],unit="s") +\
                                    pd.to_timedelta(processed_steps["startTimeOffsetInSeconds"].astype(int), unit="s")
processed_steps["durationInSeconds"] = processed_steps["durationInSeconds"].astype(int)
processed_steps["id_participant_external"] = processed_steps["id_participant_external"].astype("category")
processed_steps = processed_steps.dropna().set_index("id_participant_external")
processed_steps.head(10)

Unnamed: 0_level_0,startTimeInSeconds,startTimeOffsetInSeconds,steps,durationInSeconds,timestamp
id_participant_external,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
139209,1608597574,-18000,40,735,2020-12-21 19:39:34
760451,1608556071,-21600,1704,944,2020-12-21 07:07:51
403728,1608565909,-21600,10,1934,2020-12-21 09:51:49
831129,1608578714,-21600,1590,1426,2020-12-21 13:25:14
917852,1608559880,-18000,86,953,2020-12-21 09:11:20
480902,1608595236,-21600,1434,946,2020-12-21 18:00:36
350093,1608585841,-18000,2470,1546,2020-12-21 16:24:01
861847,1608552241,-18000,1510,813,2020-12-21 07:04:01
202028,1608583933,-21600,984,778,2020-12-21 14:52:13
587800,1608534155,-18000,58,98,2020-12-21 02:02:35


In [43]:
from src.data.make_dataset import explode_str_column, get_new_index, safe_loc
from src.data.utils import  process_minute_level_pandas
from tqdm import tqdm

In [50]:
processed_out_path = "/projects/bdata/datasets/gatesfoundation/processed/COVID_Signals/garmin_minute_level_activity"
users_with_steps = processed_steps.index.unique()
all_results = []

for user in tqdm(users_with_steps.values):
    exploded_sleep = explode_str_column(safe_loc(processed_sleep,user),
                                target_col = "sleep_classic",
                                # rename_target_column="sleep_classic",
                                start_col="startTime",
                                dur_col = "durationInSeconds",
                                single_val=True,
                                dtype=pd.Int8Dtype())
    exploded_hr =  explode_str_column(safe_loc(processed_heart_rate,user),
                                        target_col = "averageHeartRateInBeatsPerMinute",
                                        rename_target_column="heart_rate",
                                        start_col="timestamp",
                                        single_val=True,
                                        dur_col = "durationInSeconds",
                                        dtype=pd.Int8Dtype())
    exploded_steps = explode_str_column(safe_loc(processed_steps,user),
                                        target_col = "steps",
                                        # rename_target_column="averageHeartRateInBeatsPerMinute",
                                        start_col="timestamp",
                                        single_val=True,
                                        dur_col = "durationInSeconds",
                                        dtype=pd.Int8Dtype())
    steps_and_hr = exploded_steps.join(exploded_hr,how = "left") 
    merged = steps_and_hr.join(exploded_sleep,how="left")                        

    
    processed = process_minute_level_pandas(minute_level_df = merged)

    # Keep datatypes in check
    processed["heart_rate"] = processed["heart_rate"].astype(pd.Int16Dtype())
    processed["participant_id"] = user
    all_results.append(processed)

all_results = pd.concat(all_results)
all_results["sleep_classic_0"] = all_results["sleep_classic_0"].fillna(False)
all_results["sleep_classic_1"] = all_results["sleep_classic_1"].fillna(False)
all_results["sleep_classic_2"] = all_results["sleep_classic_2"].fillna(False)
all_results["sleep_classic_3"] = all_results["sleep_classic_3"].fillna(False)

all_results.to_parquet(path = processed_out_path, partition_cols=["date"], engine="fastparquet")

  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index = pd.date_range(start_ts,end_ts,freq=freq,closed="left").values
  new_index 

In [45]:
processed_heart_rate.dtypes

startTimeInSeconds                          object
startTimeOffsetInSeconds                    object
averageHeartRateInBeatsPerMinute            object
durationInSeconds                           object
timestamp                           datetime64[ns]
dtype: object

In [52]:
processed_sleep["timestamp"].median()

Timestamp('2021-01-23 07:51:59.999978496')