# Metabolic Health ARDS Permissions

Prior to running the notebook, please request [AoD to Metabolic Heatch ARDS](https://grants.corp.google.com/#/grants?request=20h%2Fchr-ards-metabolichealth-deid-eng-team-sphinx:r&reason=b%2F283774208).

In [None]:
# @title Kernel Reproducibility
from colabtools import kernelinfo
_KERNEL = "Ranklab (PY3) Metabolic Health Colab"
kernelinfo.show_info_links()

# Imports

In [None]:
import csv
import pathlib
import tarfile
import matplotlib.pyplot as plt
import pandas as pd
from google3.pyglib import gfile
from colabtools import googlefiles
import fnmatch
import json
import seaborn as sns
%matplotlib inline
import random
from ast import literal_eval
import datetime
pd.set_option('display.max_columns', None)
from scipy import stats
import statsmodels.api as sm
from scipy.stats import zscore
from statsmodels.graphics.gofplots import qqplot
import random
from scipy.stats import shapiro

from collections.abc import Sequence
import multiprocessing.pool
import os
from absl import app
from google3.pyglib import gfile
from time import sleep, time

import multiprocessing
import pdb
from google3.pyglib import gfile
# This option may not work if we are in a different runtime!
gfile.LEGACY_GROUP_WRITABLE_WORLD_READABLE
from time import sleep, time

import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf

NORMALIZATION_PARAMETERS = {
    'HR': [82.406911, 13.9461201],
    'eda_level_real': [4.116634, 3.878952961],
    'leads_contact_counts': [230.76297, 52.76303698],
    'steps': [7.952935, 18.53001124],
    'jerk_auto': [203.441044, 33.11101136],
    'step_count': [11.440943, 15.95296346],
    'log_energy': [60.306033, 42.84693899],
    'covariance': [44.81157, 12.63844836],
    'log_energy_ratio': [44.714925, 21.32527317],
    'zero_crossing_std': [160.085565, 28.10161215],
    'zero_crossing_avg': [51.270075, 34.04430198],
    'axis_mean': [119.768427, 23.58453469],
    'altim_std': [0.005178, 0.0581546286],
    'kurtosis': [108.645938, 60.38419486],
    'sleep_coefficient': [8.706734, 4.003582277],
    'wrist_temperatures': [30.921362, 2.817617692],
    'hrv_shannon_entropy_rr': [3.277522, 0.468409277],
    'hrv_shannon_entropy_rrd': [2.974838, 0.4999503109],
    'hrv_percentage_of_nn_30': [0.348379, 0.1961256813],
    'ceda_magnitude_real_micro_siemens': [43.071381, 24.11546345],
    'ceda_slope_real_micro_siemens': [3.294176, 1.828755314],
    'rmssd_percentile_0595': [34.038394, 24.86136018],
    'sdnn_percentile_0595': [44.233053, 25.04521794],
    'msa_probability': [48.120677, 14.23343678],
    'hrv_percent_good': [0.2716, 0.2760073968],
    'hrv_rr_80th_percentile_mean': [821.738396, 105.621134],
    'hrv_rr_20th_percentile_mean': [731.996986, 84.6384433],
    'hrv_rr_median': [776.111350, 90.3199562],
    'hrv_rr_mean': [781.280325, 87.08971004],
    'hr_at_rest_mean': [83.199721, 10.66796299],
    'skin_temperature_magnitude': [26.393339, 10.98900771],
    'skin_temperature_slope': [0.267523, 17.79474941],
}

FEATURES_TO_INCLUDE = [
    'HR',
    'eda_level_real',
    'leads_contact_counts',
    'steps',
    'jerk_auto',
    'step_count',
    'log_energy',
    'covariance',
    'log_energy_ratio',
    'zero_crossing_std',
    'zero_crossing_avg',
    'axis_mean',
    'altim_std',
    'kurtosis',
    'sleep_coefficient',
    'wrist_temperatures',
    'hrv_shannon_entropy_rr',
    'hrv_shannon_entropy_rrd',
    'hrv_percentage_of_nn_30',
    'ceda_magnitude_real_micro_siemens',
    'ceda_slope_real_micro_siemens',
    'rmssd_percentile_0595',
    'sdnn_percentile_0595',
    'msa_probability',
    'hrv_percent_good',
    'hrv_rr_80th_percentile_mean',
    'hrv_rr_20th_percentile_mean',
    'hrv_rr_median',
    'hrv_rr_mean',
    'hr_at_rest_mean',
    'skin_temperature_magnitude',
    'skin_temperature_slope'
    ]

In [None]:
path_to_save_lsm_data = ("/cns/yq-d/home/fitbit-medical-sandboxes/e=1:kid=76381"
                         ":mkey=cns-deid/chr-ards-metabolichealth/deid/exp/"
                         "mdb=chr-ards-metabolichealth-deid-exp-decrypt/"
                         "foundational_llm_research/large_sensor_model_data/")

In [None]:
data = []
data.append({'type':  'steps',
     'raw_file': 'STEPS_COMPACT_DATA',
     'features_to_extract': ['steps'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'momentary_stress_algorithm',
     'raw_file': 'MOMENTARY_STRESS_ALGORITHM_DATA',
     'features_to_extract': ['hrv_shannon_entropy_rr','hrv_shannon_entropy_rrd','hrv_percentage_of_nn_30','ceda_magnitude_real_micro_siemens','ceda_slope_real_micro_siemens','rmssd_percentile_0595','sdnn_percentile_0595','msa_probability','hrv_percent_good','hrv_rr_80th_percentile_mean','hrv_rr_20th_percentile_mean','hrv_rr_median','hrv_rr_mean','hr_at_rest_mean','skin_temperature_magnitude','skin_temperature_slope'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'ceda',
     'raw_file': 'CONTINUOUS_EDA_DATA',
     'features_to_extract': ['eda_level_real','leads_contact_counts'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'wrist_temperature',
     'raw_file': 'WRIST_TEMPERATURE_DATA',
     'features_to_extract': ['wrist_temperatures'],
     'timezone_offset_column': 'tz_offset_minutes'})

data.append({'type':  'sleep_coefficient',
     'raw_file': 'SLEEP_COEFFICIENT_COMPACT_DATA',
     'features_to_extract': ['sleep_coefficient','is_on_wrist'],
     'timezone_offset_column': 'tz_offset_minutes'})

data.append({'type':  'spo2',
     'raw_file': 'ABSOLUTE_SPO2_DATA',
     'features_to_extract': ['value','confidence','coverage','valid'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'grok',
     'raw_file': 'GROK_FEATURE_DATA',
     'features_to_extract': ['jerk_auto','step_count','log_energy','covariance',
                             'log_energy_ratio','zero_crossing_std',
                             'zero_crossing_avg','axis_mean','altim_std','kurtosis'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'heart_rate',
     'raw_file': 'HEART_RATE_DATA',
     'features_to_extract': ['bpm','confidence'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

In [None]:
tf_record_folder = "/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-metabolichealth/deid/exp/aliheydari/metabolic_tfrecords_weekly_v01"


# STEP 0: Move Data from Selected Snapshot

### Vico Surverys

In [None]:
gfile.MakeDirs(path_to_save_lsm_data)
gfile.SetMode(path_to_save_lsm_data, 0o775)

In [None]:
## Import survey and phone data from vico files:
vico_folder = pathlib.PurePosixPath('/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-metabolichealth/deid/raw/vico/marmara')
vico_files = gfile.ListDir(vico_folder)
with gfile.Open(vico_folder / vico_files[-1], 'rb') as f:
  members = tarfile.open(fileobj=f, mode='r:gz').getmembers()
survey_phone_data = {}
with gfile.Open(vico_folder / vico_files[-1], 'rb') as f:
  with tarfile.open(fileobj=f, mode='r:gz') as tf:
    members = tf.getmembers()
    for member in members:
      files_excluded = ['battery_status_signals.csv']
      if member.name in files_excluded:
        continue
      survey_phone_data[member.name] = (pd.read_csv(tf.extractfile(member), delimiter='\t'))

      with googlefiles.OpenGoogleFiles():
        with open(path_to_save_lsm_data+member.name, 'w') as fs:
          survey_phone_data[member.name].to_csv(fs)

### Processing Raw Fitbit Data

In [None]:
fitbit_folder = pathlib.PurePosixPath('/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-metabolichealth/deid/raw/fitbit/marmara/')
fitbit_snapshots = gfile.ListDir(fitbit_folder)
fitbit_files = gfile.ListDir(fitbit_folder / fitbit_snapshots[-1])

In [None]:
subset_we_need = ['MSA_WINDOWS_DATA.csv',
 'RUN_VO2_MAX_DATA.csv',
 'SLEEP_COEFFICIENT_COMPACT_DATA.csv',
 'SLEEP_SCORE_DATA.csv',
 'SLEEP_STAGE_COMPACT_DATA.csv',
 'STEPS_COMPACT_DATA.csv',
 'USER_DATA.csv',
 'USER_DEVICE_PAIRING_PERIOD_DATA.csv',
 'WEIGHT_DATA.csv',
 'WRIST_TEMPERATURE_DATA.csv']

need_all_fitbit_files = False #@param [True, False] {type:"boolean"}

In [None]:
import time
start = time.time()
fitbit_folder = pathlib.PurePosixPath('/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-metabolichealth/deid/raw/fitbit/marmara/')
fitbit_snapshots = gfile.ListDir(fitbit_folder)
fitbit_files = gfile.ListDir(fitbit_folder / fitbit_snapshots[-1])
#fitbit_files = fnmatch.filter(fitbit_files, 'DAILY_*')
#fitbit_files.append('STEPS_COMPACT_DATA.csv')
#fitbit_files.append('SLEEP_SCORE_DATA.csv')
fitbit_content = {}
if need_all_fitbit_files:
  print("Looping over a *ALL* Fitbit files")

  fitbit_files_to_loop = fitbit_files
else:
  print("Looping over a *SUBSET* of Fitbit files")
  fitbit_files_to_loop = subset_we_need
for data_type in fitbit_files_to_loop:
  with gfile.Open(fitbit_folder / fitbit_snapshots[-1] / data_type, 'r') as f:
      fitbit_content[data_type] = pd.read_csv(f)

      with googlefiles.OpenGoogleFiles():
        with open(path_to_save_lsm_data+data_type+'.csv', 'w') as fs:
          fitbit_content[data_type].to_csv(fs)

print(f"Total time took {time.time() - start} seconds")

#fitbit_content['STEPS_COMPACT_DATA.csv']['activity_time'] = pd.to_datetime(fitbit_content['STEPS_COMPACT_DATA.csv']['activity_time'])
#fitbit_content['STEPS_COMPACT_DATA.csv']['date'] = fitbit_content['STEPS_COMPACT_DATA.csv']['activity_time'].dt.date
#print(len(fitbit_content['STEPS_COMPACT_DATA.csv']))
#participant_timezone = fitbit_content['STEPS_COMPACT_DATA.csv'].drop_duplicates(subset=['participant_id','date'], keep='first')
#participant_timezone
#print(len(participant_timezone))

# STEP 1: Prepare Individual Participant Sessions

In [None]:
for d in data:
  print("BY_SUBJECT_"+d['type'], len(gfile.ListDir(os.path.join(path_to_save_lsm_data, "BY_SUBJECT_"+d['type']))))

In [None]:
# @title Listing data available in root path
root_folder = path_to_save_lsm_data
gfile.ListDir(root_folder)

In [None]:
def get_arrays(row,column,type=float):
  list_of_strings = row[column]
  list_of_strings = list_of_strings[1:-1]
  list_of_integers = list_of_strings.split(',')
  series = pd.Series(list_of_integers)
  if type == bool:
    return series.astype(bool)

  try:
    return series.astype(float)
  except:
    print("Could not convert millis to float in array conversion.")
    return []

In [None]:
gfile.SetMode("/cns/yq-d/home/fitbit-medical-sandboxes/e=1:kid=76381:mkey=cns-deid/chr-ards-metabolichealth/deid/exp/mdb=chr-ards-metabolichealth-deid-exp-decrypt/foundational_llm_research/large_sensor_model_data/BY_SUBJECT_ceda/", 0o775)

In [None]:
data = []

# These are already done

data.append({'type':  'heart_rate',
     'raw_file': 'HEART_RATE_DATA',
     'features_to_extract': ['bpm','confidence'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'steps',
     'raw_file': 'STEPS_COMPACT_DATA',
     'features_to_extract': ['steps'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'wrist_temperature',
     'raw_file': 'WRIST_TEMPERATURE_DATA',
     'features_to_extract': ['wrist_temperatures'],
     'timezone_offset_column': 'tz_offset_minutes'})

data.append({'type':  'sleep_coefficient',
     'raw_file': 'SLEEP_COEFFICIENT_COMPACT_DATA',
     'features_to_extract': ['sleep_coefficient','is_on_wrist'],
     'timezone_offset_column': 'tz_offset_minutes'})

data.append({'type':  'momentary_stress_algorithm',
     'raw_file': 'MOMENTARY_STRESS_ALGORITHM_DATA',
     'features_to_extract': ['hrv_shannon_entropy_rr','hrv_shannon_entropy_rrd','hrv_percentage_of_nn_30','ceda_magnitude_real_micro_siemens','ceda_slope_real_micro_siemens','rmssd_percentile_0595','sdnn_percentile_0595','msa_probability','hrv_percent_good','hrv_rr_80th_percentile_mean','hrv_rr_20th_percentile_mean','hrv_rr_median','hrv_rr_mean','hr_at_rest_mean','skin_temperature_magnitude','skin_temperature_slope'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'ceda',
     'raw_file': 'CONTINUOUS_EDA_DATA',
     'features_to_extract': ['eda_level_real','leads_contact_counts'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'spo2',
     'raw_file': 'ABSOLUTE_SPO2_DATA',
     'features_to_extract': ['value','confidence','coverage','valid'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'grok',
     'raw_file': 'GROK_FEATURE_DATA',
     'features_to_extract': ['jerk_auto','step_count','log_energy','covariance',
                             'log_energy_ratio','zero_crossing_std',
                             'zero_crossing_avg','axis_mean','altim_std','kurtosis'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})



In [None]:
# @title All Files

def split_save(L):
  df2 = df[df['participant_id']==L]

  if gfile.Exists(os.path.join(target_dir,d['type']+"_"+str(L)+".csv")):
    print(L, ' Already exists.')
    return

  list_arrays = []

  ## Loop through raw file and de-nest:
  if d['type'] == 'spo2':
    df2['activity_time'] = pd.to_datetime(df2['activity_time'], format='mixed')
    df2['valid'] = df2['valid'].astype('int')
    tmp = df2
    tmp['millis_from_start_time'] = 0
  else:
    for i, row in df2.iterrows():

      tmp = pd.DataFrame()

      if d['type'] == 'steps' or d['type'] == 'wrist_temperature' or d['type'] == 'sleep_coefficient':
        tmp['millis_from_start_time'] = pd.Series(range(0,1440))*1000*60
      if d['type'] == 'heart_rate':
        tmp['millis_from_start_time'] = pd.Series(range(0,60*60*24))*1000
      if d['type'] == 'momentary_stress_algorithm':
        tmp['millis_from_start_time'] = get_arrays(row,'offsets')*1000*60
      if d['type'] == 'ceda':
          tmp['millis_from_start_time'] = get_arrays(row,'millis_from_start_time')
      if d['type'] == 'grok':
          tmp['millis_from_start_time'] = get_arrays(row,'millis_from_start_of_day')

      for feature in d['features_to_extract']:
        if feature == 'is_on_wrist':
          tmp[feature] = get_arrays(row,feature,bool)
        else:
          tmp[feature] = get_arrays(row,feature)
      tmp['activity_time'] = row['activity_time']
      list_arrays.append(tmp)

    tmp = pd.concat(list_arrays, ignore_index=True)

    tmp['participant_id'] = row['participant_id']
    tmp['activity_tm_timezone_offset'] = row[d['timezone_offset_column']]

    ## Convert time to LOCAL:
    tmp['activity_time'] = pd.to_datetime(tmp['activity_time'])

    if d['type'] == 'ceda':
      tmp['activity_time_local'] = tmp['activity_time'] + tmp['activity_tm_timezone_offset'].astype('timedelta64[m]') + tmp['activity_tm_timezone_offset'].astype('timedelta64[m]') + tmp['millis_from_start_time'].astype('timedelta64[ms]')
    else:
      tmp['activity_time_local'] = tmp['activity_time'] + tmp['activity_tm_timezone_offset'].astype('timedelta64[m]') + tmp['millis_from_start_time'].astype('timedelta64[ms]')

    ## Rename columns:
    tmp.rename(columns={'activity_time_local': 'DT', 'participant_id': 'ID'}, inplace=True)

    cols = d['features_to_extract'].copy()
    cols.append('ID')
    cols.append('DT')
    tmp = tmp[cols]

  tmp.to_csv(gfile.Open(os.path.join(target_dir,d['type']+"_"+str(L)+".csv"), 'w'))
  print(L, ' successfully saved.')

for d in data:

  print(d['type'])

  ## Load raw file:
  with gfile.Open(os.path.join(root_folder, d['raw_file']+'.csv.csv'), 'r') as f:
    df = pd.read_csv(f)
  df.reset_index(inplace=True)

  if len(df) == 0:
    continue

  ## Make output dir and save per subject files:
  target_dir = os.path.join(root_folder,"BY_SUBJECT_"+d['type'])
  if gfile.Exists(target_dir):
    print(target_dir)
    #gfile.DeleteRecursively(target_dir)
    #gfile.MakeDirs(target_dir)
  else:
    gfile.MakeDirs(target_dir)
    gfile.SetMode(target_dir, 0o775)

  #split_save(19395)

  WORKER_COUNT = 20
  L = pd.unique(df.participant_id)
  with multiprocessing.pool.ThreadPool(WORKER_COUNT) as pool:
    output = list(pool.map(split_save, L))
    pool.close()
    pool.join()

In [None]:
root_folder = path_to_save_lsm_data
root_folder

In [None]:
for d in data:
  print(d['type'], len(gfile.ListDir(os.path.join(root_folder, "BY_SUBJECT_"+d['type']))))
  # Set permission so that they are globally viewable in ARDS
  gfile.SetMode(os.path.join(root_folder, "BY_SUBJECT_"+d['type']), 0o775)

# STEP 2: Sessionize

In [None]:
root_folder = path_to_save_lsm_data

In [None]:
#@title Data session class definition

import abc
import dataclasses
import functools as ft
import jaxtyping as jt
from scipy.stats import zscore

class Sensor(abc.ABC):

  def resample(timeseries_data, input_timestamp_units='s', output_timestamp_units='1min'):
    """Downsamples a pandas dataframe with unknown frequency into a minutely frequency, using the column 't'.

    Args:
      timeseries_data: A pandas dataframe with a column 't' of timestamps to use
      for downsampling.
      timestamp_units: The units to use for the timestamps.

    Returns:
      A pandas dataframe with a minutely frequency.
    """

    timeseries_data['DT'] = pd.to_datetime(
        timeseries_data['t'], unit=input_timestamp_units
    )
    timeseries_data.drop(columns=['t'], inplace=True)
    timeseries_data = timeseries_data.resample(output_timestamp_units, on='DT').mean()
    return timeseries_data


class HeartRate(Sensor):
  """Heart rate sensor data."""

  sessions: list

  def __init__(self, data, sensor_key,input_timestamp_units='s', output_timestamp_units='1min'):
    if sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      hr = []
      for session in sessions:
        times = pd.date_range(datetime.datetime.fromtimestamp(session.activity_tm.seconds, tz=datetime.timezone.utc), periods=60*60*24, freq='1s')
        hr_day = pd.DataFrame({'t': times, 'HR': session.bpm})
        hr.append(hr_day)
      self.hr = Sensor.resample(pd.concat(hr), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.hr = pd.DataFrame(columns=['DT',
                                      'HR']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))



class ContinuousEDA(Sensor):
  """Continuous EDA sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):

    if sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      continuous_eda = []
      for session in sessions:
        t = []
        for i in session.millis_from_start_time:
          t.append(datetime.datetime.fromtimestamp(i/1000 + session.activity_tm_timezone_offset*60 + session.activity_tm.seconds, tz=datetime.timezone.utc))
        times = pd.DatetimeIndex(t)
        continuous_eda_day = pd.DataFrame({'t': times,
                                          'eda_level_real': session.eda_level_real,
                                          'eda_level_imaginary': session.eda_level_imaginary,
                                          'eda_slope_real': session.eda_slope_real,
                                          'eda_slope_imaginary': session.eda_slope_imaginary,
                                          'leads_contact_counts': session.leads_contact_counts})
        continuous_eda.append(continuous_eda_day)
      self.continuous_eda = Sensor.resample(pd.concat(continuous_eda), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.continuous_eda = pd.DataFrame(columns=['DT',
                                                  'eda_level_real',
                                                  'eda_level_imaginary',
                                                  'eda_slope_real',
                                                  'eda_slope_imaginary',
                                                  'leads_contact_counts']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))

class Steps(Sensor):
  """Steps sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):
    if sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      steps = []
      for session in sessions:
        times = pd.date_range(datetime.datetime.fromtimestamp(session.activity_tm.seconds, tz=datetime.timezone.utc), periods=60*24, freq='1min')
        steps_day = pd.DataFrame({'t': times, 'steps': session.steps})
        steps.append(steps_day)
      self.steps = Sensor.resample(pd.concat(steps), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.steps = pd.DataFrame(columns=['DT',
                                         'steps']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))


class Grok(Sensor):
  """Grok sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):

    if sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      grok = []
      for session in data.data['grok_feature_data_with_dupes']:
        t = []
        for i in session.activity_tms:
          t.append(datetime.datetime.fromtimestamp(i.seconds, tz=datetime.timezone.utc))
        times = pd.DatetimeIndex(t)
        grok_day = pd.DataFrame({'t': times,
                                'jerk_auto': session.jerk_auto,
                                'step_count': session.step_count,
                                'log_energy': session.log_energy,
                                'covariance': session.covariance,
                                'log_energy_ratio': session.log_energy_ratio,
                                'zero_crossing_std': session.zero_crossing_std,
                                'zero_crossing_avg': session.zero_crossing_avg,
                                'axis_mean': session.axis_mean,
                                'altim_std': session.altim_std,
                                'kurtosis': session.kurtosis})
        grok.append(grok_day)
      self.grok = Sensor.resample(pd.concat(grok), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.grok = pd.DataFrame(columns=['DT',
                                        'jerk_auto',
                                        'step_count',
                                        'log_energy',
                                        'covariance',
                                        'log_energy_ratio',
                                        'zero_crossing_std',
                                        'zero_crossing_avg',
                                        'axis_mean',
                                        'altim_std',
                                        'kurtosis']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))

class SleepCoefficient(Sensor):
  """Sleep coefficient sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):
    if sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      sleep_coefficient = []
      for session in data.data['sleep_coefficient_compact']:
        times = pd.date_range(datetime.datetime.fromtimestamp(session.activity_tm.seconds, tz=datetime.timezone.utc), periods=60*24*2, freq='30s')
        sleep_coefficient_day = pd.DataFrame({'t': times,
                                              'sleep_coefficient': session.sleep_coefficient,
                                              'is_on_wrist': session.is_on_wrist})
        sleep_coefficient.append(sleep_coefficient_day)
      self.sleep_coefficient = Sensor.resample(pd.concat(sleep_coefficient), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.sleep_coefficient = pd.DataFrame(columns=['DT',
                                        'sleep_coefficient',
                                        'is_on_wrist']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))

class SkinTemp(Sensor):
  """Skin temperature sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):
    if sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      skin_temp = []
      for session in sessions:
        times = pd.date_range(datetime.datetime.fromtimestamp(session.activity_tm.seconds, tz=datetime.timezone.utc), periods=60*24, freq='1min')
        skintemp_day = pd.DataFrame({'t': times, 'wrist_temperatures': session.wrist_temperatures})
        skin_temp.append(skintemp_day)
      self.skin_temp = Sensor.resample(pd.concat(skin_temp), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.skin_temp = pd.DataFrame(columns=['DT',
                                        'wrist_temperatures']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))

class MomentaryStressAlgorithm(Sensor):
  """Momentary stress algorithm sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):

    if sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      momentary_stress_algorithm = []
      for session in sessions:
        t = []
        for i in session.offsets:
          t.append(datetime.datetime.fromtimestamp(i*60 + session.activity_tm.seconds, tz=datetime.timezone.utc))
        times = pd.DatetimeIndex(t)
        msa_day = pd.DataFrame({'t': times,
                                'hrv_shannon_entropy_rr': session.hrv_shannon_entropy_rr,
                                'hrv_shannon_entropy_rrd': session.hrv_shannon_entropy_rrd,
                                'hrv_percentage_of_nn_30': session.hrv_percentage_of_nn_30,
                                'ceda_magnitude_real_micro_siemens': session.ceda_magnitude_real_micro_siemens,
                                'ceda_slope_real_micro_siemens': session.ceda_slope_real_micro_siemens,
                                'rmssd_percentile_0595': session.rmssd_percentile_0595,
                                'sdnn_percentile_0595': session.sdnn_percentile_0595,
                                'msa_probability': session.msa_probability,
                                'hrv_percent_good': session.hrv_percent_good,
                                'hrv_rr_80th_percentile_mean': session.hrv_rr_80th_percentile_mean,
                                'hrv_rr_20th_percentile_mean': session.hrv_rr_20th_percentile_mean,
                                'hrv_rr_median': session.hrv_rr_median,
                                'hrv_rr_mean': session.hrv_rr_mean,
                                'hr_at_rest_mean': session.hr_at_rest_mean,
                                'skin_temperature_magnitude': session.skin_temperature_magnitude,
                                'skin_temperature_slope': session.skin_temperature_slope})
        momentary_stress_algorithm.append(msa_day)
      self.momentary_stress_algorithm = Sensor.resample(pd.concat(momentary_stress_algorithm), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.momentary_stress_algorithm = pd.DataFrame(columns=['DT',
                                                              'hrv_shannon_entropy_rr',
                                                              'hrv_shannon_entropy_rrd',
                                                              'hrv_percentage_of_nn_30',
                                                              'ceda_magnitude_real_micro_siemens',
                                                              'ceda_slope_real_micro_siemens',
                                                              'rmssd_percentile_0595',
                                                              'sdnn_percentile_0595',
                                                              'msa_probability',
                                                              'hrv_percent_good',
                                                              'hrv_rr_80th_percentile_mean',
                                                              'hrv_rr_20th_percentile_mean',
                                                              'hrv_rr_median',
                                                              'hrv_rr_mean',
                                                              'hr_at_rest_mean',
                                                              'skin_temperature_magnitude',
                                                              'skin_temperature_slope']).set_index('DT')

    print(ValueError(sensor_key + ' not found in data.data.keys()'))


@dataclasses.dataclass(frozen=True)
class ProdSession:

  # A session specific identifier for a 24hr period of data collection.
  session_id: str
  # Heart rate table data.
  hr: HeartRate
  # Continuous heart rate table data.
  continuous_eda: ContinuousEDA
  # Steps table data.
  steps: Steps
  # Grok table data.
  grok: Grok
  # Sleep Coefficient table data.
  sleep_coefficient: SleepCoefficient
  # Skin Temp table data.
  skin_temp: SkinTemp
  # MSA table data.
  momentary_stress_algorithm: MomentaryStressAlgorithm

  def join(self) -> pd.DataFrame:

    dfs = [self.hr, self.continuous_eda, self.steps, self.grok, self.sleep_coefficient, self.skin_temp, self.momentary_stress_algorithm]
    session = ft.reduce(lambda left, right: pd.merge(left, right, on='DT', how='outer'), dfs)
    if 'is_on_wrist' in session.columns:
      session.loc[(session.is_on_wrist == 0), :] = np.nan
      session = session.apply(lambda col: zscore(col, nan_policy='omit') if col.notna().any() else col)
      session = session.clip(-3,3)
      return session
    else:
      return pd.DataFrame()


# Step 3: Load and Process Labels

In [None]:
# @title New: Metabolic Health Labels (Pending Review by Daniel)

class MetHealthLabels:
  """MetHealthLabels sensor data."""

  def __init__(self, path_to_labels_and_scores:str=(
      "/namespace/fitbit-medical-sandboxes/partner/encrypted/"
      "chr-ards-metabolichealth/deid/exp/wear_me_study/extracted_features/"
      "labels_and_scores.pkl")):
    with gfile.Open(path_to_labels_and_scores, 'rb') as f:
      self.labels_and_scores_df = pd.read_pickle(f)
    self.labels_and_scores_df.dropna(subset="participant_id", inplace=True)
    self.labels_and_scores_df['participant_id'] = self.labels_and_scores_df['participant_id'].astype(int)
    self.labels_and_scores_df.set_index('participant_id', inplace=True)
    # The threshold for HOMA-IR comes from our manuscript
    self.labels_and_scores_df['ir_binary'] = (self.labels_and_scores_df['homa_ir'] > 2.9).astype(int)
    # Threshold for MSSS comes from https://diabetesjournals.org/care/article/41/11/2421/36563/Use-of-a-Metabolic-Syndrome-Severity-Z-Score-to
    self.labels_and_scores_df['msss_binary'] = (self.labels_and_scores_df['msss'] > 0).astype(int)

    self.age = self.labels_and_scores_df.loc[:, ['age']].to_dict()['age']
    # Unfortunately, we did not ask user's for bilogical sex, so "sex" column is
    # actually gender
    self.gender = self.labels_and_scores_df.loc[:, ['sex']].to_dict()['sex']
    # Cardiometabolic scores
    self.bmi = self.labels_and_scores_df.loc[:, ['bmi']].to_dict()['bmi']
    self.homa_ir = self.labels_and_scores_df.loc[:, ['homa_ir']].to_dict()['homa_ir']
    self.homa_ir_binary = self.labels_and_scores_df.loc[:, ['ir_binary']].to_dict()['ir_binary']
    self.msss = self.labels_and_scores_df.loc[:, ['msss']].to_dict()['msss']
    self.msss_binary = self.labels_and_scores_df.loc[:, ['msss_binary']].to_dict()['msss_binary']
    self.framingham_risk_category = self.labels_and_scores_df.loc[:, ['framingham_risk_category']].to_dict()['framingham_risk_category']
    # APRI (AST to Platelet Ratio Index) is a score realted to liver health
    self.apri = self.labels_and_scores_df.loc[:, ['apri']].to_dict()['apri']
    # Since we want to have hypertension as a binary label, we convert all NaNs
    # to -1.
    self.labels_and_scores_df['hypertension'].fillna(-1, inplace=True)
    self.hypertension = self.labels_and_scores_df['hypertension'].astype(int)
    self.hyperlipidemia = self.labels_and_scores_df.loc[:, ['hyperlipidemia']].to_dict()['hyperlipidemia']
    self.cardiovascular_condition = self.labels_and_scores_df.loc[:, ['CVD']].to_dict()['CVD']
    self.diabetes_condition = self.labels_and_scores_df.loc[:, ['diabetes']].to_dict()['diabetes']
    self.diabetes_type = self.labels_and_scores_df.loc[:, ['diabetes_type']].to_dict()['diabetes_type']
    self.anxiety_condition = self.labels_and_scores_df.loc[:, ['anxiety']].to_dict()['anxiety']
    self.respiratory_condition = self.labels_and_scores_df.loc[:, ['respiratory']].to_dict()['respiratory']
    self.kidney_condition = self.labels_and_scores_df.loc[:, ['kidney Disease']].to_dict()['kidney Disease']
    self.medication = self.labels_and_scores_df.loc[:, ['medications']].to_dict()['medications']
    self.regular_mensturation = self.labels_and_scores_df.loc[:, ['regular_periods']].to_dict()['regular_periods']
    self.smoking = self.labels_and_scores_df.loc[:, ['smoker']].to_dict()['smoker']
    self.alcohol_consumption = self.labels_and_scores_df.loc[:, ['alcohol']].to_dict()['alcohol']

  def get_individual_age(self, participant_id:int):
    try:
      return self.age[participant_id]
    except KeyError:
      return np.nan

  def get_individual_gender(self, participant_id:int):
    try:
      return self.gender[participant_id]
    except KeyError:
      return np.nan

  def get_individual_bmi(self, participant_id:int):
    try:
      return self.bmi[participant_id]
    except KeyError:
      return np.nan

  def get_individual_homa_ir(self, participant_id:int):
    try:
      return self.homa_ir[participant_id]
    except KeyError:
      return np.nan

  def get_individual_homa_ir_binary(self, participant_id:int):
    try:
      return self.homa_ir_binary[participant_id]
    except KeyError:
      return np.nan

  def get_individual_msss(self, participant_id:int):
    try:
      return self.msss[participant_id]
    except KeyError:
      return np.nan

  def get_individual_msss_binary(self, participant_id:int):
    try:
      return self.msss_binary[participant_id]
    except KeyError:
      return np.nan

  def get_individual_framingham_risk_category(self, participant_id:int):
    try:
      return self.framingham_risk_category[participant_id]
    except KeyError:
      return np.nan

  def get_individual_apri(self, participant_id:int):
    try:
      return self.apri[participant_id]
    except KeyError:
      return np.nan

  def get_individual_hypertension(self, participant_id:int):
    try:
      return self.hypertension[participant_id]
    except KeyError:
      return np.nan

  def get_individual_cardiovascular_condition(self, participant_id:int):
    try:
      return self.cardiovascular_condition[participant_id]
    except KeyError:
      return np.nan

  def get_individual_diabetes_condition(self, participant_id:int):
    try:
      return self.diabetes_condition[participant_id]
    except KeyError:
      return np.nan

  def get_individual_anxiety_condition(self, participant_id:int):
    try:
      return self.anxiety_condition[participant_id]
    except KeyError:
      return np.nan

  def get_individual_respiratory_condition(self, participant_id:int):
    try:
      return self.respiratory_condition[participant_id]
    except KeyError:
      return np.nan

  def get_individual_kidney_condition(self, participant_id:int):
    try:
      return self.kidney_condition[participant_id]
    except KeyError:
      return np.nan

  def get_individual_medications(self, participant_id:int):
    try:
      return self.medication[participant_id]
    except KeyError:
      return np.nan

  def get_individual_regular_mensturation(self, participant_id:int):
    try:
      return self.regular_mensturation[participant_id]
    except KeyError:
      return np.nan

  def get_individual_hyperlipidemia(self, participant_id:int):
    try:
      return self.hyperlipidemia[participant_id]
    except KeyError:
      return np.nan

  def get_individual_diabetes_type(self, participant_id:int):
    try:
      return self.diabetes_type[participant_id]
    except KeyError:
      return np.nan

  def get_individual_smoking(self, participant_id:int):
    try:
      return self.smoking[participant_id]
    except KeyError:
      return np.nan

  def get_individual_alcohol_consumption(self, participant_id:int):
    try:
      return self.alcohol_consumption[participant_id]
    except KeyError:
      return np.nan

  def get_complete_labels_and_scores(self):
    return self.labels_and_scores_df

In [None]:
# @title Sanity check: Checking how many particpants could have labels

files = gfile.ListDir(os.path.join(root_folder,"BY_SUBJECT_momentary_stress_algorithm"))
ids = [s[-9:] for s in files]

obj = MetHealthLabels()
df = obj.get_complete_labels_and_scores()
df['participant_id'] = df.index

total_count = 0
num_available = 0

for id in ids:
  total_count+=1
  participant_id = int(id.split(".")[0])
  current_df = df[df['participant_id']==participant_id]
  if len(current_df)>0:
    num_available+=1

print(f"Percentage of available potential labels: {(num_available/total_count)*100}")

In [None]:
# @title Sanity check: Checking how many labels are non-null
obj = MetHealthLabels()
df = obj.get_complete_labels_and_scores()
df['participant_id'] = df.index

files = gfile.ListDir(os.path.join(root_folder,"BY_SUBJECT_momentary_stress_algorithm"))
ids = [s[-9:] for s in files]

total_count = 0
num_available = 0

for id in ids:
  total_count+=1
  participant_id = int(id.split(".")[0])
  bmi = obj.get_individual_bmi(participant_id)
  if bmi >0:
    num_available+=1

print(f"Percentage of non-null labels: {(num_available/total_count)*100}")

In [None]:
# @title Sanity check: Checking how many labels are non-null
obj = MetHealthLabels()
df = obj.get_complete_labels_and_scores()
df['participant_id'] = df.index

files = gfile.ListDir(os.path.join(root_folder,"BY_SUBJECT_momentary_stress_algorithm"))
ids = [s[-9:] for s in files]

total_count = 0
num_available = 0

for id in ids:
  total_count+=1
  participant_id = int(id.split(".")[0])
  homa_ir = obj.get_individual_homa_ir(participant_id)
  if homa_ir >0:
    num_available+=1

print(f"Percentage of non-null labels: {(num_available/total_count)*100}")

In [None]:
root_folder = path_to_save_lsm_data
files = gfile.ListDir(os.path.join(root_folder,"BY_SUBJECT_momentary_stress_algorithm"))
ids = [s[-9:] for s in files]
print(ids)

all_features = []
for d in data:
  all_features.extend(d['features_to_extract'])

types = list(map(lambda x : x['type'], data))
cnt=0

def window(ids: list[str], window_length: str, timestamp_units: str):
  inputs = []
  mask = []

  for i in ids:
    print('ID: ', i)
    d = {}
    dfs = []
    for table in data:
      t = table['type']

      try:
        d[t] = pd.read_csv(gfile.Open(os.path.join(root_folder,"BY_SUBJECT_"+t,t+"_"+i), 'r'))
        d[t].rename(columns={'DT': 't'}, inplace=True)
        d[t]['t'] = pd.to_datetime(d[t]['t'], format='mixed')
      except:
        continue

      cols = table['features_to_extract'].copy()
      cols.append('t')
      d[t] = d[t][cols]

      if t == 'heart_rate':
        d[t]['bpm'][d[t]['bpm'] == -1] = np.nan

      if t == 'ceda':
        d[t].loc[d[t]['eda_level_real'] > 60, "eda_level_real"] = 60
        d[t].loc[d[t]['eda_level_real'] < 0, "eda_level_real"] = 0

      if t == 'momentary_stress_algorithm':
        d[t].loc[d[t]['ceda_slope_real_micro_siemens'] > 5, "ceda_slope_real_micro_siemens"] = 5
        d[t].loc[d[t]['ceda_slope_real_micro_siemens'] < -5, "ceda_slope_real_micro_siemens"] = -5

      if t == 'sleep_coefficient':
        d[t].loc[d[t]['sleep_coefficient'] == -1, "sleep_coefficient"] = np.nan

      if t == 'wrist_temperature':
        d[t]['wrist_temperatures'] = d[t]['wrist_temperatures']/20000
        d[t].loc[d[t]['wrist_temperatures'] > 41, "wrist_temperatures"] = 41
        d[t].loc[d[t]['wrist_temperatures'] < 0, "wrist_temperatures"] = np.nan

      if t == 'grok':
        d[t]['altim_std'] = d[t]['altim_std']/255

      if len(d[t]) > 0:
        d[t] = Sensor.resample(d[t], input_timestamp_units='s', output_timestamp_units=timestamp_units)
        dfs.append(d[t])

    if len(dfs) > 0:
      session = ft.reduce(lambda left, right: pd.merge(left, right, on='DT', how='outer'), dfs)
    else:
      continue

    for feature in FEATURES_TO_INCLUDE:
      if feature not in session.columns:
        session.loc[:,feature] = np.nan

    session = session[FEATURES_TO_INCLUDE]
    for feature in FEATURES_TO_INCLUDE:
      session.loc[:,feature] = (
          session[feature] - NORMALIZATION_PARAMETERS[feature][0]
      ) / (NORMALIZATION_PARAMETERS[feature][1])
    session = session.clip(-5, 5)

    df_grouped = session.groupby(pd.Grouper(freq=window_length))
    for name, group in df_grouped:
      nan_mask = np.isnan(group.to_numpy())
      missingness_ratio = np.sum(nan_mask) / (
          nan_mask.shape[0] * nan_mask.shape[1]
      )
      if group.shape[0] == 168*60 and group.shape[1] == len(FEATURES_TO_INCLUDE):
        if missingness_ratio>0.8:
          print('.   Too much missingness.')
        else:
          group = np.nan_to_num(group)
          yield name, {
              'id': i,
              'input': group,#.values,
              'mask': nan_mask,
          }

w = window(ids[:10], '168h', '1min')

In [None]:
df['participant_id'] = df.index
df[df['participant_id']== 12612]['homa_ir']

# STEP 4: Create TFrecords

In [None]:
import logging
import time
import enum

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _string_feature(value):
  """Returns a bytes_list from a string."""
  if isinstance(value, type(tf.constant("test"))):
    value = value.numpy()  # Ensure it's not a TensorFlow string tensor
  return _bytes_feature(value)

def numpy_example(array, labels):
  feature = {
      'bmi': _float_feature(labels[0]),
      'homa_ir': _float_feature(labels[1]),
      'apri': _float_feature(labels[2]),
      'msss': _float_feature(labels[3]),
      'hypertension_binary': _int64_feature(labels[4]),
      'hyperlipidemia_binary': _int64_feature(labels[5]),
      'cardiovascular_binary': _int64_feature(labels[6]),
      'diabetes_binary': _int64_feature(labels[7]),
      'anxiety_binary': _int64_feature(labels[8]),
      'respiratory': _int64_feature(labels[9]),
      'kidney_disease': _int64_feature(labels[10]),
      'homa_ir_binray': _int64_feature(labels[11]),
      'msss_binary': _int64_feature(labels[12]),

      'regular_menstruation_str': _string_feature(tf.io.serialize_tensor(labels[13])),
      'smoker_str': _string_feature(tf.io.serialize_tensor(labels[14])),
      'diabetes_type_str': _string_feature(tf.io.serialize_tensor(labels[15])),
      'alcohol_str': _string_feature(tf.io.serialize_tensor(labels[16])),
      'medications_str': _string_feature(tf.io.serialize_tensor(labels[17])),

      'array_raw': _bytes_feature(tf.io.serialize_tensor(array)),
  }
  return tf.train.Example(features=tf.train.Features(feature=feature))

gfile.MakeDirs(tf_record_folder)

types = list(map(lambda x : x['type'], data))
met_labels_object = MetHealthLabels()

# Iterate over the dataset and write each example to the TFRecord file
for i in ids[0:10]:
  id = i[0:-4]
  w = window([i], '168h', '1min')
  output_file = 'metabolichealth_' + id +'.tfrecords'
  with tf.io.TFRecordWriter(os.path.join(tf_record_folder,output_file)) as writer:
    for key, result in w:
      metabolic_labels_numerical = {"bmi": met_labels_object.get_individual_bmi(int(id)),
                    "homa_ir": met_labels_object.get_individual_homa_ir(int(id)),
                    "homa_ir_binary": met_labels_object.get_individual_homa_ir_binary(int(id)),
                    "msss": met_labels_object.get_individual_msss(int(id)),
                    "msss_binary": met_labels_object.get_individual_msss_binary(int(id)),
                    "apri": met_labels_object.get_individual_apri(int(id)),
                    "hypertension": met_labels_object.get_individual_hypertension(int(id)),
                    "hyperlipidemia": met_labels_object.get_individual_hyperlipidemia(int(id)),
                    "CVD": met_labels_object.get_individual_cardiovascular_condition(int(id)),
                    "diabetes": met_labels_object.get_individual_diabetes_condition(int(id)),
                    "anxiety": met_labels_object.get_individual_anxiety_condition(int(id)),
                    "respiratory": met_labels_object.get_individual_respiratory_condition(int(id)),
                    "kidney Disease": met_labels_object.get_individual_kidney_condition(int(id)),
                          }
      metabolic_labels_non_numerical = {
          "diabetes_type": met_labels_object.get_individual_diabetes_type(int(id)),
          "alcohol": met_labels_object.get_individual_alcohol_consumption(int(id)),
          "medications": met_labels_object.get_individual_medications(int(id)),
          "regular_periods": met_labels_object.get_individual_regular_mensturation(int(id)),
          "smoker": met_labels_object.get_individual_smoking(int(id)),
      }
      labels = []
      print(key)
      metabolic_labels_numerical = {k: -999 if np.isnan(v) else v for k, v in metabolic_labels_numerical.items()}
      metabolic_labels_string = {k: '-999' if pd.isna(v) else v for k, v in metabolic_labels_non_numerical.items()}

      try:
        labels.append(float(metabolic_labels_numerical["bmi"]))
        labels.append(float(metabolic_labels_numerical["homa_ir"]))
        labels.append(float(metabolic_labels_numerical["apri"]))
        labels.append(float(metabolic_labels_numerical["msss"]))
        labels.append(int(metabolic_labels_numerical["hypertension"]))
        labels.append(int(metabolic_labels_numerical["hyperlipidemia"]))
        labels.append(int(metabolic_labels_numerical["CVD"]))
        labels.append(int(metabolic_labels_numerical["diabetes"]))
        labels.append(int(metabolic_labels_numerical["anxiety"]))
        labels.append(int(metabolic_labels_numerical["respiratory"]))
        labels.append(int(metabolic_labels_numerical["kidney Disease"]))
        labels.append(int(metabolic_labels_numerical["homa_ir_binary"]))
        labels.append(int(metabolic_labels_numerical["msss_binary"]))

        labels.append(str(metabolic_labels_non_numerical["regular_periods"]))
        labels.append(str(metabolic_labels_non_numerical["smoker"]))
        labels.append(str(metabolic_labels_non_numerical["diabetes_type"]))
        labels.append(str(metabolic_labels_non_numerical["alcohol"]))
        labels.append(str(metabolic_labels_non_numerical["medications"]))
      except:
        print('Label missing.')
        continue
      tf_example = numpy_example(result['input'], labels)
      writer.write(tf_example.SerializeToString())
      print('.   '+str(key))
    print(f'TFRecord file created: {os.path.join(tf_record_folder,output_file)}')

In [None]:
root_folder = path_to_save_lsm_data
print(root_folder)
files = gfile.ListDir(os.path.join(root_folder,"BY_SUBJECT_momentary_stress_algorithm"))
ids = [s[-9:] for s in files]
print(ids)


for i in ids:
  id = i[0:-4]
  print(id)
  print(met_labels_object.get_individual_bmi(int(id)))

# Other Notes

In [None]:
root_folder = path_to_save_lsm_data

all_features = []
for d in data:
  all_features.extend(d['features_to_extract'])

print(all_features)

files = gfile.ListDir(os.path.join(root_folder,"BY_SUBJECT_momentary_stress_algorithm"))
ids = [s[-9:] for s in files]

types = list(map(lambda x : x['type'], data))
met_labels_object = MetHealthLabels()

cnt=0
for i in ids:
  print('ID: ', i)

  d = {}
  dfs = []
  # ----------------Updated January 6th-----------------

  participant_id = int(i.split(".")[0])
  print('.   Loading cardiometabolic and hepatic labels: ')
  metabolic_labels = {"bmi": met_labels_object.get_individual_bmi(participant_id),
                      "homa_ir": met_labels_object.get_individual_homa_ir(participant_id),
                      "msss": met_labels_object.get_individual_msss(participant_id),
                      "apri": met_labels_object.get_individual_apri(participant_id),
                      "hypertension": met_labels_object.get_individual_hypertension(participant_id)}

  # -------------------------------------------------

  for table in data:
    t = table['type']

    try:
      d[t] = pd.read_csv(gfile.Open(os.path.join(root_folder,"BY_SUBJECT_"+t,t+"_"+i), 'r'))
      print('.   Loaded '+t)
    except:
      print('.   Failed to load '+t)
      continue

    d[t].rename(columns={'DT': 't'}, inplace=True)
    # Ali added the following exception catch here
    try:
      d[t]['t'] = pd.to_datetime(d[t]['t'])
    except ValueError:
      print('.   Failed to convert '+t)
      d[t]['t'] = pd.to_datetime(d[t]['t'], format='mixed')
      continue
    cols = table['features_to_extract'].copy()
    cols.append('t')
    d[t] = d[t][cols]

    if t == 'heart_rate':
      d[t]['bpm'][d[t]['bpm'] == -1] = np.nan

    if t == 'ceda':
      d[t]['eda_level_real'][d[t]['eda_level_real'] > 60] = 60
      d[t]['eda_level_real'][d[t]['eda_level_real'] < 0] = 0

    if t == 'momentary_stress_algorithm':
      d[t]['ceda_slope_real_micro_siemens'][d[t]['ceda_slope_real_micro_siemens'] > 5] = 5
      d[t]['ceda_slope_real_micro_siemens'][d[t]['ceda_slope_real_micro_siemens'] < -5] = -5

    if t == 'sleep_coefficient':
      d[t]['sleep_coefficient'][d[t]['sleep_coefficient'] == -1] = np.nan

    if t == 'wrist_temperature':
      d[t]['wrist_temperatures'] = d[t]['wrist_temperatures']/20000
      d[t]['wrist_temperatures'][d[t]['wrist_temperatures'] > 41] = 41
      d[t]['wrist_temperatures'][d[t]['wrist_temperatures'] < 0] = np.nan

    if t == 'grok':
      d[t]['altim_std'] = d[t]['altim_std']/255

    if len(d[t]) > 0:
      d[t] = Sensor.resample(d[t], input_timestamp_units='s', output_timestamp_units='1min')
      dfs.append(d[t])

  session = ft.reduce(lambda left, right: pd.merge(left, right, on='DT', how='outer'), dfs)
  for f in all_features:
    if f not in session.columns:
      session[f] = np.nan

  session = zscore(session, nan_policy='omit')
  sess = session.clip(-3,3)


  df_grouped = sess.groupby(pd.Grouper(freq='168h'))
  df_grouped.head()
  groups = []
  for name, group in df_grouped:
    # ----------------Updated January 6th-----------------
    group['bmi'] = metabolic_labels['bmi']
    group['homa_ir'] = metabolic_labels['homa_ir']
    group['msss'] = metabolic_labels['msss']
    group['apri'] = metabolic_labels['apri']
    group['hypertension'] = metabolic_labels['hypertension']
    groups.append(group)
    # -------------------------------------------------

    fig, ax = plt.subplots(figsize=(25,5))
    x_lims = [group.index[0], group.index[-1]]
    y_lims = [0, len(group.columns)]
    im = ax.imshow(np.flip(np.flip(group.to_numpy()[0:-1,:].T,axis=0),axis=1), interpolation='nearest', aspect='auto', extent = [x_lims[0], x_lims[1],  y_lims[0], y_lims[1]])
    ax.set_yticks(range(0,len(group.columns)))
    ax.set_yticklabels(group.columns.to_list())
    fig.colorbar(im)
    plt.show()

  #  ----------------Added December 12th-----------------
  df_grouped = pd.concat(groups)
  # -------------------------------------------------

  cnt += 1
  if cnt >10:
    break

In [None]:
import tensorflow_datasets as tfds

class Lsm(tfds.core.GeneratorBasedBuilder):
  """DatasetBuilder for afib dataset."""

  VERSION = tfds.core.Version('2.0.0')
  RELEASE_NOTES = {
      '2.0.0': 'Initial release.',
  }

  def _info(self) -> tfds.core.DatasetInfo:
    """Returns the dataset metadata."""
    return tfds.core.DatasetInfo(
        builder=self,
        description=_DESCRIPTION,
        features=tfds.features.FeaturesDict(_init_feature_dict()),
        supervised_keys=None,
        citation=_CITATION,
    )

  def _split_generators(
      self, dl_manager: tfds.download.DownloadManager, pipeline: beam.Pipeline
  ) -> Any:
    """Returns SplitGenerators."""

    # read ids list from csv
    root_folder = ("/cns/yq-d/home/fitbit-medical-sandboxes/e=1:kid=76381"
                         ":mkey=cns-deid/chr-ards-metabolichealth/deid/exp/"
                         "mdb=chr-ards-metabolichealth-deid-exp-decrypt/"
                         "foundational_llm_research/large_sensor_model_data/")

    files = gfile.ListDir(os.path.join(root_folder,"BY_SUBJECT_steps"))
    ids = [s[6:] for s in files]

    return {
        'train': pipeline | 'GenerateExamples' >> self._generate_examples(ids)
    }

  def _generate_examples(self, ids: list[str]) -> Any:
    """Yields examples."""
    return beam.Create(ids) | 'GetExampleGenerator' >> beam.FlatMap(
        get_example_g,
        enerator
    )

In [None]:
from typing import Any

def get_example_generator(user_id: str) -> Any:
  """Returns an example generator."""

  logging.info('Retrieving Data for user %s', user_id)
  try:
    timestamp_millis = int(time.time() * 1000)
    random.seed(timestamp_millis)
    for i, (_, data) in enumerate(sensor.window([user_id], '1min', 's')):
      key = '%d_%d_%s' % (
          random.randint(0, 1000000000),
          i,
          user_id,
      )  # get unique key
      result = {
          constants.TFExampleKey.INPUT.value: _serialize_numpy_array(
              data['input']
          ),
          constants.TFExampleKey.MASK.value: _serialize_numpy_array(
              data['mask']
          ),
      }
      yield key, result
  except Exception as e:  # pylint: disable=broad-except
    logging.warning(
        'Failed to retrieve data for user %s, error: %s', user_id, str(e)
    )


In [None]:
# Demo
demo = survey_phone_data['demographic_questionnaire_responses.csv'].copy()

try:
  demo.set_index(["#study_participant_id"], inplace=True)
except:
  'Index already reset.'

joined_df = demo

# BFI Responses
#contents['intake_survey_bfi_questionnaire.csv'] # Questions
bfi = survey_phone_data['intake_survey_bfi_questionnaire_responses.csv']
# Code BFI:
#Extraversion: 1R, 5 #Agreeableness: 2, 7R #Conscientiousness: 3R, 8 #Neuroticism: 4R, 9 #Openness to Experience: 5R, 10
mapping = {'Disagree strongly': 1, 'Disagree a little': 2, 'Neither agree nor disagree': 3, 'Agree a little': 4, 'Agree strongly': 5}
for c in range(1,11):
  bfi['intake_survey_-_bfi-10_q'+str(c)+'_group_score'] = bfi['intake_survey_-_bfi-10_q'+str(c)+'_group'].map(mapping)

bfi['extraversion_score'] = -bfi['intake_survey_-_bfi-10_q1_group_score'] + bfi['intake_survey_-_bfi-10_q5_group_score']
bfi['agreeableness_score'] = bfi['intake_survey_-_bfi-10_q2_group_score'] - bfi['intake_survey_-_bfi-10_q7_group_score']
bfi['conscientiousness_score'] = -bfi['intake_survey_-_bfi-10_q3_group_score'] + bfi['intake_survey_-_bfi-10_q8_group_score']
bfi['neuroticism_score'] = -bfi['intake_survey_-_bfi-10_q4_group_score'] + bfi['intake_survey_-_bfi-10_q9_group_score']
bfi['openness_score'] = -bfi['intake_survey_-_bfi-10_q5_group_score'] + bfi['intake_survey_-_bfi-10_q10_group_score']

bfi.reset_index(inplace=True)
bfi.set_index(["#study_participant_id"], inplace=True)
#bfi.reset_index(inplace=True)
#bfi.set_index(["#study_participant_id"], inplace=True)
joined_df = joined_df.join(bfi, lsuffix='bfi_')

# PHQ Responses
#contents['phq_8_complete_questionnaire.csv'] # Questions

# Intake:
phq = survey_phone_data['phq_8_intake_questionnaire_responses.csv']

mapping = {'Not at all': 0, 'Several days': 1, 'More than half the days': 2, 'Nearly every day': 3}
columns = ['little_interest','depression','sleep','tired','appetite','failure','trouble_concentrating','restlessness']
for c in columns:
  phq[c+'_intake_score'] = phq[c].map(mapping)
columns = ['little_interest_intake_score','depression_intake_score','sleep_intake_score',
           'tired_intake_score','appetite_intake_score','failure_intake_score',
           'trouble_concentrating_intake_score','restlessness_intake_score']
phq['phq_intake_score'] = phq[columns].mean(axis=1)*8
phq = phq[['#study_participant_id',
           'little_interest_intake_score','depression_intake_score','sleep_intake_score',
           'tired_intake_score','appetite_intake_score','failure_intake_score',
           'trouble_concentrating_intake_score','restlessness_intake_score','phq_intake_score']]

phq.reset_index(inplace=True)
phq.set_index(["#study_participant_id"], inplace=True)
joined_df = joined_df.join(phq, lsuffix='phq_')

# Completion:
phq = survey_phone_data['phq_8_complete_questionnaire_responses.csv']

mapping = {'Not at all': 0, 'Several days': 1, 'More than half the days': 2, 'Nearly every day': 3}
columns = ['little_interest','depression','sleep','tired','appetite','failure','trouble_concentrating','restlessness']
for c in columns:
  phq[c+'_complete_score'] = phq[c].map(mapping)
columns = ['little_interest_complete_score','depression_complete_score','sleep_complete_score',
           'tired_complete_score','appetite_complete_score','failure_complete_score',
           'trouble_concentrating_complete_score','restlessness_complete_score']
phq['phq_complete_score'] = phq[columns].mean(axis=1)*8
phq = phq[['#study_participant_id',
           'little_interest_complete_score','depression_complete_score','sleep_complete_score',
           'tired_complete_score','appetite_complete_score','failure_complete_score',
           'trouble_concentrating_complete_score','restlessness_complete_score','phq_complete_score']]

phq.reset_index(inplace=True)
phq.set_index(["#study_participant_id"], inplace=True)
phq.drop(columns=['index'], inplace=True)
joined_df = joined_df.join(phq, lsuffix='phq_')


# GAD Responses
#contents['phq_8_complete_questionnaire.csv'] # Questions

# Intake:
gad = survey_phone_data['gad_7_intake_questionnaire_responses.csv'].copy()

mapping = {'Not at all': 0, 'Several days': 1, 'More than half the days': 2,
           'Nearly every day': 3}
columns = ['anxiety','cannot_stop_worry','too_much_worry','trouble_relaxing','restlessness','irritability','fear']
for c in columns:
  gad['gad_'+c+'_intake_score'] = gad[c].map(mapping)
columns = ['gad_anxiety_intake_score','gad_cannot_stop_worry_intake_score','gad_too_much_worry_intake_score','gad_trouble_relaxing_intake_score','gad_restlessness_intake_score','gad_irritability_intake_score','gad_fear_intake_score']
gad['gad_intake_score'] = gad[columns].mean(axis=1)*7
gad = gad[['#study_participant_id','gad_anxiety_intake_score','gad_cannot_stop_worry_intake_score','gad_too_much_worry_intake_score','gad_trouble_relaxing_intake_score','gad_restlessness_intake_score','gad_irritability_intake_score','gad_fear_intake_score','gad_intake_score']]

gad.reset_index(inplace=True)
gad.set_index(["#study_participant_id"], inplace=True)
gad.drop(columns=['index'], inplace=True)
joined_df = joined_df.join(gad, lsuffix='gad_')

# Completion:
gad = survey_phone_data['gad_7_complete_questionnaire_responses.csv'].copy()

mapping = {'Not at all': 0, 'Several days': 1, 'More than half the days': 2,
           'Nearly every day': 3}
columns = ['anxiety','cannot_stop_worry','too_much_worry','trouble_relaxing','restlessness','irritability','fear']
for c in columns:
  gad['gad_'+c+'_complete_score'] = gad[c].map(mapping)
columns = ['gad_anxiety_complete_score','gad_cannot_stop_worry_complete_score','gad_too_much_worry_complete_score','gad_trouble_relaxing_complete_score','gad_restlessness_complete_score','gad_irritability_complete_score','gad_fear_complete_score']
gad['gad_complete_score'] = gad[columns].mean(axis=1)*7
gad = gad[['#study_participant_id','gad_anxiety_complete_score','gad_cannot_stop_worry_complete_score','gad_too_much_worry_complete_score','gad_trouble_relaxing_complete_score','gad_restlessness_complete_score','gad_irritability_complete_score','gad_fear_complete_score','gad_complete_score']]

gad.reset_index(inplace=True)
gad.set_index(["#study_participant_id"], inplace=True)
gad.drop(columns=['index'], inplace=True)
joined_df = joined_df.join(gad, lsuffix='gad_')

# Sleep disturbance Responses
sleep_disturbance = survey_phone_data['sleep_disturbance_intake_questionnaire_responses.csv']

mapping = {'Not at all': 1, 'A little bit': 2, 'Somewhat': 3, 'Quite a bit': 4, 'Very much': 5}
columns = ['restless','satisfied', 'refreshing','trouble_falling_asleep']
for c in columns:
  sleep_disturbance[c+'_score'] = sleep_disturbance[c].map(mapping)

mapping = {'Never': 1, 'Rarely': 2, 'Sometimes': 3, 'Often': 4, 'Always': 5}
columns = ['trouble_staying_asleep', 'trouble_sleeping',
       'enough_sleep']
for c in columns:
  sleep_disturbance[c+'_score'] = sleep_disturbance[c].map(mapping)

mapping = {'Very poor': 1, 'Poor': 2, 'Fair': 3, 'Good': 4, 'Very good': 5}
columns = ['quality']
for c in columns:
  sleep_disturbance[c+'_score'] = sleep_disturbance[c].map(mapping)

#cols = fnmatch.filter(sleep_disturbance.columns, '*_score')
#sleep_disturbance['sleep_disturbance_score'] = sleep_disturbance[cols].sum(axis=1)
sleep_disturbance['sleep_disturbance_score'] = (sleep_disturbance['restless_score'] + (5 - sleep_disturbance['satisfied_score']) + (5 - sleep_disturbance['refreshing_score']) + sleep_disturbance['trouble_falling_asleep_score'] + sleep_disturbance['trouble_staying_asleep_score'] + sleep_disturbance['trouble_sleeping_score'] + (5 - sleep_disturbance['enough_sleep_score']) + (5 - sleep_disturbance['quality_score']) )
sleep_disturbance.reset_index(inplace=True)
sleep_disturbance.set_index(["#study_participant_id"], inplace=True)
joined_df = joined_df.join(sleep_disturbance, lsuffix='sleepdisturbance_')


# Sleep Impairment Responses
sleep_impairment = survey_phone_data['sleep_impairment_intake_questionnaire_responses.csv']

mapping = {'Not at all': 1, 'A little bit': 2, 'Somewhat': 3, 'Quite a bit': 4, 'Very much': 5}
columns = ['trouble_productivity', 'alertness', 'tiredness',
       'problems', 'trouble_concentrating', 'irritability',
       'sleepy_during_daytime', 'trouble_staying_awake']
for c in columns:
  sleep_impairment[c+'_score'] = sleep_impairment[c].map(mapping)

#cols = fnmatch.filter(sleep_impairment.columns, '*_score')
#sleep_impairment['sleep_impairment_score'] = sleep_impairment[cols].sum(axis=1)
sleep_impairment['sleep_impairment_score'] = (sleep_impairment['trouble_productivity_score'] + (5 - sleep_impairment['alertness_score']) + sleep_impairment['tiredness_score'] + sleep_impairment['problems_score'] + sleep_impairment['trouble_concentrating_score'] + sleep_impairment['irritability_score'] + sleep_impairment['sleepy_during_daytime_score'] + sleep_impairment['trouble_staying_awake_score'] )

sleep_impairment.reset_index(inplace=True)
sleep_impairment.set_index(["#study_participant_id"], inplace=True)
joined_df = joined_df.join(sleep_impairment, lsuffix='sleepimpairment_')

# PSS Responses
pss = survey_phone_data['pss_intake_questionnaire_responses.csv']

mapping_1 = {'Never': 0, 'Almost Never': 1, 'Sometimes': 2, 'Fairly Often': 3, 'Very Often': 4}
mapping_2 = {'Never': 4, 'Almost Never': 3, 'Sometimes': 2, 'Fairly Often': 1, 'Very Often': 0}

# 1. In the last month, how often have you been upset because of something that happened unexpectedly?
# 2. In the last month, how often have you felt that you were unable to control the important things in your life?
# 3. In the last month, how often have you felt nervous and stressed?
# 4. In the last month, how often have you felt confident about your ability to handle your personal problems?
# 5. In the last month, how often have you felt that things were going your way?
# 6. In the last month, how often have you found that you could not cope with all the things that you had to do?
# 7. In the last month, how often have you been able to control irritations in your life?
# 8. In the last month, how often have you felt that you were on top of things?
# 9. In the last month, how often have you been angered because of things that happened that were outside of your control?
# 10. In the last month, how often have you felt difficulties were piling up so high that you could not overcome them?

columns = ['upset','no_control','stress','handle_personal_problems','things_positive','cannot_cope','control_irritation','on_top_of_things','anger','overwhelm']
newcolumns = []
for c in columns:
  if c in ['handle_personal_problems','things_positive','control_irritation','on_top_of_things']:
    pss['pss_'+c+'_score'] = pss[c].map(mapping_2)
  else:
    pss['pss_'+c+'_score'] = pss[c].map(mapping_1)
  newcolumns.append('pss_'+c+'_score')

columns = newcolumns
pss['pss_score'] = pss[columns].sum(axis=1)
columns.append('pss_score')
columns.append('#study_participant_id')
pss = pss[columns]

pss.reset_index(inplace=True)
pss.set_index(["#study_participant_id"], inplace=True)
joined_df = joined_df.join(pss, lsuffix='pss_')

plt.figure(figsize=(15, 3))
plt.plot(joined_df.isna().sum())
plt.xticks(rotation=90)
plt.show()

joined_df.reset_index(inplace=True)
#joined_df.dropna(inplace=True)
joined_df.index = joined_df.index.astype(int)
demo = joined_df
demo['phq_delta'] = demo['phq_complete_score'] - demo['phq_intake_score']

# Per Type Processing (Deprecated)

In [None]:
# @title Steps
## Steps:
with gfile.Open(os.path.join(root_folder, 'STEPS_COMPACT_DATA.csv.csv'), 'r') as f:
  steps = pd.read_csv(f)
steps.reset_index(inplace=True)
list_arrays = []
for i, row in steps.iterrows():
  tmp = pd.DataFrame()
  tmp['millis_from_start_time'] = pd.Series(range(0,1440))*1000*60
  tmp['steps'] = get_arrays(row,'steps')
  tmp['participant_id'] = row['participant_id']
  tmp['activity_time'] = row['activity_time']
  tmp['activity_tm_timezone_offset'] = row['activity_tm_timezone_offset']
  list_arrays.append(tmp)

steps = pd.concat(list_arrays, ignore_index=True)
steps = steps[steps['steps']>=0]
steps['activity_time'] = pd.to_datetime(steps['activity_time'])
steps['activity_time_local'] = steps['activity_time'] + steps['activity_tm_timezone_offset'].astype('timedelta64[m]') + steps['millis_from_start_time'].astype('timedelta64[ms]')

steps.rename(columns={'activity_time_local': 'DT', 'participant_id': 'ID'}, inplace=True)
steps = steps[['ID','DT','steps']]
steps.to_csv(gfile.Open(os.path.join(root_folder,"STEPS_COMPACT_DATA_PROCESSED.csv"), 'w'))

WORKER_COUNT = 20

def split_save(L):
  steps[steps['ID']==L].to_csv(gfile.Open(os.path.join(root_folder,"steps","steps_"+str(L)+".csv"), 'w'))

L = pd.unique(steps.ID)
begin = time()
with multiprocessing.pool.ThreadPool(WORKER_COUNT) as pool:
  output = list(pool.map(split_save, L))
  pool.close()
  pool.join()
end = time()

In [None]:
# @title Momentary Stress Algorithm



msa = pd.read_csv(gfile.Open(os.path.join(root_folder,"MOMENTARY_STRESS_ALGORITHM_DATA.csv.csv", 'r')))

list_arrays = []
for i, row in msa.iterrows():
  msa = pd.DataFrame()

  msa['millis_from_start_time'] = get_arrays(row,'offsets')*1000*60
  msa['hrv_shannon_entropy_rr'] = get_arrays(row,'hrv_shannon_entropy_rr')
  msa['hrv_shannon_entropy_rrd'] = get_arrays(row,'hrv_shannon_entropy_rrd')
  msa['hrv_percentage_of_nn_30'] = get_arrays(row,'hrv_percentage_of_nn_30')
  msa['ceda_magnitude_real_micro_siemens'] = get_arrays(row,'ceda_magnitude_real_micro_siemens')
  msa['ceda_slope_real_micro_siemens'] = get_arrays(row,'ceda_slope_real_micro_siemens')
  msa['rmssd_percentile_0595'] = get_arrays(row,'rmssd_percentile_0595')
  msa['sdnn_percentile_0595'] = get_arrays(row,'sdnn_percentile_0595')
  msa['msa_probability'] = get_arrays(row,'msa_probability')
  msa['hrv_percent_good'] = get_arrays(row,'hrv_percent_good')
  msa['hrv_rr_80th_percentile_mean'] = get_arrays(row,'hrv_rr_80th_percentile_mean')
  msa['hrv_rr_20th_percentile_mean'] = get_arrays(row,'hrv_rr_20th_percentile_mean')
  msa['hrv_rr_median'] = get_arrays(row,'hrv_rr_median')
  msa['hrv_rr_mean'] = get_arrays(row,'hrv_rr_mean')
  msa['hr_at_rest_mean'] = get_arrays(row,'hr_at_rest_mean')
  msa['skin_temperature_magnitude'] = get_arrays(row,'skin_temperature_magnitude')
  msa['skin_temperature_slope'] = get_arrays(row,'skin_temperature_slope')
  msa['participant_id'] = row['participant_id']
  msa['activity_time'] = row['activity_time']
  msa['activity_tm_timezone_offset'] = row['activity_tm_timezone_offset']
  list_arrays.append(msa)
  #except:
  #  print('Could not process PID: ' + str(row['participant_id']))

msa = pd.concat(list_arrays, ignore_index=True)

msa['activity_time'] = pd.to_datetime(msa['activity_time'])
msa['activity_time_local'] = msa['activity_time']
msa['activity_time_true'] = msa['activity_time_local'] + msa['millis_from_start_time'].astype('timedelta64[ms]')
msa['minute_today'] = msa['activity_time_true'].dt.hour*60 + msa['activity_time_true'].dt.minute
msa['count'] = 1

In [None]:
msa.rename(columns={'activity_time_true': 'DT', 'participant_id': 'ID'}, inplace=True)
msa[['ID',
       'DT',
       'hrv_shannon_entropy_rr',
       'hrv_shannon_entropy_rrd',
       'hrv_percentage_of_nn_30',
       'ceda_magnitude_real_micro_siemens',
       'ceda_slope_real_micro_siemens',
       'rmssd_percentile_0595',
       'sdnn_percentile_0595',
       'msa_probability',
       'hrv_percent_good',
       'hrv_rr_80th_percentile_mean',
       'hrv_rr_20th_percentile_mean',
       'hrv_rr_median',
       'hrv_rr_mean',
       'hr_at_rest_mean',
       'skin_temperature_magnitude',
       'skin_temperature_slope',
       ]].to_csv(gfile.Open(os.path.join(root_folder,"MOMENTARY_STRESS_ALGORITHM_DATA_PROCESSED.csv"), 'w'))

In [None]:
WORKER_COUNT = 20
gfile.MakeDirs(gfile.Open(os.path.join(root_folder,"momentary_stress_algorithm")))

def split_save(L):
  msa[msa['ID']==L].to_csv(gfile.Open(os.path.join(root_folder,"momentary_stress_algorithm","momentary_stress_algorithm_"+str(L)+".csv"), 'w'))

L = pd.unique(msa.ID)
begin = time()
with multiprocessing.pool.ThreadPool(WORKER_COUNT) as pool:
  output = list(pool.map(split_save, L))
  pool.close()
  pool.join()
end = time()

In [None]:
# @title EDA
eda = pd.read_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/CONTINUOUS_EDA_DATA.csv.csv", 'r'))
list_arrays = []
for i, row in eda.iterrows():
  eda = pd.DataFrame()
  try:
    eda['millis_from_start_time'] = get_arrays(row,'millis_from_start_time')
    eda['eda_level_real'] = get_arrays(row,'eda_level_real')
    eda['leads_contact_counts'] = get_arrays(row,'leads_contact_counts')
    eda['participant_id'] = row['participant_id']
    eda['activity_time'] = row['activity_time']
    eda['activity_tm_timezone_offset'] = row['activity_tm_timezone_offset']
    list_arrays.append(eda)
  except:
    print('Could not process PID: ' + str(row['participant_id']))

eda = pd.concat(list_arrays, ignore_index=True)

eda['activity_time'] = pd.to_datetime(eda['activity_time'])
eda['activity_time_local'] = eda['activity_time'] + eda['activity_tm_timezone_offset'].astype('timedelta64[m]')
eda['activity_time_true'] = eda['activity_time_local'] + eda['millis_from_start_time'].astype('timedelta64[ms]')
eda['minute_today'] = eda['activity_time_true'].dt.hour*60 + eda['activity_time_true'].dt.minute
eda['count'] = 1

In [None]:
eda.rename(columns={'activity_time_true': 'DT', 'participant_id': 'ID'}, inplace=True)
eda[['ID',
     'DT',
     'eda_level_real',
     'leads_contact_counts',
       ]].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/CONTINUOUS_EDA_DATA_PROCESSED.csv", 'w'))

In [None]:
WORKER_COUNT = 20
gfile.MakeDirs("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/continuous_eda")

def split_save(L):
  eda[eda['ID']==L].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/continuous_eda/continuous_eda_"+str(L)+".csv", 'w'))

L = pd.unique(eda.ID)
begin = time()
with multiprocessing.pool.ThreadPool(WORKER_COUNT) as pool:
  output = list(pool.map(split_save, L))
  pool.close()
  pool.join()
end = time()

In [None]:
# @title Skin Temperature
skintemp = pd.read_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/WRIST_TEMPERATURE_DATA.csv.csv", 'r'))
skintemp.reset_index(inplace=True)
list_arrays = []
for i, row in skintemp.iterrows():
  tmp = pd.DataFrame()
  tmp['millis_from_start_time'] = pd.Series(range(0,1440))*1000*60
  tmp['wrist_temperatures'] = get_arrays(row,'wrist_temperatures')
  tmp['participant_id'] = row['participant_id']
  tmp['activity_time'] = row['activity_time']
  tmp['activity_tm_timezone_offset'] = row['tz_offset_minutes']
  list_arrays.append(tmp)

skintemp = pd.concat(list_arrays, ignore_index=True)
skintemp = skintemp[skintemp['wrist_temperatures']>=0]

skintemp['activity_time'] = pd.to_datetime(skintemp['activity_time'])
skintemp['activity_time_local'] = skintemp['activity_time']
skintemp['activity_time_true'] = skintemp['activity_time_local'] + skintemp['millis_from_start_time'].astype('timedelta64[ms]')
skintemp['minute_today'] = skintemp['activity_time_true'].dt.hour*60 + skintemp['activity_time_true'].dt.minute
skintemp['count'] = 1

In [None]:
skintemp.rename(columns={'activity_time_true': 'DT', 'participant_id': 'ID'}, inplace=True)
skintemp[['ID',
     'DT',
     'wrist_temperatures',
       ]].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/WRIST_TEMPERATURE_DATA_PROCESSED.csv", 'w'))

In [None]:
WORKER_COUNT = 20
gfile.MakeDirs("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/wrist_temperatures")

def split_save(L):
  skintemp[skintemp['ID']==L].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/wrist_temperatures/wrist_temperatures_"+str(L)+".csv", 'w'))

L = pd.unique(skintemp.ID)
begin = time()
with multiprocessing.pool.ThreadPool(WORKER_COUNT) as pool:
  output = list(pool.map(split_save, L))
  pool.close()
  pool.join()
end = time()

In [None]:
# @title Sleep Coefficient
sleepcoefficient = pd.read_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/SLEEP_COEFFICIENT_COMPACT_DATA.csv.csv", 'r'))
sleepcoefficient.reset_index(inplace=True)
list_arrays = []
for i, row in sleepcoefficient.iterrows():
  try:
    tmp = pd.DataFrame()
    tmp['millis_from_start_time'] = pd.Series(range(0,1440))*1000*60
    tmp['sleep_coefficient'] = get_arrays(row,'sleep_coefficient')
    tmp['is_on_wrist'] = get_arrays(row,'is_on_wrist',bool).astype('int')
    tmp['participant_id'] = row['participant_id']
    tmp['activity_time'] = row['activity_time']
    tmp['activity_tm_timezone_offset'] = row['tz_offset_minutes']
    list_arrays.append(tmp)
  except:
    print('Could not process PID: ' + str(row['participant_id']))

sleepcoefficient = pd.concat(list_arrays, ignore_index=True)

sleepcoefficient['activity_time'] = pd.to_datetime(sleepcoefficient['activity_time'])
sleepcoefficient['activity_time_local'] = sleepcoefficient['activity_time']
sleepcoefficient['activity_time_true'] = sleepcoefficient['activity_time_local'] + sleepcoefficient['millis_from_start_time'].astype('timedelta64[ms]')
sleepcoefficient['minute_today'] = sleepcoefficient['activity_time_true'].dt.hour*60 + sleepcoefficient['activity_time_true'].dt.minute
sleepcoefficient['count'] = 1

In [None]:
sleepcoefficient.rename(columns={'activity_time_true': 'DT', 'participant_id': 'ID'}, inplace=True)
sleepcoefficient[['ID',
     'DT',
     'sleep_coefficient',
      'is_on_wrist',
       ]].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/SLEEP_COEFFICIENT_COMPACT_DATA_PROCESSED.csv", 'w'))

In [None]:
sleepcoefficient = pd.read_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/SLEEP_COEFFICIENT_COMPACT_DATA_PROCESSED.csv", 'r'))

In [None]:
WORKER_COUNT = 20
gfile.MakeDirs("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/sleep_coefficient")

def split_save(L):
  sleepcoefficient[sleepcoefficient['ID']==L].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/sleep_coefficient/sleep_coefficient_"+str(L)+".csv", 'w'))

L = pd.unique(sleepcoefficient.ID)
begin = time()
with multiprocessing.pool.ThreadPool(WORKER_COUNT) as pool:
  output = list(pool.map(split_save, L))
  pool.close()
  pool.join()
end = time()

In [None]:
# @title SpO2
spo2 = pd.read_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/ABSOLUTE_SPO2_DATA.csv.csv", 'r'))
spo2.reset_index(inplace=True)
list_arrays = []

spo2['activity_time'] = pd.to_datetime(spo2['activity_time'], format='mixed')
spo2['activity_time_local'] = spo2['activity_time']
spo2['activity_time_true'] = spo2['activity_time_local']
spo2['minute_today'] = spo2['activity_time_true'].dt.hour*60 + spo2['activity_time_true'].dt.minute
spo2['count'] = 1
spo2['valid'] = spo2['valid'].astype('int')

In [None]:
spo2.rename(columns={'activity_time_true': 'DT', 'participant_id': 'ID'}, inplace=True)
spo2[['ID',
     'DT',
     'value',
      'confidence',
      'coverage',
      'valid',
       ]].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/ABSOLUTE_SPO2_DATA_PROCESSED.csv", 'w'))

In [None]:
WORKER_COUNT = 20
gfile.MakeDirs("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/spo2")

def split_save(L):
  spo2[spo2['ID']==L].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/spo2/spo2_"+str(L)+".csv", 'w'))

L = pd.unique(spo2.ID)
begin = time()
with multiprocessing.pool.ThreadPool(WORKER_COUNT) as pool:
  output = list(pool.map(split_save, L))
  pool.close()
  pool.join()
end = time()

In [None]:
# @title Grok
grok = pd.read_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/GROK_FEATURE_DATA.csv.csv", 'r'))

In [None]:
#grok.reset_index(inplace=True)
list_arrays = []
for i, row in grok.iterrows():
  tmp = pd.DataFrame()
  tmp['millis_from_start_time'] = get_arrays(row,'millis_from_start_of_day')
  tmp['jerk_auto'] = get_arrays(row,'jerk_auto')
  tmp['step_count'] = get_arrays(row,'step_count')
  tmp['log_energy'] = get_arrays(row,'log_energy')
  tmp['covariance'] = get_arrays(row,'covariance')
  tmp['log_energy_ratio'] = get_arrays(row,'log_energy_ratio')
  tmp['zero_crossing_std'] = get_arrays(row,'zero_crossing_std')
  tmp['zero_crossing_avg'] = get_arrays(row,'zero_crossing_avg')
  tmp['axis_mean'] = get_arrays(row,'axis_mean')
  tmp['altim_std'] = get_arrays(row,'altim_std')
  tmp['kurtosis'] = get_arrays(row,'kurtosis')
  tmp['participant_id'] = row['participant_id']
  tmp['activity_time'] = row['activity_time']
  tmp['activity_tm_timezone_offset'] = row['activity_tm_timezone_offset']
  list_arrays.append(tmp)

grok = pd.concat(list_arrays, ignore_index=True)

grok['activity_time'] = pd.to_datetime(grok['activity_time'])
grok['activity_time_local'] = grok['activity_time'] + grok['activity_tm_timezone_offset'].astype('timedelta64[m]')
grok['activity_time_true'] = grok['activity_time_local'] + grok['millis_from_start_time'].astype('timedelta64[ms]')
grok['minute_today'] = grok['activity_time_true'].dt.hour*60 + grok['activity_time_true'].dt.minute
grok['count'] = 1

In [None]:
grok.rename(columns={'activity_time_true': 'DT', 'participant_id': 'ID'}, inplace=True)
grok[['ID',
     'DT',
     'jerk_auto',
      'step_count',
      'log_energy',
      'covariance',
      'log_energy_ratio',
      'zero_crossing_std',
      'zero_crossing_avg',
      'axis_mean',
      'altim_std',
      'kurtosis',
       ]].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/GROK_FEATURE_DATA_PROCESSED.csv", 'w'))

In [None]:
WORKER_COUNT = 20
gfile.MakeDirs("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/grok")

def split_save(L):
  grok[grok['ID']==L].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/grok/grok_"+str(L)+".csv", 'w'))

L = pd.unique(grok.ID)
begin = time()
with multiprocessing.pool.ThreadPool(WORKER_COUNT) as pool:
  output = list(pool.map(split_save, L))
  pool.close()
  pool.join()
end = time()

In [None]:
# @title Heart Rate
heart_rate = pd.read_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/HEART_RATE_DATA.csv.csv", 'r'))
heart_rate

In [None]:
heart_rate.reset_index(inplace=True)
list_arrays = []
for i, row in heart_rate.iterrows():
  tmp = pd.DataFrame()
  tmp['millis_from_start_time'] = pd.Series(range(0,1440))*1000*60
  tmp['hr'] = get_arrays(row,'hr')
  tmp['participant_id'] = row['participant_id']
  tmp['activity_time'] = row['activity_time']
  tmp['activity_tm_timezone_offset'] = row['tz_offset_minutes']
  list_arrays.append(tmp)

heart_rate = pd.concat(list_arrays, ignore_index=True)

heart_rate['activity_time'] = pd.to_datetime(heart_rate['activity_time'])
heart_rate['activity_time_local'] = heart_rate['activity_time']
heart_rate['activity_time_true'] = heart_rate['activity_time_local'] + heart_rate['millis_from_start_time'].astype('timedelta64[ms]')
heart_rate['minute_today'] = heart_rate['activity_time_true'].dt.hour*60 + heart_rate['activity_time_true'].dt.minute
heart_rate['count'] = 1

In [None]:
heart_rate.rename(columns={'activity_time_true': 'DT', 'participant_id': 'ID'}, inplace=True)
heart_rate[['ID',
     'DT',
     'hr',
       ]].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/HEART_RATE_PROCESSED.csv", 'w'))

In [None]:
WORKER_COUNT = 20
gfile.MakeDirs("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/heart_rate")

def split_save(L):
  heart_rate[heart_rate['ID']==L].to_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/heart_rate/heart_rate_"+str(L)+".csv", 'w'))

L = pd.unique(heart_rate.ID)
begin = time()
with multiprocessing.pool.ThreadPool(WORKER_COUNT) as pool:
  output = list(pool.map(split_save, L))
  pool.close()
  pool.join()
end = time()

In [None]:
# @title Heart Rate Variability
heart_rate_variability = pd.read_csv(gfile.Open("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/HEART_RATE_VARIABILITY_DATA.csv.csv", 'r'))
heart_rate_variability