https://grants.corp.google.com/#/grants?request=20h%2Fchr-ards-dwb-deid-eng-policy:r&reason=b%2F264556558%20-%20DWB%20RQ%20and%20Analysis

https://grants.corp.google.com/#/grants?request=20h%2Fchr-ards-dwb-deid-colab-jobs&reason=b%2F264556558%20-%20DWB%20RQ%20and%20Analysis

https://pantheon.corp.google.com/storage/browser/health-studies-digital-wellbeing-export/processed_data_files_for_ari?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))

# Imports

In [None]:
from absl import app
import apache_beam as beam
from ast import literal_eval
from collections.abc import Sequence
import csv
import datetime
import fnmatch
from google3.pyglib import gfile  # This is repeated, you might want to remove one
from colabtools import googlefiles
import json
from matplotlib.colors import ListedColormap
import matplotlib.dates as mdates
from matplotlib.lines import Line2D
from matplotlib.patches import Patch, Rectangle
import matplotlib.pyplot as plt
import multiprocessing
import multiprocessing.pool
import numpy as np
import os
import pathlib
import pandas as pd
import pdb
import random  # This is repeated, you might want to remove one
import seaborn as sns
from scipy import stats
from scipy.stats import shapiro
from scipy.stats import zscore
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot
import tarfile
import tensorflow as tf
import tensorflow_datasets as tfds
from time import sleep, time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

NORMALIZATION_PARAMETERS = {
    'HR': [82.406911, 13.9461201],
    'eda_level_real': [4.116634, 3.878952961],
    'leads_contact_counts': [230.76297, 52.76303698],
    'steps': [7.952935, 18.53001124],
    'jerk_auto': [203.441044, 33.11101136],
    'step_count': [11.440943, 15.95296346],
    'log_energy': [60.306033, 42.84693899],
    'covariance': [44.81157, 12.63844836],
    'log_energy_ratio': [44.714925, 21.32527317],
    'zero_crossing_std': [160.085565, 28.10161215],
    'zero_crossing_avg': [51.270075, 34.04430198],
    'axis_mean': [119.768427, 23.58453469],
    'altim_std': [0.005178, 0.0581546286],
    'kurtosis': [108.645938, 60.38419486],
    'sleep_coefficient': [8.706734, 4.003582277],
    'wrist_temperatures': [30.921362, 2.817617692],
    'hrv_shannon_entropy_rr': [3.277522, 0.468409277],
    'hrv_shannon_entropy_rrd': [2.974838, 0.4999503109],
    'hrv_percentage_of_nn_30': [0.348379, 0.1961256813],
    'ceda_magnitude_real_micro_siemens': [43.071381, 24.11546345],
    'ceda_slope_real_micro_siemens': [3.294176, 1.828755314],
    'rmssd_percentile_0595': [34.038394, 24.86136018],
    'sdnn_percentile_0595': [44.233053, 25.04521794],
    'msa_probability': [48.120677, 14.23343678],
    'hrv_percent_good': [0.2716, 0.2760073968],
    'hrv_rr_80th_percentile_mean': [821.738396, 105.621134],
    'hrv_rr_20th_percentile_mean': [731.996986, 84.6384433],
    'hrv_rr_median': [776.111350, 90.3199562],
    'hrv_rr_mean': [781.280325, 87.08971004],
    'hr_at_rest_mean': [83.199721, 10.66796299],
    'skin_temperature_magnitude': [26.393339, 10.98900771],
    'skin_temperature_slope': [0.267523, 17.79474941],
}

FEATURES_TO_INCLUDE = [
    'HR',
    'eda_level_real',
    'leads_contact_counts',
    'steps',
    'jerk_auto',
    'step_count',
    'log_energy',
    'covariance',
    'log_energy_ratio',
    'zero_crossing_std',
    'zero_crossing_avg',
    'axis_mean',
    'altim_std',
    'kurtosis',
    'sleep_coefficient',
    'wrist_temperatures',
    'hrv_shannon_entropy_rr',
    'hrv_shannon_entropy_rrd',
    'hrv_percentage_of_nn_30',
    'ceda_magnitude_real_micro_siemens',
    'ceda_slope_real_micro_siemens',
    'rmssd_percentile_0595',
    'sdnn_percentile_0595',
    'msa_probability',
    'hrv_percent_good',
    'hrv_rr_80th_percentile_mean',
    'hrv_rr_20th_percentile_mean',
    'hrv_rr_median',
    'hrv_rr_mean',
    'hr_at_rest_mean',
    'skin_temperature_magnitude',
    'skin_temperature_slope'
    ]

In [None]:
data = []

data.append({'type':  'steps',
     'raw_file': 'STEPS_COMPACT_DATA',
     'features_to_extract': ['steps'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'momentary_stress_algorithm',
     'raw_file': 'MOMENTARY_STRESS_ALGORITHM_DATA',
     'features_to_extract': ['hrv_shannon_entropy_rr','hrv_shannon_entropy_rrd','hrv_percentage_of_nn_30','ceda_magnitude_real_micro_siemens','ceda_slope_real_micro_siemens','rmssd_percentile_0595','sdnn_percentile_0595','msa_probability','hrv_percent_good','hrv_rr_80th_percentile_mean','hrv_rr_20th_percentile_mean','hrv_rr_median','hrv_rr_mean','hr_at_rest_mean','skin_temperature_magnitude','skin_temperature_slope'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'ceda',
     'raw_file': 'CONTINUOUS_EDA_DATA',
     'features_to_extract': ['eda_level_real','leads_contact_counts'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'wrist_temperature',
     'raw_file': 'WRIST_TEMPERATURE_DATA',
     'features_to_extract': ['wrist_temperatures'],
     'timezone_offset_column': 'tz_offset_minutes'})

data.append({'type':  'sleep_coefficient',
     'raw_file': 'SLEEP_COEFFICIENT_COMPACT_DATA',
     'features_to_extract': ['sleep_coefficient','is_on_wrist'],
     'timezone_offset_column': 'tz_offset_minutes'})

data.append({'type':  'spo2',
     'raw_file': 'ABSOLUTE_SPO2_DATA',
     'features_to_extract': ['value','confidence','coverage','valid'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'grok',
     'raw_file': 'GROK_FEATURE_DATA',
     'features_to_extract': ['jerk_auto','step_count','log_energy','covariance',
                             'log_energy_ratio','zero_crossing_std',
                             'zero_crossing_avg','axis_mean','altim_std','kurtosis'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'heart_rate',
     'raw_file': 'HEART_RATE_DATA',
     'features_to_extract': ['bpm','confidence'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

In [None]:
def visualize_features(array_feature):

  fig = plt.figure(figsize=(20, 7))
  ax1 = plt.subplot2grid((1, 12), (0, 0), colspan=12)
  group = array_feature.numpy()

  ax1 = sns.heatmap(group.T, cmap="Reds", cbar=True, linewidths=0.0,
                    linecolor='black', alpha=0.8, ax=ax1, yticklabels=True)

  for tick in ax1.get_xticklabels():
      tick.set_fontname('Ubuntu')
      tick.set_style('italic')
  ax1.tick_params(axis='x', labelsize=10.5)

  for tick in ax1.get_yticklabels():
      tick.set_fontname('Ubuntu')
  ax1.tick_params(axis='y', labelsize=10.5)

  plt.xticks(rotation=45)  # Rotate labels for better readability
  plt.tight_layout()

  ax1.set_ylabel("Feature", fontname='Ubuntu', fontsize=14)

  ax1.axhline(y=0, color='k',linewidth=1, alpha=1)
  ax1.axhline(y=group.shape[1], color='k', alpha=1,linewidth=1)
  ax1.axvline(x=0, color='k',linewidth=1, alpha=1)
  ax1.axvline(x=group.shape[0], color='k', alpha=1,linewidth=1);

  for i in np.arange(0,group.shape[0],60):
    ax1.axvline(x=i, color='k', alpha=0.4,linewidth=1);
  for i in np.arange(0,group.shape[1],1):
    ax1.axhline(y=i, color='k', alpha=0.4,linewidth=1);

  fig.savefig(f'example_heatmap.pdf', format='pdf', bbox_inches="tight")
  %download_file example_heatmap.pdf
  plt.show()

In [None]:
tf_record_folder = "/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_tfrecords_v6"


# STEP 0: Move Data from Selected Snapshot:

## Load First Time:

In [None]:
gfile.MakeDirs("/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm")

In [None]:
## Import survey and phone data from vico files:
vico_folder = pathlib.PurePosixPath('/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/raw/vico/dwb')
vico_files = gfile.ListDir(vico_folder)
with gfile.Open(vico_folder / vico_files[-10], 'rb') as f:
  members = tarfile.open(fileobj=f, mode='r:gz').getmembers()
survey_phone_data = {}
with gfile.Open(vico_folder / vico_files[-1], 'rb') as f:
  with tarfile.open(fileobj=f, mode='r:gz') as tf:
    members = tf.getmembers()
    for member in members:
      survey_phone_data[member.name] = (pd.read_csv(tf.extractfile(member), delimiter='\t'))
      with googlefiles.OpenGoogleFiles():
        with open('/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/x_vico_'+member.name, 'w') as fs:
          survey_phone_data[member.name].to_csv(fs)

In [None]:
fitbit_folder = pathlib.PurePosixPath('/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/raw/fitbit/dwb')
fitbit_snapshots = gfile.ListDir(fitbit_folder)
fitbit_files = gfile.ListDir(fitbit_folder / fitbit_snapshots[-1])

fitbit_content = {}
for data_type in fitbit_files:
  with gfile.Open(fitbit_folder / fitbit_snapshots[-1] / data_type, 'r') as f:
      fitbit_content[data_type] = pd.read_csv(f)
      with googlefiles.OpenGoogleFiles():
        with open('/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/'+data_type+'.csv', 'w') as fs:
          fitbit_content[data_type].to_csv(fs)

In [None]:
data_type = 'HEART_RATE_DATA.csv'
fitbit_folder = pathlib.PurePosixPath('/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/raw/fitbit/dwb')
fitbit_snapshots = gfile.ListDir(fitbit_folder)
fitbit_content = {}
print(fitbit_snapshots[-140])
gfile.Copy(fitbit_folder / fitbit_snapshots[-140] / data_type,'/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/'+data_type+'2.csv')

In [None]:
data_type = 'CONTINUOUS_EDA_DATA.csv'
fitbit_folder = pathlib.PurePosixPath('/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/raw/fitbit/dwb')
fitbit_snapshots = gfile.ListDir(fitbit_folder)
fitbit_content = {}
print(fitbit_snapshots[-140])
gfile.Copy(fitbit_folder / fitbit_snapshots[-140] / data_type,'/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm/'+data_type+'2.csv')

# STEP 1: Prepare Individual Participant Sessions

In [None]:
for d in data:
  print("BY_SUBJECT_"+d['type'], len(gfile.ListDir(os.path.join(root_folder, "BY_SUBJECT_"+d['type']))))

In [None]:
root_folder = "/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm"

In [None]:
def get_arrays(row,column,type=float):
  list_of_strings = row[column]
  list_of_strings = list_of_strings[1:-1]
  list_of_integers = list_of_strings.split(',')
  series = pd.Series(list_of_integers)
  if type == bool:
    return series.astype(bool)

  try:
    return series.astype(float)
  except:
    print("Could not convert millis to float in array conversion.")
    return []

In [None]:
def split_save(L):
  df2 = df[df['participant_id']==L]

  if gfile.Exists(os.path.join(target_dir,d['type']+"_"+str(L)+".csv")):
    print(L, ' Already exists.')
    return

  list_arrays = []

  ## Loop through raw file and de-nest:
  if d['type'] == 'spo2':
    df2['activity_time'] = pd.to_datetime(df2['activity_time'], format='mixed')
    df2['valid'] = df2['valid'].astype('int')
    tmp = df2
    tmp['millis_from_start_time'] = 0
  else:
    for i, row in df2.iterrows():

      tmp = pd.DataFrame()

      if d['type'] == 'steps' or d['type'] == 'wrist_temperature' or d['type'] == 'sleep_coefficient':
        tmp['millis_from_start_time'] = pd.Series(range(0,1440))*1000*60
      if d['type'] == 'heart_rate':
        tmp['millis_from_start_time'] = pd.Series(range(0,60*60*24))*1000
      if d['type'] == 'momentary_stress_algorithm':
        tmp['millis_from_start_time'] = get_arrays(row,'offsets')*1000*60
      if d['type'] == 'ceda':
          tmp['millis_from_start_time'] = get_arrays(row,'millis_from_start_time')
      if d['type'] == 'grok':
          tmp['millis_from_start_time'] = get_arrays(row,'millis_from_start_of_day')

      for feature in d['features_to_extract']:
        if feature == 'is_on_wrist':
          tmp[feature] = get_arrays(row,feature,bool)
        else:
          tmp[feature] = get_arrays(row,feature)
      tmp['activity_time'] = row['activity_time']
      list_arrays.append(tmp)

    tmp = pd.concat(list_arrays, ignore_index=True)

    tmp['participant_id'] = row['participant_id']
    tmp['activity_tm_timezone_offset'] = row[d['timezone_offset_column']]

    ## Convert time to LOCAL:
    tmp['activity_time'] = pd.to_datetime(tmp['activity_time'])

    if d['type'] == 'ceda':
      tmp['activity_time_local'] = tmp['activity_time'] + tmp['activity_tm_timezone_offset'].astype('timedelta64[m]') + tmp['activity_tm_timezone_offset'].astype('timedelta64[m]') + tmp['millis_from_start_time'].astype('timedelta64[ms]')
    else:
      tmp['activity_time_local'] = tmp['activity_time'] + tmp['activity_tm_timezone_offset'].astype('timedelta64[m]') + tmp['millis_from_start_time'].astype('timedelta64[ms]')

    ## Rename columns:
    tmp.rename(columns={'activity_time_local': 'DT', 'participant_id': 'ID'}, inplace=True)

    cols = d['features_to_extract'].copy()
    cols.append('ID')
    cols.append('DT')
    tmp = tmp[cols]

  tmp.to_csv(gfile.Open(os.path.join(target_dir,d['type']+"_"+str(L)+".csv"), 'w'))
  print(L, ' successfully saved.')

for d in data:

  print(d['type'])

  ## Load raw file:
  with gfile.Open(os.path.join(root_folder, d['raw_file']+'.csv.csv'), 'r') as f:
    df = pd.read_csv(f)
  df.reset_index(inplace=True)

  if len(df) == 0:
    continue

  ## Make output dir and save per subject files:
  target_dir = os.path.join(root_folder,"BY_SUBJECT_"+d['type'])
  if gfile.Exists(target_dir):
    print(target_dir)
    #gfile.DeleteRecursively(target_dir)
    #gfile.MakeDirs(target_dir)
  else:
    gfile.MakeDirs(target_dir)

  #split_save(19395)

  WORKER_COUNT = 20
  L = pd.unique(df.participant_id)
  with multiprocessing.pool.ThreadPool(WORKER_COUNT) as pool:
    output = list(pool.map(split_save, L))
    pool.close()
    pool.join()

# STEP 2: Sessionize

In [None]:
#@title Data session class definition

import abc
import dataclasses
import functools as ft
import jaxtyping as jt
from scipy.stats import zscore
from google3.fitbit.research.sensing.common.proto import data_key_pb2

class Sensor(abc.ABC):

  def resample(timeseries_data, input_timestamp_units='s', output_timestamp_units='1min'):
    """Downsamples a pandas dataframe with unknown frequency into a minutely frequency, using the column 't'.

    Args:
      timeseries_data: A pandas dataframe with a column 't' of timestamps to use
      for downsampling.
      timestamp_units: The units to use for the timestamps.

    Returns:
      A pandas dataframe with a minutely frequency.
    """

    timeseries_data['DT'] = pd.to_datetime(
        timeseries_data['t'], unit=input_timestamp_units
    )
    timeseries_data.drop(columns=['t'], inplace=True)
    timeseries_data = timeseries_data.resample(output_timestamp_units, on='DT').mean()
    return timeseries_data


class HeartRate(Sensor):
  """Heart rate sensor data."""

  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):


    if isinstance(data, pd.DataFrame):
      data.rename(columns={'bpm': 'HR'}, inplace=True)
      data['DT'] = pd.to_datetime(data['DT'])
      self.hr = data[['DT','HR']].set_index('DT')
    elif isinstance(sensor_key, data_key_pb2.DataKey) and sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      hr = []
      for session in sessions:
        times = pd.date_range(datetime.datetime.fromtimestamp(session.activity_tm.seconds, tz=datetime.timezone.utc), periods=60*60*24, freq='1s')
        hr_day = pd.DataFrame({'t': times, 'HR': session.bpm})
        hr.append(hr_day)
      self.hr = Sensor.resample(pd.concat(hr), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.hr = pd.DataFrame(columns=['DT',
                                      'HR']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))



class ContinuousEDA(Sensor):
  """Continuous EDA sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):

    if isinstance(data, pd.DataFrame):
      data['DT'] = pd.to_datetime(data['DT'])
      self.continuous_eda = data[['DT',
                                  'eda_level_real',
                                  'eda_level_imaginary',
                                  'eda_slope_real',
                                  'eda_slope_imaginary',
                                  'leads_contact_counts']].set_index('DT')
    elif isinstance(sensor_key, data_key_pb2.DataKey) and sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      continuous_eda = []
      for session in sessions:
        t = []
        for i in session.millis_from_start_time:
          t.append(datetime.datetime.fromtimestamp(i/1000 + session.activity_tm_timezone_offset*60 + session.activity_tm.seconds, tz=datetime.timezone.utc))
        times = pd.DatetimeIndex(t)
        continuous_eda_day = pd.DataFrame({'t': times,
                                          'eda_level_real': session.eda_level_real,
                                          'eda_level_imaginary': session.eda_level_imaginary,
                                          'eda_slope_real': session.eda_slope_real,
                                          'eda_slope_imaginary': session.eda_slope_imaginary,
                                          'leads_contact_counts': session.leads_contact_counts})
        continuous_eda.append(continuous_eda_day)
      self.continuous_eda = Sensor.resample(pd.concat(continuous_eda), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.continuous_eda = pd.DataFrame(columns=['DT',
                                                  'eda_level_real',
                                                  'eda_level_imaginary',
                                                  'eda_slope_real',
                                                  'eda_slope_imaginary',
                                                  'leads_contact_counts']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))

class Steps(Sensor):
  """Steps sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):

    if isinstance(data, pd.DataFrame):
      data['DT'] = pd.to_datetime(data['DT'])
      self.steps = data[['DT',
                        'steps']].set_index('DT')
    elif isinstance(sensor_key, data_key_pb2.DataKey) and sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      steps = []
      for session in sessions:
        times = pd.date_range(datetime.datetime.fromtimestamp(session.activity_tm.seconds, tz=datetime.timezone.utc), periods=60*24, freq='1min')
        steps_day = pd.DataFrame({'t': times, 'steps': session.steps})
        steps.append(steps_day)
      self.steps = Sensor.resample(pd.concat(steps), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.steps = pd.DataFrame(columns=['DT',
                                         'steps']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))


class Grok(Sensor):
  """Grok sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):

    if isinstance(data, pd.DataFrame):
      data['DT'] = pd.to_datetime(data['DT'])
      self.grok = data[['DT',
                        'jerk_auto',
                        'step_count',
                        'log_energy',
                        'covariance',
                        'log_energy_ratio',
                        'zero_crossing_std',
                        'zero_crossing_avg',
                        'axis_mean',
                        'altim_std',
                        'kurtosis']].set_index('DT')
    elif isinstance(sensor_key, data_key_pb2.DataKey) and sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      grok = []
      for session in data.data['grok_feature_data_with_dupes']:
        t = []
        for i in session.activity_tms:
          t.append(datetime.datetime.fromtimestamp(i.seconds, tz=datetime.timezone.utc))
        times = pd.DatetimeIndex(t)
        grok_day = pd.DataFrame({'t': times,
                                'jerk_auto': session.jerk_auto,
                                'step_count': session.step_count,
                                'log_energy': session.log_energy,
                                'covariance': session.covariance,
                                'log_energy_ratio': session.log_energy_ratio,
                                'zero_crossing_std': session.zero_crossing_std,
                                'zero_crossing_avg': session.zero_crossing_avg,
                                'axis_mean': session.axis_mean,
                                'altim_std': session.altim_std,
                                'kurtosis': session.kurtosis})
        grok.append(grok_day)
      self.grok = Sensor.resample(pd.concat(grok), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.grok = pd.DataFrame(columns=['DT',
                                        'jerk_auto',
                                        'step_count',
                                        'log_energy',
                                        'covariance',
                                        'log_energy_ratio',
                                        'zero_crossing_std',
                                        'zero_crossing_avg',
                                        'axis_mean',
                                        'altim_std',
                                        'kurtosis']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))

class SleepCoefficient(Sensor):
  """Sleep coefficient sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):

    if isinstance(data, pd.DataFrame):
      data['DT'] = pd.to_datetime(data['DT'])
      self.sleep_coefficient = data[['DT',
                                     'sleep_coefficient',
                                     'is_on_wrist']].set_index('DT')
    elif isinstance(sensor_key, data_key_pb2.DataKey) and sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      sleep_coefficient = []
      for session in data.data['sleep_coefficient_compact']:
        times = pd.date_range(datetime.datetime.fromtimestamp(session.activity_tm.seconds, tz=datetime.timezone.utc), periods=60*24*2, freq='30s')
        sleep_coefficient_day = pd.DataFrame({'t': times,
                                              'sleep_coefficient': session.sleep_coefficient,
                                              'is_on_wrist': session.is_on_wrist})
        sleep_coefficient.append(sleep_coefficient_day)
      self.sleep_coefficient = Sensor.resample(pd.concat(sleep_coefficient), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.sleep_coefficient = pd.DataFrame(columns=['DT',
                                        'sleep_coefficient',
                                        'is_on_wrist']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))

class SkinTemp(Sensor):
  """Skin temperature sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):

    if isinstance(data, pd.DataFrame):
      data['DT'] = pd.to_datetime(data['DT'])
      self.skin_temp = data[['DT',
                              'wrist_temperatures']].set_index('DT')
    elif isinstance(sensor_key, data_key_pb2.DataKey) and sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      skin_temp = []
      for session in sessions:
        times = pd.date_range(datetime.datetime.fromtimestamp(session.activity_tm.seconds, tz=datetime.timezone.utc), periods=60*24, freq='1min')
        skintemp_day = pd.DataFrame({'t': times, 'wrist_temperatures': session.wrist_temperatures})
        skin_temp.append(skintemp_day)
      self.skin_temp = Sensor.resample(pd.concat(skin_temp), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.skin_temp = pd.DataFrame(columns=['DT',
                                        'wrist_temperatures']).set_index('DT')
      print(ValueError(sensor_key + ' not found in data.data.keys()'))

class MomentaryStressAlgorithm(Sensor):
  """Momentary stress algorithm sensor data."""
  sessions: list

  def __init__(self, data, sensor_key, input_timestamp_units='s', output_timestamp_units='1min'):

    if isinstance(data, pd.DataFrame):
      data['DT'] = pd.to_datetime(data['DT'])
      self.momentary_stress_algorithm = data[['DT',
                                              'hrv_shannon_entropy_rr',
                                              'hrv_shannon_entropy_rrd',
                                              'hrv_percentage_of_nn_30',
                                              'ceda_magnitude_real_micro_siemens',
                                              'ceda_slope_real_micro_siemens',
                                              'rmssd_percentile_0595',
                                              'sdnn_percentile_0595',
                                              'msa_probability',
                                              'hrv_percent_good',
                                              'hrv_rr_80th_percentile_mean',
                                              'hrv_rr_20th_percentile_mean',
                                              'hrv_rr_median',
                                              'hrv_rr_mean',
                                              'hr_at_rest_mean',
                                              'skin_temperature_magnitude',
                                              'skin_temperature_slope']].set_index('DT')
    elif isinstance(sensor_key, data_key_pb2.DataKey) and sensor_key in data.data.keys():
      sessions = data.data[sensor_key]
      momentary_stress_algorithm = []
      for session in sessions:
        t = []
        for i in session.offsets:
          t.append(datetime.datetime.fromtimestamp(i*60 + session.activity_tm.seconds, tz=datetime.timezone.utc))
        times = pd.DatetimeIndex(t)
        msa_day = pd.DataFrame({'t': times,
                                'hrv_shannon_entropy_rr': session.hrv_shannon_entropy_rr,
                                'hrv_shannon_entropy_rrd': session.hrv_shannon_entropy_rrd,
                                'hrv_percentage_of_nn_30': session.hrv_percentage_of_nn_30,
                                'ceda_magnitude_real_micro_siemens': session.ceda_magnitude_real_micro_siemens,
                                'ceda_slope_real_micro_siemens': session.ceda_slope_real_micro_siemens,
                                'rmssd_percentile_0595': session.rmssd_percentile_0595,
                                'sdnn_percentile_0595': session.sdnn_percentile_0595,
                                'msa_probability': session.msa_probability,
                                'hrv_percent_good': session.hrv_percent_good,
                                'hrv_rr_80th_percentile_mean': session.hrv_rr_80th_percentile_mean,
                                'hrv_rr_20th_percentile_mean': session.hrv_rr_20th_percentile_mean,
                                'hrv_rr_median': session.hrv_rr_median,
                                'hrv_rr_mean': session.hrv_rr_mean,
                                'hr_at_rest_mean': session.hr_at_rest_mean,
                                'skin_temperature_magnitude': session.skin_temperature_magnitude,
                                'skin_temperature_slope': session.skin_temperature_slope})
        momentary_stress_algorithm.append(msa_day)
      self.momentary_stress_algorithm = Sensor.resample(pd.concat(momentary_stress_algorithm), input_timestamp_units='s', output_timestamp_units='1min')
    else:
      self.momentary_stress_algorithm = pd.DataFrame(columns=['DT',
                                                              'hrv_shannon_entropy_rr',
                                                              'hrv_shannon_entropy_rrd',
                                                              'hrv_percentage_of_nn_30',
                                                              'ceda_magnitude_real_micro_siemens',
                                                              'ceda_slope_real_micro_siemens',
                                                              'rmssd_percentile_0595',
                                                              'sdnn_percentile_0595',
                                                              'msa_probability',
                                                              'hrv_percent_good',
                                                              'hrv_rr_80th_percentile_mean',
                                                              'hrv_rr_20th_percentile_mean',
                                                              'hrv_rr_median',
                                                              'hrv_rr_mean',
                                                              'hr_at_rest_mean',
                                                              'skin_temperature_magnitude',
                                                              'skin_temperature_slope']).set_index('DT')

    print(ValueError(sensor_key + ' not found in data.data.keys()'))


@dataclasses.dataclass(frozen=True)
class ProdSession:

  # A session specific identifier for a 24hr period of data collection.
  session_id: str
  # Heart rate table data.
  hr: HeartRate
  # Continuous heart rate table data.
  continuous_eda: ContinuousEDA
  # Steps table data.
  steps: Steps
  # Grok table data.
  grok: Grok
  # Sleep Coefficient table data.
  sleep_coefficient: SleepCoefficient
  # Skin Temp table data.
  skin_temp: SkinTemp
  # MSA table data.
  momentary_stress_algorithm: MomentaryStressAlgorithm

  def join(self) -> pd.DataFrame:

    dfs = [self.hr, self.continuous_eda, self.steps, self.grok, self.sleep_coefficient, self.skin_temp, self.momentary_stress_algorithm]
    session = ft.reduce(lambda left, right: pd.merge(left, right, on='DT', how='outer'), dfs)
    if 'is_on_wrist' in session.columns:
      session.loc[(session.is_on_wrist == 0), :] = np.nan
      #session = session.apply(lambda col: zscore(col, nan_policy='omit') if col.notna().any() else col)
      #session = session.clip(-3,3)
      return session
    else:
      return pd.DataFrame()


In [None]:
root_folder = "/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm"
files = gfile.ListDir(os.path.join(root_folder,"BY_SUBJECT_momentary_stress_algorithm"))
ids = [s[-9:] for s in files]
print(ids)

all_features = []
for d in data:
  all_features.extend(d['features_to_extract'])

types = list(map(lambda x : x['type'], data))
cnt=0

def window(ids: list[str], window_length: str, timestamp_units: str):
  inputs = []
  mask = []

  for i in ids:
    print('ID: ', i)
    d = {}
    dfs = []
    for table in data:
      t = table['type']

      try:
        d[t] = pd.read_csv(gfile.Open(os.path.join(root_folder,"BY_SUBJECT_"+t,t+"_"+i), 'r'))
        d[t].rename(columns={'DT': 't'}, inplace=True)
        d[t]['t'] = pd.to_datetime(d[t]['t'])
      except:
        print(t + ' not found.')
        continue

      cols = table['features_to_extract'].copy()
      cols.append('t')
      d[t] = d[t][cols]

      if t == 'heart_rate':
        #d[t]['bpm'][d[t]['bpm'] == -1] = np.nan
        d[t].loc[d[t]['bpm'] == -1,'bpm'] = np.nan

      if t == 'ceda':
        d[t].loc[d[t]['eda_level_real'] > 60, "eda_level_real"] = 60
        d[t].loc[d[t]['eda_level_real'] < 0, "eda_level_real"] = 0

      if t == 'momentary_stress_algorithm':
        d[t].loc[d[t]['ceda_slope_real_micro_siemens'] > 5, "ceda_slope_real_micro_siemens"] = 5
        d[t].loc[d[t]['ceda_slope_real_micro_siemens'] < -5, "ceda_slope_real_micro_siemens"] = -5

      if t == 'sleep_coefficient':
        d[t].loc[d[t]['sleep_coefficient'] == -1, "sleep_coefficient"] = np.nan

      if t == 'wrist_temperature':
        d[t].loc[:,'wrist_temperatures'] = d[t]['wrist_temperatures']/20000
        d[t].loc[d[t]['wrist_temperatures'] > 41, "wrist_temperatures"] = 41
        d[t].loc[d[t]['wrist_temperatures'] < 0, "wrist_temperatures"] = np.nan

      if t == 'grok':
        d[t].loc[:,'altim_std'] = d[t]['altim_std']/255

      if len(d[t]) > 0:
        d[t] = Sensor.resample(d[t], input_timestamp_units='s', output_timestamp_units=timestamp_units)
        dfs.append(d[t])

    if len(dfs) > 0:
      session = ft.reduce(lambda left, right: pd.merge(left, right, on='DT', how='outer'), dfs)
    else:
      continue

    for feature in FEATURES_TO_INCLUDE:
      if feature not in session.columns:
        session.loc[:,feature] = np.nan

    session = session[FEATURES_TO_INCLUDE]
    for feature in FEATURES_TO_INCLUDE:
      session.loc[:,feature] = (
          session[feature] - NORMALIZATION_PARAMETERS[feature][0]
      ) / (NORMALIZATION_PARAMETERS[feature][1])
    session = session.clip(-5, 5)

    df_grouped = session.groupby(pd.Grouper(freq=window_length))
    for name, group in df_grouped:
      nan_mask = np.isnan(group.to_numpy())
      missingness_ratio = np.sum(nan_mask) / (
          nan_mask.shape[0] * nan_mask.shape[1]
      )
      if group.shape[0] == 168*60 and group.shape[1] == len(FEATURES_TO_INCLUDE):
        if missingness_ratio>0.8:
          print('.   Too much missingness.')
        else:
          group = np.nan_to_num(group)
          yield name, {
              'id': i,
              'input': group,#.values,
              'mask': nan_mask,
          }

w = window(ids[:10], '168h', '1min')

# STEP 3: Load and Process Labels

In [None]:
# @title Load and Process Surveys
demo = pd.read_csv(gfile.Open(os.path.join(root_folder,'x_vico_demographic_questionnaire_responses.csv')))
try:
  demo.set_index(["#study_participant_id"], inplace=True)
except:
  'Index already reset.'

joined_df = demo

# BFI Responses
bfi = pd.read_csv(gfile.Open(os.path.join(root_folder,'x_vico_intake_survey_bfi_questionnaire_responses.csv')))
# Code BFI:
#Extraversion: 1R, 5 #Agreeableness: 2, 7R #Conscientiousness: 3R, 8 #Neuroticism: 4R, 9 #Openness to Experience: 5R, 10
mapping = {'Disagree strongly': 1, 'Disagree a little': 2, 'Neither agree nor disagree': 3, 'Agree a little': 4, 'Agree strongly': 5}
for c in range(1,11):
  bfi['intake_survey_-_bfi-10_q'+str(c)+'_group_score'] = bfi['intake_survey_-_bfi-10_q'+str(c)+'_group'].map(mapping)

bfi['extraversion_score'] = -bfi['intake_survey_-_bfi-10_q1_group_score'] + bfi['intake_survey_-_bfi-10_q5_group_score']
bfi['agreeableness_score'] = bfi['intake_survey_-_bfi-10_q2_group_score'] - bfi['intake_survey_-_bfi-10_q7_group_score']
bfi['conscientiousness_score'] = -bfi['intake_survey_-_bfi-10_q3_group_score'] + bfi['intake_survey_-_bfi-10_q8_group_score']
bfi['neuroticism_score'] = -bfi['intake_survey_-_bfi-10_q4_group_score'] + bfi['intake_survey_-_bfi-10_q9_group_score']
bfi['openness_score'] = -bfi['intake_survey_-_bfi-10_q5_group_score'] + bfi['intake_survey_-_bfi-10_q10_group_score']

bfi.reset_index(inplace=True)
bfi.set_index(["#study_participant_id"], inplace=True)
joined_df = joined_df.join(bfi, lsuffix='bfi_')

# PHQ Responses
# Intake:
phq = pd.read_csv(gfile.Open(os.path.join(root_folder,'x_vico_phq_8_intake_questionnaire_responses.csv')))
mapping = {'Not at all': 0, 'Several days': 1, 'More than half the days': 2, 'Nearly every day': 3}
columns = ['little_interest','depression','sleep','tired','appetite','failure','trouble_concentrating','restlessness']
for c in columns:
  phq[c+'_intake_score'] = phq[c].map(mapping)
columns = ['little_interest_intake_score','depression_intake_score','sleep_intake_score',
           'tired_intake_score','appetite_intake_score','failure_intake_score',
           'trouble_concentrating_intake_score','restlessness_intake_score']
phq['phq_intake_score'] = phq[columns].mean(axis=1)*8
phq = phq[['#study_participant_id',
           'little_interest_intake_score','depression_intake_score','sleep_intake_score',
           'tired_intake_score','appetite_intake_score','failure_intake_score',
           'trouble_concentrating_intake_score','restlessness_intake_score','phq_intake_score']]

phq.reset_index(inplace=True)
phq.set_index(["#study_participant_id"], inplace=True)
joined_df = joined_df.join(phq, lsuffix='phq_')

# Completion:
phq = pd.read_csv(gfile.Open(os.path.join(root_folder,'x_vico_phq_8_complete_questionnaire_responses.csv')))
mapping = {'Not at all': 0, 'Several days': 1, 'More than half the days': 2, 'Nearly every day': 3}
columns = ['little_interest','depression','sleep','tired','appetite','failure','trouble_concentrating','restlessness']
for c in columns:
  phq[c+'_complete_score'] = phq[c].map(mapping)
columns = ['little_interest_complete_score','depression_complete_score','sleep_complete_score',
           'tired_complete_score','appetite_complete_score','failure_complete_score',
           'trouble_concentrating_complete_score','restlessness_complete_score']
phq['phq_complete_score'] = phq[columns].mean(axis=1)*8
phq = phq[['#study_participant_id',
           'little_interest_complete_score','depression_complete_score','sleep_complete_score',
           'tired_complete_score','appetite_complete_score','failure_complete_score',
           'trouble_concentrating_complete_score','restlessness_complete_score','phq_complete_score']]

phq.reset_index(inplace=True)
phq.set_index(["#study_participant_id"], inplace=True)
phq.drop(columns=['index'], inplace=True)
joined_df = joined_df.join(phq, lsuffix='phq_')


# GAD Responses
# Intake:
gad = pd.read_csv(gfile.Open(os.path.join(root_folder,'x_vico_gad_7_intake_questionnaire_responses.csv')))
mapping = {'Not at all': 0, 'Several days': 1, 'More than half the days': 2,
           'Nearly every day': 3}
columns = ['anxiety','cannot_stop_worry','too_much_worry','trouble_relaxing','restlessness','irritability','fear']
for c in columns:
  gad['gad_'+c+'_intake_score'] = gad[c].map(mapping)
columns = ['gad_anxiety_intake_score','gad_cannot_stop_worry_intake_score','gad_too_much_worry_intake_score','gad_trouble_relaxing_intake_score','gad_restlessness_intake_score','gad_irritability_intake_score','gad_fear_intake_score']
gad['gad_intake_score'] = gad[columns].mean(axis=1)*7
gad = gad[['#study_participant_id','gad_anxiety_intake_score','gad_cannot_stop_worry_intake_score','gad_too_much_worry_intake_score','gad_trouble_relaxing_intake_score','gad_restlessness_intake_score','gad_irritability_intake_score','gad_fear_intake_score','gad_intake_score']]

gad.reset_index(inplace=True)
gad.set_index(["#study_participant_id"], inplace=True)
gad.drop(columns=['index'], inplace=True)
joined_df = joined_df.join(gad, lsuffix='gad_')

# Completion:
gad = pd.read_csv(gfile.Open(os.path.join(root_folder,'x_vico_gad_7_complete_questionnaire_responses.csv')))
mapping = {'Not at all': 0, 'Several days': 1, 'More than half the days': 2,
           'Nearly every day': 3}
columns = ['anxiety','cannot_stop_worry','too_much_worry','trouble_relaxing','restlessness','irritability','fear']
for c in columns:
  gad['gad_'+c+'_complete_score'] = gad[c].map(mapping)
columns = ['gad_anxiety_complete_score','gad_cannot_stop_worry_complete_score','gad_too_much_worry_complete_score','gad_trouble_relaxing_complete_score','gad_restlessness_complete_score','gad_irritability_complete_score','gad_fear_complete_score']
gad['gad_complete_score'] = gad[columns].mean(axis=1)*7
gad = gad[['#study_participant_id','gad_anxiety_complete_score','gad_cannot_stop_worry_complete_score','gad_too_much_worry_complete_score','gad_trouble_relaxing_complete_score','gad_restlessness_complete_score','gad_irritability_complete_score','gad_fear_complete_score','gad_complete_score']]

gad.reset_index(inplace=True)
gad.set_index(["#study_participant_id"], inplace=True)
gad.drop(columns=['index'], inplace=True)
joined_df = joined_df.join(gad, lsuffix='gad_')

# Sleep disturbance Responses
sleep_disturbance = pd.read_csv(gfile.Open(os.path.join(root_folder,'x_vico_sleep_disturbance_intake_questionnaire_responses.csv')))

mapping = {'Not at all': 1, 'A little bit': 2, 'Somewhat': 3, 'Quite a bit': 4, 'Very much': 5}
columns = ['restless','satisfied', 'refreshing','trouble_falling_asleep']
for c in columns:
  sleep_disturbance[c+'_score'] = sleep_disturbance[c].map(mapping)

mapping = {'Never': 1, 'Rarely': 2, 'Sometimes': 3, 'Often': 4, 'Always': 5}
columns = ['trouble_staying_asleep', 'trouble_sleeping',
       'enough_sleep']
for c in columns:
  sleep_disturbance[c+'_score'] = sleep_disturbance[c].map(mapping)

mapping = {'Very poor': 1, 'Poor': 2, 'Fair': 3, 'Good': 4, 'Very good': 5}
columns = ['quality']
for c in columns:
  sleep_disturbance[c+'_score'] = sleep_disturbance[c].map(mapping)

sleep_disturbance['sleep_disturbance_score'] = (sleep_disturbance['restless_score'] + (5 - sleep_disturbance['satisfied_score']) + (5 - sleep_disturbance['refreshing_score']) + sleep_disturbance['trouble_falling_asleep_score'] + sleep_disturbance['trouble_staying_asleep_score'] + sleep_disturbance['trouble_sleeping_score'] + (5 - sleep_disturbance['enough_sleep_score']) + (5 - sleep_disturbance['quality_score']) )
sleep_disturbance.reset_index(inplace=True)
sleep_disturbance.set_index(["#study_participant_id"], inplace=True)
joined_df = joined_df.join(sleep_disturbance, lsuffix='sleepdisturbance_')

# Sleep Impairment Responses
sleep_impairment = pd.read_csv(gfile.Open(os.path.join(root_folder,'x_vico_sleep_impairment_intake_questionnaire_responses.csv')))

mapping = {'Not at all': 1, 'A little bit': 2, 'Somewhat': 3, 'Quite a bit': 4, 'Very much': 5}
columns = ['trouble_productivity', 'alertness', 'tiredness',
       'problems', 'trouble_concentrating', 'irritability',
       'sleepy_during_daytime', 'trouble_staying_awake']
for c in columns:
  sleep_impairment[c+'_score'] = sleep_impairment[c].map(mapping)

sleep_impairment['sleep_impairment_score'] = (sleep_impairment['trouble_productivity_score'] + (5 - sleep_impairment['alertness_score']) + sleep_impairment['tiredness_score'] + sleep_impairment['problems_score'] + sleep_impairment['trouble_concentrating_score'] + sleep_impairment['irritability_score'] + sleep_impairment['sleepy_during_daytime_score'] + sleep_impairment['trouble_staying_awake_score'] )

sleep_impairment.reset_index(inplace=True)
sleep_impairment.set_index(["#study_participant_id"], inplace=True)
joined_df = joined_df.join(sleep_impairment, lsuffix='sleepimpairment_')

# PSS Responses
pss = pd.read_csv(gfile.Open(os.path.join(root_folder,'x_vico_pss_intake_questionnaire_responses.csv')))

mapping_1 = {'Never': 0, 'Almost Never': 1, 'Sometimes': 2, 'Fairly Often': 3, 'Very Often': 4}
mapping_2 = {'Never': 4, 'Almost Never': 3, 'Sometimes': 2, 'Fairly Often': 1, 'Very Often': 0}

# 1. In the last month, how often have you been upset because of something that happened unexpectedly?
# 2. In the last month, how often have you felt that you were unable to control the important things in your life?
# 3. In the last month, how often have you felt nervous and stressed?
# 4. In the last month, how often have you felt confident about your ability to handle your personal problems?
# 5. In the last month, how often have you felt that things were going your way?
# 6. In the last month, how often have you found that you could not cope with all the things that you had to do?
# 7. In the last month, how often have you been able to control irritations in your life?
# 8. In the last month, how often have you felt that you were on top of things?
# 9. In the last month, how often have you been angered because of things that happened that were outside of your control?
# 10. In the last month, how often have you felt difficulties were piling up so high that you could not overcome them?

columns = ['upset','no_control','stress','handle_personal_problems','things_positive','cannot_cope','control_irritation','on_top_of_things','anger','overwhelm']
newcolumns = []
for c in columns:
  if c in ['handle_personal_problems','things_positive','control_irritation','on_top_of_things']:
    pss['pss_'+c+'_score'] = pss[c].map(mapping_2)
  else:
    pss['pss_'+c+'_score'] = pss[c].map(mapping_1)
  newcolumns.append('pss_'+c+'_score')

columns = newcolumns
pss['pss_score'] = pss[columns].sum(axis=1)
columns.append('pss_score')
columns.append('#study_participant_id')
pss = pss[columns]

pss.reset_index(inplace=True)
pss.set_index(["#study_participant_id"], inplace=True)
joined_df = joined_df.join(pss, lsuffix='pss_')

plt.figure(figsize=(25, 3))
plt.plot(joined_df.isna().sum())
plt.xticks(rotation=90)
plt.show()

joined_df.reset_index(inplace=True)
joined_df.index = joined_df.index.astype(int)
demo = joined_df
demo['phq_delta'] = demo['phq_complete_score'] - demo['phq_intake_score']

# STEP 4: Create TFrecords

In [None]:
import logging
import time
import enum

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def numpy_example(array, labels):
  feature = {
      'phq_intake_score': _int64_feature(labels[0]),
      'gad_intake_score': _int64_feature(labels[1]),
      'pss_score': _int64_feature(labels[2]),
      'sleep_disturbance_score': _int64_feature(labels[3]),
      'sleep_impairment_score': _int64_feature(labels[4]),
      'extraversion_score': _int64_feature(labels[5]),
      'age': _int64_feature(labels[6]),
      'gender_group': _int64_feature(labels[7]),
      'array_raw': _bytes_feature(tf.io.serialize_tensor(array)),
  }
  return tf.train.Example(features=tf.train.Features(feature=feature))

gfile.MakeDirs(tf_record_folder)

joined_df.loc[joined_df['gender']=='<skipped>','gender_group'] = np.nan
mapping = {'Female': 'Female', 'Male': 'Male', 'Genderqueer/Gender Non Conforming': 'Queer', 'Trans Female/Trans Woman': 'Trans', 'Trans Male/Trans Man': 'Trans', 'Different Identity': 'Trans'}
joined_df['gender_clustered'] = joined_df['gender'].map(mapping)
codes, uniques = pd.factorize(joined_df['gender_clustered'])
print(uniques)
joined_df['gender_group'] = codes

# Iterate over the dataset and write each example to the TFRecord file
for i in ids:
  id = i[0:-4]
  w = window([i], '168h', '1min')
  output_file = 'dwb_' + id +'.tfrecords'
  with tf.io.TFRecordWriter(os.path.join(tf_record_folder,output_file)) as writer:
    for key, result in w:
      labels = []
      print(key)
      try:
        labels.append(int(joined_df[joined_df['#study_participant_id']==int(id)]['phq_intake_score'].values[0]))
        labels.append(int(joined_df[joined_df['#study_participant_id']==int(id)]['gad_intake_score'].values[0]))
        labels.append(int(joined_df[joined_df['#study_participant_id']==int(id)]['pss_score'].values[0]))
        labels.append(int(joined_df[joined_df['#study_participant_id']==int(id)]['sleep_disturbance_score'].values[0]))
        labels.append(int(joined_df[joined_df['#study_participant_id']==int(id)]['sleep_impairment_score'].values[0]))
        labels.append(int(joined_df[joined_df['#study_participant_id']==int(id)]['extraversion_score'].values[0]))
        labels.append(int(joined_df[joined_df['#study_participant_id']==int(id)]['age'].values[0]))
        labels.append(int(joined_df[joined_df['#study_participant_id']==int(id)]['gender_group'].values[0]))
      except:
        print('Label missing.')
        continue
      tf_example = numpy_example(result['input'], labels)
      writer.write(tf_example.SerializeToString())
    print(f'TFRecord file created: {os.path.join(tf_record_folder,output_file)}')

In [None]:
for key, result in w:
  print(key)
  print(value)

In [None]:
#def parse_tfrecord(example):
  feature = {
    'label': tf.io.FixedLenFeature([], tf.int64),
    'array_raw': tf.io.FixedLenFeature([], tf.string),
  }
  example = tf.io.parse_single_example(example, feature)
  array_feature = tf.io.parse_tensor(example['array_raw'], out_type=tf.double)
  return label, array_feature

# Create a TFRecordDataset
dataset = tf.data.TFRecordDataset(["/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_tfrecords/dwb_18457.tfrecords"])
dataset = dataset.map(parse_tfrecord)

# Iterate over the dataset
for label, array_feature in dataset:
  visualize_features(array_feature)
  print(label)

In [None]:
visualize_features(array_feature)
print(label)

In [None]:
# @title Visualize

root_folder = "/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_data_lsm"
files = gfile.ListDir(os.path.join(root_folder,"BY_SUBJECT_momentary_stress_algorithm"))
ids = [s[-9:] for s in files]
print(ids)

ids = ['18780.csv']
w = window(ids, '168h', '1min')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
from matplotlib.patches import Rectangle
import matplotlib.dates as mdates




featureNameMapping = {'HR': 'Heart Rate',
    'eda_level_real': 'SCL',
    'leads_contact_counts': 'Leads Contact Cnt.',
    'steps': 'Steps (Min)',
    'jerk_auto': 'Acc. Jerk',
    'step_count': 'Steps (Min)',
    'log_energy': 'Acc. Log Energy',
    'covariance': 'Acc. Covariance',
    'log_energy_ratio': 'Acc. Log Energy Ratio',
    'zero_crossing_std': 'Acc. Zero Cross Std.',
    'zero_crossing_avg': 'Acc. Zero Cross Avg.',
    'axis_mean': 'Acc. Axis Mean',
    'altim_std': 'Altimeter Std.',
    'kurtosis': 'Acc. Kurtosis',
    'sleep_coefficient': 'Sleep Prob.',
    'wrist_temperatures': 'Skin Temperature',
    'hrv_shannon_entropy_rr': 'HRV Shannon Ent.',
    'hrv_shannon_entropy_rrd': 'HRV Shannon Ent. Diffs.',
    'hrv_percentage_of_nn_30': 'HRV % NN30',
    'ceda_magnitude_real_micro_siemens': 'SCL Mag.',
    'ceda_slope_real_micro_siemens': 'SCL Slope',
    'rmssd_percentile_0595': 'HRV RMSSD',
    'sdnn_percentile_0595': 'HRV SDNN',
    'msa_probability': 'MSA Prob.',
    'hrv_percent_good': 'HRV % Good',
    'hrv_rr_80th_percentile_mean': 'HRV RR 80th',
    'hrv_rr_20th_percentile_mean': 'HRV RR 20th',
    'hrv_rr_median': 'HRV RR Median',
    'hrv_rr_mean': 'HRV RR Mean',
    'hr_at_rest_mean': 'Resting Heart Rate',
    'skin_temperature_magnitude': 'Skin Temp. Mag',
    'skin_temperature_slope': 'Skin Temp. Slope'
}

for x in w:

  fig = plt.figure(figsize=(20, 7))
  ax1 = plt.subplot2grid((1, 12), (0, 0), colspan=12)

  #ax1.set_title('Antimicrobial resistance genes',size=12, pad=30, fontname='Ubuntu')

  #x[1]['input'].index = x[1]['input'].index.round('H')
  #x[1]['input'].index = x[1]['input'].index.floor('5T')
  group = x[1]['input']#.groupby(by='DT').mean()
  group.rename(columns=featureNameMapping, inplace=True)
  group = group.sort_index(axis=1)

  ax1 = sns.heatmap(group.T, cmap="Reds", cbar=True, linewidths=0.0,
                    linecolor='black', alpha=0.8, ax=ax1, yticklabels=True)

  for tick in ax1.get_xticklabels():
      tick.set_fontname('Ubuntu')
      tick.set_style('italic')
  ax1.tick_params(axis='x', labelsize=10.5)

  for tick in ax1.get_yticklabels():
      tick.set_fontname('Ubuntu')
  ax1.tick_params(axis='y', labelsize=10.5)

  #date_format = mdates.DateFormatter('%Y-%m-%d')
  #ax1.xaxis.set_major_formatter(date_format)
  plt.xticks(rotation=45)  # Rotate labels for better readability
  plt.tight_layout()

  ax1.set_ylabel("Feature", fontname='Ubuntu', fontsize=14)

  ax1.axhline(y=0, color='k',linewidth=1, alpha=1)
  ax1.axhline(y=group.shape[1], color='k', alpha=1,linewidth=1)
  ax1.axvline(x=0, color='k',linewidth=1, alpha=1)
  ax1.axvline(x=group.shape[0], color='k', alpha=1,linewidth=1);

  for i in np.arange(0,group.shape[0],60):
    ax1.axvline(x=i, color='k', alpha=0.4,linewidth=1);
  for i in np.arange(0,group.shape[1],1):
    ax1.axhline(y=i, color='k', alpha=0.4,linewidth=1);
  ax1.set_xticklabels([])

  fig.savefig(f'example_heatmap.png', format='png', bbox_inches="tight")
  %download_file example_heatmap.png
  plt.show()
  break


  '''
  group = x[1]['input']
  fig, ax = plt.subplots(figsize=(25,5))
  x_lims = [group.index[0], group.index[-1]]
  y_lims = [0, len(group.columns)]
  im = ax.imshow(np.flip(np.flip(group.to_numpy()[0:-1,:].T,axis=0),axis=1), interpolation='nearest', aspect='auto', extent = [x_lims[0], x_lims[1],  y_lims[0], y_lims[1]])
  ax.set_yticks(range(0,len(group.columns)))
  ax.set_yticklabels(group.columns.to_list())

  ax.axhline(y=0, color='k',linewidth=1)
  ax.axhline(y=group.shape[1], color='k',linewidth=1)
  ax.axvline(x=0, color='k',linewidth=1)
  ax.axvline(x=group.shape[1], color='k',linewidth=1);

  fig.colorbar(im)
  plt.show()
  '''