**To run the experiments for DWB use this AoD grant:**

https://grants.corp.google.com/#/grants?request=20h%2Fchr-ards-dwb-deid-eng-policy:r&reason=b%2F264556558%20-%20DWB%20RQ%20and%20Analysis


**To run the experiments for Metabolics use this AoD grant:**

https://grants.corp.google.com/#/grants?request=20h%2Fchr-ards-metabolichealth-deid-eng-team-sphinx:r&reason=b%2F283774208

**To run the experiments for Fitbit Prod use this AoD grant:**

https://grants.corp.google.com/#/grants?request=20h%2Fchr-ards-fitbit-prod-research-deid-eng-team:r&reason=%22b%2F285178698%22



# Imports

In [None]:
from absl import app
from ast import literal_eval
from collections.abc import Sequence
import csv
import datetime
import fnmatch
from google3.pyglib import gfile  # This is repeated, you might want to remove one
from colabtools import googlefiles
import json
from matplotlib.colors import ListedColormap
import matplotlib.dates as mdates
from matplotlib.lines import Line2D
from matplotlib.patches import Patch, Rectangle
import matplotlib.pyplot as plt
import multiprocessing
import multiprocessing.pool
import numpy as np
import os
import pathlib
import pandas as pd
import pdb
import random  # This is repeated, you might want to remove one
import seaborn as sns
from scipy import stats
from scipy.stats import shapiro
from scipy.stats import zscore
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot
import tarfile
import tensorflow as tf
import tensorflow_datasets as tfds
from time import sleep, time
from google3.pyglib import gfile

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
data = []

data.append({'type':  'steps',
     'raw_file': 'STEPS_COMPACT_DATA',
     'features_to_extract': ['steps'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'momentary_stress_algorithm',
     'raw_file': 'MOMENTARY_STRESS_ALGORITHM_DATA',
     'features_to_extract': ['hrv_shannon_entropy_rr','hrv_shannon_entropy_rrd','hrv_percentage_of_nn_30','ceda_magnitude_real_micro_siemens','ceda_slope_real_micro_siemens','rmssd_percentile_0595','sdnn_percentile_0595','msa_probability','hrv_percent_good','hrv_rr_80th_percentile_mean','hrv_rr_20th_percentile_mean','hrv_rr_median','hrv_rr_mean','hr_at_rest_mean','skin_temperature_magnitude','skin_temperature_slope'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'ceda',
     'raw_file': 'CONTINUOUS_EDA_DATA',
     'features_to_extract': ['eda_level_real','leads_contact_counts'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'wrist_temperature',
     'raw_file': 'WRIST_TEMPERATURE_DATA',
     'features_to_extract': ['wrist_temperatures'],
     'timezone_offset_column': 'tz_offset_minutes'})

data.append({'type':  'sleep_coefficient',
     'raw_file': 'SLEEP_COEFFICIENT_COMPACT_DATA',
     'features_to_extract': ['sleep_coefficient','is_on_wrist'],
     'timezone_offset_column': 'tz_offset_minutes'})

data.append({'type':  'spo2',
     'raw_file': 'ABSOLUTE_SPO2_DATA',
     'features_to_extract': ['value','confidence','coverage','valid'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'grok',
     'raw_file': 'GROK_FEATURE_DATA',
     'features_to_extract': ['jerk_auto','step_count','log_energy','covariance',
                             'log_energy_ratio','zero_crossing_std',
                             'zero_crossing_avg','axis_mean','altim_std','kurtosis'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})

data.append({'type':  'heart_rate',
     'raw_file': 'HEART_RATE_DATA',
     'features_to_extract': ['bpm','confidence'],
     'timezone_offset_column': 'activity_tm_timezone_offset'})


FEATURES_TO_INCLUDE = [
    'HR',
    'eda_level_real',
    'leads_contact_counts',
    'steps',
    'jerk_auto',
    'log_energy',
    'covariance',
    'log_energy_ratio',
    'zero_crossing_std',
    'zero_crossing_avg',
    'axis_mean',
    'altim_std',
    'kurtosis',
    'sleep_coefficient',
    'wrist_temperatures',
    'hrv_shannon_entropy_rr',
    'hrv_shannon_entropy_rrd',
    'ceda_slope_real_micro_siemens',
    'rmssd_percentile_0595',
    'sdnn_percentile_0595',
    'hrv_percent_good',
    'hrv_rr_80th_percentile_mean',
    'hrv_rr_20th_percentile_mean',
    'hrv_rr_median',
    'hr_at_rest_mean',
    'skin_temperature_slope',
]

In [None]:
# Helper Functions
def visualize_features(array_feature):

  fig = plt.figure(figsize=(20, 7))
  ax1 = plt.subplot2grid((1, 12), (0, 0), colspan=12)
  group = array_feature.numpy()

  ax1 = sns.heatmap(group.T, cmap="Reds", cbar=True, linewidths=0.0,
                    linecolor='black', alpha=0.8, ax=ax1, yticklabels=True)

  for tick in ax1.get_xticklabels():
      tick.set_fontname('Ubuntu')
      tick.set_style('italic')
  ax1.tick_params(axis='x', labelsize=10.5)

  for tick in ax1.get_yticklabels():
      tick.set_fontname('Ubuntu')
  ax1.tick_params(axis='y', labelsize=10.5)

  plt.xticks(rotation=45)  # Rotate labels for better readability
  plt.tight_layout()

  ax1.set_ylabel("Feature", fontname='Ubuntu', fontsize=14)

  ax1.axhline(y=0, color='k',linewidth=1, alpha=1)
  ax1.axhline(y=group.shape[1], color='k', alpha=1,linewidth=1)
  ax1.axvline(x=0, color='k',linewidth=1, alpha=1)
  ax1.axvline(x=group.shape[0], color='k', alpha=1,linewidth=1);

  for i in np.arange(0,group.shape[0],60):
    ax1.axvline(x=i, color='k', alpha=0.4,linewidth=1);
  for i in np.arange(0,group.shape[1],1):
    ax1.axhline(y=i, color='k', alpha=0.4,linewidth=1);

  fig.savefig(f'example_heatmap.pdf', format='pdf', bbox_inches="tight")
  %download_file example_heatmap.pdf
  plt.show()

def parse_dwb_tfrecord(example):
  feature = {
    'phq_intake_score': tf.io.FixedLenFeature([], tf.int64),
    'gad_intake_score': tf.io.FixedLenFeature([], tf.int64),
    'pss_score': tf.io.FixedLenFeature([], tf.int64),
    'sleep_disturbance_score': tf.io.FixedLenFeature([], tf.int64),
    'sleep_impairment_score': tf.io.FixedLenFeature([], tf.int64),
    'extraversion_score': tf.io.FixedLenFeature([], tf.int64),
    'smartphoneaddiction_score': tf.io.FixedLenFeature([], tf.int64),
    'phq_intake_score_binary': tf.io.FixedLenFeature([], tf.int64),
    'gad_intake_score_binary': tf.io.FixedLenFeature([], tf.int64),
    'pss_score_binary': tf.io.FixedLenFeature([], tf.int64),
    'smartphoneaddiction_score_binary': tf.io.FixedLenFeature([], tf.int64),
    'age': tf.io.FixedLenFeature([], tf.int64),
    'gender_group': tf.io.FixedLenFeature([], tf.int64),
    'bmi': tf.io.FixedLenFeature([], tf.int64),
    'array_raw': tf.io.FixedLenFeature([], tf.string),
  }
  example = tf.io.parse_single_example(example, feature)
  array_feature = tf.io.parse_tensor(example['array_raw'], out_type=tf.double)
  phq_intake_score = example['phq_intake_score']
  gad_intake_score = example['gad_intake_score']
  pss_score = example['pss_score']
  sleep_disturbance_score = example['sleep_disturbance_score']
  sleep_impairment_score = example['sleep_impairment_score']
  extraversion_score = example['extraversion_score']
  smartphoneaddiction_score = example['smartphoneaddiction_score']
  phq_intake_score_binary = example['phq_intake_score_binary']
  gad_intake_score_binary = example['gad_intake_score_binary']
  pss_score_binary = example['pss_score_binary']
  smartphoneaddiction_score_binary = example['smartphoneaddiction_score_binary']

  age = example['age']
  gender_group = example['gender_group']
  bmi = example['bmi']

  return phq_intake_score, gad_intake_score, pss_score, sleep_disturbance_score, sleep_impairment_score, extraversion_score, smartphoneaddiction_score, phq_intake_score_binary, gad_intake_score_binary, pss_score_binary, smartphoneaddiction_score_binary, age, gender_group, bmi, array_feature



def parse_metabolic_tfrecord(example):
  feature = {
    'bmi': tf.io.FixedLenFeature([], tf.float32),
    'homa_ir': tf.io.FixedLenFeature([], tf.float32),
    'apri': tf.io.FixedLenFeature([], tf.float32),
    'msss': tf.io.FixedLenFeature([], tf.float32),
    'hypertension_binary': tf.io.FixedLenFeature([], tf.int64),
    'hyperlipidemia_binary': tf.io.FixedLenFeature([], tf.int64),
    'cardiovascular_binary': tf.io.FixedLenFeature([], tf.int64),
    'diabetes_binary': tf.io.FixedLenFeature([], tf.int64),
    'anxiety_binary': tf.io.FixedLenFeature([], tf.int64),
    'respiratory': tf.io.FixedLenFeature([], tf.int64),
    'kidney_disease': tf.io.FixedLenFeature([], tf.int64),
    'msss_binary': tf.io.FixedLenFeature([], tf.int64),
    'framingham': tf.io.FixedLenFeature([], tf.int64),
    'age': tf.io.FixedLenFeature([], tf.int64),
    'gender': tf.io.FixedLenFeature([], tf.int64),
    'regular_menstruation_str': tf.io.FixedLenFeature([], tf.string),
    'smoker_str': tf.io.FixedLenFeature([], tf.string),
    'diabetes_type_str': tf.io.FixedLenFeature([], tf.string),
    'alcohol_str': tf.io.FixedLenFeature([], tf.string),
    'medications_str': tf.io.FixedLenFeature([], tf.string),
    'array_raw': tf.io.FixedLenFeature([], tf.string),
    'array_mask': tf.io.FixedLenFeature([], tf.string),
  }
  example = tf.io.parse_single_example(example, feature)
  array_feature = tf.io.parse_tensor(example['array_raw'], out_type=tf.double)
  array_mask = tf.io.parse_tensor(example['array_mask'], out_type=tf.double)
  bmi = example['bmi']
  homa_ir = example['homa_ir']
  apri = example['apri']
  msss = example['msss']
  hypertension_binary = example['hypertension_binary']
  hyperlipidemia_binary = example['hyperlipidemia_binary']
  cardiovascular_binary = example['cardiovascular_binary']
  diabetes_binary = example['diabetes_binary']
  anxiety_binary = example['anxiety_binary']
  respiratory = example['respiratory']
  kidney_disease = example['kidney_disease']
  #homa_ir_binary = example['homa_ir_binary']
  msss_binary = example['msss_binary']
  framingham = example['framingham']
  age = example['age']
  gender = example['gender']
  regular_menstruation_str = example['regular_menstruation_str']
  smoker_str = example['smoker_str']
  diabetes_type_str = example['diabetes_type_str']
  alcohol_str = example['alcohol_str']
  medications_str = example['medications_str']
  return  bmi, homa_ir, apri, msss, hypertension_binary, hyperlipidemia_binary, cardiovascular_binary, diabetes_binary, anxiety_binary, respiratory, kidney_disease, msss_binary, framingham, age, gender, regular_menstruation_str, smoker_str, diabetes_type_str, alcohol_str, medications_str, array_feature, array_mask


def parse_activity_tfrecord(example):
  feature = {
    'label': tf.io.FixedLenFeature([], tf.int64),
    'array_raw': tf.io.FixedLenFeature([], tf.string),
    'array_mask': tf.io.FixedLenFeature([], tf.string),
  }
  example = tf.io.parse_single_example(example, feature)
  array = tf.io.parse_tensor(example['array_raw'], out_type=tf.double)
  mask = tf.io.parse_tensor(example['array_mask'], out_type=tf.bool)
  label = example['label']

  return  array, mask, label

# STEP 1: Read TFrecords

In [None]:

dwb_record_folder = "/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_tfrecords_24h_missingness_80/"
records = gfile.ListDir(dwb_record_folder)
number_records = len(records)
print('DWB Records: ', number_records)


metabolic_record_folder = "/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-metabolichealth/deid/exp/aliheydari/metabolic_tfrecords_daily_alllabels_v07/"
records = gfile.ListDir(metabolic_record_folder)
number_records = len(records)
print('Metabolic Records: ', number_records)

activity_record_folder = "/namespace/fitbit-medical-sandboxes/jg/partner/encrypted/chr-ards-fitbit-prod-research/deid/exp/dmcduff/ttl=52w/lsm_v2/activities_tfrecords_24h_missingness_80"
records = gfile.ListDir(activity_record_folder)
number_records = len(records)
print('Activity Records: ', number_records)



In [None]:
with gfile.GFile('/namespace/fitbit-medical-sandboxes/jg/partner/encrypted/chr-ards-fitbit-prod-research/deid/exp/dmcduff/ttl=52w/lsm_v2/datasets/raw/activities_all_val.csv', 'r') as f:
  d_val = pd.read_csv(f)

In [None]:
activity_classes = ['Walk',
                    'Bike',
                    'Sport',
                    'Run',
                    'Aerobics',
                    'Elliptical',
                    'Treadmill',
                    'Spinning',
                    'Weightlifting',
                    'Swim',
                    'Yoga',
                    'Circuit Training',
                    'Hike',
                    'Tennis',
                    'CrossFit',
                    'Core training',
                    'Pilates',
                    'Stairclimber',
                    'Bootcamp',
                    'Dancing',
                    'Indoor climbing',
                    'Mountain Bike',
                    'Golf',
                    'Kickboxing',
                    'Martial Arts',
                    'Skiing',
                    'Rollerblading',
                    'Snowboarding',
                    'Kayaking',
                    'Surfing',
                    'Paddleboarding']

d_val[d_val['activity_name'].isin(activity_classes)].groupby('activity_name').first().head(98)

In [None]:
plotOn = True
Xd = []
labels = []
if plotOn:
  for r in records:
    dataset = tf.data.TFRecordDataset(os.path.join(activity_record_folder, r))
    dataset = dataset.map(parse_activity_tfrecord)
    for array, mask, label in dataset:
      print(r, label.numpy())
      arr = array.numpy()
      arr[mask] = np.nan
      Xd.append(np.nan_to_num(np.expand_dims(np.nanmean(arr, axis=0),1),0))
      labels.append(label.numpy().item())
Xd = np.concatenate(Xd, axis=1)
      #visualize_features(array)
      #plt.figure(figsize=(20,4))
      #arr = array.numpy()
      #feature = 'eda_level_real'
      #plt.plot(arr[:,FEATURES_TO_INCLUDE.index(feature)])
      #plt.show()

In [None]:
# Fit PCA:
pca = PCA()
pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
plt.figure(figsize=(8,6))
Xt = pipe.fit_transform(Xd.T)

In [None]:
plt.hist(labels,31)

In [None]:
6%5

In [None]:
yd = np.array(labels)

plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['axes.edgecolor'] = 'black'

fig = plt.figure(figsize=(12, 12))

colors = ['#32cd32', '#ffa500', '#ff2800', '#45cea2', '#9b59b6', '#999999', '#100c08']
shapes = ['o','^','*','s','p']
for i in np.unique(labels):
  plt.scatter(Xt[yd == i, 0], Xt[yd == i, 1], color=colors[int(i/5)], label=activity_classes[i], marker=shapes[i%5], alpha=0.4)

plt.xlabel('PCA Dim 1')
plt.ylabel('PCA Dim 2')
plt.axis('square')
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.show()

In [None]:
clf = LDA()
clf.fit(Xd.T, yd)
#lda = LDA(n_components=None, priors=None, shrinkage=None, solver='svd',store_covariance=False, tol=0.0001)
lda = LDA(n_components=None, priors=None, solver='svd',store_covariance=False, tol=0.0001) #  shrinkage='auto' - can help with high dimensions, solver='lsqr' - faster for large datasets
X_r2 = lda.fit(Xd.T, yd).transform(Xd.T)



fig = plt.figure(figsize=(12, 12))

colors = ['#32cd32', '#ffa500', '#ff2800', '#45cea2', '#9b59b6', '#999999', '#100c08']
shapes = ['o','^','*','s','p']
for i in np.unique(labels):
  plt.scatter(X_r2[yd == i, 0], X_r2[yd == i, 1], color=colors[int(i/5)], label=activity_classes[i], marker=shapes[i%5], alpha=0.4)

plt.xlabel('LDA Dim 1')
plt.ylabel('LDA Dim 2')
plt.axis('square')
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.show()

In [None]:
clf = LDA()
clf.fit(Xd.T, yd)
#lda = LDA(n_components=None, priors=None, shrinkage=None, solver='svd',store_covariance=False, tol=0.0001)
lda = LDA(n_components=None, priors=None, solver='svd',store_covariance=False, tol=0.0001) #  shrinkage='auto' - can help with high dimensions, solver='lsqr' - faster for large datasets
X_r2 = lda.fit(Xd.T, yd).transform(Xd.T)



fig = plt.figure(figsize=(12, 12))

colors = ['#32cd32', '#ffa500', '#ff2800', '#45cea2', '#9b59b6', '#999999', '#100c08']
shapes = ['o','^','*','s','p']
for i in np.unique(labels):
  plt.scatter(np.mean(X_r2[yd == i, 0]), np.mean(X_r2[yd == i, 1]), color=colors[int(i/5)], label=activity_classes[i], marker=shapes[i%5], alpha=0.4)

plt.xlabel('LDA Dim 1')
plt.ylabel('LDA Dim 2')
#plt.xlim(-1.2, 1.2)
#plt.ylim(-1.2, 1.2)
#plt.axis('square')
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.show()

In [None]:
record_folder = metabolic_record_folder
records = gfile.ListDir(record_folder)
number_records = len(records)

for r in records:
  dataset = tf.data.TFRecordDataset(os.path.join(record_folder, r))
  dataset = dataset.map(parse_metabolic_tfrecord)
  for bmi, homa_ir, apri, msss, hypertension_binary, hyperlipidemia_binary, cardiovascular_binary, diabetes_binary, anxiety_binary, respiratory, kidney_disease, msss_binary, framingham, age, gender, regular_menstruation_str, smoker_str, diabetes_type_str, alcohol_str, medications_str,  array_feature in dataset:
    plt.figure(figsize=(20,4))
    arr = array_feature.numpy()
    feature = 'eda_level_real'
    plt.plot(arr[:,FEATURES_TO_INCLUDE.index(feature)])
    feature = 'ceda_magnitude_real_micro_siemens'
    plt.plot(arr[:,FEATURES_TO_INCLUDE.index(feature)])
    plt.show()

In [None]:
# Stats:
df = pd.DataFrame({'bmi':bmi_vec, 'hypertension':hypertension_vec, 'homa_ir':homa_ir_vec, 'apri':apri_vec})
df[df==-999] = np.nan
df[df==-1] = np.nan
import seaborn as sns
plt.figure(figsize=(3,20))
sns.heatmap(df.isnull(), cbar=False)
plt.show()

# DWB Data:

## Plot Data

In [None]:
# Read and process TFrecords:
record_folder = "/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_tfrecords_24h_missingness_80/"
records = gfile.ListDir(record_folder)
number_records = len(records)

Xd = []
phq_intake_score_vec = []
gad_intake_score_vec = []
pss_score_vec = []
sleep_disturbance_score_vec = []
sleep_impairment_score_vec = []
smartphoneaddiction_score_vec = []

phq_intake_score_binary_vec = []
gad_intake_score_binary_vec = []
pss_score_binary_vec = []
smartphoneaddiction_score_binary_vec = []

age_vec = []
gender_group_vec = []
bmi_vec = []

for r in records:
  dataset = tf.data.TFRecordDataset(os.path.join(record_folder, r))
  dataset = dataset.map(parse_dwb_tfrecord)

  for phq_intake_score, gad_intake_score, pss_score, sleep_disturbance_score, sleep_impairment_score, extraversion_score, smartphoneaddiction_score, phq_intake_score_binary, gad_intake_score_binary, pss_score_binary, smartphoneaddiction_score_binary, age, gender_group, bmi, array_feature in dataset:
    arr = array_feature.numpy()
    arr = np.nan_to_num(arr)
    arr = np.pad(arr,((10080-arr.shape[0],0),(0,0)))
    #Xd.append(np.expand_dims(arr.flatten(), axis=0))
    Xd.append(np.nan_to_num(np.expand_dims(np.nanmean(arr, axis=0),1),0))
    phq_intake_score_vec.append(phq_intake_score.numpy().item())
    gad_intake_score_vec.append(gad_intake_score.numpy().item())
    pss_score_vec.append(pss_score.numpy().item())
    sleep_disturbance_score_vec.append(sleep_disturbance_score.numpy().item())
    sleep_impairment_score_vec.append(sleep_impairment_score.numpy().item())
    smartphoneaddiction_score_vec.append(smartphoneaddiction_score.numpy().item())

    phq_intake_score_binary_vec.append(phq_intake_score_binary.numpy().item())
    gad_intake_score_binary_vec.append(gad_intake_score_binary.numpy().item())
    pss_score_binary_vec.append(pss_score_binary.numpy().item())
    smartphoneaddiction_score_binary_vec.append(smartphoneaddiction_score_binary.numpy().item())

    age_vec.append(age.numpy().item())
    gender_group_vec.append(gender_group.numpy().item())
    bmi_vec.append(bmi.numpy().item())
Xd = np.concatenate(Xd, axis=1)

In [None]:
# Plot
def visualize_features(array_feature):

  fig = plt.figure(figsize=(20, 7))
  ax1 = plt.subplot2grid((1, 12), (0, 0), colspan=12)
  group = array_feature.numpy()

  ax1 = sns.heatmap(group.T, cmap="Reds", cbar=True, linewidths=0.0,
                    linecolor='black', alpha=0.8, ax=ax1, yticklabels=True)

  for tick in ax1.get_xticklabels():
      tick.set_fontname('Ubuntu')
      tick.set_style('italic')
  ax1.tick_params(axis='x', labelsize=10.5)

  for tick in ax1.get_yticklabels():
      tick.set_fontname('Ubuntu')
  ax1.tick_params(axis='y', labelsize=10.5)

  plt.xticks(rotation=45)  # Rotate labels for better readability
  plt.tight_layout()

  ax1.set_ylabel("Feature", fontname='Ubuntu', fontsize=14)

  ax1.axhline(y=0, color='k',linewidth=1, alpha=1)
  ax1.axhline(y=group.shape[1], color='k', alpha=1,linewidth=1)
  ax1.axvline(x=0, color='k',linewidth=1, alpha=1)
  ax1.axvline(x=group.shape[0], color='k', alpha=1,linewidth=1);

  for i in np.arange(0,group.shape[0],60):
    ax1.axvline(x=i, color='k', alpha=0.4,linewidth=1);
  for i in np.arange(0,group.shape[1],1):
    ax1.axhline(y=i, color='k', alpha=0.4,linewidth=1);

  plt.show()

record_folder = "/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-dwb/deid/exp/dmcduff/dwb_tfrecords_24h_missingness_80/"

for r in records:
  dataset = tf.data.TFRecordDataset(os.path.join(record_folder, r))
  dataset = dataset.map(parse_dwb_tfrecord)
  for phq_intake_score, gad_intake_score, pss_score, sleep_disturbance_score, sleep_impairment_score, extraversion_score, smartphoneaddiction_score, phq_intake_score_binary, gad_intake_score_binary, pss_score_binary, smartphoneaddiction_score_binary, age, gender_group, bmi, array_feature in dataset:
    visualize_features(array_feature)

## Compute PCA/LDA Embeddings

In [None]:
# Fit PCA:
pca = PCA()
pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
plt.figure(figsize=(8,6))
Xt = pipe.fit_transform(Xd.T)

In [None]:
labels = ['pss_score','phq_intake_score','sleep_disturbance_score','smartphoneaddiction_score']
colors = ['red', 'green', 'blue']

plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['axes.edgecolor'] = 'black'

fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 5))
for idx, label in enumerate(labels):

  # Categorize continuous labels:
  if label == 'pss_score':
      yd = np.array(pss_score_vec)
      yd[yd<14] = 0
      yd[(yd<22) & (yd>=14)] = 1
      yd[yd>=22] = 2

  if label == 'phq_score':
      yd = np.array(phq_intake_score_vec)
      yd[yd<10] = 0
      yd[(yd<20) & (yd>=10)] = 1
      yd[yd>=20] = 2

  if label == 'sleep_disturbance_score':
      yd = np.array(sleep_disturbance_score_vec)
      yd[yd<15] = 0
      yd[(yd<25) & (yd>=15)] = 1
      yd[yd>=25] = 2

  if label == 'smartphoneaddiction_score':
      yd = np.array(smartphoneaddiction_score_binary_vec)

  for i in range(3):
    axes[idx].scatter(Xt[yd == i, 1], Xt[yd == i, 2], c=colors[i], label=str(i), alpha=0.4)

  axes[idx].legend(['low','med','high'])
  axes[idx].set_xlabel('PCA Dim 1')
  axes[idx].set_ylabel('PCA Dim 2')
  axes[idx].axis('square')
plt.show()

In [None]:
# Fit LDA:

label = 'phq_score'
#yd = np.array(smartphoneaddiction_score_binary_vec)

# Categorize continuous labels:
if label == 'phq_score':
    yd = np.array(phq_intake_score_vec)
    plt.hist(yd)
    plt.show()
    yd[yd<10] = 0
    yd[(yd<14) & (yd>10)] = 1
    yd[yd>=14] = 2
    plt.hist(yd)
    plt.show()

print(Xd.shape)
print(len(yd))

clf = LDA()
clf.fit(Xd.T, yd)
lda = LDA(n_components=None, priors=None, shrinkage=None, solver='svd',store_covariance=False, tol=0.0001)
#lda = LDA(n_components=None, priors=None, shrinkage='auto', solver='lsqr',store_covariance=False, tol=0.0001) #  shrinkage='auto' - can help with high dimensions, solver='lsqr' - faster for large datasets
X_r2 = lda.fit(Xd.T, yd).transform(Xd.T)

In [None]:
plt.figure(figsize=(8,8))
for i in range(3):
    plt.scatter(X_r2[yd == i, 0], X_r2[yd == i, 1], label=str(i), alpha=0.1)
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.xlabel('LDA Dim 1')
plt.ylabel('LDA Dim 2')

# Metabolic Data:

## Plot Data

In [None]:
record_folder = metabolic_record_folder
records = gfile.ListDir(record_folder)
number_records = len(records)

smoker_vec = []
regular_menstruation_vec = []
medications_vec = []
for r in records:
  dataset = tf.data.TFRecordDataset(os.path.join(record_folder, r))
  dataset = dataset.map(parse_metabolic_tfrecord)
  for bmi, homa_ir, apri, msss, hypertension_binary, hyperlipidemia_binary, cardiovascular_binary, diabetes_binary, anxiety_binary, respiratory, kidney_disease, msss_binary, framingham, age, gender, regular_menstruation_str, smoker_str, diabetes_type_str, alcohol_str, medications_str,  array_feature in dataset:
    tmp = []
    tmp = str(smoker_str.numpy().decode(encoding="utf-8")).split("\x02")
    if len(tmp)<2:
     tmp = str(smoker_str.numpy().decode(encoding="utf-8")).split("\x10")
    if len(tmp)<2:
     tmp = str(smoker_str.numpy().decode(encoding="utf-8")).split("\x03")
    if tmp[1] == "Yes":
      smoker_vec.append(1)
    elif tmp[1] == "No":
      smoker_vec.append(0)
    else:
      smoker_vec.append(np.nan)

    tmp = []
    tmp = str(regular_menstruation_str.numpy().decode(encoding="utf-8")).split("\x02")
    if len(tmp)<2:
     tmp = str(regular_menstruation_str.numpy().decode(encoding="utf-8")).split("\x10")
    if len(tmp)<2:
     tmp = str(regular_menstruation_str.numpy().decode(encoding="utf-8")).split("\x03")
    if len(tmp)<2:
     tmp = str(regular_menstruation_str.numpy().decode(encoding="utf-8")).split("\x14")
    if tmp[1] == "Yes":
      regular_menstruation_vec.append(1)
    elif tmp[1] == "No":
      regular_menstruation_vec.append(0)
    else:
      regular_menstruation_vec.append(np.nan)

    tmp = []
    try:
      tmp = str(medications_str.numpy().decode(encoding="utf-8")).split("\x1d")
      if len(tmp)<2:
        tmp = str(medications_str.numpy().decode(encoding="utf-8")).split("\x10")
        medications_vec.append(1)
      else:
        medications_vec.append(0)
    except:
      regular_menstruation_vec.append(np.nan)

    #print(str(alcohol_str.numpy().decode(encoding="utf-8")).split("\x02"))


In [None]:
# Read and process TFrecords:
record_folder = metabolic_record_folder
records = gfile.ListDir(record_folder)
number_records = len(records)

Xd = []
bmi_vec = []
homa_ir_vec = []
apri_vec = []
hypertension_vec = []
hyperlipidemia_vec = []
cardiovascular_vec = []
diabetes_vec = []
anxiety_vec = []
respiratory_vec = []
kidney_disease_vec = []
msss_binary_vec = []
framingham_vec = []
age_vec = []
gender_vec = []
regular_menstruation_vec = []
smoker_vec = []
diabetes_type_vec = []
alcohol_vec = []
medications_vec = []
for r in records:
  dataset = tf.data.TFRecordDataset(os.path.join(record_folder, r))
  dataset = dataset.map(parse_metabolic_tfrecord)
  for bmi, homa_ir, apri, msss, hypertension_binary, hyperlipidemia_binary, cardiovascular_binary, diabetes_binary, anxiety_binary, respiratory, kidney_disease, msss_binary, framingham, age, gender, regular_menstruation_str, smoker_str, diabetes_type_str, alcohol_str, medications_str,  array_feature in dataset:
    arr = array_feature.numpy()
    arr = np.nan_to_num(arr)
    arr = np.pad(arr,((10080-arr.shape[0],0),(0,0)))
    arr = np.mean(arr, axis=0) # Collapse to average.
    arr = np.append(arr, np.std(arr, axis=0)) # Collapse to average.
    Xd.append(np.expand_dims(arr.flatten(), axis=0))
    bmi_vec.append(bmi.numpy().item())
    homa_ir_vec.append(homa_ir.numpy().item())
    apri_vec.append(apri.numpy().item())
    hypertension_vec.append(hypertension_binary.numpy().item())
    hyperlipidemia_vec.append(hyperlipidemia_binary.numpy().item())
    cardiovascular_vec.append(cardiovascular_binary.numpy().item())
    diabetes_vec.append(diabetes_binary.numpy().item())
    anxiety_vec.append(anxiety_binary.numpy().item())
    respiratory_vec.append(respiratory.numpy().item())
    kidney_disease_vec.append(kidney_disease.numpy().item())
    msss_binary_vec.append(msss_binary.numpy().item())
    framingham_vec.append(framingham.numpy().item())
    age_vec.append(age.numpy().item())
    gender_vec.append(gender.numpy().item())

    tmp = []
    tmp = str(smoker_str.numpy().decode(encoding="utf-8")).split("\x02")
    if len(tmp)<2:
     tmp = str(smoker_str.numpy().decode(encoding="utf-8")).split("\x10")
    if len(tmp)<2:
     tmp = str(smoker_str.numpy().decode(encoding="utf-8")).split("\x03")
    if tmp[1] == "Yes":
      smoker_vec.append(1)
    elif tmp[1] == "No":
      smoker_vec.append(0)
    else:
      smoker_vec.append(np.nan)

    tmp = []
    tmp = str(regular_menstruation_str.numpy().decode(encoding="utf-8")).split("\x02")
    if len(tmp)<2:
     tmp = str(regular_menstruation_str.numpy().decode(encoding="utf-8")).split("\x10")
    if len(tmp)<2:
     tmp = str(regular_menstruation_str.numpy().decode(encoding="utf-8")).split("\x03")
    if len(tmp)<2:
     tmp = str(regular_menstruation_str.numpy().decode(encoding="utf-8")).split("\x14")
    if tmp[1] == "Yes":
      regular_menstruation_vec.append(1)
    elif tmp[1] == "No":
      regular_menstruation_vec.append(0)
    else:
      regular_menstruation_vec.append(np.nan)

    tmp = []
    try:
      tmp = str(medications_str.numpy().decode(encoding="utf-8")).split("\x1d")
      if len(tmp)<2:
        tmp = str(medications_str.numpy().decode(encoding="utf-8")).split("\x10")
        medications_vec.append(1)
      else:
        medications_vec.append(0)
    except:
      medications_vec.append(np.nan)


Xd = np.concatenate(Xd, axis=0)

In [None]:
# Plot
def visualize_features_nan(array_feature):

  fig = plt.figure(figsize=(20, 7))
  ax1 = plt.subplot2grid((1, 12), (0, 0), colspan=12)
  group = array_feature.numpy()
  group[group==0] = np.nan

  ax1 = sns.heatmap(group.T, cmap="Reds", cbar=True, linewidths=0.0,
                    linecolor='black', alpha=0.8, ax=ax1, yticklabels=True)

  for tick in ax1.get_xticklabels():
      tick.set_fontname('Ubuntu')
      tick.set_style('italic')
  ax1.tick_params(axis='x', labelsize=10.5)

  ax1.set_yticklabels(FEATURES_TO_INCLUDE)
  for tick in ax1.get_yticklabels():
      tick.set_fontname('Ubuntu')
  ax1.tick_params(axis='y', labelsize=10.5)

  plt.xticks(rotation=45)  # Rotate labels for better readability
  plt.tight_layout()

  ax1.set_ylabel("Feature", fontname='Ubuntu', fontsize=14)

  ax1.axhline(y=0, color='k',linewidth=1, alpha=1)
  ax1.axhline(y=group.shape[1], color='k', alpha=1,linewidth=1)
  ax1.axvline(x=0, color='k',linewidth=1, alpha=1)
  ax1.axvline(x=group.shape[0], color='k', alpha=1,linewidth=1);

  for i in np.arange(0,group.shape[0],60):
    ax1.axvline(x=i, color='k', alpha=0.4,linewidth=1);
  for i in np.arange(0,group.shape[1],1):
    ax1.axhline(y=i, color='k', alpha=0.4,linewidth=1);

  plt.show()

# Plot
def visualize_features(array_feature):

  fig = plt.figure(figsize=(20, 7))
  ax1 = plt.subplot2grid((1, 12), (0, 0), colspan=12)
  group = array_feature.numpy()

  ax1 = sns.heatmap(group.T, cmap="Reds", cbar=True, linewidths=0.0,
                    linecolor='black', alpha=0.8, ax=ax1, yticklabels=True)

  for tick in ax1.get_xticklabels():
      tick.set_fontname('Ubuntu')
      tick.set_style('italic')
  ax1.tick_params(axis='x', labelsize=10.5)

  ax1.set_yticklabels(FEATURES_TO_INCLUDE)
  for tick in ax1.get_yticklabels():
      tick.set_fontname('Ubuntu')
  ax1.tick_params(axis='y', labelsize=10.5)

  plt.xticks(rotation=45)  # Rotate labels for better readability
  plt.tight_layout()

  ax1.set_ylabel("Feature", fontname='Ubuntu', fontsize=14)

  ax1.axhline(y=0, color='k',linewidth=1, alpha=1)
  ax1.axhline(y=group.shape[1], color='k', alpha=1,linewidth=1)
  ax1.axvline(x=0, color='k',linewidth=1, alpha=1)
  ax1.axvline(x=group.shape[0], color='k', alpha=1,linewidth=1);

  for i in np.arange(0,group.shape[0],60):
    ax1.axvline(x=i, color='k', alpha=0.4,linewidth=1);
  for i in np.arange(0,group.shape[1],1):
    ax1.axhline(y=i, color='k', alpha=0.4,linewidth=1);

  plt.show()

record_folder = "/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-metabolichealth/deid/exp/aliheydari/metabolic_tfrecords_weekly_alllabels_v02"
records = gfile.ListDir(record_folder)
for r in records:
  dataset = tf.data.TFRecordDataset(os.path.join(record_folder, r))
  dataset = dataset.map(parse_metabolic_tfrecord)
  for bmi, homa_ir, apri, msss, hypertension_binary, hyperlipidemia_binary, cardiovascular_binary, diabetes_binary, anxiety_binary, respiratory, kidney_disease, msss_binary, regular_menstruation_str, smoker_str, diabetes_type_str, alcohol_str, medications_str,  array_feature in dataset:
    visualize_features_nan(array_feature)
    visualize_features(array_feature)

## Compute PCA/LDA Embeddings

In [None]:
# Collapse Feature to Average
Xd.shape

In [None]:
# Fit PCA:
pca = PCA()
pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
plt.figure(figsize=(8,6))
Xt = pipe.fit_transform(Xd)

In [None]:
Xd.shape

In [None]:
labels = ['bmi','hypertension','homa_ir']
colors = ['green', 'blue', 'orange', 'red']

plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['axes.edgecolor'] = 'black'

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
for idx, label in enumerate(labels):

  # Categorize continuous labels:
  if label == 'bmi':
      yd = np.array(bmi_vec)
      yd[(yd>0) & (yd<20)] = 0
      yd[(yd>20) & (yd<=25)] = 1
      yd[(yd>20) & (yd<=30)] = 2
      yd[(yd>30) & (yd<=50)] = 3

      for i in range(4):
        axes[idx].scatter(Xt[yd == i, 0], Xt[yd == i, 1], c=colors[i], label=str(i), alpha=0.1)

      axes[idx].legend(['Under Weight','Normal Weight','Over Weight','Obese'])

  if label == 'hypertension':
      yd = np.array(hypertension_vec)

      for i in range(2):
        axes[idx].scatter(Xt[yd == i, 0], Xt[yd == i, 1], c=colors[i], label=str(i), alpha=0.1)

      axes[idx].legend(['No Hypertension','Hypertension'])

  if label == 'homa_ir':
      yd = np.array(homa_ir_vec)
      yd[yd<0] = -1
      yd[(yd>=0) & (yd<=2.5)] = 0
      yd[(yd>2.5)] = 1

      for i in range(2):
        axes[idx].scatter(Xt[yd == i, 0], Xt[yd == i, 1], c=colors[i], label=str(i), alpha=0.1)

      axes[idx].legend(['Low HOMA-IR','High HOMA-IR'])

  axes[idx].set_xlabel('PCA Dim 1')
  axes[idx].set_ylabel('PCA Dim 2')
  axes[idx].axis('square')
plt.show()

## Baseline Classification

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.gridspec as gridspec
from sklearn.metrics import f1_score


age_vec_norm = stats.zscore(age_vec)
bmi_vec_norm = stats.zscore(bmi_vec)

agg_features = []
for sensor_features, age, gender, bmi in zip(Xd,age_vec_norm,gender_vec,bmi_vec_norm):
  new_arr = np.append(sensor_features, np.array([age, gender, bmi]))
  agg_features.append(new_arr)

demo_features = []
for age, gender, bmi in zip(age_vec,gender_vec,bmi_vec):
  new_arr = np.array([age, gender, bmi])
  demo_features.append(new_arr)



# Split the data into training and testing sets
label_names = ['Smoking', 'Menstration', 'Medications', 'Homa-IR', 'Hypertension', 'Hyperlipidemia', 'Diabetes', 'Anxiety', 'Respiratory Disease'] ##'cardiovascular',
labels = [smoker_vec, regular_menstruation_vec, medications_vec, homa_ir_vec, hypertension_vec, hyperlipidemia_vec, diabetes_vec, anxiety_vec, respiratory_vec] #cardiovascular_vec,
          #bmi_vec,
          #homa_ir_vec,
          #apri_vec,
          #hypertension_vec,
          #hyperlipidemia_vec,
          #cardiovascular_vec,
          #diabetes_vec,
          #anxiety_vec,
          #respiratory_vec,
          #kidney_disease_vec,
          #msss_binary_vec,
          #age_vec,
          #gender_vec,
          #regular_menstruation_vec,
          #smoker_vec,
          #diabetes_type_vec,
          #alcohol_vec,
          #medications_vec]

number_samples = [10,20,50,100,200,500,1000,3000]

methods = ['LR', 'XGBOOST','MLP']
colors = ['red', 'blue', 'green', 'orange']

plotOn = False

for label, label_name in zip(labels, label_names):

  scores = []
  feature_types = ['Demo','Sensor+Demo']
  for feature_type in feature_types:
    print(feature_type)

    label_mask = np.array(label)
    if feature_type == 'Demo':
      agg_features_mask = np.array(demo_features)
    else:
      agg_features_mask = np.array(agg_features)

    mask = label_mask!=-999
    label_mask = label_mask[mask]
    agg_features_mask = agg_features_mask[mask,:]

    if label_name == 'Homa-IR':
      label_mask = label_mask>2.8
      label_mask = label_mask.astype(int)
    if label_name == 'APRI':
      label_mask = label_mask>0.5
      label_mask = label_mask.astype(int)
    if label_name == 'Smoking' or 'Menstration' or 'Medications':
      agg_features_mask = agg_features_mask[(label_mask == 0) | (label_mask == 1),:]
      label_mask = label_mask[(label_mask == 0) | (label_mask == 1)]

    label_mask = label_mask.tolist()
    agg_features_mask = agg_features_mask.tolist()


    for method in methods:
      for fwsht in number_samples:
        reps=5
        for rep in range(reps):


          train_agg_features_mask = agg_features_mask[:int(len(agg_features_mask)*0.8)]
          train_label_mask = label_mask[:int(len(label_mask)*0.8)]
          test_agg_features_mask = agg_features_mask[int(len(agg_features_mask)*0.8) :]
          test_label_mask = label_mask[int(len(label_mask)*0.8) :]

          X_test = test_agg_features_mask
          y_test = test_label_mask
          #_, X_test, _, y_test = train_test_split(test_agg_features_mask, test_label_mask, test_size=200, random_state=rep, stratify=test_label_mask)
          _, X_train_fewshot, _, y_train_fewshot = train_test_split(train_agg_features_mask, train_label_mask, test_size=fwsht, random_state=rep, stratify=train_label_mask)

          if method == 'LR':
            # Create a logistic regression model
            model = LogisticRegression(solver='liblinear')
          if method =='XGBOOST':
            model = XGBClassifier(objective='binary:logistic',
                                  n_estimators=50,
                                  learning_rate=0.1,
                                  max_depth=5,
                                  random_state=42,
                                  eval_metric='error', use_label_encoder=False)
          if method =='MLP':
            model = MLPClassifier(hidden_layer_sizes=(50, 25),  # Two hidden layers with 50 and 25 neurons respectively
                                  activation='relu',            # ReLU activation function
                                  solver='adam',                # Adam optimizer
                                  alpha=0.001,                  # L2 regularization term
                                  batch_size=32,                # Batch size for mini-batch learning
                                  learning_rate='adaptive',     # Adaptive learning rate
                                  max_iter=300,                 # Maximum number of iterations
                                  random_state=42)

          # Train the model
          model.fit(X_train_fewshot, y_train_fewshot)

          # Make predictions on the test set
          y_pred = model.predict(X_test)

          # Evaluate the model
          accuracy = accuracy_score(y_test, y_pred)
          #print('TASK: ', label_name)
          #print(f".     Accuracy: {accuracy}")

          if plotOn==True:
            plt.subplot(1, 2, 1)
            plt.hist(y_train_fewshot)
            plt.title('Train Label Dist.')
            plt.subplot(1, 2, 2)
            plt.hist(y_test)
            plt.title('Test Label Dist.')

            cm = confusion_matrix(y_test, y_pred)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm)
            disp.plot()
            plt.show()

          row = pd.DataFrame({'method': [method], 'feature_type': feature_type, 'fwsht': [fwsht], 'rep': [rep], 'score': [accuracy], 'f1': [f1_score(y_test, y_pred, average='macro')]})
          scores.append(row)

  scores = pd.concat(scores)
  print('=======================')
  fig = plt.figure(figsize=(22, 4))
  gs = gridspec.GridSpec(
      1,
      4,
      hspace=0.25,
      wspace=0.25,
      figure=fig,
  )
  for method, col in zip(methods,colors):
    for feature_type in feature_types:
      tmp = scores[(scores['method']==method) & (scores['feature_type']==feature_type)].groupby(['fwsht']).mean(numeric_only=True)
      ax0 = plt.subplot(gs[0, 1])
      if feature_type=='Demo':
        ax0.plot(tmp.index,tmp.f1, '--', label=method, color=col)
      else:
        ax0.plot(tmp.index,tmp.f1,label=method, color=col)

  ax0.xaxis.set_tick_params(which='major', size=10, width=2, direction='in')
  ax0.xaxis.set_tick_params(which='minor', size=7, width=2, direction='in')
  ax0.yaxis.set_tick_params(which='major', size=10, width=2, direction='in')
  ax0.yaxis.set_tick_params(which='minor', size=7, width=2, direction='in')
  ax0.spines['left'].set_linewidth(3)
  ax0.spines['bottom'].set_linewidth(3)
  ax0.spines['left'].set_edgecolor("#000000")
  ax0.spines['bottom'].set_edgecolor("#000000")
  ax0.tick_params(bottom=True, left=True, width=2, direction='inout')
  ax0.spines['right'].set_visible(False)
  ax0.spines['top'].set_visible(False)
  ax0.grid(False)
  ax0.set_facecolor('xkcd:white')
  ax0.set_ylabel('Accuracy (%)')
  ax0.set_xlabel('Number of Training Samples')
  ax0.set_ylim([0, 1])
  ax0.set_title(label_name)
  plt.legend()
  plt.show()
  print('=======================')

# Activity Data:

In [None]:
plotOn = True
Xd = []
labels = []
ids = []
if plotOn:
  for r in records:
    dataset = tf.data.TFRecordDataset(os.path.join(activity_record_folder, r))
    dataset = dataset.map(parse_activity_tfrecord)
    for array, mask, label in dataset:
      #print(r, label.numpy())
      arr = array.numpy()
      arr[mask] = np.nan
      Xd.append(np.nan_to_num(np.expand_dims(np.nanmean(arr, axis=0),1),0))
      labels.append(label.numpy().item())
      ids.append(r[9:-10])
Xd = np.concatenate(Xd, axis=1)

In [None]:
plt.hist(labels)

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.gridspec as gridspec
from sklearn.metrics import f1_score, balanced_accuracy_score, average_precision_score

label_names = ['homa_ir_binary', 'hypertension', 'hyperlipidemia', 'anxiety', 'diabetes', 'respiratory']
feature_types = ['Sensor','Demo','Sensor+Demo']
methods = ['LR', 'XGBOOST']#,'MLP']
colors = ['red', 'blue', 'green', 'orange']

scores = []

for label_name in label_names:
  for feature_type in feature_types:
    print(feature_type)

    data_complete = data[data[label_name]>=0]
    train = data_complete[data_complete['id']<70467]
    test = data_complete[data_complete['id']>=70467]

    if feature_type == 'Sensor':
      train_features = np.concatenate( train['features'].to_list(), axis=0)
      test_features = np.concatenate( test['features'].to_list(), axis=0)
    if feature_type == 'Demo':
      train_features = train[['age','gender','bmi']].to_numpy()
      test_features = test[['age','gender','bmi']].to_numpy()
    if feature_type == 'Sensor+Demo':
      train_features_lsm = np.concatenate( train['features'].to_list(), axis=0)
      train_features_demo = train[['age','gender','bmi']].to_numpy()
      train_features = np.concatenate((train_features_lsm, train_features_demo), axis=1)
      test_features_lsm = np.concatenate( test['features'].to_list(), axis=0)
      test_features_demo = test[['age','gender','bmi']].to_numpy()
      test_features = np.concatenate((test_features_lsm, test_features_demo), axis=1)

    train_labels = train[label_name].to_numpy()
    test_labels = test[label_name].to_numpy()
    test_ids = test['id'].to_numpy()

    for method in methods:
      if method == 'LR':
        model = LogisticRegression(solver='liblinear')
      if method =='XGBOOST':
        model = XGBClassifier(objective='binary:logistic',
                              n_estimators=50,
                              learning_rate=0.1,
                              max_depth=5,
                              random_state=42,
                              eval_metric='error', use_label_encoder=False)
      if method =='MLP':
        model = MLPClassifier(hidden_layer_sizes=(50, 25),  # Two hidden layers with 50 and 25 neurons respectively
                              activation='relu',            # ReLU activation function
                              solver='adam',                # Adam optimizer
                              alpha=0.001,                  # L2 regularization term
                              batch_size=32,                # Batch size for mini-batch learning
                              learning_rate='adaptive',     # Adaptive learning rate
                              max_iter=300,                 # Maximum number of iterations
                              random_state=42)

      # Train the model
      model.fit(train_features, train_labels)

      # Make predictions on the test set
      y_pred = model.predict(test_features)

      p = pd.DataFrame({'id': test_ids, 'label': test_labels, 'prediction': y_pred})
      p = p.groupby('id').mean().round()
      test_labels_p = p['label']
      y_pred_p = p['prediction']

      # Evaluate the model
      plt.subplot(1, 2, 1)
      plt.hist(train_labels)
      plt.title('Train Label Dist.')
      plt.subplot(1, 2, 2)
      plt.hist(test_labels)
      plt.title('Test Label Dist.')
      cm = confusion_matrix(test_labels, y_pred)
      disp = ConfusionMatrixDisplay(confusion_matrix=cm)
      disp.plot()
      plt.show()

      row = pd.DataFrame({'label': [label_name],
                          'feature_type': [feature_type],
                          'method': [method],
                          'accuracy': [accuracy_score(test_labels, y_pred)],
                          'f1': [f1_score(test_labels, y_pred, average='macro')],
                          'balanced_accuracy': [balanced_accuracy_score(test_labels, y_pred)],
                          'mAP': [average_precision_score(test_labels, y_pred, average='macro')],
                          'accuracy_p': [accuracy_score(test_labels_p, y_pred_p)],
                          'f1_p': [f1_score(test_labels_p, y_pred_p, average='macro')],
                          'balanced_accuracy_p': [balanced_accuracy_score(test_labels_p, y_pred_p)],
                          'mAP_p': [average_precision_score(test_labels_p, y_pred_p, average='macro')]})
      scores.append(row)
    print('=======================')
    fig = plt.figure(figsize=(22, 4))
    gs = gridspec.GridSpec(
        1,
        4,
        hspace=0.25,
        wspace=0.25,
        figure=fig,
    )
scores = pd.concat(scores)

# Inference from LSM - Get embeddings from LSM-2

In [None]:
import pickle

# Dump embedding data dir
train_data_dir = '/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-metabolichealth/deid/exp/aliheydari/metabolic_embedding_dump/metabolic_tfrecords_daily_alllabels_v04_embedding_dump_xid149985992_wid1_20250218153123'
valid_data_dir = '/namespace/fitbit-medical-sandboxes/partner/encrypted/chr-ards-metabolichealth/deid/exp/aliheydari/metabolic_embedding_dump/metabolic_tfrecords_daily_alllabels_v04_embedding_dump_xid149985992_wid1_20250218161703'

# Glob files
ext_pattern = '*.pickle'
train_fpattern = os.path.join(train_data_dir, ext_pattern)
valid_fpattern = os.path.join(valid_data_dir, ext_pattern)
train_flist = gfile.Glob(train_fpattern)
valid_flist = gfile.Glob(valid_fpattern)

# Example file read
path = train_flist[0]
with gfile.GFile(path, mode='r') as f:
  data = pickle.load(f)

print(f'Train num dumped files: {len(train_flist)}')
print(f'Valid num dumped files: {len(valid_flist)}\n')
print(f'Data Keys\n{data.keys()}\n')
print('Embedding Shape:', data['embedding_pre_logits'].shape)

In [None]:
embeddings = []
bmi_vec = []
hypertension_vec = []
homa_ir_vec = []
batch_mask_vec = []
for path in valid_flist:
  with gfile.GFile(path, mode='r') as f:
    try:
      data = pickle.load(f)
      flattened_arr = data['embedding_pre_logits'].reshape(data['embedding_pre_logits'].shape[0] * data['embedding_pre_logits'].shape[1], *data['embedding_pre_logits'].shape[2:])
      flattened_arr = flattened_arr.reshape(flattened_arr.shape[0],flattened_arr.shape[1] * flattened_arr.shape[2])
      embeddings.append(flattened_arr)
      bmi_vec.append(data['bmi'].flatten())
      hypertension_vec.append(data['hypertension_binary'].flatten())
      homa_ir_vec.append(data['homa_ir'].flatten())
      batch_mask_vec.append(data['batch_mask'].flatten())
    except:
      continue

embeddings = np.concatenate( embeddings, axis=0 )
bmi_vec = np.concatenate( bmi_vec, axis=0 )
hypertension_vec = np.concatenate( hypertension_vec, axis=0 )
homa_ir_vec = np.concatenate( homa_ir_vec, axis=0 )
batch_mask_vec = np.concatenate( batch_mask_vec, axis=0 )
print(embeddings.shape)

In [None]:
n = 5824
pca = PCA()
pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
plt.figure(figsize=(8,6))
Xt = pipe.fit_transform(embeddings[:n])

In [None]:
labels = ['bmi','hypertension','homa_ir']
colors = ['green', 'blue', 'orange', 'red']

plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['axes.edgecolor'] = 'black'

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
for idx, label in enumerate(labels):

  # Categorize continuous labels:
  if label == 'bmi':
      yd = bmi_vec[:n]
      yd[(yd>0) & (yd<20)] = 0
      yd[(yd>20) & (yd<=25)] = 1
      yd[(yd>20) & (yd<=30)] = 2
      yd[(yd>30) & (yd<=50)] = 3

      for i in range(4):
        axes[idx].scatter(Xt[yd == i, 0], Xt[yd == i, 1], c=colors[i], label=str(i), alpha=0.1)

      axes[idx].legend(['Under Weight','Normal Weight','Over Weight','Obese'])

  if label == 'hypertension':
      yd = hypertension_vec[:n]
      #yd = labels#np.array(hypertension_vec)

      for i in range(2):
        axes[idx].scatter(Xt[yd == i, 0], Xt[yd == i, 1], c=colors[i], label=str(i), alpha=0.1)

      axes[idx].legend(['No Hypertension','Hypertension'])

  if label == 'homa_ir':
      yd = homa_ir_vec[:n]
      yd[yd<0] = -1
      yd[(yd>=0) & (yd<=2.5)] = 0
      yd[(yd>2.5)] = 1

      for i in range(2):
        axes[idx].scatter(Xt[yd == i, 0], Xt[yd == i, 1], c=colors[i], label=str(i), alpha=0.1)

      axes[idx].legend(['Low HOMA-IR','High HOMA-IR'])

  axes[idx].set_xlabel('PCA Dim 1')
  axes[idx].set_ylabel('PCA Dim 2')
  axes[idx].axis('square')
plt.show()

In [None]:
yd