In [1]:
import pandas as pd
import numpy as np
from scipy import stats

import seaborn as sns
import pylab as plt
from statannot import add_stat_annotation
import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests


import missingno as msn

from importlib import reload
from functools import reduce
import datetime

import sys
sys.path.insert(1,'../scripts')
import plots
import utils
import _preprocess

In [2]:
data_path = '/scratch/c.c21013066/data/ppmi'
path = '/scratch/c.c21013066/data/ppmi/accelerometer'
image_path = '/scratch/c.c21013066/images/ppmi/studywatch'

# Load Data

In [3]:
# load clinical scores
demo = pd.read_csv(f'{data_path}/phenotypes2023/demographics_clean.csv',parse_dates=['date_birth'])
behavior = pd.read_csv(f'{data_path}/phenotypes2023/behavior_clean.csv',parse_dates=['date'])

# adjust to fit into bounds
behavior.loc[behavior['hvlt_recognition']<0,'hvlt_recognition'] = np.nan
behavior.loc[behavior['hvlt_recognition']>12,'hvlt_recognition'] = 12
behavior.loc[behavior['benton']>15,'benton'] = 15
behavior.loc[behavior['stai_trait']<20,'stai_trait'] = np.nan
behavior.loc[behavior['stai_state']<20,'stai_state'] = np.nan
behavior['updrs_iii_OFF'] = behavior['updrs_iii_OFF'].fillna(behavior['updrs_iii_NoMED'])
behavior = pd.merge(demo[['participant','date_birth']],behavior,on='participant',how='right')
behavior = behavior.set_index('date').sort_index().reset_index()
behavior = _preprocess.get_visit_age(behavior)
behavior = behavior.drop(columns=['date_birth'])

# add medication
ledd = pd.read_csv(f'{data_path}/phenotypes2023/ledd.csv',parse_dates=['date'],index_col=0)
behavior = pd.merge(ledd,behavior,on=['participant','date'],how='outer')

# define modalities
neuropsychiatric = ['stai_trait','stai_state','gds','quip']
cognition = ['semantic_fluency','moca','benton','lns','hvlt_recall','hvlt_recognition','hvlt_retention','symbol_digit']
autonome = ['epworth','rbd','systolic_bp_drop','scopa_aut']
daily = ['se_adl','updrs_i']
motor = ['updrs_ii','updrs_iii_OFF']
dat = ['datscan_caudate_l']
bio = ['ttau']
medication = ['LEDD','updrs_iv','updrs_iii_ON']

In [4]:
# load smartwatch data
reload(utils)
merged, ambulatory, step, sleep, pulse, pulsevar = utils.load_timeseries(demo,path)

subset = demo['participant'].sample(10)
timeseries = utils.merge_timeseries([ambulatory, step, sleep, pulse, pulsevar],subset=subset)
timeseries = timeseries.reset_index()
timeseries = timeseries.drop(columns=['participant']).rename(columns={'subject':'participant'})
timeseries = timeseries.set_index('date_local_adj').sort_index().reset_index()

  df['date'] = df['time'].dt.to_period('D')
  df['date'] = df['time'].dt.to_period('D')
  df['date'] = df['time'].dt.to_period('D')
  features = df.groupby('subject')[df_raw.columns[5:]].agg(['size','mean','std','max','min'])
  df['date'] = df['time'].dt.to_period('D')
  df['date'] = df['time'].dt.to_period('D')
  dfs = [df.loc[(subset,slice(None)),:] for df in dfs]
  merged = reduce(lambda  left,right: pd.merge(left,right,right_index=True,left_index=True,
  merged = reduce(lambda  left,right: pd.merge(left,right,right_index=True,left_index=True,
  merged = reduce(lambda  left,right: pd.merge(left,right,right_index=True,left_index=True,


In [5]:
# get names of digital outcome measures
predictors = timeseries.filter(regex='(walking|step|efficiency|total_sleep|pulse|deep|light|rem|nrem|rmssd|wake)').columns
sleep_col = timeseries.filter(regex='(efficiency|total_sleep|deep|light|rem|nrem|wake)').columns
phys = timeseries.filter(regex='(walking|step)').columns
vital = timeseries.filter(regex='(pulse|rmssd)').columns
import re
predictors_filt = [a for a in predictors if not re.search('_ms', a)]
sleep_col = [a for a in sleep_col if not re.search('_ms', a)]
phys = [a for a in phys if not re.search('_ms', a)]
vital = [a for a in vital if not re.search('_ms', a)]

# Get overlap clinic visit and digital data
adjust to get averages for month instead

In [6]:
# merge with behavior to timeseries date
window = 3.5
kind = 'week'
reload(utils)
ms = []
for timeseries in [ambulatory,step,sleep,pulse,pulsevar]:
    m = utils.merge_with_behavior(timeseries,behavior,on='timeseries',tolerance=pd.Timedelta(window,'days'))#adjust for other timeframe: 15,3.5
    m = m.dropna(subset=np.intersect1d(predictors,timeseries.columns),how='all')
    m = m.dropna(subset=np.hstack([motor,daily,cognition,neuropsychiatric,autonome,medication]),how='all')
    ms.append(m)
    
merged = reduce(lambda  left,right: pd.merge(left,right,on=['participant','date_local_adj'],
                                            how='outer',suffixes=['','__drop']), ms)
merged = merged.dropna(subset=predictors,how='all')
merged = merged.drop(columns=merged.filter(regex='__drop'))

In [8]:
# clinic visit not representative normal day, so remove it from week around visit
dropvisit = merged.groupby('participant').apply(lambda sub: sub.loc[sub['date_local_adj'].dt.date != sub['date_y'],:]).reset_index(drop=True)
clinicvisit = merged.groupby('participant').apply(lambda sub: sub.loc[sub['date_local_adj'].dt.date == sub['date_y'],:]).reset_index(drop=True)

In [20]:
# compute rolling mean
df = dropvisit.copy(deep=True)
grouped_mean = df.groupby(['participant','date_y'])[np.hstack([predictors,motor,cognition,neuropsychiatric,autonome,daily,medication,'visit_age'])].mean()

merged_pd = df[df['diagnosis']=='pd']
# compute rolling mean
grouped_mean_pd = merged_pd.groupby(['participant','date_y'])[np.hstack([predictors,motor,cognition,neuropsychiatric,autonome,daily,medication,'visit_age'])].mean()

grouped_pd_missing = merged_pd.groupby(['participant','date_y'])[np.hstack([predictors,motor,cognition,neuropsychiatric,autonome,daily,medication,'visit_age'])].count()

merged_hc = df[df['diagnosis']=='hc']
# compute rolling mean
grouped_mean_hc = merged_hc.groupby(['participant','date_y'])[np.hstack([predictors,motor,cognition,neuropsychiatric,autonome,daily,medication,'visit_age'])].mean()

In [21]:
# compute amount of missing data
grouped_pd_missing = grouped_pd_missing.groupby('participant').last()
grouped_pd_missing[predictors_filt] = grouped_pd_missing[predictors_filt].replace([0],np.nan)
((100/(24*6)) * ((24*6)-grouped_pd_missing))[predictors_filt].describe().iloc[:2,:].T#.to_csv('/scratch/c.c21013066/data/ppmi/analyses/studywatch/clinicvisit_missingrate_digital.csv')

In [11]:
# save digital weekly averages
grouped_mean.to_csv(f'/scratch/c.c21013066/data/ppmi/accelerometer/{kind}ly_mean.csv')
grouped_mean_pd.to_csv(f'/scratch/c.c21013066/data/ppmi/accelerometer/{kind}ly_mean_pd.csv')

# Correlate digital averages with clinical scores

In [33]:
grouped_mean_pd_old = pd.read_csv(f'/scratch/c.c21013066/data/ppmi/accelerometer/{kind}ly_mean_pd.csv')

In [12]:
# restrict to one visit per person to avoid overrepresentation
grouped_mean_pd = grouped_mean_pd.groupby('participant').last()
print(grouped_mean_pd.shape)

(92, 44)


In [15]:
neuropsychiatric = ['stai_trait','stai_state','gds','quip']
cognition = ['semantic_fluency','moca','benton','lns','hvlt_recall','hvlt_recognition','hvlt_retention','symbol_digit']
autonome = ['epworth','rbd','systolic_bp_drop','scopa_aut']
daily = ['se_adl','updrs_i']
motor = ['updrs_ii','updrs_iii_OFF']
dat = ['datscan_caudate_l']
bio = ['ttau']
medication = ['updrs_iii_ON','updrs_iv','LEDD']

cl_names = ['Semantic Fluency', 'MOCA', 'Benton',
       'Letter Number Sequencing', 'HVLT Recall', 'HVLT Recognition', 'HVLT Retention',
       'Symbol Digit', 'STAI trait', 'STAI state', 'GDS', 'QUIP',
       'ESS', 'RBDSQ', 'Systolic BP Drop', 'SCOPA autonome',
       'Schwab England ADL', 'UPDRS I','UPDRS II','UPDRS III OFF','UPDRS III ON','UPDRS IV', 'LEDD']

In [16]:
corr = pd.DataFrame(columns=['N','pearson r','p-value'],index=pd.MultiIndex.from_product([np.hstack([cl_names,predictors_filt]),
                                                                                          np.hstack([cl_names,predictors_filt])],
                                                                                         names=['f1','f2']))
for (i,p),m in zip(enumerate(np.hstack([np.hstack([cognition,neuropsychiatric,autonome,daily,motor,medication]),predictors_filt])),
                               np.hstack([cl_names,predictors_filt])):
    for (j,u),n in zip(enumerate(np.hstack([np.hstack([cognition,neuropsychiatric,autonome,daily,motor,medication]),predictors_filt])),
                               np.hstack([cl_names,predictors_filt])):
        if j>i:
            try:
                dat = grouped_mean_pd.dropna(subset=[p,u])
                corr.loc[(m,n),'N'] = dat.shape[0]
                r,pval = stats.pearsonr(dat[p],dat[u])
                corr.loc[(m,n),'pearson r'] = r
                corr.loc[(m,n),'p-value'] = pval
            except:
                print(u,p)

In [39]:
corr.to_csv(f'/scratch/c.c21013066/data/ppmi/analyses/studywatch/corr_clinicaldig_pd_{kind}_lastvisit_allcorr.csv')