In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import pingouin as pg

import seaborn as sns
import pylab as plt
import statsmodels.api as sm

from importlib import reload
from functools import reduce
import glob
import datetime
import re

import os
import sys
sys.path.insert(1,'../scripts')
import utils
import plots
#import classify
sys.path.insert(1,'../../phenotype')
import _loaders
import _preprocess

from sklearn import cluster, decomposition,preprocessing,linear_model,model_selection,metrics,neighbors,mixture
from sklearn.pipeline import Pipeline

import pickle

In [None]:
def plot_context():
    sns.set_context("talk", rc={"font.size":18,"axes.titlesize":18,"axes.labelsize":16,"font_scale":0.9})

In [None]:
data_path = '/scratch/c.c21013066/data/ppmi'
path = '/scratch/c.c21013066/data/ppmi/accelerometer'
image_path = '/scratch/c.c21013066/images/paper/digitalPPMI'

# Load data

In [None]:
demo = pd.read_csv(f'{data_path}/phenotypes2021/demographics.csv',parse_dates=['date_birth'])
behavior = _loaders.load_behavior(f'{data_path}/phenotypes2021')
datscan = _loaders.load_datscan_all(f'{data_path}/phenotypes2021')
genotypes = _loaders.load_genotypes(f'{data_path}/phenotypes2021')
prodromal = _loaders.load_prodromalBerg(f'{data_path}/phenotypes2021')
merged, ambulatory, step, sleep, pulse, pulsevar = utils.load_timeseries(demo,path)

In [None]:
biospecimen = pd.read_csv(f'{data_path}/phenotypes2021/biospecimen_SAA_all_clean.csv',index_col=0)
bsp_rest = pd.read_csv(f'{data_path}/phenotypes2021/biospecimen_clean_add.csv',index_col=0).drop(columns=['index'])
biospecimen_all = pd.merge(biospecimen.iloc[:,:-1],bsp_rest,on=['participant','date','visit'],how='outer')
biospecimen_all = biospecimen_all[biospecimen_all['visit']=='BL']

In [None]:
demo = pd.read_csv(f'{data_path}/phenotypes2021/demographics.csv',parse_dates=['date_birth'])
#biospecimen = pd.read_csv(f'{data_path}/phenotypes2021/biospecimen_SAA_all_clean.csv',index_col=0)
biospfeatures = biospecimen_all.columns[-7:]
biospecimen = pd.merge(biospecimen_all,demo,on='participant',how='left')
biospecimen = _preprocess.date_to_datetime(biospecimen)
biospecimen = _preprocess.get_visit_age(biospecimen)
biospecimen = biospecimen.sort_values(['participant','date'])
biospecimen_last = biospecimen.groupby('participant').last()
datscan_ = pd.read_csv(f'{data_path}/phenotypes2021/datscan_all_clean.csv',index_col=0)
datscan_ = pd.merge(datscan_,demo,on='participant',how='left')
datscan_ = _preprocess.date_to_datetime(datscan_)
datscan_ = _preprocess.get_visit_age(datscan_)
datscan_ = _preprocess.get_DatScan_IDPs(datscan_)
datfeatures = ['datscan_caudate_mean', 'datscan_putamen_mean',
       'datscan_mean','datscan_asymmetry','datscan_caudate_asymmetry', 'datscan_putamen_asymmetry']
datscan_ = datscan_.sort_values(['participant','date'])
datscan_last = datscan_.groupby('participant').last()
datscan_bl = datscan_.groupby('participant').first()
datscan = _preprocess.get_DAT_deficit(datscan_)
dat_last = datscan.dropna(subset=['putamen_min'],how='any',axis='rows').sort_values('date').groupby('participant').last()
bio = pd.merge(biospecimen_last,datscan_,right_index=True,left_index=True,suffixes=['_bio','_dat'])
bio = _preprocess.match_only_nearest_df(datscan_.reset_index(),biospecimen.reset_index(),merge='visit_age',tolerance=2,suffixes=['','_bio'])

In [None]:
prodromal = pd.merge(dat_last.reset_index()[['participant','dat_deficit','putamen_min']],prodromal,on='participant',how='right',suffixes=['','_visual'])
prodromal['dat_deficit_visual'] = prodromal['dat_deficit_visual'].replace(['negative','positive'],[0,1])
behprod = pd.merge(behavior.reset_index().drop(columns=['rbd']),prodromal,left_on=['participant','visit'],right_on=['participant','phenoconverted_visit'],how='right')
behprod.loc[behprod['phenoconverted']==1,'phenoconverted_date'] = behprod.loc[behprod['phenoconverted']==1,'date']
prod = pd.merge(demo,genotypes,on='participant')
prod = pd.merge(prod,behprod[np.hstack([prodromal.columns,'phenoconverted_date'])],on='participant')
prod['phenoconverted_age'] = (prod['phenoconverted_date'] - prod['date_birth'])/np.timedelta64(1,"Y")

In [None]:
# load extracted features
#reload(utils)
path = '/rds/general/user/aschalka/home/data/ppmi/accelerometer'
f = utils.read_extracted_features(f'{path}/extracted_features',names=['stepcount.csv','ambulatory.csv','prv.csv','pulserate.csv','sleepmetrics2.csv'])

In [None]:
for i,mod in enumerate([ambulatory,step,sleep,pulse,pulsevar]):
    age = mod.groupby('subject')[['age_accelerometry']].agg(['min','max','mean','median'])
    if i == 0:
        ages = age.copy(deep=True)
    else:
        ages = pd.concat([ages,age])
        ages = ages[~ages.index.duplicated(keep='first')]
ages.columns = ['_'.join(col) for col in ages.columns.values]

In [None]:
m = pd.merge(ages.reset_index(),f,right_on='participant',left_on='subject',how='right')
m = pd.merge(prod,m,on='participant',how='right')
m = m.set_index('participant')
converter = (m['age_accelerometry_min']-m['phenoconverted_age']).dropna().index
m['diagnosis_own'] = m['diagnosis'].copy()
m.loc[converter,'diagnosis_own'] = 'pd'

features = f.columns[1:]
covs = np.hstack([ages.columns,'male','education'])
m['male'] = m['gender'].replace(['f','m'],[0,1])
m = pd.merge(m,pd.get_dummies(m['diagnosis_own']),on='participant')

# classify PD vs HC digital
- timeframe: weekly, all data
- features: all sensors, sensor specific
- ML models

In [None]:
# per modality models
reload(utils)
namess = [['stepcount.csv','ambulatory.csv'],['prv.csv','pulserate.csv'],['sleepmetrics2.csv']]
labels = ['physical activity','vital signs','sleep']
for label,names in zip(labels,namess):
    if label=='vital signs':
        f = utils.read_extracted_features(f'{path}/extracted_features',names=names)
        m = pd.merge(ages.reset_index(),f,right_on='participant',left_on='subject',how='right')
        m = pd.merge(prod,m,on='participant',how='right')
        m = m.set_index('participant')
        converter = (m['age_accelerometry_min']-m['phenoconverted_age']).dropna().index
        m['diagnosis_own'] = m['diagnosis'].copy()
        m.loc[converter,'diagnosis_own'] = 'pd'

        features = f.columns[1:]
        print(len(features))
        covs = np.hstack([ages.columns,'male','education'])
        m['male'] = m['gender'].replace(['f','m'],[0,1])
        m = pd.merge(m,pd.get_dummies(m['diagnosis_own']),on='participant')
        m['pd'] = (m['diagnosis']=='pd').astype(int)
        clean = m[m['diagnosis'].isin(['pd','hc'])].dropna(subset=np.hstack([features,covs]))
        external_test = m[m['diagnosis']=='prod'].dropna(subset=np.hstack([features,covs]))
        X_filtered = select_features(clean[features], clean['pd'])
        if len(X_filtered.columns)>0:
            features = X_filtered.columns
        print(len(features))
        folder_path = f'/scratch/c.c21013066/data/ppmi/analyses/classifyPDHC/digital_tsfresh_{label}/'
        if not os.path.exists(folder_path):
            # Create the folder if it doesn't exist
            os.makedirs(folder_path)
            print(f"Folder created at {folder_path}")
        else:
            print(f"Folder already exists at {folder_path}")
        # baseline
        join_dig = pd.concat([clean,external_test])
        coefs,cl,join_dig = classify.run_classification(clean.reset_index(),[],covs,'pd',join_dig,
                                           save=folder_path)
        join_dig = pd.concat([clean,external_test])
        coefs,cl,join_dig = classify.run_classification(clean.reset_index(),features,covs,'pd',join_dig,
                                           save=folder_path)

In [None]:
# all modaltities combined model
# modify to either select whole or last week
#names = ['stepcount_lastweek.csv','ambulatory_lastweek.csv','prv_lastweek.csv','pulserate_lastweek.csv','sleepmetrics2_lastweek.csv']
names = ['stepcount.csv','ambulatory.csv','prv.csv','pulserate.csv','sleepmetrics2.csv']
f = utils.read_extracted_features(f'{path}/extracted_features',names=names)
m = pd.merge(ages.reset_index(),f,right_on='participant',left_on='subject',how='right')
m = pd.merge(prod,m,on='participant',how='right')
m = m.set_index('participant')
converter = (m['age_accelerometry_min']-m['phenoconverted_age']).dropna().index
m['diagnosis_own'] = m['diagnosis'].copy()
m.loc[converter,'diagnosis_own'] = 'pd'

features = f.columns[1:]
covs = np.hstack([ages.columns,'male','education'])
m['male'] = m['gender'].replace(['f','m'],[0,1])
m = pd.merge(m,pd.get_dummies(m['diagnosis_own']),on='participant')
m['pd'] = (m['diagnosis']=='pd').astype(int)

clean = m[m['diagnosis'].isin(['pd','hc'])].dropna(subset=np.hstack([features,covs]))
external_test = m[m['diagnosis']=='prod'].dropna(subset=np.hstack([features,covs]))

# preselect features
X_filtered = select_features(clean[features], clean['pd'])

reload(classify)
reload(plots)
join_dig = pd.concat([clean,external_test])
coefs,cl,join_dig = classify.run_classification(clean.reset_index(),X_filtered.columns,covs,'pd',join_dig,
                                       save='/scratch/c.c21013066/data/ppmi/analyses/classifyPDHC/digital_tsfresh/')


In [None]:
# ML models
coefs,cl,join_dig = classify.run_classification_models(clean.reset_index(),X_filtered.columns,covs,'pd',join_dig,saveing='/scratch/c.c21013066/data/ppmi/analyses/classifyPDHC')

In [None]:
# baseline model
coefs,cl,join_dig = classify.run_classification(clean.reset_index(),[],covs,'pd',join_dig,
                                       save='/scratch/c.c21013066/data/ppmi/analyses/classifyPDHC/digital_tsfresh/baseline/')