In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import pingouin as pg

import seaborn as sns
import pylab as plt
from statannotations.Annotator import Annotator
import statsmodels.api as sm
from statsmodels import stats as sm_stats
from sklearn import metrics

from importlib import reload
from functools import reduce
import glob
import datetime
import re

import sys
sys.path.insert(1,'../scripts')
import plots
sys.path.insert(1,'../../phenotype')
import utils
import _preprocess

import pickle
from statsmodels.stats.multitest import multipletests

In [None]:
def plot_context():
    sns.set_context("talk", rc={"font.size":18,"axes.titlesize":18,"axes.labelsize":16,"font_scale":0.9})

In [None]:
data_path = '/scratch/c.c21013066/data/ppmi/analyses'
image_path = '/scratch/c.c21013066/images/paper/digitalPPMI'

path = '/rds/general/user/aschalka/home/data/ppmi'
data_path = '/rds/general/user/aschalka/home/data/ppmi/analyses'
image_path = '/rds/general/user/aschalka/home/images/paper/prodromalPPMI'

In [None]:
names = ['prodromal_Heinzel','prodromal_Heinzel_noDaT','UPSIT','digital']
files = [
        'prodromal/Heinzel_Yan2024_>1.csv','prodromal/Heinzel_Yan2024_noDaT_>1.csv','classifyPDHC/upsit/predictions',
         'classifyPDHC/digital_tsfresh/predictions']
for (n,name),file in zip(enumerate(names),files):
    if 'prodromal' in name:
        data = pd.read_csv(f'{data_path}/{file}')
        data = data.rename(columns={'PD':f'risk_{name}'})
        data = data.rename(columns={'age':'baseline_age','current_age':'age'})
    else:
        for cv in range(5):
            fold = pd.read_csv(f'{data_path}/{file}{cv}.csv')
            if cv==0:
                folds = pd.DataFrame(index=fold.participant,columns=np.arange(5))
            folds.loc[fold.participant,cv] = fold['pred_proba'].values
            if 'clinical' in name or 'SAA' in name or 'DaTscan' in name or 'UPSIT' in name:
                folds.loc[fold.participant,'age'] = fold['visit_age'].values
            else:
                folds.loc[fold.participant,'age'] = fold['age_accelerometry_mean'].values
        folds['prob_PD'] = folds.loc[:,np.arange(5)].mean(axis=1)
        folds[f'prob_{name}_std'] = folds.loc[:,np.arange(5)].mean(axis=1)
        data = folds[['prob_PD','age']].reset_index()

    data = data.rename(columns={'prob_PD':f'prob_{name}','pred_proba':f'prob_{name}','age':f'age_{name}'})

    if n == 0:
        merged = data[['participant',f'prob_{name}','diagnosis',f'risk_{name}',f'age_{name}']]
    else:
        try:
            merged = pd.merge(merged,data[['participant',f'prob_{name}',f'risk_{name}',f'age_{name}']],on='participant',how='outer')
        except:
            merged = pd.merge(merged,data[['participant',f'prob_{name}',f'age_{name}']],on='participant',how='outer')

In [None]:
# add all risk info
raw = pd.read_csv(f'{data_path}/prodromal/Heinzel_Yan2024_>1.csv',index_col=0)
raw = pd.merge(raw.drop(columns=['diagnosis']),merged,on='participant',how='outer')
converter = pd.read_csv(f'{data_path}/prodromal/converterInfo.csv',index_col=0)
converter = converter.groupby('participant').first().reset_index()
raw = pd.merge(raw,converter[np.hstack(['participant',converter.filter(regex='phenocon').columns])],on='participant',how='outer',suffixes=['_drop$',''])
raw = raw.drop(columns=raw.filter(regex='_drop$').columns)

In [None]:
raw[['GBA','SNCA','LRRK2','1st_degree_family_history']] = raw[['GBA','SNCA','LRRK2','1st_degree_family_history']].replace([0,1],['no','yes'])
raw = raw.set_index('participant')
yesno = pd.get_dummies(raw[['GBA','SNCA','LRRK2','1st_degree_family_history']]).set_index(raw.index)
raw = pd.merge(raw,yesno,right_index=True,left_index=True,how='outer')
raw = raw.reset_index()
names = ['prodromal_Heinzel','prodromal_Heinzel_noDaT','hyposmia','digital','DaTscan','SAA']
for method,age in zip(names, ['age_prodromal_Heinzel','age_prodromal_Heinzel_noDaT','age_UPSIT','age_digital','age_DaTscan','age_SAA']):
                      #['age_clinical_updrs','age_digital_tsfresh','age_clinical_updrs','age_bio_last','age_dat_last','age_bio_bl_all_saa_positive','age_bio_bl_saa_mean','age_dat_first']):
                            # 'age_bio_bl','age_bio_bl_all_pos','age_bio_bl_saa_pos','age_bio_bl_saa_mean','age_bio_bl_fmax']):
    print(method)
    raw[f'diagnosis_update_{method}'] = raw['diagnosis'].copy()
    raw.loc[np.logical_and(raw['phenoconverted']==1,(raw['phenoconverted_age']-raw[age])<0),f'diagnosis_update_{method}'] = 'converted'
    raw.loc[np.logical_and(raw[f'diagnosis_update_{method}']=='prod',raw[age].isna()),f'diagnosis_update_{method}'] = 'no data'
    raw.loc[np.logical_and(raw[f'diagnosis_update_{method}']=='prod',
                           np.logical_and(raw['phenoconverted']==1,raw['phenoconverted_age'].isna())),f'diagnosis_update_{method}'] = 'no data'

In [None]:
cols = raw.filter(regex='diagnosis_update').columns
convs = raw.loc[raw[cols].apply(lambda row: (row == 'converted').any(), axis=1),np.hstack([cols,'participant'])]

In [None]:
saa_positive = pd.read_csv(f'{path}/phenotypes2021/biospecimen_SAA_all_clean.csv')
saa_positive['SAA_positive'] = saa_positive['SAA_positive'].replace([0,1],['no','yes'])
saa_positive = pd.concat([saa_positive,pd.get_dummies(saa_positive['SAA_positive'],prefix='SAA')],axis=1)
saa_positive['SAA_mean'] = saa_positive[['SAA_Fmax1','SAA_Fmax2','SAA_Fmax3']].mean(axis=1)
raw = pd.merge(raw,saa_positive,on='participant',how='outer')

In [None]:
for v in ['male', 'pesticide_exposure', 'rbd_psgproven', 'rbd_test', 'HYPOSMIA', 'positiveDaT', 'SAA','constipation_','ErectileDysfunction_','UrinaryDysfunction_','OrthostaticHypotension_',
         'ExcessiveDaytimeSleepiness_','DepressionAnxiety_','1st_degree_family_history_']:
    raw[v] = raw[[f'{v}_no',f'{v}_yes']].idxmax(axis=1).replace([f'{v}_no',f'{v}_yes'],[0,1])
    raw.loc[raw[[f'{v}_no',f'{v}_yes']].sum(axis=1)==0,v] = np.nan

In [None]:
# Heinzel binarize >80%,<5%
bins = pd.IntervalIndex.from_tuples([(0, 0.05), (0.05, 0.8), (0.8, 1)])

raw['Heinzel2019_cat'] = pd.cut(raw['prob_prodromal_Heinzel'],bins,labels=['low risk','medium risk','high risk']).map(dict(zip(bins, ['low risk','medium risk','high risk'])))

In [None]:
nona_all = raw.dropna(subset=raw.filter(regex='prob_dig').columns,how='all')
nona_all = nona_all[nona_all['diagnosis_update_digital'].isin(['prod'])]
nona_all = nona_all.dropna(subset=['positiveDaT','SAA','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT'],how='any')

In [None]:
for group in ['SAA','positiveDaT','LRRK2_yes','GBA_yes','HYPOSMIA']:
    test = nona_all[nona_all[group]==1]
    print(group)
    print(test.groupby(['male'])[['LRRK2_yes','GBA_yes','positiveDaT',
                                            'SAA','HYPOSMIA','UPDRS>6','male','age_digital']].agg(['mean',
                                                                                                   'sum','count','std']).T)

In [None]:
var = ['LRRK2_yes','GBA_yes','rbd_psgproven','HYPOSMIA','rbd_test',
                                  'constipation_','DepressionAnxiety_','ExcessiveDaytimeSleepiness_',
                                 'positiveDaT','UPDRS>6','ErectileDysfunction_','UrinaryDysfunction_',
                                 'OrthostaticHypotension_','DiabetesII','cognitive_impairment','SAA','positiveDaT','male','age_digital']
pd.DataFrame(nona_all[var].std())

In [None]:
risks = ['prodromal_Heinzel','prodromal_Heinzel_noDaT','digital']
table = pd.DataFrame(columns=['mean','std','N'],index=pd.MultiIndex.from_product([risks,['LRRK2_yes','GBA_yes','rbd_psgproven','HYPOSMIA',
                                                                                    'positiveDaT','SAA']],
                                                                             names=['risk','group']))
for risk in risks:
    for group in ['LRRK2_yes','GBA_yes','rbd_psgproven','HYPOSMIA','positiveDaT','SAA']:
        table.loc[(risk,group),'mean'] = nona_all.loc[nona_all[group]==1,f'prob_{risk}'].mean()
        table.loc[(risk,group),'std'] = nona_all.loc[nona_all[group]==1,f'prob_{risk}'].std()
        table.loc[(risk,group),'N'] = nona_all.loc[nona_all[group]==1,f'prob_{risk}'].dropna().shape[0]

In [None]:
table

In [None]:
# time between saa and accelerometer
print((nona_all['age_digital']-nona_all['age_DaTscan']).describe())
print((nona_all['age_digital']-nona_all['age_SAA']).describe())
ax = sns.histplot(nona_all['age_digital']-nona_all['age_DaTscan'],label='DaTscan')
ax = sns.histplot(nona_all['age_digital']-nona_all['age_SAA'],label='SAA',color='orange')
ax.set_xlabel('years to digital data collection end')
plt.legend()

# Difference in risk between prodromal risk carriers
- biological stages and definitions (NSD, SynNeuGe)
- prodromal and risk markers

In [None]:
var = ['male','pesticide_exposure','caffeine','never_smoke','previous_smoke',
                                  'current_smoke','rbd_psgproven','rbd_test','HYPOSMIA',
                                  'constipation_','DepressionAnxiety_','ExcessiveDaytimeSleepiness_',
                                 'positiveDaT','UPDRS>6','ErectileDysfunction_','UrinaryDysfunction_',
                                 'OrthostaticHypotension_','1st_degree_family_history_','DiabetesII','cognitive_impairment','GBA','LRRK2','PRS_highQ','PRS_lowQ']
#names = ['prodromal_Heinzel','digital_tsfresh','clinical_updrs','bio_last','dat_last','bio_bl_all_saa_positive','bio_bl_saa_mean','dat_first']
names=['digital']

boxplots = pd.DataFrame(columns=['T', 'dof', 'alternative', 'p-val', 'CI95%', 'cohen-d', 'BF10',
       'power','N_cases','N_controls'],index=pd.MultiIndex.from_product([names,var],names=['model','marker']))
for v in var:
    nona_all[v] = nona_all[v].replace([True,False],[1,0])
    nona_all[v] = nona_all[v].replace(['yes','no'],[1,0])
    nona_all[v] = nona_all[v].replace(['low risk','high risk'],[0,1])
    for m in names:
        try:
            if v == 'Heinzel2019_cat':
                nona = nona_all[nona_all[f'diagnosis_update_{m}']=='prod'].dropna(subset=[f'{v}',f'prob_{m}'])
                boxplots.loc[(m,v),['T', 'dof', 'alternative', 'p-val', 'CI95%', 'cohen-d', 'BF10','power']] = pg.ttest(nona.loc[nona[f'{v}']==1,f'prob_{m}'],nona.loc[nona[f'{v}']==0,f'prob_{m}'],correction=True).values

                boxplots.loc[(m,v),'N_cases'] = nona.loc[nona[f'{v}']==1].shape[0]
                boxplots.loc[(m,v),'N_controls'] = nona.loc[nona[f'{v}']==0].shape[0]
            elif v == 'ErectileDysfunction_':
                nona = nona_all[nona_all[f'diagnosis_update_{m}']=='prod'].dropna(subset=[f'{v}_yes',f'{v}_no',f'prob_{m}'])
                males = nona[nona['male']==1]
                boxplots.loc[(m,v),['T', 'dof', 'alternative', 'p-val', 'CI95%', 'cohen-d', 'BF10','power']] = pg.ttest(males.loc[males[f'{v}_yes']==1,f'prob_{m}'],males.loc[males[f'{v}_no']==1,f'prob_{m}'],correction=True).values
                boxplots.loc[(m,v),'N_cases'] = males.loc[males[f'{v}_yes']==1].shape[0]
                boxplots.loc[(m,v),'N_controls'] = males.loc[males[f'{v}_no']==1].shape[0]
            else:
                nona = nona_all[nona_all[f'diagnosis_update_{m}']=='prod'].dropna(subset=[f'{v}_yes',f'{v}_no',f'prob_{m}'])
                boxplots.loc[(m,v),['T', 'dof', 'alternative', 'p-val', 'CI95%', 'cohen-d', 'BF10','power']] = pg.ttest(nona.loc[nona[f'{v}_yes']==1,f'prob_{m}'],nona.loc[nona[f'{v}_no']==1,f'prob_{m}'],correction=True).values
                boxplots.loc[(m,v),'N_cases'] = nona.loc[nona[f'{v}_yes']==1].shape[0]
                boxplots.loc[(m,v),'N_controls'] = nona.loc[nona[f'{v}_no']==1].shape[0]
        except:
            print('too few people',v)
        
boxplots = boxplots.reset_index()

In [None]:
# drop too few people
boxplots = boxplots[np.logical_and(boxplots['N_cases']>9,boxplots['N_controls']>9)]

In [None]:
boxplots['FDR-corrected'] = sm_stats.multitest.fdrcorrection(boxplots['p-val'], alpha=0.05, method='indep', is_sorted=False)[1]

In [None]:
boxplots[boxplots['FDR-corrected']<0.05]

In [None]:
boxplots_sign = boxplots[boxplots['p-val']<(0.05/len(boxplots)-10)]
boxplots.to_csv(f'{data_path}/prodromal/digital_marker_ttest_welch>1.csv')

In [None]:
reload(plots)
var = ['HYPOSMIA','UPDRS>6','DepressionAnxiety_']
labels = [['Normal\n Smell','Hyposmia'],['UPDRS \n<= 6', 'UPDRS \n> 6'],['No\nDepression','Depression'],['normal PRS','low PRS'],['never smoked\ncurrent smoker','previous smoker']]
boxplots_sign = boxplots[boxplots['p-val']<(0.05/len(var))]
fig,axes = plt.subplots(ncols=3,nrows=1,figsize=(15,4))
plots.plot_context()
for ax,v,label in zip(axes.flatten(),var,labels):
    ax = sns.boxplot(x=v,y='prob_digital',ax=ax,data=nona_all,palette=['gray','black'])
    pvals = boxplots[boxplots['marker']==v]['FDR-corrected'].values
    print(pvals)
    formatted_pvals = ['{:.2e}'.format(num) for num in pvals]
    annotator = Annotator(ax, [(0,1)], data=nona_all, x=v, y='prob_digital',perform_stat_test=False)
    annotator.configure(test=None, text_format='full', loc='outside', verbose=2,comparisons_correction=None).set_pvalues(pvalues=pvals).set_custom_annotations(formatted_pvals)
    annotator.annotate()
    plots.add_median_labels(ax,boxplots[boxplots['marker']==v][['N_controls','N_cases']].T.iloc[:,0],fmt="%d",remove=0)
    ax.set_xticklabels(label)
    ax.set_xlabel('')
    ax.set_ylabel('digital risk')
plt.tight_layout()
plt.savefig(f'{image_path}/boxplot_digital_clinicalcategories_welch_>1_FDRcorrected.png',bbox_inches='tight',dpi=300)
plt.savefig(f'{image_path}/boxplot_digital_clinicalcategories_welch_>1_FDRcorrected.pdf',bbox_inches='tight',dpi=300)

In [None]:
# define biological stages
nona_all['SynNeurGe'] = ''
nona_all['G_p'] = (nona_all[['LRRK2','GBA']].sum(axis=1)>0).astype(int)
nona_all.loc[np.logical_and(nona_all['G_p']==1,np.logical_and(nona_all['SAA']==1,nona_all['positiveDaT']==0)),'SynNeurGe'] = 'G+S+N-'
nona_all.loc[np.logical_and(nona_all['G_p']==1,np.logical_and(nona_all['SAA']==1,nona_all['positiveDaT']==1)),'SynNeurGe'] = 'G+S+N+'
nona_all.loc[np.logical_and(nona_all['G_p']==1,np.logical_and(nona_all['SAA']==0,nona_all['positiveDaT']==1)),'SynNeurGe'] = 'G+S-N+'
nona_all.loc[np.logical_and(nona_all['G_p']==1,np.logical_and(nona_all['SAA']==0,nona_all['positiveDaT']==0)),'SynNeurGe'] = 'G+S-N-'
nona_all.loc[np.logical_and(nona_all['G_p']==0,np.logical_and(nona_all['SAA']==1,nona_all['positiveDaT']==0)),'SynNeurGe'] = 'G-S+N-'
nona_all.loc[np.logical_and(nona_all['G_p']==0,np.logical_and(nona_all['SAA']==1,nona_all['positiveDaT']==1)),'SynNeurGe'] = 'G-S+N+'
nona_all.loc[np.logical_and(nona_all['G_p']==0,np.logical_and(nona_all['SAA']==0,nona_all['positiveDaT']==1)),'SynNeurGe'] = 'G-S-N+'
nona_all.loc[np.logical_and(nona_all['G_p']==0,np.logical_and(nona_all['SAA']==0,nona_all['positiveDaT']==0)),'SynNeurGe'] = 'G-S-N-'

nona_all['NSD'] = ''
nona_all['C'] = (nona_all[['rbd_psgproven','rbd_test','HYPOSMIA',
                                  'constipation_','DepressionAnxiety_','cognitive_impairment','UPDRS>6']].sum(axis=1)>0).astype(int)
nona_all.loc[np.logical_and(nona_all['C']==0,np.logical_and(nona_all['SAA']==1,nona_all['positiveDaT']==0)),'NSD'] = '1A'
nona_all.loc[np.logical_and(nona_all['C']==0,np.logical_and(nona_all['SAA']==1,nona_all['positiveDaT']==1)),'NSD'] = '1B'
nona_all.loc[np.logical_and(nona_all['C']==1,np.logical_and(nona_all['SAA']==1,nona_all['positiveDaT']==0)),'NSD'] = '2A'
nona_all.loc[np.logical_and(nona_all['C']==1,np.logical_and(nona_all['SAA']==1,nona_all['positiveDaT']==1)),'NSD'] = '2B'

In [None]:
nona_all['SynNeurGe'].value_counts(),nona_all['NSD'].value_counts()

In [None]:
nona_all.groupby(['SynNeurGe'])[['prob_digital','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT']].agg(['mean','std'])
nona_all.groupby(['NSD'])[['prob_digital','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT']].agg(['mean','std'])

In [None]:
order_syn = ['G+S-N-','G+S+N-','G+S-N+','G-S+N-','G-S+N+']
ttest_syneurge = pd.DataFrame(columns=pd.MultiIndex.from_product([['prob_digital','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT'],
                                                                  ['T', 'dof', 'alternative', 'p-val', 'CI95%','cohen-d', 'BF10', 'power']],
                                                                 names=['risk','test']),
                              index=pd.MultiIndex.from_product([order_syn,order_syn],names=['g1','g2']))
order_nsd = ['','1A','2A','2B']
ttest_nsd = pd.DataFrame(columns=pd.MultiIndex.from_product([['prob_digital','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT'],
                                                                  ['T', 'dof', 'alternative', 'p-val', 'CI95%','cohen-d', 'BF10', 'power']],
                                                                 names=['risk','test']),
                              index=pd.MultiIndex.from_product([order_nsd,order_nsd],names=['g1','g2']))

for risk in ['prob_digital','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT']:
    for i,g1 in enumerate(order_syn):
        for j,g2 in enumerate(order_syn):
            if j>i:
                ttest_syneurge.loc[(g1,g2),(risk,slice(None))] = pg.ttest(nona_all.loc[nona_all['SynNeurGe']==g1,risk],
                                                                          nona_all.loc[nona_all['SynNeurGe']==g2,risk]).values
    for i,g1 in enumerate(order_nsd):
        for j,g2 in enumerate(order_nsd):
            if j>i:
                ttest_nsd.loc[(g1,g2),(risk,slice(None))] = pg.ttest(nona_all.loc[nona_all['NSD']==g1,risk],
                                                                          nona_all.loc[nona_all['NSD']==g2,risk]).values
ttest_syneurge.dropna(inplace=True)
ttest_nsd.dropna(inplace=True)

In [None]:
ttest_nsd.loc[:,(slice(None),'p-val')]

In [None]:
fig,axes = plt.subplots(ncols=3,nrows=3,figsize=(19,12))
plots.plot_context()

reload(plots)
var = ['HYPOSMIA','UPDRS>6','DepressionAnxiety_']
labels = [['Normal\n Smell','Hyposmia'],['UPDRS \n<= 6', 'UPDRS \n> 6'],['No\nDepression','Depression'],['normal PRS','low PRS'],['never smoked\ncurrent smoker','previous smoker']]
boxplots_sign = boxplots[boxplots['p-val']<(0.05/len(var))]

axes[0,0].text(-0.1, 1.1, 'a', transform=axes[0,0].transAxes, 
                size=20, weight='bold')
for ax,v,label in zip(axes[0,:],var,labels):
    ax = sns.boxplot(x=v,y='prob_digital',ax=ax,data=nona_all,palette=['gray','black'])
    pvals = boxplots[boxplots['marker']==v]['FDR-corrected'].values
    print(pvals)
    formatted_pvals = ['{:.2e}'.format(num) for num in pvals]
    annotator = Annotator(ax, [(0,1)], data=nona_all, x=v, y='prob_digital',perform_stat_test=False)
    annotator.configure(test=None, text_format='full', loc='outside', verbose=2,comparisons_correction=None).set_pvalues(pvalues=pvals).set_custom_annotations(formatted_pvals)
    annotator.annotate()
    plots.add_median_labels(ax,boxplots[boxplots['marker']==v][['N_controls','N_cases']].T.iloc[:,0],fmt="%d",remove=0)
    ax.set_xticklabels(label)
    ax.set_xlabel('')
    ax.set_ylabel('digital risk')

axes[1,0].text(-0.1, 1.1, 'b', transform=axes[1,0].transAxes, 
                size=20, weight='bold')
axes[2,0].text(-0.1, 1.1, 'c', transform=axes[2,0].transAxes, 
                size=20, weight='bold')
for (i,ax),y,name in zip(enumerate(axes[1:,:].flatten()),['prob_digital','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT',
                                                   'prob_digital','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT'],
                         ['digital risk','MDS','MDS restricted','digital risk','MDS','MDS restricted']):
    if i<3:
        bp = sns.boxplot(y=y,x='SynNeurGe',data=nona_all,order=order_syn,ax=ax,palette='pastel')
        plots.add_median_labels(ax,nona_all['SynNeurGe'].value_counts()[order_syn],fmt="%d",remove=0)
        ax.set_ylabel(name)
    else:
        bp = sns.boxplot(y=y,x='NSD',data=nona_all,order=order_nsd,ax=ax,palette='colorblind')
        plots.add_median_labels(ax,nona_all['NSD'].value_counts()[order_nsd],fmt="%d",remove=0)
        ax.set_ylabel(name)        
plt.tight_layout()

plt.savefig(f'{image_path}/boxplot_drivingfactors.png',dpi=300,bbox_inches='tight')
plt.savefig(f'{image_path}/boxplot_drivingfactors.pdf',dpi=300,bbox_inches='tight')

In [None]:
plots.plot_context()
nona = nona_all.dropna(subset=['positiveDaT','SAA','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT'],how='any')
dig_thresh = 0.5374367134240129
ax=sns.scatterplot(x='prob_prodromal_Heinzel_noDaT',y='prob_digital',data=nona,hue='SAA',style='positiveDaT',palette='colorblind',alpha=0.8)
ax.axvline(0.8,0,1,ls='--',color='k')
ax.axvline(0.05,0,1,ls='--',color='k')
ax.axhline(dig_thresh,0,1,ls='--',color='gray')
plt.fill_between(x=[plt.xlim()[0], 0.05], y1=plt.ylim()[0], y2=dig_thresh, alpha=0.2, color='green')  # Upper
plt.fill_between(x=[0.8, plt.xlim()[1]], y1=dig_thresh, y2=plt.ylim()[1], alpha=0.2, color='red')  # Lower
plt.xlabel('Heinzel 2019')
plt.ylabel('digital risk')
handles, labels = ax.get_legend_handles_labels()

labels[0] = "SAA+"
labels[3] = "DaT+"

ax.legend(handles=handles, labels=labels,bbox_to_anchor=(1,1))
#plt.savefig(f'{image_path}/scatterplot_HeinzelvsDigital_>1_SAAhue_DaTstyle.png',dpi=300,bbox_inches='tight')
#plt.savefig(f'{image_path}/scatterplot_HeinzelvsDigital_>1_SAAhue_DaTstyle.pdf',dpi=300,bbox_inches='tight')

## Correlation between risk scores

In [None]:
names = ['DaTscan minimum\nputamen SBR','CSF alpha-synuclein\nSAA Fmax mean','MDS','MDS\nrestricted','digital']
xs = ['putamen_min','SAA_mean','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT','prob_digital']
corrs = pd.DataFrame(columns=['n','r','CI95%','p-val','BF10','power'],index=pd.MultiIndex.from_product([names,names],names=['risk1','risk2']))
nona = nona_all.dropna(subset=xs,how='any')
for (i,x1),name1 in zip(enumerate(xs),names):
    for (j,x2),name2 in zip(enumerate(xs),names):
        if j>i:
            corrs.loc[(name1,name2),:] = pg.corr(nona[x1],nona[x2],kind='pearson').values
corrs = corrs.dropna()

In [None]:
corrs['FDR corrected p-val'] = sm_stats.multitest.fdrcorrection(corrs['p-val'], alpha=0.05, method='indep', is_sorted=False)[1]

In [None]:
corrs

In [None]:
def plot_pairplot_own():
    import matplotlib.patches as mpatches
    fig = plt.figure(figsize=(20,20))
    plot_context()
    order_ = ['putamen_min','SAA_mean','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT','prob_digital'][::-1]#'bio_bl_saa_pos','bio_bl_saa_mean','bio_bl_fmax','prodromal_Heinzel']
    #order_ = ['digital','SAA','DaTscan','UPSIT','prodromal_Heinzel_noDaT','prodromal_Heinzel']
    order = ['DaTscan minimum\nputamen SBR','CSF alpha-synuclein\nSAA Fmax mean','MDS','MDS\nrestricted','digital'][::-1]
    #order = ['digital\ntimeseries','SAA','DaTscan','UPSIT','Heinzel2019_restricted','Heinzel2019']
    data = raw.rename(columns={'prob_digital_average':'digital\naverage','prob_digital':'digital','prob_clinical_updrs':'clinical',
                                                 'prob_clinical_noupdrs':'clinical\nwithout UPDRS','prob_prodromal_Berg':'Berg2015',
                                                 'prob_prodromal_Heinzel':'MDS','prob_prodromal_Heinzel_noDaT':'MDS\nrestricted','prob_bio_last':'CSF',
                              'prob_bio_bl':'CSF_BL_noSAA','prob_bio_bl_all_saa_positive':'CSF_BL','prob_bio_bl_saa_pos':'SAA_pos','prob_bio_bl_saa_mean':'SAA_mean','prob_bio_bl_fmax':'SAA_fmax','prob_dat_last':'DaTscan',
                              'prob_dat_first':'DaTscan_BL','prob_DaTscan':'DaTscan','prob_SAA':'SAA','prob_UPSIT':'UPSIT',
                              'putamen_min':'DaTscan minimum\nputamen SBR','SAA_mean':'CSF alpha-synuclein\nSAA Fmax mean'})
    data = data[data['diagnosis_update_prodromal_Heinzel'].isin(['pd','prod','hc'])].dropna(subset=order,how='all')
    #data.loc[data['phenoconverted']==1,'diagnosis'] = 'converter'
    data = data.dropna(subset=order,how='any')
    subset = data.loc[data['diagnosis_update_prodromal_Heinzel'].isin(['pd','prod','hc'])]
    fig,axes = plt.subplots(ncols=len(order_),nrows=len(order_),figsize=(15,12))
    for (i,o_),o in zip(enumerate(order_),order):
        for (j,o_2),o2 in zip(enumerate(order_),order):
            if i>j:
                # scatterplot
                sns.regplot(x=o_2,y=o_,data=nona,ax=axes[i,j],color='green',scatter_kws={'s':2})
                r = corrs.loc[(o,o2),'r']
                p = corrs.loc[(o,o2),'FDR corrected p-val']
                axes[i,j].text(0.05,0.05,f'r: {r:.2f}\np: {p:.1e}',bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5), transform=axes[i,j].transAxes,fontsize=15)
            elif j==i:
                # kdeplot
                sns.kdeplot(data=data,x=o,ax=axes[i,j],hue='diagnosis_update_digital')
                axes[i,j].legend([],[],frameon=False)
            elif j>i:
                axes[i,j].set_axis_off()
            if j==0:
                axes[i,j].set_ylabel(o)
            else:
                axes[i,j].set_ylabel('')
            if i==(len(order)-1):
                axes[i,j].set_xlabel(o2)
            else:
                axes[i,j].set_xlabel('') 

    handles = [
               mpatches.Patch(color='blue', label='PD'),
              mpatches.Patch(color='orange', label='HC'),
              mpatches.Patch(color='green', label='Prodromal')]

    labels = ['PD','HC','Prodromal']
    plt.figlegend(handles, labels, loc='upper right')
    plt.tight_layout()

    plt.savefig(f'{image_path}/pairplot.png',dpi=300,bbox_inches='tight')
    plt.savefig(f'{image_path}/pairplot.pdf',dpi=300,bbox_inches='tight')
    return fig
plot_pairplot_own()

In [None]:
from matplotlib.ticker import MaxNLocator
nona = nona_all.dropna(subset=['putamen_min','SAA_mean','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT'],how='any')
names = ['DaTscan minimum putamen SBR','SAA Fmax mean','Heinzel 2019','Heinzel 2019 restricted']
corr = pd.DataFrame(columns=['n','r','CI95%','p-val','BF10','power'],index=names)
fig, axes = plt.subplots(ncols=2,nrows=2,figsize=(15,8))
plots.plot_context()
for x,name,ax in zip(['putamen_min','SAA_mean','prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT'],names,axes.flatten()):
    corr.loc[name,:] = pg.corr(nona[x],nona['prob_digital'],kind='pearson').values
    ax=sns.regplot(x=x,y='prob_digital',data=nona,ax=ax,color='k')
    r = corr.loc[name,'r']
    p = corr.loc[name,'p-val']
    ax.text(0.05,0.05,f'r: {r:.2f}, p-val: {p:.1e}',bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5), transform=ax.transAxes)
    ax.set_xlabel(name)
    ax.set_ylabel('digital risk')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True, nbins=5))  # Decrease number of x-ticks

plt.tight_layout()
plt.savefig(f'{image_path}/regplot_Digital_>0.png',dpi=300,bbox_inches='tight')
plt.savefig(f'{image_path}/regplot_Digital_>0.pdf',dpi=300,bbox_inches='tight')

corr.to_csv('/scratch/c.c21013066/data/ppmi/analyses/prodromal/correlation_dig_markers_>1.csv')

# Correlation with clinical tests

In [None]:
# correlation digital risk with clinical scores
behavior = pd.read_csv('/scratch/c.c21013066/data/ppmi/phenotypes2021/behavior_clean.csv')
behavior = behavior.groupby('participant').last()
behavior['updrs_iii_OFF'] = behavior['updrs_iii_OFF'].fillna(behavior['updrs_iii_NoMED'])
risk_clinic = pd.merge(behavior,nona_all,on='participant')
clinic = ['updrs_iii_OFF','updrs_ii','updrs_i','se_adl','moca','lns','benton','scopa_aut','epworth','quip','gds']
corr = pd.DataFrame(index=clinic,columns=['n', 'r', 'CI95%', 'p-val', 'BF10', 'power'])
for c in clinic:
    corr.loc[c,:] = pg.corr(risk_clinic['prob_digital'],risk_clinic[c]).values

In [None]:
corr

# Chaining of risk tests
use DaTscan as gold standard, chain digital, hyposmia, SAA and see how close to DaT

In [None]:
thrsh = [100]
names = ['digital']
truth = 'pd'
dig_threshs = pd.DataFrame(index=['F1','precision','youden'],columns=['value'])
for thresh_name in dig_threshs.index:
    for name,thr in zip(names,thrsh):
        prob = f'prob_{name}'
        pd_mean = raw.loc[raw['diagnosis']=='pd',prob].mean() - raw.loc[raw['diagnosis']=='pd',prob].std()
        if thr==0:
            pass
        else:
            nona = raw.dropna(subset=[prob])
            nona = nona[nona['diagnosis'].isin(['hc','pd'])]
            nona['pd'] = (nona['diagnosis']=='pd').astype(int)
            nona = nona.dropna(subset=[truth])
            precision, recall, thresholds = metrics.precision_recall_curve(nona[truth], nona[prob])
            if thresh_name == 'F1':
                dig_thresh = thresholds[np.argmax((2*precision*recall)/(precision+recall))]
            elif thresh_name=='precision':
                dig_thresh = thresholds[np.argmax(precision)]
            #dig_thresh = thresholds[np.argmax(precision + recall)]
            fpr, tpr, thresholds = metrics.roc_curve(nona[truth], nona[prob])
            if thresh_name == 'youden':
                dig_thresh = thresholds[np.argmax(tpr - fpr)]
            print(dig_thresh,thresh_name)
            dig_threshs.loc[thresh_name,'value'] = dig_thresh
            raw[f'risk_{name}'] = (raw[prob] > dig_thresh).astype(int)
            raw.loc[raw[prob].isna(),f'risk_{name}'] = np.nan
        ax = sns.histplot(data=raw[raw['diagnosis_update_prodromal_Heinzel'].isin(['pd','hc','prod','converted'])],hue='diagnosis_update_prodromal_Heinzel',
                     x=prob,kde=True,legend=False,hue_order=['pd','hc','prod'])
        if thr==0:
            ax.axvline(pd_mean,color='red')
        else:
            ax.axvline(dig_thresh,color='red')
    plt.tight_layout()
dig_threshs

In [None]:
CMs = pd.DataFrame(columns=pd.MultiIndex.from_product([['SAA','DaTscan'],['TN','FP','FN','TP','precision','recall','fscore','support']],names=['true outcome','statistic']),
                       index=dig_threshs.index)

for truth,truth_var in zip(['SAA','DaTscan'],['SAA','positiveDaT']):
    for name in CMs.index:
            dig_thresh = dig_threshs.loc[name,'value']
            nona_all['risk_digital'] = (nona_all['prob_digital']>dig_thresh).astype(int)
            nona_all['digital_hyposmia'] = nona_all[['risk_digital','HYPOSMIA']].max(axis=1)
            nona = nona_all.dropna(subset=['prob_digital'],how='any')
            print(nona.shape[0])
            CM = metrics.confusion_matrix(nona[truth_var],nona[test])
            print(nona[np.logical_and(nona[truth_var]==1,nona[test]==0)].shape)
            CMs.loc[name,(truth,['TN','FP','FN','TP'])] = CM.ravel()
            CMs.loc[name,(truth,['precision','recall','fscore','support'])] = metrics.precision_recall_fscore_support(nona[truth_var],nona['risk_digital'],average='binary')

In [None]:
CMs

In [None]:
#dig_thresh = 0.5374367134240129
nona_all['risk_digital'] = (nona_all['prob_digital']>dig_thresh).astype(int)
nona_all['digital_hyposmia'] = nona_all[['risk_digital','HYPOSMIA']].max(axis=1)
nona_all['SAA_DaT'] = nona_all[['positiveDaT','SAA']].max(axis=1)
nona = nona_all.dropna(subset=['prob_prodromal_Heinzel','prob_prodromal_Heinzel_noDaT','prob_digital','SAA_positive','positiveDaT','HYPOSMIA'],how='any')
print(nona.shape[0])

CMs = pd.DataFrame(columns=pd.MultiIndex.from_product([['SAA+','DaT+','SAA+ or DaT+'],['TN','FP','FN','TP','precision','recall','fscore','support']],names=['true outcome','statistic']),
                   index=['MDS','MDS restricted','Hyposmia','digital','digital+hyposmia','SAA','DaTscan'])
for truth,truth_var in zip(['SAA+','DaT+','SAA+ or DaT+'],['SAA','positiveDaT','SAA_DaT']):
    for name,test in zip(CMs.index,['risk_prodromal_Heinzel','risk_prodromal_Heinzel_noDaT','HYPOSMIA','risk_digital','digital_hyposmia','SAA','positiveDaT']):
        #nona = nona_all.dropna(subset=[test,'positiveDaT'],how='any')
        CM = metrics.confusion_matrix(nona[truth_var],nona[test])
        print(nona[np.logical_and(nona[truth_var]==1,nona[test]==0)].shape)
        CMs.loc[name,(truth,['TN','FP','FN','TP'])] = CM.ravel()
        CMs.loc[name,(truth,['precision','recall','fscore','support'])] = metrics.precision_recall_fscore_support(nona[truth_var],nona[test],average='binary')

In [None]:
CMs.dropna(axis='columns')

In [None]:
print(CMs.dropna(axis='columns').to_latex())

In [None]:
CMs.dropna(axis='columns').to_csv(f'{data_path}/prodromal/confusion_matrix_>1.csv')

In [None]:
#chaining
#dig_thresh = 0.5374367134240129
nona = nona_all.dropna(subset=['prob_digital','HYPOSMIA','SAA','positiveDaT'],how='any')
print(nona.shape[0])
nona['SAA_DaT'] = nona[['positiveDaT','SAA']].max(axis=1)
#nona_all['risk_digital'] = (nona_all['prob_digital']>dig_thresh).astype(int)
CMschain = pd.DataFrame(columns=['TN','FP','FN','TP','precision','recall','fscore','support'],index=['MDS restricted','Hyposmia'])
for name,test in zip(CMschain.index,['risk_prodromal_Heinzel_noDaT','HYPOSMIA']):
    print(nona.shape,name)
    #nona = nona.dropna(subset=[test,'positiveDaT'],how='any')
    CM = metrics.confusion_matrix(nona['SAA_DaT'],nona[test])
    CMschain.loc[name,:4] = CM.ravel()
    CMschain.loc[name,4:] = metrics.precision_recall_fscore_support(nona['SAA_DaT'],nona[test],average='binary')
    nona = nona[nona[test]==1]

In [None]:
CMschain

In [None]:
CMschain

In [None]:
CMschain.iloc[:,:4].sum(axis=1)

# Proportion identified as PD

In [None]:
raw = raw.set_index('participant')

In [None]:
raw['risk_digital'] = (raw['prob_digital']>dig_thresh).astype(int)
raw.loc[raw['prob_digital'].isna(),'risk_digital'] = np.nan

In [None]:
table_prop = pd.DataFrame(index=['prodromal_Heinzel','prodromal_Heinzel_noDaT','digital','SAA','DaTscan'],columns=pd.MultiIndex.from_product([['hc','pd','prod','converted','future_converters'],['N','proportion identified as PD']],names=['group','statistic']))
converters = raw.loc[raw[f'diagnosis_update_prodromal_Heinzel']=='converted'].index
for name,test in zip(['prodromal_Heinzel','prodromal_Heinzel_noDaT','digital','SAA','DaTscan'],['risk_prodromal_Heinzel','risk_prodromal_Heinzel_noDaT','risk_digital','SAA','positiveDaT']):
    df = raw[raw[f'diagnosis_update_{name}'].isin(['converted','hc','pd','prod'])]
    have_converted = df.loc[df[f'diagnosis_update_{name}']=='converted'].index
    will_convert = np.setdiff1d(converters,have_converted)
    will_convert = np.intersect1d(will_convert,df.index)
    df.loc[will_convert,f'diagnosis_update_{name}'] = 'future_converters'
    temp = df.groupby(f'diagnosis_update_{name}')[test].agg(['count','mean']).reindex(['hc','pd','prod','converted','future_converters'], fill_value=0)

    table_prop.loc[name,:] = temp.stack().T.values


In [None]:
table_prop

## Who is false negative?
- elevated UPDRS?

In [None]:
raw = raw.replace(['no','yes'],[0,1])
raw = raw.replace([False,True],[0,1])
var = ['male','pesticide_exposure','caffeine','never_smoke','previous_smoke',
                                  'current_smoke','rbd_psgproven','rbd_test','HYPOSMIA',
                                  'constipation_','DepressionAnxiety_','ExcessiveDaytimeSleepiness_',
                                 'UPDRS>6','ErectileDysfunction_','UrinaryDysfunction_',
                                 'OrthostaticHypotension_','1st_degree_family_history','DiabetesII','cognitive_impairment','GBA','LRRK2','PRS_highQ','PRS_lowQ','current_age','SAA','positiveDaT']
LRs = [[1.2,0.8],[1.5,1],[0.88,1.35],[1.2,1],[0.91,1],[0.51,1],
                          [130,0.65],[2.8,0.89],[6.4,0.4],[2.5,0.82],[1.6,0.88],[2.7,0.86],[43.3,0.66],[9.6,0.55],[3.4,0.87],
                           [2.0,0.9],[3.2,0.8],[2.5,1],
                                        [1.57,1],[0.45,1],
                                        #[1.3,0.91],[1.8,1],[0.88,1],
       [1.8,0.88],[1.5,0.97]]
var_names = ['male','pesticide_exposure','caffeine','never_smoke','previous_smoke',
                                  'current_smoke','rbd_psgproven','rbd_test','HYPOSMIA',
                                  'constipation_','DepressionAnxiety_','ExcessiveDaytimeSleepiness_',
                                 'positiveDaT',
                                  'UPDRS>6','ErectileDysfunction_','UrinaryDysfunction_',
                                 'OrthostaticHypotension_','1st_degree_family_history',
                                 'PRS_highQ','PRS_lowQ',
                                  #'physical_inactivity','urate_low','urate_high',
             'cognitive_impairment','DiabetesII']
LR_dict = {var_names[i]: LRs[i] for i in range(len(var_names))}
subset = raw[raw['diagnosis_update_prodromal_Heinzel'].isin(['converted','hc','pd','prod'])]
subset = raw[raw['participant'].isin(nona_all.participant)]
table = subset.groupby('diagnosis_update_prodromal_Heinzel')[var].agg(['count','mean']).T
#table['LR'] = table.index.map(LR_dict)

In [None]:
df = table.unstack()
new_columns = []
for col in df.columns:
    new_col = (col[0], col[1].replace('count', 'data available for N').replace('mean', 'proportion with indicator present'))
    new_columns.append(new_col)

df.columns = pd.MultiIndex.from_tuples(new_columns)
df = df.drop(index='current_age')

In [None]:
df

In [None]:
df.to_csv(f'{path}/analyses/prodromal/study_cohort_prod.csv')

In [None]:
nona_all['risk_digital'] = (nona_all['prob_digital']>dig_thresh).astype(int)

FN = nona[np.logical_and(nona['risk_digital']==0, nona['risk_prodromal_Heinzel']==1)]

In [None]:
TP = nona.loc[np.logical_and(nona['risk_digital']==1, nona['SAA']==1),'participant']
TP_hyp = nona.loc[np.logical_and(nona['HYPOSMIA']==1, nona['SAA']==1),'participant']

In [None]:
len(np.union1d(TP,TP_hyp)),TP.shape,TP_hyp.shape

In [None]:
(nona['risk_prodromal_Heinzel']==1).sum(),FN.shape,1-2/17

In [None]:
var = ['male','pesticide_exposure','caffeine','never_smoke','previous_smoke',
                                  'current_smoke','rbd_psgproven','rbd_test','HYPOSMIA',
                                  'constipation_','DepressionAnxiety_','ExcessiveDaytimeSleepiness_',
                                 'positiveDaT','UPDRS>6','ErectileDysfunction_','UrinaryDysfunction_',
                                 'OrthostaticHypotension_','1st_degree_family_history_','DiabetesII','cognitive_impairment','GBA','LRRK2','PRS_highQ','PRS_lowQ','SAA','Heinzel2019_cat']

In [None]:
test = 'positiveDaT'
FN = nona[np.logical_and(nona['risk_digital']==0, nona[test]==1)]
TP = nona[np.logical_and(nona['risk_digital']==1, nona[test]==1)]
nona[f'{test}_digital'] = np.logical_and(nona['risk_digital']==0, nona[test]==1)
#nona.loc[np.logical_and(nona['risk_digital']==1, nona[test]==0),f'{test}_digital'] = np.nan
#nona.loc[np.logical_and(nona['risk_digital']==0, nona[test]==0),f'{test}_digital'] = np.nan
chi2 = pd.DataFrame(index=var,columns=['chi2','p-value','dof','expected_frequency'])
for v in var:
        crosstab = pd.crosstab(nona[v],nona[f'{test}_digital'])
        chi2.loc[v,:] = stats.chi2_contingency(crosstab)

In [None]:
chi2

In [None]:
updrs = pd.read_csv(f'{path}/phenotypes2021/MDS_UPDRS_Part_III.csv',na_values=['UR'],parse_dates=['INFODT'])
updrs['updrs_iii'] = updrs[['NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU',
                 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR',
                 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
                 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT',
                 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML',
                 'NP3KTRMR', 'NP3KTRML',
                  'NP3RTARU', 'NP3RTALU', 'NP3RTARL',
                 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON']].sum(axis=1)
max_updrs = updrs.groupby('PATNO')[['NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU',
                 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR',
                 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
                 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT',
                 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML',
                 'NP3KTRMR', 'NP3KTRML',
                  'NP3RTARU', 'NP3RTALU', 'NP3RTARL',
                 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON','updrs_iii']].max()
data = pd.merge(nona,max_updrs,right_on='PATNO',left_on='participant',how='left')#updrs.loc[updrs['PATNO'].isin(FN['participant']),['PATNO','INFODT','NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU',
                 # 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR',
                 # 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
                 # 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT',
                 # 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML',
                 # 'NP3KTRMR', 'NP3KTRML',
                 #  'NP3RTARU', 'NP3RTALU', 'NP3RTARL',
                 # 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON']]

In [None]:
ttest = pd.DataFrame(index=['NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU',
                 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR',
                 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
                 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT',
                 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML',
                 'NP3KTRMR', 'NP3KTRML',
                  'NP3RTARU', 'NP3RTALU', 'NP3RTARL',
                 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON','updrs_iii'],columns=pd.MultiIndex.from_product([['SAA','positiveDaT'],['U', 'alternative', 'p-val', 'RBC', 'CLES']],names=['true outcome','statistic']))
# means = pd.DataFrame(index=['NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU',
#                  'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR',
#                  'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
#                  'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT',
#                  'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML',
#                  'NP3KTRMR', 'NP3KTRML',
#                   'NP3RTARU', 'NP3RTALU', 'NP3RTARL',
#                  'NP3RTALL', 'NP3RTALJ', 'NP3RTCON','updrs_iii'],columns=pd.MultiIndex.from_product(['SAA','positiveDaT'],['mean','std','min','max']
for test in ['SAA','positiveDaT']:
    FN = data[np.logical_and(data['risk_digital']==0, data[test]==1)]
    TP = data[np.logical_and(data['risk_digital']==1, data[test]==1)]
    data[f'{test}_digital'] = np.logical_and(data['risk_digital']==0, data[test]==1)
    #data.loc[np.logical_and(data['risk_digital']==1, data[test]==0),f'{test}_digital'] = np.nan
    #data.loc[np.logical_and(data['risk_digital']==0, data[test]==0),f'{test}_digital'] = np.nan
    for v in ['NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU',
                     'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR',
                     'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
                     'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT',
                     'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML',
                     'NP3KTRMR', 'NP3KTRML',
                      'NP3RTARU', 'NP3RTALU', 'NP3RTARL',
                     'NP3RTALL', 'NP3RTALJ', 'NP3RTCON','updrs_iii']:
            ttest.loc[v,(test,slice(None))] = pg.mwu(data.loc[data[f'{test}_digital']==1,v].dropna(),data.loc[data[f'{test}_digital']==0,v].dropna()).values[0]

In [None]:
# clinical scores
behavior = pd.read_csv(f'{path}/phenotypes2021/behavior_clean.csv')
behavior = behavior.groupby('participant').max()
behavior['updrs_iii_OFF'] = behavior['updrs_iii_OFF'].fillna(behavior['updrs_iii_NoMED'])
data = pd.merge(behavior,nona_all,on='participant')
c = 'updrs_iii_OFF'
corr = pd.DataFrame(index=pd.MultiIndex.from_product([['digital','hyposmia','Heinzel2019 restricted'],['SAA','positiveDaT','SAA_DaT']],names=['prediction','true']),columns=['T', 'dof', 'alternative', 'p-val', 'CI95%', 'cohen-d', 'BF10',
       'power','mean FN','mean TP','std FN','std TP','N FN','N TP'])
for test in ['SAA','positiveDaT','SAA_DaT']:
    for risk,name in zip(['risk_digital','HYPOSMIA','risk_prodromal_Heinzel_noDaT'],['digital','hyposmia','Heinzel2019 restricted']):
        FN = data[np.logical_and(data[risk]==0, data[test]==1)]
        TP = data[np.logical_and(data[risk]==1, data[test]==1)]
        data[f'{test}_{name}'] = np.logical_and(data[risk]==0, data[test]==1).astype(int).replace([0],[np.nan])
        data.loc[TP.index,f'{test}_{name}'] = 0
        corr.loc[(name,test),['T', 'dof', 'alternative', 'p-val', 'CI95%', 'cohen-d', 'BF10','power']] = pg.ttest(data.loc[data[f'{test}_{name}']==1,c].dropna(),data.loc[data[f'{test}_{name}']==0,c],correction=True).values
        corr.loc[(name,test),'mean FN'] = data.loc[data[f'{test}_{name}']==1,c].dropna().mean()
        corr.loc[(name,test),'mean TP'] = data.loc[data[f'{test}_{name}']==0,c].dropna().mean()
        corr.loc[(name,test),'std FN'] = data.loc[data[f'{test}_{name}']==1,c].dropna().std()
        corr.loc[(name,test),'std TP'] = data.loc[data[f'{test}_{name}']==0,c].dropna().std()
        corr.loc[(name,test),'N FN'] = data.loc[data[f'{test}_{name}']==1,c].dropna().shape[0]
        corr.loc[(name,test),'N TP'] = data.loc[data[f'{test}_{name}']==0,c].dropna().shape[0]
        #print(data.loc[data[f'{test}_digital']==0,c].dropna().shape[0],data.loc[data[f'{test}_digital']==1,c].dropna().shape[0])

In [None]:
corr

In [None]:
corr.to_csv(f'{path}/analyses/prodromal/falsenegatives_updrsiii_welch.csv')

In [None]:
updrs = pd.read_csv('/scratch/c.c21013066/data/ppmi/phenotypes2021/MDS_UPDRS_Part_III.csv',na_values=['UR'],parse_dates=['INFODT'])
data = updrs.loc[updrs['PATNO'].isin(FN['participant']),['PATNO','INFODT','NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU',
                 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR',
                 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
                 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT',
                 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML',
                 'NP3KTRMR', 'NP3KTRML',
                  'NP3RTARU', 'NP3RTALU', 'NP3RTARL',
                 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON']]

fig,axes = plt.subplots(nrows=6,ncols=6,figsize=(15,15))
for ax,item in zip(axes.flatten(),['NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU',
                 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR',
                 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
                 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT',
                 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML',
                 'NP3KTRMR', 'NP3KTRML',
                  'NP3RTARU', 'NP3RTALU', 'NP3RTARL',
                 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON']):
    ax = sns.scatterplot(x='INFODT',y=item,hue='PATNO',data=data,ax=ax,palette='deep')
    ax.legend([],[], frameon=False)
plt.tight_layout()
fig.autofmt_xdate()

In [None]:
data.groupby('PATNO')[['NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU',
                 'NP3RIGRL', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR',
                 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL',
                 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT',
                 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML',
                 'NP3KTRMR', 'NP3KTRML',
                  'NP3RTARU', 'NP3RTALU', 'NP3RTARL',
                 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON']].apply(lambda x: (x>0).sum()).T

In [None]:
demo = pd.read_csv('/scratch/c.c21013066/data/ppmi/phenotypes2021/demographics_clean.csv',parse_dates=['date_birth'])
merged, ambulatory, step, sleep, pulse, pulsevar = utils.load_timeseries(demo, '/scratch/c.c21013066/data/ppmi/accelerometer')

In [None]:
step = step.loc[(FN['participant'],slice(None)),:]

In [None]:
time_covered = step.reset_index().groupby('subject')['date_local_adj'].apply(lambda x: (x.max()-x.min())/np.timedelta64(1, 'h'))
data_av = step.reset_index().groupby('subject')['hourly_step_count_sum'].count()

In [None]:
(100/time_covered)*(data_av)

# Agreed high risk

In [None]:
HR = nona_all[np.logical_and(nona_all['risk_digital']==1, np.logical_and(nona_all['SAA']==1,np.logical_and(nona_all['positiveDaT']==1,nona_all['risk_prodromal_Heinzel']==1)))]

In [None]:
HR[var].agg(['count','mean']).T