In [None]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
from scipy import stats

import seaborn as sns
import pylab as plt
from statannot import add_stat_annotation
import statsmodels.api as sm

import missingno as msn

from importlib import reload
from functools import reduce
import glob
import datetime
import re

import sys
sys.path.insert(1,'../scripts')
import utils
sys.path.insert(1,'../../phenotype') # this is the updated pympi package
import _loaders
import _preprocess
import _info2021
import data_wrangling

from sklearn import cluster, decomposition,preprocessing,linear_model,model_selection,metrics,neighbors,mixture
from sklearn.pipeline import Pipeline

import pickle

In [None]:
def plot_context():
    sns.set_context("talk", rc={"font.size":18,"axes.titlesize":18,"axes.labelsize":16,"font_scale":0.9})

In [None]:
data_path = '/scratch/c.c21013066/data/ppmi'
path = '/scratch/c.c21013066/data/ppmi/accelerometer'

# load relevant data 
using helper functions defined in "_loaders" of the pypmi adaption

In [None]:
demo = pd.read_csv(f'{data_path}/phenotypes2021/demographics.csv',parse_dates=['date_birth'])
behavior = _loaders.load_behavior(f'{data_path}/phenotypes2021')
datscan = _loaders.load_datscan_all(f'{data_path}/phenotypes2021')
genotypes = _loaders.load_genotypes(f'{data_path}/phenotypes2021')
prodromal = _loaders.load_prodromal(f'{data_path}/phenotypes2021')
#merged, ambulatory, step, sleep, pulse, pulsevar = utils.load_timeseries(demo,path)

In [None]:
reload(_info2021)
reload(_loaders)
prodromalBerg = _loaders.load_prodromalBerg(f'{data_path}/phenotypes2021')
prodromalBerg = prodromalBerg.set_index('participant')

In [None]:
reload(_info2021)
reload(data_wrangling)
prod = data_wrangling.get_nonuse_caffeine(prodromalBerg)
prod = data_wrangling.get_smoker(prod)
prod = data_wrangling.get_rbdpositive(prod)
prod = data_wrangling.get_subthresholdUPDRS(prod)
prod = data_wrangling.get_updrs_scopa_identifiers(prod)
prod = data_wrangling.get_prodromalconditions(prod)
prod = data_wrangling.get_cognitiveimpairment(prod)
prod = data_wrangling.get_hyposmiatest(prod)

In [None]:
reload(_preprocess)
datscan = _loaders.load_datscan_all(f'{data_path}/phenotypes2021')
datscan = pd.merge(datscan,demo,on='participant',how='left')
datscan = _preprocess.get_visit_age(datscan)
datscan = _preprocess.get_DAT_deficit(datscan,age_sex_correct='none')
dat_last = datscan.dropna(subset=['putamen_min'],how='any',axis='rows').sort_values('date').groupby('participant').last()

In [None]:
# merge it all together
prodromal = pd.merge(dat_last.reset_index()[['participant','dat_deficit','putamen_min']],prod,
                     on='participant',how='outer',suffixes=['','_visual'])
prodromal['dat_deficit_visual'] = prodromal['dat_deficit_visual'].replace(['negative','positive'],[0,1])
behprod = pd.merge(behavior.reset_index().drop(columns=['rbd','upsit']),prodromal,left_on=['participant','visit'],
                   right_on=['participant','phenoconverted_visit'],how='right')
behprod.loc[behprod['phenoconverted']==1,'phenoconverted_date'] = behprod.loc[behprod['phenoconverted']==1,'date']
prod = pd.merge(demo,genotypes,on='participant',how='outer')
prod = pd.merge(prod,behprod[np.hstack([prodromal.columns,'phenoconverted_date'])],on='participant',how='outer')
prod['phenoconverted_age'] = (prod['phenoconverted_date'] - prod['date_birth'])/np.timedelta64(1,"Y")

In [None]:
# take care of dtypes
prod.loc[np.logical_and(~prod['family_history'].isna(),prod['1st_degree_family_history'].isna()),'1st_degree_family_history'] = False
prod['current_age'] = (datetime.datetime(2021, 10, 1) - prod['date_birth']) / np.timedelta64(1,'Y')

In [None]:
#add genetic info
haplo = pd.read_csv(f'{data_path}/genotypes/haplotypes/PD_variants_clean.csv',index_col=0)
PRS = pd.read_csv(f'/scratch/c.c21013066/data/ppmi/genotypes/PRS/merged_celltypesall_PRSice_3Set_consensusPD_mergedQC51_cov_sample.csv',index_col=0)
PRS = PRS[np.logical_and(PRS['tissue'].isin(['SN']),PRS['cells']=='All')]
#merged_best = pd.read_csv(f'/scratch/c.c21013066/data/ppmi/genotypes/PRS/{ts}/merged_celltypes_PRSice_{base}_consensusPD_mergedQC51_cov_best.csv',index_col=0)
best_thr = pd.read_csv('/scratch/c.c21013066/data/ppmi/genotypes/PRS/SN/best_thresholds_percell_AUROC.csv',
                       index_col=[0,1],header=[0,2]).droplevel(level=1,axis=1).droplevel(level=0,axis=0).rename(columns={'Unnamed: 2_level_0':'p-thresh'})
keep=[]
for c in best_thr.index:
    thr = best_thr.loc[c,'p-thresh']
    keep.append(PRS.loc[np.logical_and(PRS['cells']==c,PRS['p-thresh']==thr),['cells','tissue','value','p-thresh']])
best = pd.concat(keep).reset_index()
best = best.rename(columns={'value':'PRS'})

best['PRS_Q'] = pd.qcut(best['PRS'],q=4,labels=['low','mid1','mid2','high'])
best['PRS_highQ'] = (best['PRS_Q']=='high').astype(int)
best['PRS_lowQ'] = (best['PRS_Q']=='low').astype(int)

prod = pd.merge(prod,best,on='participant',how='outer')

In [None]:
prod.to_csv('/scratch/c.c21013066/data/ppmi/analyses/prodromal/raw_prod_Yan2024_>1.csv')

# Test prodromal model from Berg et al 2015 and Heinzel 2019

doi: 10.1002/mds.26431

"
- Step 1: Establish the prior from the table51.25
- Step 2: Calculate total LR 1.2 (male)x 1.5 (pesti-cide)x 0.88   (coffee) 1.25 x  (nonsmoker) x130(RBD) x4.0 (olfaction) x 0.8 (no constipation) x 0.85(nodepressionoranxiety), 0.88(nosomnolence)x 1.0(borderline motor testing â€“ result omitted)= 616.
- Step  3: Calculate  post-test  probability, using  one  oftwo methods:a. Make  an  exact  quantitative  probability  calculation using calculators. Result 89%, orb. From Table 2, LR must be 300. Actual LR>300, sopatient meets criteria for probable prodromal PD."

In [None]:
def get_prior(df,age='age'):
    d = {range(0, 55): 0.4, range(55, 60): 0.75, range(60, 65): 1.25, range(65, 70): 2.0, 
         range(70, 75): 2.5, range(75, 80): 3.5, range(80,130): 4.0}
    df['prior'] = round(df[age]).apply(lambda x: next((v for k, v in d.items() if x in k), 0))
    return df
def calculate_LR(df,kind='Heinzel'):
    if kind == 'Berg':
        formula = pd.DataFrame([[1.2,0.8],[1.5,1],[0.88,1.35],[1.25,1],[0.8,1],[0.45,1],
                          [130,0.62],[2.3,0.76],[4,0.43],[2.2,0.8],[1.8,0.85],[2.2,0.88],[40,0.65],[10,0.7],[2.0,0.9],
                           [1.9,0.9],[1.2,0.87],[2.5,1]],
                           index=['male','pesticide_exposure','caffeine','never_smoke','previous_smoke',
                                  'current_smoke','rbd_psgproven','rbd_test','HYPOSMIA',
                                  'constipation','DepressionAnxiety','ExcessiveDaytimeSleepiness',
                                 'positiveDaT','UPDRS>6','ErectileDysfunction','UrinaryDysfunction',
                                 'OrthostaticHypotension','1st_degree_family_history'],
                           columns=['yes','no'])
    elif kind == 'Heinzel':
                formula = pd.DataFrame([[1.2,0.8],[1.5,1],[0.88,1.35],[1.2,1],[0.91,1],[0.51,1],
                          [130,0.65],[2.8,0.89],[6.4,0.4],[2.5,0.82],[1.6,0.88],[2.7,0.86],[43.3,0.66],[9.6,0.55],[3.4,0.87],
                           [2.0,0.9],[3.2,0.8],[2.5,1],
                                        [1.57,1],[0.45,1],
                                        #[1.3,0.91],#[1.8,1],[0.88,1],
                                        [1.8,0.88],[1.5,0.97]],
                           index=['male','pesticide_exposure','caffeine','never_smoke','previous_smoke',
                                  'current_smoke','rbd_psgproven','rbd_test','HYPOSMIA',
                                  'constipation_','DepressionAnxiety_','ExcessiveDaytimeSleepiness_',
                                 'positiveDaT',
                                  'UPDRS>6','ErectileDysfunction_','UrinaryDysfunction_',
                                 'OrthostaticHypotension_','1st_degree_family_history_',
                                 'PRS_highQ','PRS_lowQ',
                                  #'physical_inactivity',#'urate_low','urate_high',
                                  'cognitive_impairment','DiabetesII'],
                           columns=['yes','no'])
                print(formula)
    else:
        print("not a valid version: try 'Heinzel' or 'Berg'")
        return []
    # create missing cols
    df_new = df.copy(deep=True)
    df_new['male'] = df_new['gender'].replace(['m','f'],[1,0])
    df_new['pesticide_exposure'] = df_new['pesticide_occupational_exposure'].replace([9999],[np.nan])
    df_new['rbd_psgproven'] = df_new[['rbd','rbd_record']].max(axis=1)
    df_new['rbd_test'] = df_new['rbd_screen_positive'].copy()
    df_new.loc[~df_new['rbd_psgproven'].isna(),'rbd_test'] = np.nan
    df_new['HYPOSMIA'] = df_new[['hyposmia','Hyposmia','upsit']].max(axis=1)#upsit
    df_new['positiveDaT'] = df_new[['dat_deficit','dat_deficit_visual']].max(axis=1)
    df_new['PRS_Q'] = pd.qcut(df_new['PRS'],q=4,labels=['low','mid1','mid2','high'])
    df_new['PRS_highQ'] = (df_new['PRS_Q']=='high').astype(int)
    df_new['PRS_lowQ'] = (df_new['PRS_Q']=='low').astype(int)
    df_new['1st_degree_family_history_'] = df_new['1st_degree_family_history'].copy()
    df_new.loc[~df_new['PRS_Q'].isna(),'1st_degree_family_history_'] = np.nan
    df_new['DepressionAnxiety_'] = df_new[['updrs_i_depression_cat','DepressionAnxiety']].max(axis=1)
    df_new['constipation_'] = df_new[['updrs_i_constipation_cat','constipation']].max(axis=1)
    df_new['ExcessiveDaytimeSleepiness_'] = df_new[['updrs_i_daytimesleepiness_cat','ExcessiveDaytimeSleepiness']].max(axis=1)
    df_new['OrthostaticHypotension_'] = df_new[['updrs_i_OH_cat','OrthostaticHypotension']].max(axis=1)
    df_new['UrinaryDysfunction_'] = df_new[['updrs_i_urinary_cat','UrinaryDysfunction']].max(axis=1)
    df_new['ErectileDysfunction_'] = df_new[['scopa_aut_erectileDysfunction_cat','ErectileDysfunction']].max(axis=1)
    print(df_new.shape[0] - df_new[['male','pesticide_exposure','caffeine','never_smoke','previous_smoke',
                                  'current_smoke','rbd_psgproven','rbd_test',#'HYPOSMIA',
                                  'constipation_','DepressionAnxiety_','ExcessiveDaytimeSleepiness_','1st_degree_family_history_',
                                 #'positiveDaT',
                                    'UPDRS>6','ErectileDysfunction_','UrinaryDysfunction_',
                                 'OrthostaticHypotension_','PRS_highQ','PRS_lowQ','DiabetesII']].isna().sum())
    #df_new['PRS_Q'] = pd.qcut(df_new['PRS'],q=4,labels=['lowQ','midQlow','midQhigh','highQ'])
    #df_new['PRS_highQ'] = (df_new['PRS_Q']=='highQ').astype(int)
    #df_new['PRS_lowQ'] = (df_new['PRS_Q']=='lowQ').astype(int)
    #low physical activity is defined as less than 1 hour per week of activity causing increased respiratory or heart rate or sweating.
    #df_new['physical_inactivity'] = (df_new['total_MVPA_hours']/2/60 < 1).astype(int) # convert 30sec into hours
    # low defined as <5mg/dL and high as >5.6mg/dl only in men
    #df_new['urate_low'] = ((df_new['Urate']/59.48) <5).astype(int) * df_new['male'] #conversion factor as in UKBB given as umol/L
    #df_new['urate_high'] = ((df_new['Urate']/59.48) >5.6).astype(int) * df_new['male'] #conversion factor as in UKBB given as umol/L
    #TODOdf_new['cognitive_deficit']
    # convert
    intersect = np.intersect1d(df_new.columns,formula.index)
    print(intersect)
    df_new = df_new[intersect]
    df_new = df_new.replace([0,1],['no','yes'])
    df_new = pd.get_dummies(df_new)
    # multiply
    df_new['LR'] = 1
    for column in intersect:
        try:
            yes = (df_new[f'{column}_yes'] * formula.loc[column,'yes']).replace(0,1)
        except:
            yes = pd.Series(1,index=df_new.index)
        try:
            no  = (df_new[f'{column}_no'] * formula.loc[column,'no']).replace(0,1)
        except:
            no = pd.Series(1,index=df_new.index)
        df_new['LR'] *= yes
        df_new['LR'] *= no
    return df_new,formula

def get_post_prob(df,age='age'):
    d = {range(0, 55): 1000, range(55, 60): 515, range(60, 65): 300, range(65, 70): 180, 
         range(70, 75): 155, range(75, 80): 110, range(80,130): 95}
    df['PD_thresh'] = round(df[age]).apply(lambda x: next((v for k, v in d.items() if x in k), 0))
    df['PD'] = (df['LR'] >= df['PD_thresh']).astype(int)
    odds = ((df['prior']/100)/(1-(df['prior']/100))) * df['LR']
    df['prob_PD'] = odds/(1+odds)
    return df

In [None]:
prod = prod.set_index('participant')

In [None]:
# calculate probability of having prodromal PD
for name,kind in zip(['Heinzel et al., 2019','Berg et al., 2015'],['Heinzel','Berg']):
    if kind=='Heinzel':
        df = get_prior(prod,age='current_age')
        dfcalc,formula = calculate_LR(df,kind=kind)
        df = pd.concat([df,dfcalc],axis=1)
        df = get_post_prob(df,age='current_age')
        #for v in ['male', 'pesticide_exposure', 'rbd_psgproven', 'rbd_test', 'HYPOSMIA', 'positiveDaT','constipation_','ErectileDysfunction_','UrinaryDysfunction_','OrthostaticHypotension_',
        # 'ExcessiveDaytimeSleepiness_','DepressionAnxiety_']:
        #    df[v] = df[[f'{v}_no',f'{v}_yes']].idxmax(axis=1).replace([f'{v}_no',f'{v}_yes'],[0,1])
        #    df.loc[df[[f'{v}_no',f'{v}_yes']].sum(axis=1)==0,v] = np.nan
        #print(df.loc[71679,np.hstack([formula.index,'prob_PD','LR','PD','prior','current_age'])])
        df.to_csv(f'/scratch/c.c21013066/data/ppmi/analyses/prodromal/{kind}_Yan2024_>1.csv')
        fig = plt.figure(figsize=(7,5))
        #plots.plot_context()
        ax = sns.histplot(data=df,hue='diagnosis',x='prob_PD')
        ax.set_xlabel(f'Probability of Prodromal PD ({name})');
        plt.show()
        fig = plt.figure(figsize=(7,5))
        #plots.plot_context()
        ax = sns.boxplot(data=df,x='diagnosis',y='prob_PD')
        ax.set_ylabel(f'Probability of Prodromal PD ({name})');
        plt.show()

In [None]:
fig = plt.figure(figsize=(7,5))
#plots.plot_context()
ax = sns.histplot(data=df,hue='diagnosis',x='prob_PD')
ax.set_xlabel('Probability of Prodromal PD');

In [None]:
fig = plt.figure(figsize=(7,5))
#plots.plot_context()
ax = sns.boxplot(data=df,x='diagnosis',y='prob_PD')
ax.set_ylabel('Probability of Prodromal PD');

# Prodromal Cohort features

In [None]:
df = pd.read_csv(f'/scratch/c.c21013066/data/ppmi/analyses/prodromal/raw_prod_Yan2024_>1.csv.csv',index_col=0)

In [None]:
prod = df[df['diagnosis']=='prod']

In [None]:
# percentage people with trait
# rapid eye movement behavioural sleep disorder (RBD),
# olfactory loss, LRRK2, GBA, SNCA, Parkin, Pink1
# with or without dopamine transporter (DAT) deficiency
traits = ['GBA','SNCA','LRRK2','HYPOSMIA_yes','rbd_psgproven_yes','positiveDaT_yes']
trait_names = ['GBA','SNCA','LRRK2','olfactory\nloss','RBD','positive\nDaTscan']
prod[traits].describe()

In [None]:
prod.loc[prod[traits].sum(axis=1)>2,traits]

In [None]:
perc = prod[traits].agg(['mean']).melt()
perc

In [None]:
prod['mutation'] = prod[['SNCA','GBA','LRRK2']].max(axis=1)

In [None]:
fig = plt.figure(figsize=(8,5))
plot_context()
perc = prod[traits].agg(['mean']).melt()
ax = sns.barplot(perc,x='variable',y='value',color='gray')
ax.set_ylabel('fraction')
ax.set_xlabel('prodromal group criterium')
ax.set_xticklabels(trait_names)
plt.savefig('/scratch/c.c21013066/images/paper/digitalPPMI/barplot_prodromal_criteria.png',bbox_inches='tight',dpi=300)
plt.savefig('/scratch/c.c21013066/images/paper/digitalPPMI/barplot_prodromal_criteria.pdf',bbox_inches='tight',dpi=300)

In [None]:
fig = plt.figure(figsize=(8,5))
plot_context()
ax = sns.barplot(prod[traits].melt(),x='variable',y='value',color='gray')
ax.set_ylabel('fraction')
ax.set_xlabel('prodromal group criterium')
ax.set_xticklabels(trait_names)
plt.savefig('/scratch/c.c21013066/images/paper/digitalPPMI/barplot_prodromal_criteria.png',bbox_inches='tight',dpi=300)
plt.savefig('/scratch/c.c21013066/images/paper/digitalPPMI/barplot_prodromal_criteria.pdf',bbox_inches='tight',dpi=300)

In [None]:
# Define the sets
rbd_yes = set(prod[prod['rbd_psgproven_yes'] == 1].index)
rbd_no = set(prod[prod['rbd_psgproven_no'] == 1].index)
datscan_yes = set(prod[prod['mutation'] == 1].index)
datscan_no = set(prod[prod['mutation'] == 0].index)
hyposmia_yes = set(prod[prod['HYPOSMIA_yes'] == 1].index)
hyposmia_no = set(prod[prod['HYPOSMIA_no'] == 1].index)

from matplotlib_venn import venn3
# Create the Venn diagram
venn3([rbd_yes, datscan_yes, hyposmia_yes], ('RBD', 'mendelian mutation', 'Hyposmia'))
plt.show()