In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
from functools import reduce
import pylab as plt

import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn import preprocessing,metrics,cross_decomposition,linear_model,model_selection
from scipy import stats
from statannot import add_stat_annotation

from importlib import reload
import itertools
from itertools import count

import yaml
import sys

sys.path.insert(1,'/scratch/c.c21013066/software/ukbb_parser/ukbb_parser')
sys.path.insert(1,'/scratch/c.c21013066/software/ukbb_parser/ukbb_parser/shared_utils')
import ukbb_parser as ukbb_parser
import ukbb_phenotype_dataset as ukbb_phenotype_dataset
from shared_utils.util import summarize

sys.path.insert(1,'../../resources')
sys.path.insert(1,'../../resources/utils')
#import _preprocess,_get_data#,linear_models,plots,evaluate_models
#import phenotypesnew as pheno_info

In [6]:
def plot_context():
    sns.set_context("talk", rc={"font.size":18,"axes.titlesize":18,"axes.labelsize":16,"font_scale":0.9})
image_path = '/scratch/c.c21013066/images/ukbiobank/accelerometer'
data_path = '/scratch/c.c21013066/data/ukbiobank'
sample_path = '/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo'

In [7]:
def read_traits_file(input_path: str):
    with open(input_path, 'r') as f:
        traits_data = yaml.load(f, Loader=yaml.BaseLoader)
    return traits_data

# Load accelerometry data

In [5]:
merged = pd.read_csv(f'{data_path}/phenotypes/demo_acc.csv',index_col=0)
merged = _preprocess.date_to_datetime(merged)
merged = _preprocess.date_to_datetime_end(merged)

# Extract healthy control matches for each disease case

1. ensure all HC also have complete lifestyle and blood info: prediction cohort, need matched (PD) and unmatched
2. just match on acc info: for residual, statistical analysis, need matched (PD) and unmatched

In [6]:
# define predictors
predictors_cat = []
predictors_norm = merged.columns[34:72]
predictors = np.hstack([predictors_cat,predictors_norm])
predictors = merged.columns[34:72]
scale_predictors = np.hstack([np.repeat([False],len(predictors_cat)),np.repeat([True],len(predictors_norm))])

In [91]:
# subsample for weartime corrected and get HC match
reload(_get_data)
reload(_preprocess)
reload(ukbb_parser)
include_risk = False # set to True if blood/genes/lifestyle columns should be checked if available and only retain those with complete info (for prediction cohort)
for name,exclude,drop_healthy in zip(['AllCauseDementia','AllCauseParkinsonism','AlzheimerDisease',
        'MultipleSystemAtrophy','ProgressiveSupranuclearPalsy','FrontoTemporalDementia','VascularDementia','ParkinsonDisease',
                                     'Dystonia','Osteoarthritis','Depression'],
                                     [['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],
                                     ['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],
                                     ['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC']],
                                    ['nonHC','nonHC','nonHC','nonHC','nonHC','nonHC','nonHC','nonHC','nonHC',
                                    'nonHC','nonHC','nonHC','nonHC']):
        if include_risk:
            risk = pd.read_csv(f'{sample_path}/{name}_controlNononHC_risk.csv',index_col=0)
            blood = pd.read_csv(f'{sample_path}/{name}_controlNononHC_blood.csv',index_col=0)
            riskblood = pd.merge(risk,blood,right_index=True,left_index=True,how='outer',suffixes=['_drop',''])
            riskblood = riskblood.drop(columns=riskblood.filter(regex='_drop').columns)
            riskblood = riskblood[~riskblood.index.duplicated(keep='first')]
            # get data of PRS
            traits = read_traits_file('../../resources/genetics/traits.yaml')
            traits = pd.DataFrame(traits)
            score1 = pd.read_csv(f'{data_path}/ukb52375.csv').set_index('eid')
            trait='26260-0.0'
            score_best = score1[trait]
            score1.columns = score1.columns.str.replace('-0.0','')
            PRSs = score1[traits.columns]
            PRSs.columns = traits.loc['full_name',PRSs.columns]
            genetics = PRSs.columns
            genetics_scale = genetics

            # merge data
            m = pd.merge(merged,score_best,right_index=True,left_index=True,how='left').rename(columns={trait:'PRS'})
            m = pd.merge(m,PRSs,right_index=True,left_index=True,how='left')

            m = pd.merge(m,riskblood[np.hstack([risk.columns[:-6],blood.columns[:-6]])],right_index=True,left_index=True,how='left',suffixes=['','_drop'])
            m = m.drop(columns=m.filter(regex='_drop').columns)

            covs = np.hstack(['visit_age','male','TownsendDeprivationIndex',risk.columns[:-6],blood.columns[:-6],genetics])
            covs_scale = np.hstack(['visit_age','TownsendDeprivationIndex',risk.columns[:-6],blood.columns[:-6],genetics])
            levels = [0,1]
            scale_covs = np.hstack([1,0,1,np.repeat(0,len(risk.columns[:-6])),np.repeat(0,len(blood.columns[:-6])),np.repeat(0,len(genetics))]).astype(bool)
        else:
            covs = np.hstack(['visit_age','male'])
            covs_scale = np.hstack(['visit_age'])
            levels = [0,1]
            scale_covs = np.hstack([1,0]).astype(bool)
            m = merged.copy()
            
        keep = [f'{name}_age','time_to_diagnosis','accelerometry_age','date_accelerometry','weartime_QC']
        merged_ = _get_data.get_healthy_disorder(m.copy(deep=True),name,covs=covs,
                             predictors=predictors,incident=False,exclude=drop_healthy) # drops any HC with incomplete covariate info
        merged_clean = _preprocess.make_categorical(merged_,covs[~scale_covs],levels)
        merged_clean = _preprocess.make_categorical(merged_clean,predictors[~scale_predictors],levels)
        #drop columns with too many nan
        #predictors, scale_predictors = _preprocess.clean_predictors(merged_clean,predictors,scale_predictors,
        #                     thresh=0.15)
        # drop subjects with too many nan (here any nan)
        merged_clean = _preprocess.clean_subjects(merged_clean,predictors,thresh=0)
        merged_clean = merged_clean[merged_clean['weartime_QC']==1]
        if include_risk:
            merged_clean[np.hstack([predictors,name,covs,keep])].to_csv(f'{sample_path}/{name}_controlNo{drop_healthy}_acc_QC_genebloodrisk.csv')
            matched_sample = _get_data.get_matched_acc(merged_clean,name,exclude=exclude,matched_cols=['accelerometry_age','male'],
                                                   file=f'{name}_controlNo{drop_healthy}_match_accage_acc_QC_genebloodrisk.txt',save=True)
        else:
            merged_clean[np.hstack([predictors,name,covs,keep])].to_csv(f'{sample_path}/{name}_controlNo{drop_healthy}_acc_QC.csv')
            matched_sample = _get_data.get_matched_acc(merged_clean,name,exclude=exclude,matched_cols=['accelerometry_age','male'],
                                                   file=f'{name}_controlNo{drop_healthy}_match_accage_acc_QC.txt',save=True)

/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo
people in HC and Case:  (0,)
people in HC and Case:  (0,)
Subjects get dropped due to too many NaN  11175


  last_update = pd.datetime(2021,3,1)


matched file does not exist, so creating one
(1369, 171)
(43753, 171)
/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo
people in HC and Case:  (0,)
people in HC and Case:  (0,)
Subjects get dropped due to too many NaN  3744


  last_update = pd.datetime(2021,3,1)


matched file does not exist, so creating one
(519, 171)
(43753, 171)
/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo
people in HC and Case:  (0,)
people in HC and Case:  (0,)
Subjects get dropped due to too many NaN  2951


  last_update = pd.datetime(2021,3,1)


matched file does not exist, so creating one
(212, 171)
(43753, 171)
/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo
people in HC and Case:  (0,)
people in HC and Case:  (0,)
Subjects get dropped due to too many NaN  109


  last_update = pd.datetime(2021,3,1)


matched file does not exist, so creating one
(8, 171)
(43753, 171)
/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo
people in HC and Case:  (0,)
people in HC and Case:  (0,)
Subjects get dropped due to too many NaN  152


  last_update = pd.datetime(2021,3,1)


matched file does not exist, so creating one
(17, 171)
(43753, 171)
/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo
people in HC and Case:  (0,)
people in HC and Case:  (0,)
Subjects get dropped due to too many NaN  326


  last_update = pd.datetime(2021,3,1)


matched file does not exist, so creating one
(20, 171)
(43753, 171)
/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo
people in HC and Case:  (0,)
people in HC and Case:  (0,)
Subjects get dropped due to too many NaN  1537


  last_update = pd.datetime(2021,3,1)


matched file does not exist, so creating one
(93, 171)
(43753, 171)
/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo
people in HC and Case:  (0,)
people in HC and Case:  (0,)
Subjects get dropped due to too many NaN  3327


  last_update = pd.datetime(2021,3,1)


matched file does not exist, so creating one
(469, 171)
(43753, 171)
/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo
people in HC and Case:  (0,)
people in HC and Case:  (0,)
Subjects get dropped due to too many NaN  524


  last_update = pd.datetime(2021,3,1)


matched file does not exist, so creating one
(81, 171)
(43753, 171)
/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo


  exec(code_obj, self.user_global_ns, self.user_ns)


people in HC and Case:  (0,)
people in HC and Case:  (0,)


  last_update = pd.datetime(2021,3,1)


Subjects get dropped due to too many NaN  61609
matched file does not exist, so creating one
(13407, 171)
(43753, 171)
no match found for  5473821
no match found for  5515304
no match found for  5540534
no match found for  5550868
no match found for  5554844
no match found for  5555170
no match found for  5562687
no match found for  5584356
no match found for  5591508
no match found for  5624261
no match found for  5697180
no match found for  5786056
no match found for  5897063
/scratch/c.c21013066/data/ukbiobank/sample/withGP/noOsteo
people in HC and Case:  (0,)
people in HC and Case:  (0,)


  last_update = pd.datetime(2021,3,1)


Subjects get dropped due to too many NaN  48698
matched file does not exist, so creating one
(9586, 171)
(43753, 171)


In [92]:
def match_acc(df,target='ParkinsonDisease',exclude=['AllCauseParkinsonism'],match_cols=['accelerometry_age_rounded','male']):
    '''match by age and gender to find healthy (do not have exclude disease or target disorder) for each patient'''
    df[match_cols[0]] = df[match_cols[0].replace('_rounded','')].round(0)
    case = df[df[target]==1]
    print(case.shape)
    control = df[np.logical_and(df[target]!=1,df[exclude].sum(axis=1) == 0)]
    print(control.shape)
    eids = pd.DataFrame(index=case.index,columns=['control_match'])
    no_matches = []
    for key,row in case.iterrows():
        # find match
        try:
            match = control[(control[match_cols[0]]==row[match_cols[0]]) & (control[match_cols[1]]==row[match_cols[1]])].sample(n=1)
            # append match and remove it from control pool for sampling without retaking
            eids.loc[key,'control_match'] = match.index.values[0]
            control = control[~control.index.isin(eids['control_match'])]
        except:
            print('no match found for ',key)
            eids = eids.drop(index=[key])
            no_matches.append([key])
    matched = df.loc[np.hstack([eids['control_match'],eids.index])]
    return eids, matched

In [93]:
# remove PD and Depressed comorbid cases
match = False
for name,age_onset in zip(['AllCauseDementia','AllCauseParkinsonism','AlzheimerDisease',
        'MultipleSystemAtrophy','ProgressiveSupranuclearPalsy','FrontoTemporalDementia','VascularDementia',
            'Dystonia','Osteoarthritis','Depression','ParkinsonDisease'],[20,20,20,20,20,20,20,0,0,0,20]):
        drop_healthy='nonHC'
        merged_clean = pd.read_csv(f'{sample_path}/{name}_controlNo{drop_healthy}_acc_QC.csv').set_index('eid')
        matched_eid = pd.read_csv(f'{sample_path}/{name}_controlNo{drop_healthy}_match_accage_acc_QC.txt',header=None,names=['eid'])
        xnames = merged_clean.filter(regex='Average').columns.str.replace('Average','').str.replace('___','-')
        diag_name = name
        merged_clean.loc[merged_clean[f'{name}_age']<=age_onset,f'{name}_age'] = merged_clean.filter(regex='_age')[merged_clean.filter(regex='_age')>age_onset].min(axis=1)
        merged_clean['diagnosis'] = merged_clean[name].replace([0,1],['Healthy',diag_name])
        merged_clean['acc_time_since_diagnosis'] = merged_clean[f'accelerometry_age'] - merged_clean[f'{name}_age']
        merged_clean['acc_time_to_diagnosis'] =  merged_clean[f'{name}_age'] - merged_clean[f'accelerometry_age']
        merged_clean['acc_incident'] = merged_clean[f'{name}_age'] > merged_clean[f'accelerometry_age']
        merged_clean.loc[merged_clean['acc_time_since_diagnosis'].isna(),'acc_incident'] = np.nan
        merged_clean['diagnosis_prod'] = merged_clean['diagnosis'].copy(deep=True)
        merged_clean.loc[np.logical_and(merged_clean['acc_incident']==1,merged_clean['diagnosis']==diag_name),'diagnosis_prod'] = 'Prodromal'
        merged_clean['diagnosis_prod_conservative'] = merged_clean['diagnosis_prod'].copy(deep=True)
        merged_clean.loc[np.logical_and(merged_clean['acc_time_to_diagnosis']<2,merged_clean['diagnosis_prod']=='Prodromal'),'diagnosis_prod_conservative'] = diag_name
        merged_clean = merged_clean[merged_clean['weartime_QC']==1]
        if match:
            matched_sample = merged_clean.loc[matched_eid['eid']]
        else:
            matched_sample = merged_clean.copy()
        # remove depressed and their HC matches
        depressed = pd.read_csv(f'{sample_path}/Depression.csv').set_index('eid')
        matched_sample['depressed'] = 0
        matched_sample.loc[np.intersect1d(depressed.index,matched_sample.index),'depressed'] = 1
        PD = pd.read_csv(f'{sample_path}/ParkinsonDisease.csv').set_index('eid')
        matched_sample['PD'] = 0
        matched_sample.loc[np.intersect1d(PD.index,matched_sample.index),'PD'] = 1
        match_cols=['accelerometry_age_rounded','male']
        matched_sample[match_cols[0]] = matched_sample[match_cols[0].replace('_rounded','')].round(0)
        control = matched_sample[matched_sample[name]==0]
        if match:
            if name != 'Depression':
                # also remove depressed HC match
                eids = pd.DataFrame(index=matched_sample[matched_sample['depressed']==1].index,
                                    columns=['control_match'])
                for key,row in matched_sample[matched_sample['depressed']==1].iterrows():
                    match = control[(control[match_cols[0]]==row[match_cols[0]]) & (control[match_cols[1]]==row[match_cols[1]])].sample(n=1)
                    eids.loc[key,'control_match'] = match.index.values[0]
                    control = control[~control.index.isin(eids['control_match'])]
                matched_sample = matched_sample[~(matched_sample.index.isin(depressed.index))]
                matched_sample = matched_sample.drop(index=eids['control_match'])
            if name != 'ParkinsonDisease':
                # also remove PD HC match
                eids = pd.DataFrame(index=matched_sample[matched_sample['PD']==1].index,
                                    columns=['control_match'])
                for key,row in matched_sample[matched_sample['PD']==1].iterrows():
                    match = control[(control[match_cols[0]]==row[match_cols[0]]) & (control[match_cols[1]]==row[match_cols[1]])].sample(n=1)
                    eids.loc[key,'control_match'] = match.index.values[0]
                    control = control[~control.index.isin(eids['control_match'])]
                matched_sample = matched_sample[~(matched_sample.index.isin(PD.index))]
                matched_sample = matched_sample.drop(index=eids['control_match'])
        else:
            if name != 'Depression':
                matched_sample = matched_sample[~(matched_sample.index.isin(depressed.index))]
            if name != 'ParkinsonDisease':
                matched_sample = matched_sample[~(matched_sample.index.isin(PD.index))]
        np.savetxt(f'{sample_path}/{name}_controlNo{drop_healthy}_acc_QC_NoDepressedNoPD.txt',matched_sample.index,fmt='%d')

# Now save each dataset

In [109]:
# merge curated datasets
match = True
dfs = []
for name,age_onset in zip(['AllCauseDementia','AllCauseParkinsonism','AlzheimerDisease',
        'MultipleSystemAtrophy','ProgressiveSupranuclearPalsy','FrontoTemporalDementia','VascularDementia',
            'Dystonia','Osteoarthritis','Depression','ParkinsonDisease'],[20,20,20,20,20,20,20,0,0,0,20]):
        drop_healthy='nonHC'
        merged_clean = pd.read_csv(f'{sample_path}/{name}_controlNo{drop_healthy}_acc_QC.csv').set_index('eid')
        matched_eid = pd.read_csv(f'{sample_path}/{name}_controlNo{drop_healthy}_match_accage_acc_QC.txt',
                                      header=None,names=['eid'])
        depressed = pd.read_csv(f'{sample_path}/Depression.csv').set_index('eid')
        PD = pd.read_csv(f'{sample_path}/ParkinsonDisease.csv').set_index('eid')
        diag_name = name
        merged_clean.loc[merged_clean[f'{name}_age']<=age_onset,f'{name}_age'] = merged_clean.filter(regex='_age')[merged_clean.filter(regex='_age')>age_onset].min(axis=1)
        merged_clean['diagnosis'] = merged_clean[name].replace([0,1],['Healthy',diag_name])
        merged_clean['acc_time_since_diagnosis'] = merged_clean[f'accelerometry_age'] - merged_clean[f'{name}_age']
        merged_clean['acc_time_to_diagnosis'] =  merged_clean[f'{name}_age'] - merged_clean[f'accelerometry_age']
        merged_clean['acc_incident'] = merged_clean[f'{name}_age'] > merged_clean[f'accelerometry_age']
        merged_clean.loc[merged_clean['acc_time_since_diagnosis'].isna(),'acc_incident'] = np.nan
        merged_clean['diagnosis_prod'] = merged_clean['diagnosis'].copy(deep=True)
        merged_clean.loc[np.logical_and(merged_clean['acc_incident']==1,merged_clean['diagnosis']==diag_name),'diagnosis_prod'] = 'Prodromal'
        merged_clean['diagnosis_prod_conservative'] = merged_clean['diagnosis_prod'].copy(deep=True)
        merged_clean.loc[np.logical_and(merged_clean['acc_time_to_diagnosis']<2,merged_clean['diagnosis_prod']=='Prodromal'),'diagnosis_prod_conservative'] = diag_name
        merged_clean = merged_clean[merged_clean['weartime_QC']==1]
        if match:
            matched_sample = merged_clean.loc[matched_eid['eid']]
            matched_sample['diagnosis'] = name
        else:
            matched_sample = merged_clean.copy()
        matched_sample['Status'] = matched_sample['diagnosis_prod_conservative'].replace(name,'Diseased')
        matched_sample['depressed'] = 0
        matched_sample.loc[np.intersect1d(depressed.index,matched_sample.index),'depressed'] = 1
        matched_sample['PD'] = 0
        matched_sample.loc[np.intersect1d(PD.index,matched_sample.index),'PD'] = 1
        dfs.append(matched_sample)
dfs = pd.concat(dfs)

In [102]:
# save unmatched: all HC, all population cases either NoDepressedNoPD or all
hc = dfs[dfs['diagnosis']=='Healthy']
dfs_new = dfs.drop(index=hc.index)
hc_unique = hc[~hc.index.duplicated(keep='first')]
subset = ['AllCauseDementia','AlzheimerDisease','AllCauseParkinsonism','ParkinsonDisease','Dystonia','Osteoarthritis','Depression']
dfs_new = dfs_new[dfs_new['diagnosis'].isin(subset)]
dfs_new = pd.concat([dfs_new,hc_unique])

In [103]:
#dfs_new.to_csv(f'{data_path}/phenotypes/accelerometer/unmatched_all_HCnoOsteo.csv')
#dfs_new.to_csv(f'{data_path}/phenotypes/accelerometer/unmatched_all_HCnoOsteo_NoDepressedNoPD.csv')
dfs_new.to_csv(f'{data_path}/phenotypes/accelerometer/unmatched_all_HCnoOsteo_genebloodrisk.csv')

In [110]:
# for matched version identify matches for each prod/pd case
match=True
match_cols=['accelerometry_age_rounded','male']
dfs[match_cols[0]] = dfs[match_cols[0].replace('_rounded','')].round(0)
dfs_new = []
subset = ['AllCauseDementia','AlzheimerDisease','AllCauseParkinsonism','ParkinsonDisease','Dystonia','Osteoarthritis','Depression']
for diagnosis in subset:
    print(diagnosis)
    if match:
        data = dfs[dfs['diagnosis']==diagnosis]
    else:
        data = dfs[dfs['diagnosis'].isin(['Healthy',diagnosis])]
    data['Status_group'] = 'None'
    control = data[data[diagnosis]==0]
    print(control.shape)
    for status in ['Diseased','Prodromal']:
        group = data[data['Status']==status] 
        eids = pd.DataFrame(index=group.index,columns=['control_match'])
        for key,row in group.iterrows():
            try:
                matcheid = control[(control[match_cols[0]]==row[match_cols[0]]) & (control[match_cols[1]]==row[match_cols[1]])].sample(n=1)
                eids.loc[key,'control_match'] = matcheid.index.values[0]
                control = control[~control.index.isin(eids['control_match'])]
            except:
                # no match found
                pass
        data.loc[eids.index,'Status_group'] = status
        data.loc[eids['control_match'].dropna(),'Status_group'] = status
    data = data.dropna(subset=['Status_group'])
    dfs_new.append(data)
dfs_new = pd.concat(dfs_new)
dfs_new['Group'] = (dfs_new['Status'] + '_' + dfs_new['Status_group']).replace(['Prodromal_Prodromal','Diseased_Diseased'],['Prodromal','Diseased'])

AllCauseDementia
(1369, 77)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Status_group'] = 'None'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cav

AlzheimerDisease
(212, 77)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Status_group'] = 'None'


AllCauseParkinsonism
(519, 77)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Status_group'] = 'None'


ParkinsonDisease
(469, 77)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Status_group'] = 'None'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cav

Dystonia
(81, 77)
Osteoarthritis
(13394, 77)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Status_group'] = 'None'


Depression
(9586, 77)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [111]:
dfs_new.to_csv(f'{data_path}/phenotypes/accelerometer/matched_all_HCnoOsteo.csv')
#dfs_new.to_csv(f'{data_path}/phenotypes/accelerometer/matched_all_HCnoOsteo_genebloodrisk.csv')

# Sup Table 1

In [27]:
dfs_new = pd.read_csv(f'{data_path}/phenotypes/accelerometer/unmatched_all_HCnoOsteo.csv',index_col=0)
PDmatch = pd.read_csv(f'{data_path}/phenotypes/accelerometer/matched_all_HCnoOsteo.csv',index_col=0)
PDmatch = PDmatch[PDmatch['diagnosis']=='ParkinsonDisease']
features_all = ['No_wear_time_bias_adjusted_average_acceleration',
                                             'accelerometry_age','visit_age',
                                               'acc_time_to_diagnosis']
fnames = {'No_wear_time_bias_adjusted_average_acceleration':'average acceleration [milligal]',
          'accelerometry_age':'age [years]','visit_age':'initial visit age [years]',
'acc_time_to_diagnosis':'time from acceleration data to diagnosis [years]'}
subset = ['AllCauseDementia','AlzheimerDisease','AllCauseParkinsonism','Dystonia','Osteoarthritis','Depression','Healthy']

In [33]:
table = dfs_new.groupby(['diagnosis','Status'])[features_all].agg(['mean','std']).rename(columns=fnames)
table_cat = dfs_new.groupby(['diagnosis','Status'])[['male','depressed','PD']].agg([lambda g: g.mean()*100])
table_size = dfs_new.groupby(['diagnosis','Status'])[['male']].agg(['size']).rename(columns={'male':'sample'})
tables = [table,table_cat,table_size]
tables = reduce(lambda  left,right: pd.merge(left,right,left_index=True,right_index=True,
                                            how='outer'), tables)

table = PDmatch.groupby(['diagnosis','Group'])[['No_wear_time_bias_adjusted_average_acceleration',
                                             'accelerometry_age','visit_age',
                                               'acc_time_to_diagnosis']].agg(['mean','std']).rename(columns={'No_wear_time_bias_adjusted_average_acceleration':'average acceleration [milligal]',
                                                                                                      'accelerometry_age':'age [years]','visit_age':'initial visit age [years]',
                                                                                                        
                                                                                                            'acc_time_to_diagnosis':'time from acceleration data to diagnosis [years]'})
table_cat = PDmatch.groupby(['diagnosis','Group'])[['male','depressed','PD']].agg([lambda g: g.mean()*100])
table_size = PDmatch.groupby(['diagnosis','Group'])[['male']].agg(['size']).rename(columns={'male':'sample'})
tablesPD = [table,table_cat,table_size]
tablesPD = reduce(lambda  left,right: pd.merge(left,right,left_index=True,right_index=True,
                                            how='outer'), tablesPD)
tablesPD.loc[(slice(None),['Healthy_Diseased','Healthy_Prodromal']),['time from acceleration data to diagnosis [years]']] = np.nan
tables.loc[(slice(None),['Healthy']),['time from acceleration data to diagnosis [years]']] = np.nan
tables.loc[(subset,slice(None),slice(None)),:]
tables = pd.concat([tablesPD,tables.loc[(subset,slice(None)),:]])
print(tables.round(2).to_latex())
tables.to_csv('/scratch/c.c21013066/data/ukbiobank/analyses/paper/prodromalPDacc/cohort_raw_HCnoOsteo_completeinfo.csv')

\begin{tabular}{llrrrrrrrrrrrr}
\toprule
        &         & \multicolumn{2}{l}{average acceleration [milligal]} & \multicolumn{2}{l}{age [years]} & \multicolumn{2}{l}{initial visit age [years]} & \multicolumn{2}{l}{time from acceleration data to diagnosis [years]} &     male & depressed &       PD & sample \\
        &         &                            mean &    std &        mean &   std &                      mean &   std &                                             mean &    std & <lambda> &  <lambda> & <lambda> &   size \\
diagnosis & Group &                                 &        &             &       &                           &       &                                                  &        &          &           &          &        \\
\midrule
ParkinsonDisease & Diseased &                           20.07 &   8.41 &       67.62 &  5.81 &                     61.96 &  5.81 &                                            -4.90 &   5.97 &    60.81 &     17.58 &   100.00 &    2

  print(tables.round(2).to_latex())


In [35]:
tables

Unnamed: 0_level_0,Unnamed: 1_level_0,average acceleration [milligal],average acceleration [milligal],age [years],age [years],initial visit age [years],initial visit age [years],time from acceleration data to diagnosis [years],time from acceleration data to diagnosis [years],male,depressed,PD,sample
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,<lambda>,<lambda>,<lambda>,size
diagnosis,Group,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
ParkinsonDisease,Diseased,20.07381,8.405321,67.624154,5.812055,61.960571,5.80756,-4.902189,5.970795,60.805861,17.582418,100.0,273
ParkinsonDisease,Healthy_Diseased,26.789377,7.482977,67.624235,5.834623,61.751948,5.828059,,,60.805861,0.0,0.0,273
ParkinsonDisease,Healthy_Prodromal,26.323571,7.239688,69.237235,4.960416,63.253237,4.885422,,,69.387755,0.0,0.0,196
ParkinsonDisease,Prodromal,19.938367,7.439105,69.253132,4.965301,63.747499,4.817502,4.331318,1.295915,69.387755,12.755102,100.0,196
AllCauseDementia,Diseased,26.660081,8.862885,61.748858,8.162259,56.139398,8.126638,-7.917578,9.261143,37.221095,80.831643,1.217039,986
AllCauseDementia,Prodromal,23.352376,7.501628,69.760259,5.315065,64.115541,5.106116,4.660972,1.29487,53.002611,18.798956,13.577023,383
AlzheimerDisease,Diseased,26.010169,7.374207,69.618274,4.701989,63.609654,4.845792,-1.062753,2.734274,61.016949,23.728814,1.694915,59
AlzheimerDisease,Prodromal,24.149542,7.724969,70.478482,4.257444,64.780777,4.176705,4.791163,1.222173,46.405229,16.339869,5.882353,153
AllCauseParkinsonism,Diseased,20.628766,8.698801,67.206784,6.207731,61.558461,6.134287,-5.449525,6.506924,59.74026,19.155844,88.636364,308
AllCauseParkinsonism,Prodromal,20.004787,7.314587,69.078781,5.144211,63.544551,5.042844,4.339493,1.287435,67.298578,13.744076,92.890995,211


# population cohort with all HC info

In [168]:
# get and clean hc
extra = ['male','visit_age','TownsendDeprivationIndex']
acc = pd.read_csv(f'{sample_path}/ParkinsonDisease_controlNononHC_acc_QC.csv',index_col=0)
risk = pd.read_csv(f'{sample_path}/ParkinsonDisease_controlNononHC_risk.csv',index_col=0)
blood = pd.read_csv(f'{sample_path}/ParkinsonDisease_controlNononHC_blood.csv',index_col=0)
riskblood = pd.merge(blood,risk,right_index=True,left_index=True,how='outer',suffixes=['_drop',''])
riskblood = riskblood.drop(columns=riskblood.filter(regex='_drop').columns)
print(riskblood.columns)
hc = pd.merge(acc,riskblood,right_index=True,left_index=True,how='outer',suffixes=['','_drop'])
hc = hc.drop(columns=hc.filter(regex='_drop').columns)
depressed = pd.read_csv(f'{sample_path}/Depression.csv').set_index('eid')
PD = pd.read_csv(f'{sample_path}/ParkinsonDisease.csv').set_index('eid')
hc['Healthy_age'] = 999
hc['diagnosis'] = 'Healthy'
hc['Status'] = 'Healthy'
hc['depressed'] = 0
hc.loc[np.intersect1d(depressed.index,hc.index),'depressed'] = 1
hc['PD'] = 0
hc.loc[np.intersect1d(PD.index,hc.index),'PD'] = 1
hc['ParkinsonDisease'] = 0
hc.loc[np.intersect1d(PD.index,hc.index),'ParkinsonDisease'] = 1
hc = hc[hc['ParkinsonDisease']==0]
hc = hc[~hc.index.duplicated(keep='first')]
hc.to_csv('/scratch/c.c21013066/data/ukbiobank/phenotypes/accelerometer/allHCnoOsteo.csv')
# merge with dfs_new
#merged = pd.merge(dfs_new,hc)

Index(['Albumin', 'Alkalinephosphatase', 'Alanineaminotransferase',
       'ApolipoproteinA', 'ApolipoproteinB', 'Aspartateaminotransferase',
       'Urea', 'Calcium', 'Cholesterol', 'Creatinine', 'C_reactiveprotein',
       'CystatinC', 'Gammaglutamyltransferase', 'Glucose',
       'Glycatedhaemoglobin_HbA1c', 'HDLcholesterol', 'IGF_1', 'LDLdirect',
       'Phosphate', 'SHBG', 'Totalbilirubin', 'Testosterone', 'Totalprotein',
       'Triglycerides', 'Urate', 'VitaminD', 'AlcoholStatus_Current',
       'AlcoholStatus_Previous', 'SmokeStatus_Current', 'SmokeStatus_Previous',
       'DaytimeSleepiness_Often', 'AlcoholFrequency_LessThanWeekly',
       'family_Stroke', 'family_Diabetes', 'family_Severedepression',
       'family_Alzheimersdiseasedementia', 'family_Parkinsonsdisease', 'BMI',
       'Waist_Circumference', 'Hip_Circumference', 'Diastolic_BloodPressure',
       'PulseRate', 'BodyFat_Percentage', 'ParkinsonDisease', 'visit_age',
       'male', 'TownsendDeprivationIndex', 'Parki

In [128]:
# get and clean hc
acc = pd.read_csv(f'{sample_path}/ParkinsonDisease_controlNononHC_acc_QC.csv',index_col=0)
acc.columns

Index(['Overallaverage', 'Standard_deviationacceleration',
       'Monday_average_acceleration', 'Tuesday_average_acceleration',
       'Wednesday_average_acceleration', 'Thursday_average_acceleration',
       'Friday_average_acceleration', 'Saturday_average_acceleration',
       'Sunday_average_acceleration', 'Average0000___0059',
       'Average0100___0159', 'Average0200___0259', 'Average0300___0359',
       'Average0400___0459', 'Average0500___0559', 'Average0600___0659',
       'Average0700___0759', 'Average0800___0859', 'Average0900___0959',
       'Average1000___1059', 'Average1100___1159', 'Average1200___1259',
       'Average1300___1359', 'Average1400___1459', 'Average1500___1559',
       'Average1600___1659', 'Average1700___1759', 'Average1800___1859',
       'Average1900___1959', 'Average2000___2059', 'Average2100___2159',
       'Average2200___2259', 'Average2300___2359',
       'No_wear_time_bias_adjusted_average_acceleration',
       'No_wear_time_bias_adjustedstandard_dev