In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import pylab as plt
import datetime
import os
import glob

# packages burrowed and adapted from pypmi
import sys
sys.path.insert(1, '/scratch/c.c21013066/PPMI_DataPreparation/phenotype/')
import _info2021
import _utils
import _loaders
import _loadersSubitems
import _thresholds2021 as thr

from importlib import reload

In [None]:
path = "/scratch/c.c21013066/data/ppmi/phenotypes2021"

In [None]:
demographics = _loaders.load_demographics(path=path)
# check for unreasonable data in demographics
# go through columns and check data
reload(thr)
demographics_clean = demographics.copy(deep=True)

for col in demographics.columns[1:]:
    print(col)
    info = thr.DEMOGRAPHIC_INFO[col]
    print('    Are there NaN values? {}'.format(demographics_clean[col].isna().sum()))
    if info['scale_level'] == 'categorical': # check if only allowed categories used
        if col == 'diagnosis':
            print('    How many dropped due to no category? {}'.format(demographics_clean[col].isna().sum()))
            # we can only use data of people who belong to a category
            demographics_clean = demographics_clean.dropna(axis='rows',how='any',subset=[col])
        assert all(elem in info['categories']  for elem in set(demographics_clean[col].dropna()))
    if info['scale_level'] == 'date': # check if date in range
        # need brithdate information
        if col == 'date_birth':
            print('    How many dropped due to no category? {}'.format(demographics_clean[col].isna().sum()))
            # we can only use data of people who belong to a category
            demographics_clean = demographics_clean.dropna(axis='rows',how='any',subset=[col])
        assert all(demographics_clean[col].dropna() >= info['min']), 'min exceeds bounds {}'.format(demographics_clean[col].min())
        assert all(demographics_clean[col].dropna() <= info['max']), 'max exceeds bounds {}'.format(demographics_clean[col].max())
    if info['scale_level'] == 'binomial':
        assert  set(demographics_clean[col].dropna()) == set(info['categories'])
    if info['scale_level'] == 'normal':
        assert all(demographics_clean[col].dropna() >= info['min']), 'min exceeds bounds {}'.format(demographics_clean[col].min())
        assert all(demographics_clean[col].dropna() <= info['max']), 'max exceeds bounds {}'.format(demographics_clean[col].max())
demographics_clean.to_csv(f'{path}/demographics_clean.csv')

In [None]:
# check for outliers in behavioral data
# only keep those for who we know demographics
#print(behavior.shape,behavior_sub.shape)
reload(thr)
behavior = _loaders.load_behavior(path = path)
behavior_clean = behavior[behavior['participant'].isin(demographics_clean['participant'])]
#behavior_sub_clean = behavior_sub[behavior_sub['participant'].isin(demographics_clean['participant'])]
#print(behavior_clean.shape,behavior_sub_clean.shape)
print('Exclude subjects for which we do not know demographics: {}'.format(behavior.shape[0]-behavior_clean.shape[0]))
print('Drop visits where no visit ID given: {}'.format(behavior_clean['date'].isna().sum()))
behavior_clean = behavior_clean.dropna(axis='rows',how='all',subset=['date'])
#behavior_sub_clean = behavior_sub_clean.dropna(axis='rows',how='all',subset=['visit'])
#print(behavior_clean.shape,behavior_sub_clean.shape)
# set date right
behavior_clean.loc[np.logical_and(behavior_clean['participant']==3278,behavior_clean['visit']=='V14'),"date"] = pd.Timestamp(datetime.datetime(2018,12,1))
#behavior_sub_clean.loc[np.logical_and(behavior_sub_clean['participant']==3278,behavior_sub_clean['visit']=='V14'),"date"] = pd.Timestamp(datetime.datetime(2018,12,1))

for col in behavior_clean.columns[2:]:
    print(col)
    info = thr.BEHAVIORAL_INFO[col]
    print(info)
    print('    Are there NaN values in composite? {}'.format(behavior_clean[col].isna().sum()))
    if info['scale_level'] == 'categorical': # check if only allowed categories used
        assert all(elem in info['categories']  for elem in set(behavior_clean[col].dropna()))
    if info['scale_level'] == 'date': # check if date in range
        # need visit date info to infer age
        if col == 'date':
            print('    How many dropped due to no visit date? {}'.format(behavior[col].isna().sum()))
            # we can only use data of people who belong to a category
            behavior_clean = behavior_clean.dropna(axis='rows',how='any',subset=[col])
            #behavior_sub_clean = behavior_sub_clean.dropna(axis='rows',how='any',subset=[col])
        #assert all(behavior_clean[col].dropna() >= info['min']), 'min exceeds bounds {}'.format(behavior_clean[col].min())
        #assert all(behavior_clean[col].dropna() <= info['max']), 'max exceeds bounds {}'.format(behavior_clean[col].max())
        print('    Min exceeded? {}'.format((behavior_clean[col].dropna() < info['min']).sum()))
        print('    Max exceeded? {}'.format((behavior_clean[col].dropna() > info['max']).sum()))
        #print('    Min exceeded sub? {}'.format((behavior_sub_clean[col].dropna() < info['min']).sum()))
        #print('    Max exceeded sub? {}'.format((behavior_sub_clean[col].dropna() > info['max']).sum()))
    if info['scale_level'] == 'binomial':
        print(set(behavior_clean[col].dropna()))
        print(set(info['categories']))
        #assert  set(behavior_clean[col].dropna()) == set(info['categories'])
    if info['scale_level'] == 'normal':
        min_exceed = behavior_clean[col].min() < info['min']
        max_exceed = behavior_clean[col].max() < info['max']
        print('    Min exceeded? {}'.format((behavior_clean[col].dropna() < info['min']).sum()))
        print('    Max exceeded? {}'.format((behavior_clean[col].dropna() > info['max']).sum()))
behavior_clean = behavior_clean.dropna(axis='rows',how='all',subset=thr.BEHAVIORAL_INFO.keys()
                                       
# drop benton < 0
behavior_clean.loc[behavior_clean['benton']<0,'benton'] = 0
# drop hvlt recognition above 12 and below 0
behavior_clean.loc[behavior_clean['hvlt_recognition']<0,'hvlt_recognition'] = 0
behavior_clean.loc[behavior_clean['hvlt_recognition']>12,'hvlt_recognition'] = 12
# drop stai_state, stai_trait <0 as those all NaN
behavior_clean.loc[behavior_clean['stai_trait']<20,'stai_trait'] = np.nan
behavior_clean.loc[behavior_clean['stai_state']<20,'stai_state'] = np.nan

behavior_clean.to_csv("{}/behavior_clean.csv".format(path))

In [None]:
demographics_clean = pd.read_csv(f'{path}/demographics_clean.csv',index_col=0)
datscan = _loaders.load_datscan_all(path=path)
print(datscan.shape)
datscan_clean = datscan[datscan['participant'].isin(demographics_clean['participant'])]
print(datscan_clean.shape)
datscan_clean = datscan_clean.dropna(axis='rows',how='all',subset=['datscan_putamen_r','datscan_caudate_r','datscan_putamen_l','datscan_caudate_l'])
print(datscan_clean.shape)
datscan_clean.to_csv(f'{path}/datscan_clean.csv')

In [None]:
reload(_loaders)
demographics_clean = pd.read_csv(f'{path}/demographics_clean.csv',index_col=0)
biospecimen = _loaders.load_biospecimen(path=path)
print(biospecimen.shape)
biospecimen_clean = biospecimen[biospecimen['participant'].isin(demographics_clean['participant'])]
print(biospecimen_clean.shape)
biospecimen_clean = biospecimen_clean.dropna(axis='rows',how='all',subset=['abeta_1-42','csf_alpha-synuclein','ptau','ttau','gfap','nfl'])
print(biospecimen_clean.shape)
biospecimen_clean = biospecimen_clean.dropna(axis='rows',how='all',subset=['date'])
print(biospecimen_clean.shape)
biospecimen_clean.to_csv(f'{path}/biospecimen_clean.csv')

In [None]:
# combine into one dataframe
# drop visit column, instead only deal with date
reload(_preprocess)
covs=['gender','education','diagnosis_bl_age','time','bl_age','visit_age','diagnosis','diagnosis_age','visit']
bio=['ptau','ttau','abeta_1-42','csf_alpha-synuclein']
dat=['datscan_putamen_l','datscan_putamen_r','datscan_caudate_l','datscan_caudate_r']
behavior = pd.read_csv('/scratch/c.c21013066/data/ppmi/phenotypes2021/behavior_clean.csv',index_col=0)
targets_norm = ['updrs_i','updrs_ii','updrs_iii_OFF','updrs_iii_ON','updrs_iii_A','updrs_iii_NoMED','updrs_iv','semantic_fluency','upsit','epworth',
               'moca','stai_trait','stai_state','rbd','systolic_bp_drop','benton','gds','lns','quip','se_adl',
               'scopa_aut','hvlt_recall','hvlt_recognition','hvlt_retention','symbol_digit','tmtb-a']

data = _preprocess.combine_phenotypes_only_closest_final('behavior_clean','/scratch/c.c21013066/data/ppmi',covs=covs,
               clinical=targets_norm,bio=bio,dat=dat)
data = data.rename(columns={'tmtb-a':'tmtbminusa'})
clinical = ['updrs_i','updrs_ii','updrs_iii_OFF','updrs_iii_ON','updrs_iii_A','updrs_iii_NoMED','updrs_iv','semantic_fluency','upsit','epworth',
               'moca','stai_trait','stai_state','rbd','systolic_bp_drop','benton','gds','lns','quip','se_adl',
               'scopa_aut','hvlt_recall','hvlt_recognition','hvlt_retention','symbol_digit','tmtbminusa']

In [None]:
# clean data
reload(_preprocess)
print("all data", data.shape,len(np.unique(data.index)))
data_clean = data[data['diagnosis'].isin(['pd','hc','prod'])]
print("keep only PD", data_clean.shape,len(np.unique(data_clean.index)))
data_clean['time_since_diagnosis'] = (data_clean['date'] - data_clean['date_diagnosis']) / np.timedelta64(1,'Y')
data_clean = data_clean.dropna(subset=clinical,how='all',axis='rows')
print(data_clean['diagnosis'].value_counts())
print("only visits where clinical data available",data_clean.shape,len(np.unique(data_clean.index)))
data_clean = get_n_visits(data_clean)

In [None]:
first_med = pd.read_csv('/scratch/c.c21013066/data/ppmi/phenotypes2021/MDS-UPDRS_Part_III.csv',usecols=['PATNO','EVENT_ID','INFODT','PDTRTMNT'],dtype={'PATNO':'int'},parse_dates=['INFODT']).rename(columns=
{'PATNO':'participant','EVENT_ID':'visit','INFODT':'date','PDTRTMNT':'med'})
print(first_med[first_med['participant']==3772])
first_med = first_med.set_index(['participant','date']).sort_index()
no_med = first_med[np.logical_or(first_med['med']==0,first_med['med'].isna())].reset_index().groupby('participant').last()
no_med['date'] = no_med['date'] + pd.offsets.DateOffset(years=1)
first_med = first_med[first_med['med']==1]
first_med = first_med.reset_index().groupby('participant').first()
no_med = no_med[~no_med.index.isin(first_med.index)]
first_med = pd.concat([first_med,no_med])

updrs_iii_data = data_clean[np.hstack([covs,bio,dat,['date','time_since_diagnosis','updrs_iii_OFF','updrs_iii_ON','updrs_iii_A','updrs_iii_NoMED']])].dropna(axis='rows',how='all',
                                                                                                                                        subset=['updrs_iii_OFF','updrs_iii_ON','updrs_iii_A',
                                                                                                                                                'updrs_iii_NoMED'])
print('How many OFF data missing', updrs_iii_data['updrs_iii_OFF'].isna().sum())
# all non medicated people when visit is OFF
updrs_iii_data.loc[~updrs_iii_data['updrs_iii_NoMED'].isna(),'updrs_iii_OFF'] = updrs_iii_data.loc[~updrs_iii_data['updrs_iii_NoMED'].isna(),'updrs_iii_NoMED']
print('NoMed visits are OFF',updrs_iii_data['updrs_iii_OFF'].isna().sum())
# all visits before first medication are OFF
# get all dates before first med
updrs_iii_data['before_med'] = np.nan
for key,row in updrs_iii_data.groupby('participant')[['date']]:
    # all visits before first medication and no ON info are OFF
    try:
        updrs_iii_data.loc[key,'before_med'] = np.logical_and(row['date'] < first_med.loc[key,'date'], row['updrs_iii_ON'].isna())
    except:
        print('problem with participant ', key)
        updrs_iii_data.loc[key,'before_med'] = False
updrs_iii_data.loc[updrs_iii_data['before_med'],'updrs_iii_OFF'] = updrs_iii_data.loc[updrs_iii_data['before_med'],['updrs_iii_OFF','updrs_iii_NAN','updrs_iii_NoMED']].median(axis=1)
print('All visits before first med mention and before ON first reported are OFF',updrs_iii_data['updrs_iii_OFF'].isna().sum())
# when there is a ON visit but no OFF, the unclear visit is the OFF one
updrs_iii_data.loc[np.logical_and(~updrs_iii_data['updrs_iii_ON'].isna(),updrs_iii_data['updrs_iii_OFF'].isna()),'updrs_iii_OFF'] = updrs_iii_data.loc[np.logical_and(~updrs_iii_data['updrs_iii_ON'].isna(),
updrs_iii_data['updrs_iii_OFF'].isna()),['updrs_iii_OFF','updrs_iii_A']].median(axis=1)
print('All visits where ON is there and an undetermined, that one must be the OFF',updrs_iii_data['updrs_iii_OFF'].isna().sum())
# when there is a OFF visit but no ON, the unclear visit is the ON one
#updrs_iii_data.loc[np.logical_and(updrs_iii_data['updrs_iii_ON'].isna(),~updrs_iii_data['updrs_iii_OFF'].isna()),'updrs_iii_ON'] = updrs_iii_data.loc[np.logical_and(updrs_iii_data['updrs_iii_ON'].isna(),
#~updrs_iii_data['updrs_iii_OFF'].isna()),['updrs_iii_ON','updrs_iii_NAN']].median(axis=1)
print(updrs_iii_data['updrs_iii_OFF'].isna().sum())
# melt
updrs_iii_melt = pd.melt(frame=updrs_iii_data.reset_index(),id_vars=np.hstack(['participant','date','time_since_diagnosis',covs,bio,dat]),value_vars=['updrs_iii_OFF','updrs_iii_ON','updrs_iii_NAN'],var_name='ON',value_name='updrs_iii')
updrs_iii_melt = updrs_iii_melt.dropna(how='all',axis='rows',subset=['updrs_iii'])
# regex to transform string to identifier for ON OFF
updrs_iii_melt['ON'] = updrs_iii_melt['ON'].replace(['updrs_iii_OFF','updrs_iii_ON','updrs_iii_A'],[False,True,np.nan])

data_cleaned = pd.merge(data_clean,updrs_iii_data[['updrs_iii_ON','updrs_iii_OFF','date']],on=['participant','date'],how='outer',suffixes=['_old',""])

                                       
data_cleaned['updrs_iii_OFF'] = data_cleaned['updrs_iii_OFF'].fillna(data_cleaned['updrs_iii_NoMED'])

In [None]:
data_cleaned.to_csv('/scratch/c.c21013066/data/ppmi/phenotypes2021/clinical_progression_raw_clean.csv')