In [26]:
import numpy as np
import pandas as pd
from functools import reduce

from importlib import reload
import itertools
from itertools import count

import sys

# CHANGE PATHS
sys.path.insert(1,'/scratch/c.c21013066/software/ukbb_parser/ukbb_parser')
sys.path.insert(1,'/scratch/c.c21013066/software/ukbb_parser/ukbb_parser/shared_utils')
import ukbb_parser as ukbb_parser
import ukbb_phenotype_dataset as ukbb_phenotype_dataset
from shared_utils.util import summarize
# CHANGE PATHS
sys.path.insert(1,'../../resources')
import phenotypesnew as pheno_info
# CHANGE PATHS
sys.path.insert(1,'../../resources/utils')
import _preprocess
import _get_data

In [2]:
data_path = '/scratch/c.c21013066/data/ukbiobank'
sample_path = f'{data_path}/sample/withGP/noOsteo'

# Extract Death and Birth info for Prevalence/Incidence normalisation

In [None]:
# how many subjects to extract (to test code before running on complete dataset (None))
nrows = None

# get demographics: this calls function from ukbb_parser and passes to it the fields you want to extract 
# (in this case all specified in phenotypes.py under DEMOGRAPHICS)
# for now we use all data (ie do not exclude relatives, use all ethnicities)
# for now do not load genotype meta data (needs imputed gene data at specific locations set in .ukbb_paths.py)
# this returns eid (unique subject identifier), demographics (the fields we specified), covariates (some general stuff set by
# author of ukbb_parser package)
eid, demographics, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.DEMOGRAPHICS,nrows=nrows,
                                        parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False)
demographics['eid'] = eid
demographics.set_index('eid',inplace=True)
# ethnicity (like all other things) is coded with some numbers representing some meaning (eg 1 means white)
# this function makes a dummy coded version of these codes and replaces it by their meaning, so we get
# ethnicity_white, ethnicity_asian, ... as columns with 1 where true and 0 where false
demographics = pd.merge(demographics,_preprocess.recode_ethnicity(demographics[['ethnicity']],1001),on='eid')
# ukbb does not give date of birth so we combine year and month to one datetime and set for all 15 as day of birth
demographics = _preprocess.get_birthdate(demographics)
eid, death, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.DEATH,nrows=nrows,
                                       parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False)
death['eid'] = eid

In [None]:
death.to_csv(f'{data_path}/phenotypes/death.csv')
demographics.to_csv(f'{data_path}/phenotypes/demo.csv')

# Extract Disease info

In [None]:
reload(_preprocess)
reload(pheno_info)
reload(ukbb_parser)
reload(ukbb_phenotype_dataset)
# how many subjects to extract (too test code before running on complete dataset (None))
nrows = None

eid, demographics, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.DEMOGRAPHICS,nrows=nrows,
                                        parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False)
demographics['eid'] = eid
demographics.set_index('eid',inplace=True)
demographics = pd.merge(demographics,_preprocess.recode_ethnicity(demographics[['ethnicity']],1001),on='eid')
demographics = _preprocess.get_birthdate(demographics)

# now repeat basically same thing for all other fields of interest
eid, baseline, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.ASSESSMENTS,nrows=nrows,
                                        parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False)
baseline['eid'] = eid
baseline.set_index('eid',inplace=True)
baseline['visit'] = 0
baseline['date_visit'] = pd.to_datetime(baseline['date_visit'],format='%Y-%m-%d',errors='coerce')

# NEW ADD PRINCIPAL COMPONENTS FOR COVARIATES
eid, pcs, covariates = ukbb_parser.create_dataset(pheno_info.GeneticPCs,nrows=nrows,parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False)
pcs['eid'] = eid
pcs.set_index('eid',inplace=True)
pcs.columns = [f'PC_{i}' for i in range(pcs.shape[1])]

eid, icd10diagnoses, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.DIAGNOSESICD10,nrows=nrows,
                                        parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False,code='19')
icd10diagnoses['eid'] = eid
icd10diagnoses.set_index('eid',inplace=True)
eid, icd9diagnoses, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.DIAGNOSESICD9,nrows=nrows,
                                        parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False,code='87')
icd9diagnoses['eid'] = eid
icd9diagnoses.set_index('eid',inplace=True)

eid, selfdiagnoses, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.DIAGNOSESSELF,nrows=nrows,
                                        parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False,code='6')
selfdiagnoses['eid'] = eid
selfdiagnoses.set_index('eid',inplace=True)

pheno_info.run_gpdiagnosis(pheno_info.DIAGNOSESGP)

In [None]:
# get date of diagnosis info
selfreport = _preprocess.get_selfreported_diagnoses(demographics,collapse_list=['parkinsonsdisease',
                                                                              'dementiaalzheimerscognitiveimpairment',
                                                                               'osteoarthritis','depression'],nrows=nrows,
                                                  names=['ParkinsonDisease','AllCauseDementia','Osteoarthritis','Depression'])

collapsegp = ['gp_ParkinsonDisease','gp_OtherParkinsonism','gp_AllCauseDementia','gp_AllCauseParkinsonism',
              'gp_AlzheimerDisease','gp_MultipleSystemAtrophy','gp_ProgressiveSupranuclearPalsy',
              'gp_FrontoTemporalDementia','gp_VascularDementia',
              'gp_Dystonia','gp_Osteoarthritis','gp_Depression','gp_neurology','gp_nonHC']

gp_diags = []
for diagnosis in collapsegp:
    diag = pd.read_csv(f'/scratch/c.c21013066/data/ukbiobank/record_level/{diagnosis}.csv',parse_dates=[f'{diagnosis}_date'])
    gp_diags.append(diag)
gp_diags = reduce(lambda left,right: pd.merge(left,right,on='eid',how='outer'), gp_diags)

# extract codes for specific disorders (use info from above diagnostics)
codes = pd.DataFrame.from_dict(pheno_info.DIAGNOSESICD10)
codes_list = [codes.loc[codes['name']=='icd10_ParkinsonDisease','codings'].values.tolist()[0],
              codes.loc[codes['name']=='icd10_OtherParkinsonism','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_AllCauseDementia','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_AllCauseParkinsonism','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_AlzheimerDisease','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_MultipleSystemAtrophy','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_ProgressiveSupranuclearPalsy','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_FrontoTemporalDementia','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_VascularDementia','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_Dystonia','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_Osteoarthritis','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_Depression','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_neurology','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_nonHC','codings'].values.tolist()[0]]
collapseicd10 = ['icd10_ParkinsonDisease','icd10_OtherParkinsonism','icd10_AllCauseDementia','icd10_AllCauseParkinsonism',
                 'icd10_AlzheimerDisease','icd10_MultipleSystemAtrophy','icd10_ProgressiveSupranuclearPalsy',
                 'icd10_FrontoTemporalDementia','icd10_VascularDementia','icd10_Dystonia','icd10_Osteoarthritis','icd10_Depression','icd10_neurology','icd10_nonHC']
date_diagicd10 = _preprocess.get_icd10diagnosis_source_date(codes_list,collapseicd10,nrows=nrows)
date_diagicd10.columns = date_diagicd10.columns.map('_'.join).str.strip('_')

codes = pd.DataFrame.from_dict(pheno_info.DIAGNOSESICD9)
codes_list = [codes.loc[codes['name']=='icd9_ParkinsonDisease','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_AllCauseDementia','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_AllCauseParkinsonism','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_AlzheimerDisease','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_FrontoTemporalDementia','codings'].values.tolist()[0],
            codes.loc[codes['name']=='icd9_VascularDementia','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_Dystonia','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_Osteoarthritis','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_Depression','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_neurology','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_nonHC','codings'].values.tolist()[0]]
collapseicd9 = ['icd9_ParkinsonDisease','icd9_AllCauseDementia','icd9_AllCauseParkinsonism','icd9_AlzheimerDisease',
           'icd9_FrontoTemporalDementia','icd9_VascularDementia','icd9_Dystonia','icd9_Osteoarthritis','icd9_Depression','icd9_neurology','icd9_nonHC']
date_diagicd9 = _preprocess.get_icd9diagnosis_source_date(codes_list,collapseicd9,nrows=nrows)
date_diagicd9.columns = date_diagicd9.columns.map('_'.join).str.strip('_')

In [None]:
# get date for prodromal symptoms
collapsegp = ['gp_Depression','gp_Anxiety','gp_Constipation','gp_ErectileDysfunction','gp_UrinaryIncontinence', 'gp_Hyposmia',
             'gp_RBD','gp_OrthostaticHypotension']
gp_diags = []
for diagnosis in collapsegp:
    diag = pd.read_csv(f'/scratch/c.c21013066/data/ukbiobank/record_level/{diagnosis}.csv',parse_dates=[f'{diagnosis}_date'])
    gp_diags.append(diag)
gp_diags = reduce(lambda left,right: pd.merge(left,right,on='eid',how='outer'), gp_diags)

# extract codes for specific disorders (use info from above diagnostics)
codes = pd.DataFrame.from_dict(pheno_info.DIAGNOSESICD10)
codes_list = [codes.loc[codes['name']=='icd10_Depression','codings'].values.tolist()[0],
              codes.loc[codes['name']=='icd10_Anxiety','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_Constipation','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_ErectileDysfunction','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_UrinaryIncontinence','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_Hyposmia','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_RBD','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd10_OrthostaticHypotension','codings'].values.tolist()[0]]
collapseicd10 = ['icd10_Depression','icd10_Anxiety','icd10_Constipation','icd10_ErectileDysfunction',
                 'icd10_UrinaryIncontinence','icd10_Hyposmia','icd10_RBD',
                 'icd10_OrthostaticHypotension']
date_diagicd10 = _preprocess.get_icd10diagnosis_source_date(codes_list,collapseicd10,nrows=nrows)
date_diagicd10.columns = date_diagicd10.columns.map('_'.join).str.strip('_')

codes = pd.DataFrame.from_dict(pheno_info.DIAGNOSESICD9)
codes_list = [codes.loc[codes['name']=='icd9_Depression','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_Anxiety','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_Constipation','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_ErectileDysfunction','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_UrinaryIncontinence','codings'].values.tolist()[0],
            codes.loc[codes['name']=='icd9_Hyposmia','codings'].values.tolist()[0],
             codes.loc[codes['name']=='icd9_OrthostaticHypotension','codings'].values.tolist()[0]]
collapseicd9 = ['icd9_Depression','icd9_Anxiety','icd9_Constipation','icd9_ErectileDysfunction',
           'icd9_UrinaryIncontinence','icd9_Hyposmia','icd9_OrthostaticHypotension']
date_diagicd9 = _preprocess.get_icd9diagnosis_source_date(codes_list,collapseicd9,nrows=nrows)
date_diagicd9.columns = date_diagicd9.columns.map('_'.join).str.strip('_')

In [None]:
# for HC definition we do not require age
selfreport['selfreported_neurology_age'] = np.nan
selfreport['selfreported_nonHC_age'] = np.nan

## Combine data

- after loading all you want, merge to one dataframe
- use some functions of preprocess to define some new fields (visit age, age of diagnoses (only works when you use the date_diag way of grabbing info)

In [None]:
# combine datasets
# HERE ADD PCS TO MERGED DATA
dfs = [demographics,baseline,icd10diagnoses,icd9diagnoses,selfdiagnoses,date_diagicd10,date_diagicd9,selfreport,pcs,gp_diags]
merged = reduce(lambda left,right: pd.merge(left,right,on='eid',how='outer',suffixes=["_x",'']), dfs)
# transform date to age
merged = _preprocess.get_visit_age(merged)
merged = _preprocess.get_diagnosis_age(merged,diags=collapseicd10)
merged = _preprocess.get_diagnosis_age(merged,diags=collapseicd9,source=['hospital'])
merged = _preprocess.get_diagnosis_age(merged,diags=collapsegp,source=[])
merged = merged.set_index("eid")

In [None]:
# specify for which disorder you just extracted you want to create a csv file containing all subject with that disorder and
# additional info on date/source
reload(_preprocess)
# whenever there is no selfreport code or ICD10 code for that, just put fillself/fillicd9 and it will be ignored
icd10 = ['icd10_ParkinsonDisease','icd10_OtherParkinsonism','icd10_AllCauseDementia','icd10_AllCauseParkinsonism',
                 'icd10_AlzheimerDisease','icd10_MultipleSystemAtrophy','icd10_ProgressiveSupranuclearPalsy',
                 'icd10_FrontoTemporalDementia','icd10_VascularDementia','icd10_Dystonia','icd10_Osteoarthritis','icd10_Depression','icd10_neurology','icd10_nonHC']
selfes = ['selfreported_ParkinsonDisease','fillself','selfreported_AllCauseDementia','selfreported_ParkinsonDisease',
         'fillself','fillself','fillself',
          'fillself','fillself','fillself','selfreported_Osteoarthritis','selfreported_Depression','selfreported_neurology','selfreported_nonHC']
names = ['ParkinsonDisease','OtherParkinsonism','AllCauseDementia','AllCauseParkinsonism','AlzheimerDisease',
        'MultipleSystemAtrophy','ProgressiveSupranuclearPalsy','FrontoTemporalDementia','VascularDementia','Dystonia','Osteoarthritis','Depression','neurology','nonHC']
icd9 = ['icd9_ParkinsonDisease', 'fillicd9',
 'icd9_AllCauseDementia',
 'icd9_AllCauseParkinsonism',
 'icd9_AlzheimerDisease', 'fillicd9','fillicd9',
 'icd9_FrontoTemporalDementia',
 'icd9_VascularDementia','icd9_Dystonia','icd9_Osteoarthritis','icd9_Depression','icd9_neurology','icd9_nonHC']
gps = ['gp_ParkinsonDisease','gp_OtherParkinsonism','gp_AllCauseDementia','gp_AllCauseParkinsonism','gp_AlzheimerDisease','gp_MultipleSystemAtrophy','gp_ProgressiveSupranuclearPalsy',
       'gp_FrontoTemporalDementia','gp_VascularDementia','gp_Dystonia','gp_Osteoarthritis','gp_Depression',
      'gp_neurology','gp_nonHC']

#DEFINE COVARIATES TO BE EXTRACTED
covariates = np.hstack(['male',[f'PC_{i}' for i in range(20)],'date_birth'])

# PASS COVARIATES SUCH THAT THEY ARE EXTRACTED
_preprocess.extract_disorder_withGP(merged,icd10,icd9,selfes,gps,names,save=sample_path,covariates=covariates)

In [None]:
# specify for which disorder you just extracted you want to create a csv file containing all subject with that disorder and
# additional info on date/source
# whenever there is no selfreport code or ICD10 code for that, just put fillself/fillicd9 and it will be ignored
icd10 = ['icd10_Anxiety','icd10_Constipation','icd10_ErectileDysfunction',
                 'icd10_UrinaryIncontinence','icd10_Hyposmia','icd10_RBD',
                 'icd10_OrthostaticHypotension']
selfes = ['selfreported_Anxiety','selfreported_Constipation',
         'selfreported_ErectileDysfunction','selfreported_UrinaryIncontinence','fillself',
          'fillself','fillself']
names = ['Anxiety','Constipation','ErectileDysfunction',
                 'UrinaryIncontinence','Hyposmia','RBD',
                 'OrthostaticHypotension']
icd9 = ['icd9_Anxiety','icd9_Constipation','icd9_ErectileDysfunction',
           'icd9_UrinaryIncontinence','icd9_Hyposmia','fillicd9','icd9_OrthostaticHypotension']
gps = ['gp_Anxiety','gp_Constipation','gp_ErectileDysfunction','gp_UrinaryIncontinence', 'gp_Hyposmia',
             'gp_RBD','gp_OrthostaticHypotension']

#DEFINE COVARIATES TO BE EXTRACTED
covariates = np.hstack(['male',[f'PC_{i}' for i in range(20)],'date_birth'])

# CHANGE PATH
# PASS COVARIATES SUCH THAT THEY ARE EXTRACTED
_preprocess.extract_disorder_withGP(merged,icd10,icd9,selfes,gps,names,save=sample_path,covariates=covariates)

# Manually ensure HC has no Osteoarthritis

In [21]:
hc = pd.read_csv(f'{sample_path}/healthy_not_nonHC.csv')
osteo = pd.read_csv(f'{sample_path}/Osteoarthritis.csv')
hc = hc.set_index('eid')
hc = hc.drop(index=np.intersect1d(hc.index,osteo.eid))
hc.to_csv(f'{sample_path}/healthy_not_nonHC.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Manually ensure HC not treated for Parkinsonism

In [None]:
# GET VERBAL INTERVIEW MEDICATION INFO
reload(pheno_info)
reload(ukbb_parser)
reload(_preprocess)
# you need to add this to phenotypes.py to specify the data field ID for medications you want to extract
#MEDICATION = [
#    ('medication', 20003, 'raw')
#]
# You need to have the xlsx file I sent you saved in the CODINGS_DIR named ukbb_ATC_to_coding4.xls (need to convert xlsx to xls)
# what it does is search for your specified ATC codes in that excel file and extract the corresponding ukbb codings
# then from the loaded datafield MEDICATION (saved in raw) we check if a subject has ever reported a ukbb code correspoding to the ATC codes you are interested in
# it returns a dataframe for all subjects with 1 column that is named here immunosuppressants with 1 if they ever took some, 0 else
# You'll then have to merge it with your extracted PD group etc on 'eid' to know who of that group have taken the meds (you can do that in R)
nrows = None
eid, raw,covariates = ukbb_parser.create_dataset(pheno_info.MEDICATION, nrows = nrows,parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False)
raw[raw.isna()] = 0
raw = raw.astype(int).astype(str)
raw[raw=='0'] = 'nan'
raw['eid'] = eid
raw.set_index('eid',inplace=True)

# all parkinsonism drugs 
drugs = _preprocess.extract_medication(raw,atc_code=['N04'],name='antiparkinsonism')
drugs.to_csv(f'{data_path}/phenotypes/AntiParkinsonism.csv')

# levodopa, 
drugs = _preprocess.extract_medication(raw,atc_code=['N04BA'],name='Dopa')
drugs.to_csv(f'{data_path}/phenotypes/Dopa.csv')

In [None]:
# get GP INFO MEDICATION
pheno_info.run_gpmedication(pheno_info.GPDRUGS)

In [20]:
hc = pd.read_csv(f'{sample_path}/healthy_not_nonHC.csv')
antiparkinsonism = pd.read_csv(f'{data_path}/record_level/gp_AntiParkinsonism_first.csv',parse_dates=['gp_AntiParkinsonism_date'])
antiparkinsonism = antiparkinsonism[antiparkinsonism['gp_AntiParkinsonism']==1]
antiparkinsonism2 = pd.read_csv(f'{data_path}/phenotypes/AntiParkinsonism.csv')
antiparkinsonism2 = antiparkinsonism2[antiparkinsonism2['antiparkinsonism']==1]
print(hc.shape)
hc = hc.set_index('eid')
hc = hc.drop(index=np.intersect1d(hc.index,antiparkinsonism.eid))
print(hc.shape,antiparkinsonism.shape)
hc = hc.drop(index=np.intersect1d(hc.index,antiparkinsonism2.eid))
print(hc.shape,antiparkinsonism2.shape)
hc.to_csv(f'{sample_path}/healthy_not_nonHC.csv')

(211289, 30)
(211243, 29) (513, 3)
(211149, 29) (1620, 2)


# Extract Accelerometer data

In [None]:
nrows = None

eid, demographics, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.DEMOGRAPHICS,nrows=nrows,parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False)
demographics['eid'] = eid
demographics.set_index('eid',inplace=True)
demographics = pd.merge(demographics,_preprocess.recode_ethnicity(demographics[['ethnicity']],1001),on='eid')
demographics = _preprocess.get_birthdate(demographics)

eid, baseline, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.ASSESSMENTS,nrows=nrows,parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False)
baseline['eid'] = eid
baseline.set_index('eid',inplace=True)
baseline['visit'] = 0
baseline['date_visit'] = pd.to_datetime(baseline['date_visit'],format='%Y-%m-%d',errors='coerce')

eid, icd10diagnoses, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.DIAGNOSESICD10,nrows=nrows,parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},no_kinship=False, only_caucasians=False,code='19')
icd10diagnoses['eid'] = eid

eid, accelerometer, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.ACCELEROMETER,nrows,parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},
                                        no_kinship=False, only_caucasians=False)
accelerometer['eid'] = eid
accelerometer['date_accelerometry'] = pd.to_datetime(accelerometer['date_accelerometry'],format='%Y-%m-%d',errors='coerce')
accelerometer.set_index('eid',inplace=True)

In [None]:
# combine datasets
dfs = [demographics,baseline,accelerometer,icd10diagnoses]
merged = reduce(lambda left,right: pd.merge(left,right,on='eid'), dfs)
merged = _preprocess.get_visit_age(merged)
merged['accelerometry_age'] = (merged['date_accelerometry'] - merged['date_birth'])/ np.timedelta64(1,'Y')
merged = merged.set_index('eid')

In [None]:
merged.to_csv(f'{data_oath}/phenotypes/demo_acc.csv')

# Extract Lifestyle and Blood info

In [None]:
nrows=None
dfs = _get_data.get_risks(nrows=nrows)
merged = _get_data.merge_data(dfs)

[2023_02_07-19:21:41] Reading all dataset rows of 24 columns (for 10 fields)...
[2023_02_07-19:24:54] Finished after 0:03:13.908705.
[2023_02_07-19:24:54] Read a dataset of 502462 samples.
[2023_02_07-19:24:55] Knowing of 63 samples who have wished to withdraw, 49 of them are in the loaded dataset. Filtering out these records, the dataset has reduced from 502462 to 502413 samples.
[2023_02_07-19:24:55] Parsing field sex...
[2023_02_07-19:24:55] Parsing field year_of_birth...
[2023_02_07-19:24:55] To avoid the "dummy variable trap", removing the AC_leeds column (44186 matching records).
[2023_02_07-19:24:56] Parsing field male...
[2023_02_07-19:24:56] Parsing field year_birth...
[2023_02_07-19:24:56] Parsing field month_birth...
[2023_02_07-19:24:56] Parsing field country_birth...
[2023_02_07-19:24:56] Parsing field handedness...
[2023_02_07-19:24:56] Parsing field skin_color...
[2023_02_07-19:24:56] Parsing field ethnicity...
[2023_02_07-19:24:56] Parsing field TownsendDeprivationIndex

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df[c]<0,c] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


[2023_02_07-19:24:58] Reading all dataset rows of 11 columns (for 4 fields)...
[2023_02_07-19:27:19] Finished after 0:02:21.018065.
[2023_02_07-19:27:19] Read a dataset of 502462 samples.
[2023_02_07-19:27:19] Knowing of 63 samples who have wished to withdraw, 49 of them are in the loaded dataset. Filtering out these records, the dataset has reduced from 502462 to 502413 samples.
[2023_02_07-19:27:19] Parsing field sex...
[2023_02_07-19:27:19] Parsing field year_of_birth...


  eid, baseline, covariates = ukbb_phenotype_dataset.create_phenotype_dataset(pheno_info.ASSESSMENTS,nrows=nrows,parse_dataset_covariates_kwargs={'use_genotyping_metadata':False},no_kinship=False, only_caucasians=False)


[2023_02_07-19:27:20] To avoid the "dummy variable trap", removing the AC_leeds column (44186 matching records).
[2023_02_07-19:27:20] Parsing field date_visit...
[2023_02_07-19:27:21] Parsing field site...
[2023_02_07-19:27:22] Reading all dataset rows of 299 columns (for 8 fields)...
[2023_02_07-19:31:46] Finished after 0:04:24.720634.
[2023_02_07-19:31:46] Read a dataset of 502462 samples.
[2023_02_07-19:31:48] Knowing of 63 samples who have wished to withdraw, 49 of them are in the loaded dataset. Filtering out these records, the dataset has reduced from 502462 to 502413 samples.
[2023_02_07-19:31:48] Parsing field sex...
[2023_02_07-19:31:48] Parsing field year_of_birth...
[2023_02_07-19:31:48] To avoid the "dummy variable trap", removing the AC_leeds column (44186 matching records).
[2023_02_07-19:32:00] Parsing the read dataset into an ICD-10 tree...
[2023_02_07-19:32:00] Filtering the ICD-10 tree to keep only nodes descending from 944 specific codes...
[2023_02_07-19:32:00] Rem

In [None]:
# extract the subgroup you want to work with
name = "ParkinsonDisease"
exclude=['icd10_nonHC']
drop_healthy = 'nonHC'
levels = [0,1]
covs = np.array(['visit_age','male','TownsendDeprivationIndex'])
scale_covs = np.array([1,0,1]).astype(bool)
keep = ['ParkinsonDisease_age','time_to_diagnosis']
for modality in ['risk','blood']:
    if modality == 'risk':
        predictors_cat = ['AlcoholStatus_Current','AlcoholStatus_Previous','SmokeStatus_Current','SmokeStatus_Previous',
                     'DaytimeSleepiness_Often','AlcoholFrequency_LessThanWeekly',
                      'family_Stroke','family_Diabetes','family_Severedepression',
                      'family_Alzheimersdiseasedementia','family_Parkinsonsdisease']
        predictors_norm = np.hstack(['BMI','Waist_Circumference','Hip_Circumference','Diastolic_BloodPressure','PulseRate','BodyFat_Percentage',])
        predictors = np.hstack([predictors_cat,predictors_norm])
        scale_predictors = np.hstack([np.repeat([False],len(predictors_cat)),np.repeat([True],len(predictors_norm))])
    elif modality == 'blood':
        predictors = dfs[5].columns
        scale_predictors = np.repeat([True],len(predictors))
    print(predictors)
    for name,exclude,drop_healthy in zip(['AllCauseDementia','AllCauseParkinsonism','AlzheimerDisease',
        'MultipleSystemAtrophy','ProgressiveSupranuclearPalsy','FrontoTemporalDementia','VascularDementia','ParkinsonDisease',
                                     'Dystonia','Osteoarthritis','Depression'],
                                     [['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],
                                     ['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],['icd10_nonHC'],
                                     ['icd10_nonHC'],['icd10_nonHC']],
                                    ['nonHC','nonHC','nonHC','nonHC','nonHC','nonHC','nonHC','nonHC','nonHC',
                                    'nonHC','nonHC']):
        keep = [f'{name}_age','time_to_diagnosis']
        print(f'disease group: {name}, control group: no {drop_healthy}')
        merged_ = _get_data.get_healthy_disorder(merged.copy(deep=True),name,covs=covs,
                             predictors=predictors,incident=False,exclude=drop_healthy)
        merged_clean = _preprocess.make_categorical(merged_,covs[~scale_covs],levels)
        merged_clean = _preprocess.make_categorical(merged_clean,predictors[~scale_predictors],levels)
        #drop columns with too many nan
        predictors, scale_predictors = _preprocess.clean_predictors(merged_clean,predictors,scale_predictors,
                             thresh=0.15)
        # drop subjects with too many nan (here any nan)
        merged_clean = _preprocess.clean_subjects(merged_clean,predictors,thresh=0)
        merged_clean[np.hstack([predictors,name,covs,keep])].to_csv(f'{sample_path}/{name}_controlNo{drop_healthy}_{modality}.csv')
        # subsample
        #matched_sample = _get_data.get_matched(merged_clean,name,exclude=exclude,file=f'{name}_controlNo{drop_healthy}_match_{modality}.txt',save=True)