# Appendix 4

In [1]:
def get_library_path()->str:

    cwd = os.getcwd()
    lst = cwd.split(os.sep)[:-1]

    path_lib = f'{os.sep}'

    for elem in lst:
        path_lib = os.path.join(path_lib, elem)

    return path_lib

# imports

import os
import sys

path_to_lib = get_library_path()

sys.path.append(path_to_lib)

import luxgiant_clinical.ThreeCatAnalysis as thr
from luxgiant_clinical.Helpers import categories_recoder

import pandas as pd
import numpy as np

In [2]:
# load data

folder_path = os.path.join(path_to_lib, 'data/source')

df = pd.read_csv(
    os.path.join(folder_path, 'cleaned_file.csv'), 
    low_memory=False
)

In [3]:
# filter data to keep only patients with PD subtype information

mask_patients= (df['Status']=='Patient')
mask_agecat = ~df['age_category'].isnull()

df_cases = df[mask_patients & mask_agecat].reset_index(drop=True)

del df

df_cases = df_cases[~(df_cases['age_category']=='Onset <21 years')].reset_index(drop=True)
df_cases.shape

(7439, 716)

In [4]:
df_cases['age_category'].value_counts().reset_index()

Unnamed: 0,age_category,count
0,Onset 50-60 years,2688
1,Onset 21-49 years,2421
2,Onset >60 years,2330


In [5]:
# rename and select first group of variables
variables_dict_1 = {
    "age"                       : "Age at Recruitment (years)", 
    "age_at_onset"              : "Age at Onset (years)" ,      
    "sex"                       : "Male" ,                      
    "PD_duration"               : "Duration of Disease (years)",
    "years_of_education"        : "Years of Education",         
    "bmi_comp"                  : "Body Mass Index",            
    "reason"                    : "PD-related Reason for Stop Working", 
    "work"                      : "Currently Employed",         
    "nature_of_work___1"        : "Agriculture as a job",     
    "over_your_lifetime_have_yo": "Life time direct exposure to pesticide/insecticide/fungicide",
    "in_your_lifetime_have_you" : "Smoked 100 or more cigarettes in lifetime",
    "in_your_lifetime_have"     : "Regular consumption of caffeinated drinks for >6months",
    "have_you_ever_had_a_head_i": "Head injury or concussion",

}
variables_1 = ['participant_id', 'age_category'] + list(variables_dict_1.keys())

# statistical measures
stats_meas = {
    'mean'  : ["Age at Recruitment (years)", "Age at Onset (years)", "Body Mass Index"],
    'n'     : ["Male", "Currently Employed", "PD-related Reason for Stop Working", "Agriculture as a job", 
               "Life time direct exposure to pesticide/insecticide/fungicide", 
               "Smoked 100 or more cigarettes in lifetime", "Regular consumption of caffeinated drinks for >6months",
               "Head injury or concussion"],
    'median': ["Duration of Disease (years)", "Years of Education"]
}

groups = ['Onset 21-49 years', 'Onset 50-60 years', 'Onset >60 years']

In [6]:
# keep only variables for analysis

df_cases_1 = df_cases[variables_1].copy()
df_cases_1 = df_cases_1.rename(columns=variables_dict_1)

In [7]:
# recode variable values

recode_dict = {
    "Currently Employed"                :{'Yes':1, 'No':0},
    "Agriculture as a job"              :{'Checked':1, 'Unchecked':0},
    "PD-related Reason for Stop Working":{'PD related':1, 'not PD related':0},
    "Male"                              :{'Male':1, 'Female':0}
}
for key in recode_dict.keys():
    df_cases_1 = categories_recoder(df_cases_1, [key], recode_dict[key])

recode = ["Life time direct exposure to pesticide/insecticide/fungicide", 
          "Smoked 100 or more cigarettes in lifetime",
          "Regular consumption of caffeinated drinks for >6months", "Head injury or concussion"]
mapping = {'Yes':1, 'No':0, 'Dont Know':np.nan, 'Refused':np.nan, 'Possibly':np.nan}

df_cases_1 = categories_recoder(df_cases_1, recode, mapping)

In [8]:
summary = [
    thr.report_mean_std(
        data_df    =df_cases_1,
        variables  =stats_meas['mean'],
        groups     =groups,
        grouping_by='age_category'
    ),
    thr.report_median_iqr(
        data_df    =df_cases_1,
        variables  =stats_meas['median'],
        groups     =groups,
        grouping_by='age_category'
    ),
    thr.report_proportion(
        data_df    =df_cases_1,
        variables  =stats_meas['n'],
        groups     =groups,
        grouping_by='age_category'
    )
]

In [9]:
# rename and select second group of variables

variables_dict_2 = {
    "medical_history_neurologic___7": "Stroke",
    "medical_history_cancer___1"    : "Melanoma" ,
    "medical_history_cancer___2"    : "Prostate Cancer" ,
    "medical_history_cancer___3"    : "Other Cancers",
    "medical_history_metabolic___1" : "Diabetes",
    "medical_history_metabolic___3" : "High Cholesterol",
    "medical_history_cardiovasc___2": "Hypertension",
    "medical_history_cardiovasc___3": "History of CAD",
    "medical_history_psychologi___1": "History of Anxiety",
    "medical_history_psychologi___3": "History of Depression",
    "family_member_diagnosed_wi"    : "Family History of PD",
    "family_member_diagnosed_wi_2"  : "Family History of Tremor",
    "family_member_diagnosed"       : "Family History of Dementia"
}
variables_2 = ['participant_id', 'age_category'] + list(variables_dict_2.keys())

# statistical measures
stats_meas2 = {
    'n'     : ["Stroke", "Melanoma", "Prostate Cancer", "Other Cancers", "Diabetes", "High Cholesterol", "Hypertension",
               "History of CAD", "History of Anxiety", "History of Depression", "Family History of PD", "Family History of Tremor", "Family History of Dementia"]
}

In [10]:
# keep only variables for analysis

df_cases_2 = df_cases[variables_2].copy()
df_cases_2 = df_cases_2.rename(columns=variables_dict_2)

In [11]:
# recode variable values

fam_hist = ["Family History of PD", "Family History of Tremor", "Family History of Dementia"]
remaining= [feat for feat in stats_meas2['n'] if feat not in fam_hist]

df_cases_2 = categories_recoder(df_cases_2, fam_hist, {"Yes":1, "No":0})
df_cases_2 = categories_recoder(df_cases_2, remaining, {"Checked":1, "Unchecked":0})

In [12]:
# count and proportion of categorical variables

df_sum4 = thr.report_proportion(
    data_df    =df_cases_2,
    variables  =stats_meas2['n'],
    groups     =['Onset 21-49 years', 'Onset 50-60 years', 'Onset >60 years'],
    grouping_by='age_category',
    subheader  ='Medical History'
)

In [13]:
# concatenate all results

df_11 = pd.concat(
    summary + [df_sum4], axis=0,ignore_index=True
)

In [14]:
df_cases_3 = df_cases_1[~(df_cases_1['age_category']=='Onset 50-60 years')].reset_index(drop=True)
df_cases_4 = df_cases_2[~(df_cases_2['age_category']=='Onset 50-60 years')].reset_index(drop=True)

In [15]:
summary1 = [
    thr.bonferroni_mean_std(
        data_df      =df_cases_3, 
        variables    =stats_meas['mean'], 
        groups       =['Onset 21-49 years', 'Onset >60 years'],
        grouping_by  ='age_category',
        correc_factor=3
    ),
    thr.bonferroni_median_iqr(
        data_df      =df_cases_3, 
        variables    =stats_meas['median'], 
        groups       =['Onset 21-49 years', 'Onset >60 years'],
        grouping_by  ='age_category',
        correc_factor=3
    ),
    thr.bonferroni_proportions(
        data_df      =df_cases_3, 
        variables    =stats_meas['n'], 
        groups       =['Onset 21-49 years', 'Onset >60 years'],
        grouping_by  ='age_category',
        correc_factor=3
    ),
    thr.bonferroni_proportions(
        data_df    =df_cases_4,
        variables  =stats_meas2['n'],
        groups     =['Onset 21-49 years', 'Onset >60 years'],
        grouping_by='age_category',
        correc_factor=3,
        subheader  ='Medical History'
    )
]

# concatenate all results
df_111 = pd.concat(
    summary1, axis=0,ignore_index=True
)
df_111.columns = ['Variable', 'Adjusted p-value (Early vs Late)']

In [16]:
df_cases_5 = df_cases_1[~(df_cases_1['age_category']=='Onset >60 years')].reset_index(drop=True)
df_cases_6 = df_cases_2[~(df_cases_2['age_category']=='Onset >60 years')].reset_index(drop=True)

In [17]:
summary2 = [
    thr.bonferroni_mean_std(
        data_df      =df_cases_5, 
        variables    =stats_meas['mean'], 
        groups       =['Onset 21-49 years', 'Onset 50-60 years'],
        grouping_by  ='age_category',
        correc_factor=3
    ),
    thr.bonferroni_median_iqr(
        data_df      =df_cases_5, 
        variables    =stats_meas['median'], 
        groups       =['Onset 21-49 years', 'Onset 50-60 years'],
        grouping_by  ='age_category',
        correc_factor=3
    ),
    thr.bonferroni_proportions(
        data_df      =df_cases_5, 
        variables    =stats_meas['n'], 
        groups       =['Onset 21-49 years', 'Onset 50-60 years'],
        grouping_by  ='age_category',
        correc_factor=3
    ),
    thr.bonferroni_proportions(
        data_df    =df_cases_6,
        variables  =stats_meas2['n'],
        groups     =['Onset 21-49 years', 'Onset 50-60 years'],
        grouping_by='age_category',
        correc_factor=3,
        subheader  ='Medical History'
    )
]

# concatenate all results
df_112 = pd.concat(
    summary2, axis=0,ignore_index=True
)
df_112.columns = ['Variable', 'Adjusted p-value (Early vs Medium)']

In [18]:
df_cases_7 = df_cases_1[~(df_cases_1['age_category']=='Onset 21-49 years')].reset_index(drop=True)
df_cases_8 = df_cases_2[~(df_cases_2['age_category']=='Onset 21-49 years')].reset_index(drop=True)

In [19]:
summary3 = [
    thr.bonferroni_mean_std(
        data_df      =df_cases_7, 
        variables    =stats_meas['mean'], 
        groups       =['Onset 50-60 years', 'Onset >60 years'],
        grouping_by  ='age_category',
        correc_factor=3
    ),
    thr.bonferroni_median_iqr(
        data_df      =df_cases_7, 
        variables    =stats_meas['median'], 
        groups       =['Onset 50-60 years', 'Onset >60 years'],
        grouping_by  ='age_category',
        correc_factor=3
    ),
    thr.bonferroni_proportions(
        data_df      =df_cases_7, 
        variables    =stats_meas['n'], 
        groups       =['Onset 50-60 years', 'Onset >60 years'],
        grouping_by  ='age_category',
        correc_factor=3
    ),
    thr.bonferroni_proportions(
        data_df    =df_cases_8,
        variables  =stats_meas2['n'],
        groups     =['Onset 50-60 years', 'Onset >60 years'],
        grouping_by='age_category',
        correc_factor=3,
        subheader  ='Medical History'
    )
]

# concatenate all results
df_113 = pd.concat(
    summary3, axis=0, ignore_index=True
)
df_113.columns = ['Variable', 'Adjusted p-value (Medium vs Late)']

In [20]:
df_11 = thr.final_formatter(
    overall_df=df_11,
    adjusted_df=[df_111, df_112, df_113],
    groups=groups
)

In [21]:
# save result in csv file

df_11.to_csv(os.path.join(folder_path, 'Table_11.csv'), index=False)
df_11

Unnamed: 0,Variable,Statistical Measure,Onset 21-49 years,Onset 50-60 years,Onset >60 years,Total,p-value,Adjusted p-value (Early vs Late),Adjusted p-value (Early vs Medium),Adjusted p-value (Medium vs Late),Available Samples for Analysis
0,Age at Recruitment (years),mean (SD),49.1 (8.2),61.3 (5.4),71.5 (5.6),60.5 (11.1),p<0.001,p<0.001,p<0.001,p<0.001,7439.0
1,Age at Onset (years),mean (SD),41.2 (6.2),55.0 (3.2),67.4 (5.2),54.4 (11.6),p<0.001,p<0.001,p<0.001,p<0.001,7439.0
2,Body Mass Index,mean (SD),24.2 (3.7),24.4 (3.6),24.2 (3.6),24.3 (3.6),0.0625,0.9999,0.2215,0.0837,6641.0
3,Duration of Disease (years),median (IQR),6.0 (3.0 - 11.0),5.0 (3.0 - 9.0),3.0 (2.0 - 6.0),5.0 (2.0 - 9.0),p<0.001,p<0.001,p<0.001,p<0.001,7439.0
4,Years of Education,median (IQR),12.0 (9.0 - 15.0),12.0 (10.0 - 15.0),12.0 (9.0 - 15.0),12.0 (9.0 - 15.0),0.0017,0.0011,0.185,0.1895,7065.0
5,Male,n (%),1571 (64.9),1819 (67.7),1654 (71.0),5044 (67.8),p<0.001,p<0.001,0.1126,0.0335,7438.0
6,Currently Employed,n (%),1514 (64.4),1313 (50.7),794 (35.2),3621 (50.3),p<0.001,p<0.001,p<0.001,p<0.001,7197.0
7,PD-related Reason for Stop Working,n (%),542 (51.7),538 (35.2),326 (20.3),1406 (33.6),p<0.001,p<0.001,p<0.001,p<0.001,4185.0
8,Agriculture as a job,n (%),347 (14.3),416 (15.5),354 (15.2),1117 (15.0),0.4994,0.9999,0.7569,0.9999,7439.0
9,Life time direct exposure to pesticide/insecti...,n (%),448 (18.8),543 (20.6),488 (21.2),1479 (20.2),0.1023,0.1223,0.3297,0.9999,7319.0
