# Comparison between motor subtypes of PD

In [1]:
def get_library_path()->str:

    cwd = os.getcwd()
    lst = cwd.split(os.sep)[:-1]

    path_lib = f'{os.sep}'

    for elem in lst:
        path_lib = os.path.join(path_lib, elem)

    return path_lib

# imports

import os
import sys

path_to_lib = get_library_path()

sys.path.append(path_to_lib)
import luxgiant_clinical.ThreeCatAnalysis as thr
from luxgiant_clinical.Helpers import categories_recoder

import pandas as pd
import numpy as np

In [2]:
# load data

folder_path = os.path.join(path_to_lib, 'data/source')

df = pd.read_csv(
    os.path.join(folder_path, 'cleaned_file.csv'), 
    low_memory=False
)

In [3]:
# filter data to keep only patients with PD subtype information

mask_patients= (df['Status']=='Patient')
mask_subtype = ~df['subtype'].isnull()
mask_off     = (df['on_off']=='OFF: Off is the typical functional state when patien ts have a poor response in spite of taking medications.')

df_cases = df[mask_patients & mask_off & mask_subtype].reset_index(drop=True)
del df

df_cases.shape

(1815, 716)

## Summary statistics for demography and lifestyle

In [4]:
# rename and select first group of variables
variables_dict_1 = {
    "age"                       : "Age at Recruitment (years)", 
    "age_at_onset"              : "Age at Onset (years)" ,
    "bmi_comp"                  : "Body Mass Index",                        
    "PD_duration"               : "Duration of Disease (years)",
    "years_of_education"        : "Years of Education",
    "sex"                       : "Male" ,
    "work"                      : "Currently Employed", 
    "reason"                    : "PD-related Reason for Stop Working",               
    "nature_of_work___1"        : "Agriculture as a job",              
    "over_your_lifetime_have_yo": "Life time direct exposure to pesticide/insecticide/fungicide",
    "in_your_lifetime_have_you" : "Smoked 100 or more cigarettes in lifetime",                   
    "in_your_lifetime_have"     : "Regular consumption of caffeinated drinks for >6months",      
    "have_you_ever_had_a_head_i": "Head injury or concussion",                                   
}
variables_1 = ['participant_id', 'subtype'] + list(variables_dict_1.keys())

# statistical measures
stats_meas = {
    'mean'  : ["Age at Recruitment (years)", "Age at Onset (years)", "Body Mass Index"],
    'n'     : ["Male", "Currently Employed", "PD-related Reason for Stop Working", "Agriculture as a job", 
               "Life time direct exposure to pesticide/insecticide/fungicide", 
               "Smoked 100 or more cigarettes in lifetime", "Regular consumption of caffeinated drinks for >6months", 
               "Head injury or concussion"],
    'median': ["Duration of Disease (years)", "Years of Education"]
}

groups = ['Tremor Dominant', 'Indeterminate', 'Postural instability and gait difficulty']

In [5]:
# keep only variables for analysis

df_cases_1 = df_cases[variables_1].copy()
df_cases_1 = df_cases_1.rename(columns=variables_dict_1)

In [6]:
# recode variable values

recode_dict = {
    "Currently Employed"                :{'Yes':1, 'No':0},
    "Agriculture as a job"              :{'Checked':1, 'Unchecked':0},
    "Male"                              :{'Male':1, 'Female':0},
    "PD-related Reason for Stop Working":{'PD related':1, 'not PD related':0}
}
for key in recode_dict.keys():
    df_cases_1 = categories_recoder(df_cases_1, [key], recode_dict[key])

recode = ["Life time direct exposure to pesticide/insecticide/fungicide", "Smoked 100 or more cigarettes in lifetime",
          "Regular consumption of caffeinated drinks for >6months", "Head injury or concussion"]
mapping = {'Yes':1, 'No':0, 'Dont Know':np.nan, 'Refused':np.nan, 'Possibly':np.nan}

df_cases_1 = categories_recoder(df_cases_1, recode, mapping)


In [7]:
summary1 = [
    thr.report_mean_std(
        data_df    =df_cases_1,
        variables  =stats_meas['mean'],
        groups     =groups,
        grouping_by='subtype'
    ),
    thr.report_median_iqr(
        data_df    =df_cases_1,
        variables  =stats_meas['median'],
        groups     =groups,
        grouping_by='subtype'
    ),
    thr.report_proportion(
        data_df    =df_cases_1,
        variables  =stats_meas['n'],
        groups     =groups,
        grouping_by='subtype'
    )
]

In [8]:
# rename and select second group of variables

variables_dict_2 = {
    "medical_history_neurologic___7": "Stroke",
    "medical_history_cancer___1"    : "Melanoma" ,
    "medical_history_cancer___2"    : "Prostate Cancer" ,
    "medical_history_cancer___3"    : "Other Cancers",
    "medical_history_metabolic___1" : "Diabetes",
    "medical_history_metabolic___3" : "High Cholesterol",
    "medical_history_cardiovasc___2": "Hypertension",
    "medical_history_cardiovasc___3": "History of CAD",
    "medical_history_psychologi___1": "History of Anxiety",
    "medical_history_psychologi___3": "History of Depression",
    "family_member_diagnosed_wi"    : "Family History of PD",
    "family_member_diagnosed_wi_2"  : "Family History of Tremor",
    "family_member_diagnosed"       : "Family History of Dementia",
    "hystage"                       : "Hoehn and Yahr Staging IV-V (drug OFF)",
    "total_score_for_moca"          : "Total score for MOCA",
    "total_score_for_bdi"           : "Total score for BDI"
}
variables_2 = ['participant_id', 'subtype'] + list(variables_dict_2.keys())

# statistical measures
stats_meas2 = {
    'n'     : ["Stroke", "Melanoma", "Prostate Cancer", "Other Cancers", "Diabetes", "High Cholesterol", "Hypertension",
               "History of CAD", "History of Anxiety", "History of Depression", "Family History of PD", 
               "Family History of Tremor", "Family History of Dementia", "Hoehn and Yahr Staging IV-V (drug OFF)"],
    'mean'  : ["Total score for MOCA"],
    'median': ["Total score for BDI"]
}

In [9]:
# keep only variables for analysis

df_cases_2 = df_cases[variables_2].copy()
df_cases_2 = df_cases_2.rename(columns=variables_dict_2)

In [10]:
# recode variable values
fam_hist = ["Family History of PD", "Family History of Tremor", "Family History of Dementia"]
hy       = ["Hoehn and Yahr Staging IV-V (drug OFF)"]
remaining= [feat for feat in stats_meas2['n'] if feat not in fam_hist+hy]

df_cases_2 = categories_recoder(df_cases_2, fam_hist, {"Yes":1, "No":0})
df_cases_2 = categories_recoder(df_cases_2, remaining, {"Checked":1, "Unchecked":0})
df_cases_2 = categories_recoder(df_cases_2, ["Hoehn and Yahr Staging IV-V (drug OFF)"], {'Not severe':0, 'Severe':1})

In [11]:
summary2 = [
    thr.report_proportion(
        data_df    =df_cases_2,
        variables  =stats_meas2['n'],
        groups     =groups,
        grouping_by='subtype',
        subheader  ='Medical History'
    ),
    thr.report_mean_std(
        data_df    =df_cases_2,
        variables  =stats_meas2['mean'],
        groups     =groups,
        grouping_by='subtype'
    ),
    thr.report_median_iqr(
        data_df    =df_cases_2,
        variables  =stats_meas2['median'],
        groups     =groups,
        grouping_by='subtype'
    )
]

In [12]:
# concatenate all results

df_7 = pd.concat(
    summary1 + summary2, axis=0,ignore_index=True
)

In [13]:
df_7

Unnamed: 0,Variable,Statistical Measure,Tremor Dominant,Indeterminate,Postural instability and gait difficulty,Total,p-value,Available Samples for Analysis
0,Age at Recruitment (years),mean (SD),57.1 (11.3),59.3 (10.9),60.7 (10.9),59.2 (11.1),0.0,1813.0
1,Age at Onset (years),mean (SD),52.4 (12.1),53.6 (11.4),54.0 (11.4),53.4 (11.7),0.0367,1798.0
2,Body Mass Index,mean (SD),24.4 (3.9),24.0 (3.9),24.2 (3.9),24.2 (3.9),0.3062,1627.0
3,Duration of Disease (years),median (IQR),4.0 (1.5 - 7.0),5.0 (2.0 - 8.0),6.0 (3.0 - 10.0),5.0 (2.0 - 8.0),0.0,1799.0
4,Years of Education,median (IQR),11.0 (7.0 - 15.0),10.0 (7.0 - 15.0),11.0 (7.0 - 15.0),10.0 (7.0 - 15.0),0.8777,1738.0
5,Male,n (%),452 (74.7),247 (69.6),535 (62.6),1234 (68.0),4.798457e-06,1815.0
6,Currently Employed,n (%),406 (68.8),242 (69.5),521 (62.0),1169 (65.7),0.007234609,1778.0
7,PD-related Reason for Stop Working,n (%),105 (35.7),79 (43.2),216 (44.8),400 (41.7),0.04042585,959.0
8,Agriculture as a job,n (%),140 (23.1),80 (22.5),143 (16.7),363 (20.0),0.004325642,1815.0
9,Life time direct exposure to pesticide/insecti...,n (%),155 (26.2),81 (23.3),144 (17.2),380 (21.4),0.0001528256,1775.0


In [14]:
df_cases_3 = df_cases_1[~(df_cases_1['subtype']=='Indeterminate')].reset_index(drop=True)
df_cases_4 = df_cases_2[~(df_cases_2['subtype']=='Indeterminate')].reset_index(drop=True)

In [15]:
summary3 = [
    thr.bonferroni_mean_std(
        data_df      =df_cases_3, 
        variables    =stats_meas['mean'], 
        groups       =['Tremor Dominant',  'Postural instability and gait difficulty'],
        grouping_by  ='subtype',
        correc_factor=3
    ),
    thr.bonferroni_median_iqr(
        data_df      =df_cases_3, 
        variables    =stats_meas['median'], 
        groups       =['Tremor Dominant',  'Postural instability and gait difficulty'],
        grouping_by  ='subtype',
        correc_factor=3
    ),
    thr.bonferroni_proportions(
        data_df      =df_cases_3, 
        variables    =stats_meas['n'], 
        groups       =['Tremor Dominant',  'Postural instability and gait difficulty'],
        grouping_by  ='subtype',
        correc_factor=3
    )
]

In [16]:
summary4 = [
    thr.bonferroni_proportions(
        data_df      =df_cases_4, 
        variables    =stats_meas2['n'], 
        groups       =['Tremor Dominant',  'Postural instability and gait difficulty'],
        grouping_by  ='subtype',
        correc_factor=3
    ),
    thr.bonferroni_mean_std(
        data_df      =df_cases_4, 
        variables    =stats_meas2['mean'], 
        groups       =['Tremor Dominant',  'Postural instability and gait difficulty'],
        grouping_by  ='subtype',
        correc_factor=3
    ),
    thr.bonferroni_median_iqr(
        data_df      =df_cases_4, 
        variables    =stats_meas2['median'], 
        groups       =['Tremor Dominant',  'Postural instability and gait difficulty'],
        grouping_by  ='subtype',
        correc_factor=3
    )
]

In [17]:
# concatenate all results

df_71 = pd.concat(
    summary3 + summary4, axis=0,ignore_index=True
)

In [18]:
df_7 = thr.final_formatter(
    overall_df =df_7, 
    adjusted_df=[df_71], 
    groups     =['Tremor Dominant', 'Indeterminate', 'Postural instability and gait difficulty']
)

In [19]:
df_7.to_csv(os.path.join(folder_path, 'Table_7.csv'), index=False)
df_7

Unnamed: 0,Variable,Statistical Measure,Tremor Dominant,Indeterminate,Postural instability and gait difficulty,Total,p-value,Adjusted p-value,Available Samples for Analysis
0,Age at Recruitment (years),mean (SD),57.1 (11.3),59.3 (10.9),60.7 (10.9),59.2 (11.1),p<0.001,p<0.001,1813
1,Age at Onset (years),mean (SD),52.4 (12.1),53.6 (11.4),54.0 (11.4),53.4 (11.7),0.0367,0.0343,1798
2,Body Mass Index,mean (SD),24.4 (3.9),24.0 (3.9),24.2 (3.9),24.2 (3.9),0.3062,0.7605,1627
3,Duration of Disease (years),median (IQR),4.0 (1.5 - 7.0),5.0 (2.0 - 8.0),6.0 (3.0 - 10.0),5.0 (2.0 - 8.0),p<0.001,p<0.001,1799
4,Years of Education,median (IQR),11.0 (7.0 - 15.0),10.0 (7.0 - 15.0),11.0 (7.0 - 15.0),10.0 (7.0 - 15.0),0.8777,0.9999,1738
5,Male,n (%),452 (74.7),247 (69.6),535 (62.6),1234 (68.0),p<0.001,p<0.001,1815
6,Currently Employed,n (%),406 (68.8),242 (69.5),521 (62.0),1169 (65.7),0.0072,0.0244,1778
7,PD-related Reason for Stop Working,n (%),105 (35.7),79 (43.2),216 (44.8),400 (41.7),0.0404,0.0376,959
8,Agriculture as a job,n (%),140 (23.1),80 (22.5),143 (16.7),363 (20.0),0.0043,0.0068,1815
9,Life time direct exposure to pesticide/insecti...,n (%),155 (26.2),81 (23.3),144 (17.2),380 (21.4),p<0.001,p<0.001,1775
