# Comparison between patients with short and long disease

In this notebook we give summary statistics for the whole cohort of PD patients stratified by the length of the disease. The first group are those who have been diagnosed less than 5 years before the assessment and the second group are those who have been diagnosed more than 5 years before the assessment.

In [1]:
# imports

import os
import sys
sys.path.append('/mnt/0A2AAC152AABFBB7/CGE/luxgiant-clinical')
import luxgiant_clinical.TwoCatAnalysis as two

import pandas as pd

In [2]:
# load data

folder_path = '/mnt/0A2AAC152AABFBB7/CGE/luxgiant-clinical/data'

df = pd.read_csv(
    os.path.join(folder_path, 'cleaned_file.csv'), 
    index_col=0, low_memory=False
)

In [3]:
# filter data to keep only patients with duration of PD information

mask_patients= (df['Status']=='Patient')
mask_duration= (~df['PD_duration'].isnull()) 

df_cases = df[mask_patients & mask_duration].reset_index(drop=True)
del df
df_cases.shape

(7473, 714)

## Summary statistics for demography and lifestyle

In [4]:
# rename and select first group of variables

variables_dict_1 = {
    "age"                       : "Age at Recruitment (years)",
    "age_at_onset"              : "Age at Onset (years)" ,
    "sex"                       : "Male" ,
    "agecat_1"                  : "Onset <50 years",
    "agecat_2"                  : "Onset <40 years",
    "PD_duration"               : "Duration of Disease (years)",
    "years_of_education"        : "Years of Education",
    "bmi_comp"                  : "Body Mass Index",
    "reason"                    : "PD-related Reason for Stop Working",
    "work"                      : "Currently Employed",
    "nature_of_work___1"        : "Agriculture as a job",
    "over_your_lifetime_have_yo": "Life time direct exposure to pesticide/insecticide/fungicide",
    "during_your_lifetime_did_y": "Exposure to chemicals at home",
    "in_your_lifetime_have_you" : "Smoked 100 or more cigarettes in lifetime",
    "in_your_lifetime_have"     : "Regular consumption of caffeinated drinks for >6months",
    "have_you_ever_had_a_head_i": "Head injury or concussion",
}
variables_1 = ['participant_id', 'pdsl'] + list(variables_dict_1.keys())

# statistical measures
stats_meas = {
    'mean'  : ["Age at Recruitment (years)", "Age at Onset (years)", "Body Mass Index"],
    'n'     : ["Male", "Onset <50 years", "Onset <40 years", "PD-related Reason for Stop Working", "Currently Employed", 
               "Agriculture as a job", "Life time direct exposure to pesticide/insecticide/fungicide", 
               "Exposure to chemicals at home",
               "Smoked 100 or more cigarettes in lifetime", "Regular consumption of caffeinated drinks for >6months",
               "Head injury or concussion"],
    'median': ["Duration of Disease (years)", "Years of Education"]
}

In [5]:
# keep only variables for analysis

df_cases_1 = df_cases[variables_1].copy()
df_cases_1 = df_cases_1.rename(columns=variables_dict_1)

In [6]:
# recode variable values

df_cases_1["pdsl"] = df_cases_1["pdsl"].map({"<=5":'PD duration <=5 yrs', ">5":'PD duration >5 yrs'})

df_cases_1["Onset <50 years"]     = df_cases_1["Onset <50 years"].map({'<50':1, '>=50':0})
df_cases_1["Onset <40 years"]     = df_cases_1["Onset <40 years"].map({'<40':1, '>=40':0})
df_cases_1["Male"]     = df_cases_1["Male"].map({'Male':1, 'Female':0})
df_cases_1["Currently Employed"]  = df_cases_1["Currently Employed"].map({'Yes':1, 'No':0})
df_cases_1["Agriculture as a job"]= df_cases_1["Agriculture as a job"].map({'Checked':1, 'Unchecked':0})
df_cases_1["PD-related Reason for Stop Working"] = \
    df_cases_1["PD-related Reason for Stop Working"]\
    .map({'PD related':1, 'not PD related':0})
df_cases_1["Life time direct exposure to pesticide/insecticide/fungicide"] = \
    df_cases_1["Life time direct exposure to pesticide/insecticide/fungicide"]\
        .map({'Yes':1, 'No':0, 'Dont Know':0, 'Refused':0})
df_cases_1["Exposure to chemicals at home"] =\
    df_cases_1["Exposure to chemicals at home"]\
        .map({'Yes':1, 'No':0, 'Dont Know':0, 'Refused':0})
df_cases_1["Smoked 100 or more cigarettes in lifetime"] = \
    df_cases_1["Smoked 100 or more cigarettes in lifetime"]\
        .map({'Yes':1, 'No':0, 'Dont Know':0, 'Refused':0})
df_cases_1["Regular consumption of caffeinated drinks for >6months"] = \
    df_cases_1["Regular consumption of caffeinated drinks for >6months"]\
        .map({'Yes':1, 'No':0, 'Dont Know':0, 'Refused':0})
df_cases_1["Head injury or concussion"] = \
    df_cases_1["Head injury or concussion"].map({'Yes':1, 'No':0, 'Dont Know':0, 'Refused':0})

In [7]:
# create empty dataframe for summary statistics

summary_cols = ['Variable', 'Statistical Measure', 'PD duration <=5 yrs', 'PD duration >5 yrs', 'Available Sample for Analysis']
df_summary = pd.DataFrame(columns=summary_cols)

In [8]:
# mean and standard deviation of some continuous variables

res1 = two.mean_std(df_cases_1, stats_meas['mean'], grouping_by='pdsl')

df_summary1 = two.summaryze_mean_std(df_summary, res1, stats_meas['mean'], 'PD duration <=5 yrs', 'PD duration >5 yrs')
df_summary1 = df_summary1\
    .merge(
        two.mean_std_simple(df_cases_1, features=stats_meas['mean'])
    )\
    .merge(
        two.t_test_by_group(df_cases_1, stats_meas['mean'], group_var='pdsl'), on='Variable'
    )

In [9]:
# count and proportion of categorical variables

res2 = two.count_percent(df_cases_1, stats_meas['n'], 'pdsl')

df_summary2 = two.summaryze_count_percent(df_summary, res2, stats_meas['n'], 'PD duration <=5 yrs', 'PD duration >5 yrs')
df_summary2 = df_summary2\
    .merge(
        two.count_simple(df_cases_1, features=stats_meas['n']), on='Variable'
    )\
    .merge(
        two.chi_squared_tests(df_cases_1, stats_meas['n'], group_var='pdsl'), on='Variable'
    )

In [10]:
# median and inter quartile range of remaining continuos variables

res3 = two.median_iqr(df_cases_1, stats_meas['median'], 'pdsl')

df_summary3 = two.summaryze_median_iqr(df_summary, res3, stats_meas['median'], 'PD duration <=5 yrs', 'PD duration >5 yrs')
df_summary3 = df_summary3\
    .merge(
        two.median_iqr_simple(df_cases_1, stats_meas['median']), on='Variable'
    )\
    .merge(
        two.mann_whitney(df_cases_1, stats_meas['median'], 'pdsl'), on='Variable'
    )

In [11]:
# concatenate all results

ordered_cols = ['Variable', 'Statistical Measure', 'PD duration <=5 yrs', 'PD duration >5 yrs', 'p-value', 'Total', 'Available Sample for Analysis']
df_summary = pd.concat(
    [df_summary1, df_summary3, df_summary2], axis=0,ignore_index=True
)
df_summary = df_summary[ordered_cols].copy()

# Analysis of patients medical history

In [12]:
# rename and select second group of variables

variables_dict_2 = {
    "medical_history_neurologic___7": "Stroke",
    "medical_history_cancer___1"    : "Melanoma" ,
    "medical_history_cancer___2"    : "Prostate Cancer" ,
    "medical_history_cancer___3"    : "Other Cancers",
    "medical_history_metabolic___1" : "Diabetes",
    "medical_history_metabolic___3" : "High Cholesterol",
    "medical_history_cardiovasc___2": "Hypertension",
    "medical_history_cardiovasc___3": "History of CAD",
    "medical_history_psychologi___1": "History of Anxiety",
    "medical_history_psychologi___3": "History of Depression",
    "family_member_diagnosed_wi"    : "Family History of PD",
    "family_member_diagnosed_wi_2"  : "Family History of Tremor",
    "family_member_diagnosed"       : "Family History of Dementia"
}
variables_2 = ['participant_id', 'pdsl'] + list(variables_dict_2.keys())

# statistical measures
stats_meas2 = {
    'n'     : ["Stroke", "Melanoma", "Prostate Cancer", "Other Cancers", "Diabetes", "High Cholesterol", "Hypertension",
               "History of CAD", "History of Anxiety", "History of Depression", "Family History of PD", "Family History of Tremor", "Family History of Dementia"]
}

In [13]:
# keep only variables for analysis

df_cases_2 = df_cases[variables_2].copy()
df_cases_2 = df_cases_2.rename(columns=variables_dict_2)

In [14]:
# recode variable values

df_cases_2["pdsl"] = df_cases_2["pdsl"].map({"<=5":'PD duration <=5 yrs', ">5":'PD duration >5 yrs'})

df_cases_2["Stroke"]                    = df_cases_2["Stroke"].map({"Checked":1, "Unchecked":0})
df_cases_2["Melanoma"]                  = df_cases_2["Melanoma"].map({"Checked":1, "Unchecked":0})
df_cases_2["Prostate Cancer"]           = df_cases_2["Prostate Cancer"].map({"Checked":1, "Unchecked":0})
df_cases_2["Other Cancers"]             = df_cases_2["Other Cancers"].map({"Checked":1, "Unchecked":0})
df_cases_2["Diabetes"]                  = df_cases_2["Diabetes"].map({"Checked":1, "Unchecked":0})
df_cases_2["High Cholesterol"]          = df_cases_2["High Cholesterol"].map({"Checked":1, "Unchecked":0})
df_cases_2["Hypertension"]              = df_cases_2["Hypertension"].map({"Checked":1, "Unchecked":0})
df_cases_2["History of CAD"]            = df_cases_2["History of CAD"].map({"Checked":1, "Unchecked":0})
df_cases_2["History of Anxiety"]        = df_cases_2["History of Anxiety"].map({"Checked":1, "Unchecked":0})
df_cases_2["History of Depression"]     = df_cases_2["History of Depression"].map({"Checked":1, "Unchecked":0})
df_cases_2["Family History of PD"]      = df_cases_2["Family History of PD"].map({"Yes":1, "No":0})
df_cases_2["Family History of Tremor"]  = df_cases_2["Family History of Tremor"].map({"Yes":1, "No":0})
df_cases_2["Family History of Dementia"]= df_cases_2["Family History of Dementia"].map({"Yes":1, "No":0})

In [15]:
# create empty dataframe for summary statistics

summary_cols = ['Variable', 'Statistical Measure', 'PD duration <=5 yrs', 'PD duration >5 yrs', 'Available Sample for Analysis']
df_summary_med = pd.DataFrame(columns=summary_cols)

In [16]:
# count and proportion of categorical variables

res4 = two.count_percent(df_cases_2, stats_meas2['n'], 'pdsl')

df_summary_med = two.summaryze_count_percent(df_summary_med, res4, stats_meas2['n'], 'PD duration <=5 yrs', 'PD duration >5 yrs')
df_summary_med = df_summary_med\
    .merge(
        two.count_simple(df_cases_2, features=stats_meas2['n']), on='Variable'
    )\
    .merge(
        two.chi_squared_tests(df_cases_2, stats_meas2['n'], group_var='pdsl'), on='Variable'
    )
df_summary_med = df_summary_med[ordered_cols].copy()

df_summary_med = pd.concat([
    pd.DataFrame(data=[["Medical History", "", "", "", "", "", ""]], columns=ordered_cols), df_summary_med
],
ignore_index=True, axis=0
)

In [17]:
# save result in csv file

df_9 = pd.concat([df_summary, df_summary_med], axis=0, ignore_index=True)
df_9.to_csv(os.path.join(folder_path, 'Table_9.csv'), index=False)
df_9

Unnamed: 0,Variable,Statistical Measure,PD duration <=5 yrs,PD duration >5 yrs,p-value,Total,Available Sample for Analysis
0,Age at Recruitment (years),mean (SD),59.4 (11.9),61.2 (10.6),p<0.001,60.4 (11.3),7468.0
1,Age at Onset (years),mean (SD),57.2 (12.0),51.7 (11.1),p<0.001,54.2 (11.8),7468.0
2,Body Mass Index,mean (SD),24.3 (3.6),24.2 (3.7),0.174,24.3 (3.6),6670.0
3,Duration of Disease (years),median (IQR),2.0 (1.0 - 3.0),8.0 (6.0 - 12.0),p<0.001,5.0 (2.0 - 9.0),7473.0
4,Years of Education,median (IQR),12.0 (8.0 - 15.0),12.0 (10.0 - 15.0),p<0.001,12.0 (9.0 - 15.0),7095.0
5,Male,n (%),2346 (67.9),2720 (67.7),0.8809,5066 (67.8),7472.0
6,Onset <50 years,n (%),854 (24.7),1596 (39.7),p<0.001,2450 (32.8),7468.0
7,Onset <40 years,n (%),294 (8.5),539 (13.4),p<0.001,833 (11.2),7468.0
8,PD-related Reason for Stop Working,n (%),515 (28.1),894 (37.8),p<0.001,1409 (33.6),4197.0
9,Currently Employed,n (%),1781 (53.2),1856 (47.8),p<0.001,3637 (50.3),7229.0


In [18]:
to_excel = pd.ExcelWriter('/mnt/0A2AAC152AABFBB7/CGE/luxgiant-clinical/data/Table_9.xlsx')
df_9.to_excel(to_excel, index=False)
to_excel.close()