# Comparison between patients and controls

In this notebook we made the analysis for matched data according to `sex` and `age_at_onset` variables. The summary statistics compare patients and controls. 

In [1]:
# imports

import os
import sys
sys.path.append('/mnt/0A2AAC152AABFBB7/CGE/luxgiant-clinical')
import luxgiant_clinical.TwoCatAnalysis as two

import pandas as pd
import numpy as np

In [2]:
# load data

folder_path = '/mnt/0A2AAC152AABFBB7/CGE/luxgiant-clinical/data'

df = pd.read_csv(
    os.path.join(folder_path, 'cleaned_file.csv'), 
    index_col=0, low_memory=False
)
matched_ids = pd.read_csv(
    os.path.join(folder_path, 'matched_ids.csv'), 
    index_col=0, low_memory=False
)
df = df.merge(matched_ids, on='participant_id')

## Summary statistics for demography and lifestyle

In [3]:
# rename and select first group of variables

variables_dict_1 = {
    "age"                       : "Age at Recruitment (years)",
    "age_at_onset"              : "Age at Onset (years)" ,
    "agecat_1"                  : "Onset <50 years" ,
    "agecat_2"                  : "Onset <40 years",
    "PD_duration"               : "Duration of Disease (years)",
    "years_of_education"        : "Years of Education",
    "bmi_comp"                  : "Body Mass Index",
    "work"                      : "Currently Employed",
    "reason"                    : "PD-related Reason for Stop Working",
    "nature_of_work___1"        : "Agriculture as a job",
    "over_your_lifetime_have_yo": "Life time direct exposure to pesticide/insecticide/fungicide",
    "during_your_lifetime_did_y": "Exposure to chemicals at home",
    "in_your_lifetime_have_you" : "Smoked 100 or more cigarettes in lifetime",
    "in_your_lifetime_have"     : "Regular consumption of caffeinated drinks for >6months",
    "have_you_ever_had_a_head_i": "Head injury or concussion",

}
variables_1 = ['participant_id', 'Status'] + list(variables_dict_1.keys())

# statistical measures
stats_meas = {
    'mean'  : ["Age at Recruitment (years)", "Age at Onset (years)", "Body Mass Index"],
    'n'     : ["Onset <50 years", "Onset <40 years", "Currently Employed", "PD-related Reason for Stop Working",
               "Agriculture as a job", "Life time direct exposure to pesticide/insecticide/fungicide", 
               "Exposure to chemicals at home", 
               "Smoked 100 or more cigarettes in lifetime", "Regular consumption of caffeinated drinks for >6months",
               "Head injury or concussion"],
    'median': ["Duration of Disease (years)", "Years of Education"]
}

groups = ['Control', 'Patient']

In [4]:
# keep only variables for analysis

df_1 = df[variables_1].copy()
df_1 = df_1.rename(columns=variables_dict_1)

In [5]:
mask_control = (df_1['Status']=='Control')

df_1.loc[mask_control, "Age at Onset (years)"] = df_1.loc[mask_control, "Age at Recruitment (years)"]

In [6]:
# recode variable values

df_1["Onset <50 years"]     = df_1["Onset <50 years"].map({'<50':1, '>=50':0})
df_1["Onset <40 years"]     = df_1["Onset <40 years"].map({'<40':1, '>=40':0})
df_1["Currently Employed"]  = df_1["Currently Employed"].map({'Yes':1, 'No':0})
df_1["Agriculture as a job"]= df_1["Agriculture as a job"].map({'Checked':1, 'Unchecked':0})

df_1["PD-related Reason for Stop Working"] = \
    df_1["PD-related Reason for Stop Working"]\
    .map({'PD related':1, 'not PD related':0})
df_1["Life time direct exposure to pesticide/insecticide/fungicide"] = \
    df_1["Life time direct exposure to pesticide/insecticide/fungicide"]\
        .map({'Yes':1, 'No':0, 'Dont Know':np.nan, 'Refused':np.nan})
df_1["Exposure to chemicals at home"] =\
    df_1["Exposure to chemicals at home"]\
        .map({'Yes':1, 'No':0, 'Dont Know':np.nan, 'Refused':np.nan})
df_1["Smoked 100 or more cigarettes in lifetime"] = \
    df_1["Smoked 100 or more cigarettes in lifetime"]\
        .map({'Yes':1, 'No':0, 'Dont Know':np.nan, 'Refused':np.nan})
df_1["Regular consumption of caffeinated drinks for >6months"] = \
    df_1["Regular consumption of caffeinated drinks for >6months"]\
        .map({'Yes':1, 'No':0, 'Dont Know':np.nan, 'Refused':np.nan})
df_1["Head injury or concussion"] = \
    df_1["Head injury or concussion"].map({'Yes':1, 'No':0, 'Dont Know':np.nan, 'Refused':np.nan, 'Possibly':np.nan})

In [7]:
df_sum1 = two.report_mean_std(
    data_df    =df_1,
    variables  =stats_meas['mean'],
    groups     =groups,
    grouping_by='Status'
)
df_sum2 = two.report_median_iqr(
    data_df    =df_1,
    variables  =stats_meas['median'],
    groups     =groups,
    grouping_by='Status'
)
df_sum3 = two.report_proportion(
    data_df    =df_1,
    variables  =stats_meas['n'],
    groups     =groups,
    grouping_by='Status'
)

# Analysis of patients medical history

In [8]:
# rename and select second group of variables

variables_dict_2 = {
    "medical_history_neurologic___7": "Stroke",
    "medical_history_cancer___1"    : "Melanoma" ,
    "medical_history_cancer___2"    : "Prostate Cancer" ,
    "medical_history_cancer___3"    : "Other Cancers",
    "medical_history_metabolic___1" : "Diabetes",
    "medical_history_metabolic___3" : "High Cholesterol",
    "medical_history_cardiovasc___2": "Hypertension",
    "medical_history_cardiovasc___3": "History of CAD",
    "medical_history_psychologi___1": "History of Anxiety",
    "medical_history_psychologi___3": "History of Depression",
    "family_member_diagnosed_wi"    : "Family History of PD",
    "family_member_diagnosed_wi_2"  : "Family History of Tremor",
    "family_member_diagnosed"       : "Family History of Dementia"
}
variables_2 = ['participant_id', 'Status'] + list(variables_dict_2.keys())

# statistical measures
stats_meas2 = {
    'n'     : ["Stroke", "Melanoma", "Prostate Cancer", "Other Cancers", "Diabetes", "High Cholesterol", "Hypertension",
               "History of CAD", "History of Anxiety", "History of Depression", "Family History of PD", "Family History of Tremor", "Family History of Dementia"]
}

In [9]:
# keep only variables for analysis

df_2 = df[variables_2].copy()
df_2 = df_2.rename(columns=variables_dict_2)

In [10]:
# recode variable values

df_2["Stroke"]                    = df_2["Stroke"].map({"Checked":1, "Unchecked":0})
df_2["Melanoma"]                  = df_2["Melanoma"].map({"Checked":1, "Unchecked":0})
df_2["Prostate Cancer"]           = df_2["Prostate Cancer"].map({"Checked":1, "Unchecked":0})
df_2["Other Cancers"]             = df_2["Other Cancers"].map({"Checked":1, "Unchecked":0})
df_2["Diabetes"]                  = df_2["Diabetes"].map({"Checked":1, "Unchecked":0})
df_2["High Cholesterol"]          = df_2["High Cholesterol"].map({"Checked":1, "Unchecked":0})
df_2["Hypertension"]              = df_2["Hypertension"].map({"Checked":1, "Unchecked":0})
df_2["History of CAD"]            = df_2["History of CAD"].map({"Checked":1, "Unchecked":0})
df_2["History of Anxiety"]        = df_2["History of Anxiety"].map({"Checked":1, "Unchecked":0})
df_2["History of Depression"]     = df_2["History of Depression"].map({"Checked":1, "Unchecked":0})
df_2["Family History of PD"]      = df_2["Family History of PD"].map({"Yes":1, "No":0})
df_2["Family History of Tremor"]  = df_2["Family History of Tremor"].map({"Yes":1, "No":0})
df_2["Family History of Dementia"]= df_2["Family History of Dementia"].map({"Yes":1, "No":0})

In [11]:
# count and proportion of categorical variables

df_sum4 = two.report_proportion(
    data_df    =df_2,
    variables  =stats_meas2['n'],
    groups     =groups,
    grouping_by='Status',
    subheader  ='Medical History'
)

In [12]:
# save result in csv file

df_4 = pd.concat([df_sum1, df_sum2, df_sum3, df_sum4], axis=0,ignore_index=True)

df_4 = two.final_formatter(df_4, groups)
df_4.to_csv('/mnt/0A2AAC152AABFBB7/CGE/luxgiant-clinical/data/Table_4.csv', index=False)
df_4

Unnamed: 0,Variable,Statistical Measure,Control,Patient,p-value,Total,Available Samples for Analysis
0,Age at Recruitment (years),mean (SD),50.0 (12.6),56.7 (12.0),p<0.001,53.3 (12.7),6640.0
1,Age at Onset (years),mean (SD),50.0 (12.6),50.0 (12.6),0.9999,50.0 (12.6),6640.0
2,Body Mass Index,mean (SD),25.0 (3.8),24.2 (3.7),p<0.001,24.6 (3.7),5979.0
3,Duration of Disease (years),median (IQR),2.0 (2.0 - 11.0),5.0 (3.0 - 10.0),0.4396,5.0 (3.0 - 10.0),3325.0
4,Years of Education,median (IQR),12.0 (9.0 - 15.0),12.0 (8.0 - 15.0),0.0149,12.0 (8.0 - 15.0),6365.0
5,Onset <50 years,n (%),1642 (49.5),1642 (49.5),0.9999,3284 (49.5),6640.0
6,Onset <40 years,n (%),733 (22.1),733 (22.1),0.9999,1466 (22.1),6640.0
7,Currently Employed,n (%),2380 (76.2),1901 (58.5),p<0.001,4281 (67.2),6371.0
8,PD-related Reason for Stop Working,n (%),0 (0.0),722 (40.8),p<0.001,722 (34.1),2115.0
9,Agriculture as a job,n (%),383 (11.5),502 (15.1),p<0.001,885 (13.3),6640.0


In [13]:
to_excel = pd.ExcelWriter('/mnt/0A2AAC152AABFBB7/CGE/luxgiant-clinical/data/Table_4.xlsx')
df_4.to_excel(to_excel, index=False)
to_excel.close()