#  Comparison between male and female patients of PD

In this notebook we will give summary statistics of the whole cohort of PD patients stratified by sex.

In [1]:
# imports

import os
import sys
sys.path.append('/mnt/0A2AAC152AABFBB7/CGE/luxgiant-clinical')
import luxgiant_clinical.MaleFemale as mf

import pandas as pd

In [2]:
# load data
folder_path = '/mnt/0A2AAC152AABFBB7/CGE/luxgiant-clinical/data'

df = pd.read_csv(
    os.path.join(folder_path, 'cleaned_file.csv'), 
    index_col=0, low_memory=False
)

In [3]:
# filter data to keep only patients with gender information
mask_patients= (df['Status']=='Patient')
mask_gender  = (~df['sex'].isnull()) 

df_cases = df[mask_patients & mask_gender].reset_index(drop=True)
del df

## Summary statistics for demography and lifestyle

In [4]:
# rename and select first group of variables
variables_dict_1 = {
    "age"                       : "Age at Recruitment (years)",
    "age_at_onset"              : "Age at Onset (years)" ,
    "agecat_1"                  : "Onset <50 years" ,
    "agecat_2"                  : "Onset <40 years",
    "PD_duration"               : "Duration of Disease (years)",
    "years_of_education"        : "Years of Education",
    "bmi_comp"                  : "Body Mass Index",
    "work"                      : "Currently Employed",
    "nature_of_work___1"        : "Agriculture as a job",
    "over_your_lifetime_have_yo": "Life time direct exposure to pesticide/insecticide/fungicide",
    "during_your_lifetime_did_y": "Exposure to chemicals to kill insects in your life time",
    "in_your_lifetime_have_you" : "Smoked 100 or more cigarettes in lifetime",
    "in_your_lifetime_have"     : "Regular consumption of caffeinated drinks for >6months",
    "have_you_ever_had_a_head_i": "Head injury or concussion",

}
variables_1 = ['participant_id', 'sex'] + list(variables_dict_1.keys())

# statistical measures
stats_meas = {
    'mean'  : ["Age at Recruitment (years)", "Age at Onset (years)", "Body Mass Index"],
    'n'     : ["Onset <50 years", "Onset <40 years", "Currently Employed", "Agriculture as a job", 
               "Life time direct exposure to pesticide/insecticide/fungicide", "Exposure to chemicals to kill insects in your life time",
               "Smoked 100 or more cigarettes in lifetime", "Regular consumption of caffeinated drinks for >6months",
               "Head injury or concussion"],
    'median': ["Duration of Disease (years)", "Years of Education"]
}

In [None]:
# keep only variables for analysis

df_cases_1 = df_cases[variables_1].copy()
df_cases_1 = df_cases_1.rename(columns=variables_dict_1)

In [5]:
# recode variable values

df_cases_1["Onset <50 years"]     = df_cases_1["Onset <50 years"].map({'<50':1, '>=50':0})
df_cases_1["Onset <40 years"]     = df_cases_1["Onset <40 years"].map({'<40':1, '>=40':0})
df_cases_1["Currently Employed"]  = df_cases_1["Currently Employed"].map({'Yes':1, 'No':0})
df_cases_1["Agriculture as a job"]= df_cases_1["Agriculture as a job"].map({'Checked':1, 'Unchecked':0})

df_cases_1["Life time direct exposure to pesticide/insecticide/fungicide"] = \
    df_cases_1["Life time direct exposure to pesticide/insecticide/fungicide"]\
        .map({'Yes':1, 'No':0, 'Dont Know':0, 'Refused':0})
df_cases_1["Exposure to chemicals to kill insects in your life time"] =\
    df_cases_1["Exposure to chemicals to kill insects in your life time"]\
        .map({'Yes':1, 'No':0, 'Dont Know':0, 'Refused':0})
df_cases_1["Smoked 100 or more cigarettes in lifetime"] = \
    df_cases_1["Smoked 100 or more cigarettes in lifetime"]\
        .map({'Yes':1, 'No':0, 'Dont Know':0, 'Refused':0})
df_cases_1["Regular consumption of caffeinated drinks for >6months"] = \
    df_cases_1["Regular consumption of caffeinated drinks for >6months"]\
        .map({'Yes':1, 'No':0, 'Dont Know':0, 'Refused':0})
df_cases_1["Head injury or concussion"] = \
    df_cases_1["Head injury or concussion"].map({'Yes':1, 'No':0, 'Dont Know':0, 'Refused':0})

In [6]:
# create empty dataframe for summary statistics

summary_cols = ['Variable', 'Statistical Measure', 'Male', 'Female', 'Available Sample for Analysis']
df_summary = pd.DataFrame(columns=summary_cols)

In [7]:
# mean and standard deviation of some continuous variables

res1 = mf.mean_std(df_cases_1, stats_meas['mean'], grouping_by='sex')
df_summary1 = mf.summaryze_mean_std(df_summary, res1, stats_meas['mean'], 'Female', 'Male')
df_summary1 = df_summary1\
    .merge(
        mf.mean_std_simple(df_cases_1, features=stats_meas['mean'])
    )\
    .merge(
        mf.t_test_by_group(df_cases_1, stats_meas['mean'], group_var='sex'), on='Variable'
    )

In [8]:
# count and proportion of categorical variables

res2 = mf.count_percent(df_cases_1, stats_meas['n'], 'sex')
df_summary2 = mf.summaryze_count_percent(df_summary, res2, stats_meas['n'], 'Female', 'Male')
df_summary2 = df_summary2\
    .merge(
        mf.count_simple(df_cases_1, features=stats_meas['n']), on='Variable'
    )\
    .merge(
        mf.chi_squared_tests(df_cases_1, stats_meas['n'], group_var='sex'), on='Variable'
    )

In [9]:
# meadian and inter quartile range of remaining continuos variables

res3 = mf.median_iqr(df_cases_1, stats_meas['median'], 'sex')
df_summary3 = mf.summaryze_median_iqr(df_summary, res3, stats_meas['median'], 'Female', 'Male')
df_summary3 = df_summary3\
    .merge(
        mf.median_iqr_simple(df_cases_1, stats_meas['median']), on='Variable'
    )\
    .merge(
        mf.mann_whitney(df_cases_1, stats_meas['median'], 'sex'), on='Variable'
    )

In [10]:
# concatenate all results

ordered_cols = ['Variable', 'Statistical Measure', 'Male', 'Female', 'p-value', 'Total', 'Available Sample for Analysis']
df_summary = pd.concat(
    [df_summary1, df_summary3, df_summary2], axis=0,ignore_index=True
)
df_summary = df_summary[ordered_cols].copy()

# Analysis of patients medical history

In [11]:
# selection of second group of variables 
variables_dict_2 = {
    "medical_history_neurologic___7": "Stroke",
    "medical_history_cancer___1"    : "Melanoma" ,
    "medical_history_cancer___2"    : "Prostate Cancer" ,
    "medical_history_cancer___3"    : "Other Cancers",
    "medical_history_metabolic___1" : "Diabetes",
    "medical_history_metabolic___4" : "High Cholesterol",
    "medical_history_cardiovasc___2": "Hypertension",
    "medical_history_cardiovasc___3": "History of CAD",
    "medical_history_psychologi___1": "History of Anxiety",
    "medical_history_psychologi___3": "History of Depression",
    "family_member_diagnosed_wi"    : "Family History of PD",
    "family_member_diagnosed_wi_2"  : "Family History of Tremor",
    "family_member_diagnosed"       : "Family History of Dementia"
}
variables_2 = ['participant_id', 'sex'] + list(variables_dict_2.keys())

# statistical measures
stats_meas2 = {
    'n'     : ["Stroke", "Melanoma", "Prostate Cancer", "Other Cancers", "Diabetes", "High Cholesterol", "Hypertension",
               "History of CAD", "History of Anxiety", "History of Depression", "Family History of PD", "Family History of Tremor", "Family History of Dementia"]
}

In [None]:
# keep only variables for analysis

df_cases_2 = df_cases[variables_2].copy()
df_cases_2 = df_cases_2.rename(columns=variables_dict_2)

In [12]:
# recode variable values

df_cases_2["Stroke"]                    = df_cases_2["Stroke"].map({"Checked":1, "Unchecked":0})
df_cases_2["Melanoma"]                  = df_cases_2["Melanoma"].map({"Checked":1, "Unchecked":0})
df_cases_2["Prostate Cancer"]           = df_cases_2["Prostate Cancer"].map({"Checked":1, "Unchecked":0})
df_cases_2["Other Cancers"]             = df_cases_2["Other Cancers"].map({"Checked":1, "Unchecked":0})
df_cases_2["Diabetes"]                  = df_cases_2["Diabetes"].map({"Checked":1, "Unchecked":0})
df_cases_2["High Cholesterol"]          = df_cases_2["High Cholesterol"].map({"Checked":1, "Unchecked":0})
df_cases_2["Hypertension"]              = df_cases_2["Hypertension"].map({"Checked":1, "Unchecked":0})
df_cases_2["History of CAD"]            = df_cases_2["History of CAD"].map({"Checked":1, "Unchecked":0})
df_cases_2["History of Anxiety"]        = df_cases_2["History of Anxiety"].map({"Checked":1, "Unchecked":0})
df_cases_2["History of Depression"]     = df_cases_2["History of Depression"].map({"Checked":1, "Unchecked":0})
df_cases_2["Family History of PD"]      = df_cases_2["Family History of PD"].map({"Yes":1, "No":0})
df_cases_2["Family History of Tremor"]  = df_cases_2["Family History of Tremor"].map({"Yes":1, "No":0})
df_cases_2["Family History of Dementia"]= df_cases_2["Family History of Dementia"].map({"Yes":1, "No":0})

In [13]:
# create empty dataframe for summary statistics

summary_cols = ['Variable', 'Statistical Measure', 'Male', 'Female', 'Available Sample for Analysis']
df_summary_med = pd.DataFrame(columns=summary_cols)

In [14]:
# count and proportion of categorical variables

res4 = mf.count_percent(df_cases_2, stats_meas2['n'], 'sex')

df_summary_med = mf.summaryze_count_percent(df_summary_med, res4, stats_meas2['n'], 'Female', 'Male')
df_summary_med = df_summary_med\
    .merge(
        mf.count_simple(df_cases_2, features=stats_meas2['n']), on='Variable'
    )\
    .merge(
        mf.chi_squared_tests(df_cases_2, stats_meas2['n'], group_var='sex'), on='Variable'
    )
df_summary_med = df_summary_med[ordered_cols].copy()

df_summary_med = pd.concat([
    pd.DataFrame(data=[["Medical History", "", "", "", "", "", ""]], columns=ordered_cols), df_summary_med
],
ignore_index=True, axis=0
)

In [15]:
# save result in csv file

df = pd.concat([df_summary, df_summary_med], axis=0, ignore_index=True)
df.to_csv('/mnt/0A2AAC152AABFBB7/CGE/luxgiant-clinical/data/Table_1.csv', index=False)
df

Unnamed: 0,Variable,Statistical Measure,Male,Female,p-value,Total,Available Sample for Analysis
0,Age at Recruitment (years),mean (SD),60.9 (11.4),59.7 (11.0),p<0.001,60.5 (11.3),7905.0
1,Age at Onset (years),mean (SD),54.6 (11.9),53.5 (11.5),p<0.001,54.2 (11.8),7467.0
2,Body Mass Index,mean (SD),24.3 (3.5),24.2 (4.0),0.1928,24.3 (3.6),6904.0
3,Duration of Disease (years),median (IQR),5.0 (2.0 - 9.0),5.0 (2.0 - 9.0),0.6474,5.0 (2.0 - 9.0),7472.0
4,Years of Education,median (IQR),12.0 (10.0 - 15.0),10.0 (5.0 - 15.0),p<0.001,12.0 (9.0 - 15.0),7365.0
5,Onset <50 years,n (%),1591 (31.4),858 (35.7),p<0.001,2449 (30.9),7467.0
6,Onset <40 years,n (%),565 (11.2),268 (11.2),1.0,833 (10.5),7467.0
7,Currently Employed,n (%),2816 (54.7),943 (38.6),p<0.001,3759 (47.5),7585.0
8,Agriculture as a job,n (%),906 (16.9),246 (9.6),p<0.001,1152 (14.5),7918.0
9,Life time direct exposure to pesticide/insecti...,n (%),1154 (21.7),382 (15.0),p<0.001,1536 (19.4),7864.0
