In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import scipy.stats as stats
from sklearn.model_selection import train_test_split

In [2]:
direc = "E:\\WS4PD_data"
os.chdir(direc)
# load data
demogra_data = pd.read_csv("Demographics_data.csv")
task_score = pd.read_csv("Task_scores_part_I.csv")
save_path = os.path.join(direc, "Figures", "Demographic_and_clinical_data")

In [3]:
demogra_data.head()

Unnamed: 0,ROW_ID,ROW_VERSION,subject_id,cohort,gender,birth_year,dominant_hand,upper_limb_length,upper_arm_length,lower_arm_length,...,recording_end,timezone,updrs_time,updrs_score_p1,updrs_score_p2,updrs_score_p3,updrs_score_p4,h_and_y_score,updrs_second_visit_time,updrs_second_visit_score_p3
0,1,1,7_NYC,PD,Male,1956,Right,,33,29,...,14:54:00,Eastern Daylight Time (GMT -4),12:07:00,24,22,55,8,,11:56:00,61
1,2,1,2_NYC,PD,Male,1959,Right,,32,26,...,14:44:00,Eastern Standard Time (GMT -5),12:39:00,13,14,10,13,2.0,9:50:00,22
2,3,1,3_NYC,PD,Male,1969,Right,,32,29,...,14:14:00,Eastern Daylight Time (GMT -4),12:33:00,12,16,16,4,2.0,10:15:00,35
3,4,1,4_NYC,PD,Male,1952,Right,,36,28,...,13:34:00,Eastern Daylight Time (GMT -4),13:44:00,0,7,19,0,2.0,9:40:00,36
4,5,1,5_NYC,PD,Male,1960,Right,,25,28,...,13:02:00,Eastern Daylight Time (GMT -4),12:00:00,12,10,12,11,2.0,8:24:00,37


In [4]:
# Manual correction
# The diagnosis year for subject #20 is 2006-2007, and we choose 2006 as the diagnosis year 
demogra_data.loc[20,'diagnosis_year'] = 2006

In [11]:
# stratify gender and clinic
is_BOS = demogra_data['subject_id'].str.contains('BOS')
is_male = demogra_data.gender=='Male'
y = is_BOS.astype(str) + is_male.astype(str)

# Train/Test Split
X_train0, X_test = train_test_split(demogra_data, test_size=0.25,random_state=42, stratify=y)

is_BOS_train0 = X_train0['subject_id'].str.contains('BOS')
is_male_train0 = X_train0.gender=='Male'
y_train0 = is_BOS_train0.astype(str) + is_male_train0.astype(str)

# Train/Validation Split
X_train, X_val = train_test_split(X_train0, test_size=0.25,random_state=21, stratify=y_train0)

In [12]:
for X in [X_train, X_val, X_test]:
    print(X.subject_id)
    

14     6_BOS
24    16_BOS
0      7_NYC
22    14_BOS
6      8_NYC
13     5_BOS
10    12_NYC
5      6_NYC
25    17_BOS
12     4_BOS
19    11_BOS
18    10_BOS
23    15_BOS
3      4_NYC
9     11_NYC
Name: subject_id, dtype: object
16     8_BOS
26    18_BOS
1      2_NYC
7      9_NYC
11     3_BOS
17     9_BOS
Name: subject_id, dtype: object
27    19_BOS
2      3_NYC
15     7_BOS
4      5_NYC
21    13_BOS
8     10_NYC
20    12_BOS
Name: subject_id, dtype: object


In [None]:
# Gender
for data in [X_train, X_test]:
    is_male = data.gender=='Male'
    is_female = data.gender=='Female'
    print('Male: '+ str(sum(is_male)))
    print('Female: ' + str(sum(is_female)))

In [None]:
# Clinic
for data in [X_train, X_test]:
    is_BOS = data['subject_id'].str.contains('BOS')
    is_NY = data['subject_id'].str.contains('NY')
    is_male = data.gender=='Male'
    is_female = data.gender=='Female'
    print('Boston group: '+ str(sum(is_BOS)) +\
          ' (' + str(sum(is_BOS & is_male)) +'M' + str(sum(is_BOS & is_female)) +'F)')
    print('New York group: ' + str(sum(is_NY)) +\
         ' (' + str(sum(is_NY & is_male)) +'M' + str(sum(is_NY & is_female)) +'F)')

In [None]:
# Age, UPDRS score (on and off states)
visit_year  = 2015 # Data was collected in 2015
width = 0.5
fig, axs = plt.subplots(figsize=(5,15),nrows=3, ncols=1)
for a,data in zip(range(2),[X_train, X_test]):
    for b,ax in zip(range(3),axs):
        if b==0:
            feature = visit_year - data['birth_year']
            ax.set_ylabel('Age')
        elif b==1:  
            feature = data['updrs_score_p3']
            ax.set_ylabel('UPDRS score (on med.)')
        elif b==2:
            feature = data['updrs_second_visit_score_p3']
            ax.set_ylabel('UPDRS score (off med.)')
        feature_mean = np.mean(feature)
        feature_std = np.std(feature)
        ax.bar(a,feature_mean, width, yerr=feature_std) 
        ax.set_xticks([0,1], labels=['Train', 'Test'])
# save figure
plt.savefig(os.path.join(save_path,"Train_Test_comp"))

In [None]:
X_train.head()

In [None]:
X_train.subject_id

In [None]:
X_test.head()