In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:
direc = "E:\\WS4PD_data"
os.chdir(direc)
# load data
demogra_data = pd.read_csv("Demographics_data.csv")
score_file_path = os.path.join(direc, 'Feature_extraction','score_by_device.pkl')
df_score = pd.read_pickle(score_file_path)
save_path = os.path.join(direc, "Feature_extraction")

In [3]:
# Manual correction
# The diagnosis year for subject #20 is 2006-2007, and we choose 2006 as the diagnosis year 
demogra_data.loc[20,'diagnosis_year'] = 2006

In [4]:
demogra_data.columns

Index(['ROW_ID', 'ROW_VERSION', 'subject_id', 'cohort', 'gender', 'birth_year',
       'dominant_hand', 'upper_limb_length', 'upper_arm_length',
       'lower_arm_length', 'lower_limb_length', 'thigh_length', 'shank_length',
       'height', 'weight', 'visit_date', 'diagnosis_day', 'diagnosis_month',
       'diagnosis_year', 'pd_most_affected_side', 'gait_impediments',
       'posture_instability', 'tremor', 'bradykinesia', 'disrupted_sleep',
       'freeze_of_gait', 'dyskinesia', 'rigidity', 'other_symptoms',
       'last_levodopa_dose_timestamp', 'regular_medication', 'geneactive_num',
       'pebble_num', 'geneactive_hand', 'pebble_hand', 'smartphone_location',
       'recording_start', 'recording_end', 'timezone', 'updrs_time',
       'updrs_score_p1', 'updrs_score_p2', 'updrs_score_p3', 'updrs_score_p4',
       'h_and_y_score', 'updrs_second_visit_time',
       'updrs_second_visit_score_p3'],
      dtype='object')

In [5]:
# Get age and age diagnosis
visit_year  = 2015 # Data was collected in 2015
age = visit_year - demogra_data['birth_year']
demogra_data['age'] = age
age_diagnosis = demogra_data['diagnosis_year'].astype('int64') - demogra_data['birth_year']
demogra_data['age_diagnosis'] = age_diagnosis

In [6]:
demogra_data.dtypes

ROW_ID                            int64
ROW_VERSION                       int64
subject_id                       object
cohort                           object
gender                           object
birth_year                        int64
dominant_hand                    object
upper_limb_length               float64
upper_arm_length                  int64
lower_arm_length                  int64
lower_limb_length               float64
thigh_length                      int64
shank_length                      int64
height                          float64
weight                          float64
visit_date                      float64
diagnosis_day                   float64
diagnosis_month                 float64
diagnosis_year                   object
pd_most_affected_side            object
gait_impediments                 object
posture_instability              object
tremor                           object
bradykinesia                     object
disrupted_sleep                  object


In [7]:
demogra_data.isnull().sum()

ROW_ID                           0
ROW_VERSION                      0
subject_id                       0
cohort                           0
gender                           0
birth_year                       0
dominant_hand                    0
upper_limb_length               11
upper_arm_length                 0
lower_arm_length                 0
lower_limb_length               11
thigh_length                     0
shank_length                     0
height                           1
weight                           1
visit_date                       1
diagnosis_day                   27
diagnosis_month                 16
diagnosis_year                   0
pd_most_affected_side            0
gait_impediments                 0
posture_instability              0
tremor                           0
bradykinesia                     0
disrupted_sleep                  0
freeze_of_gait                   0
dyskinesia                       0
rigidity                         0
other_symptoms      

In [8]:
demogra_data.describe()

Unnamed: 0,ROW_ID,ROW_VERSION,birth_year,upper_limb_length,upper_arm_length,lower_arm_length,lower_limb_length,thigh_length,shank_length,height,...,last_levodopa_dose_timestamp,geneactive_num,updrs_score_p1,updrs_score_p2,updrs_score_p3,updrs_score_p4,h_and_y_score,updrs_second_visit_score_p3,age,age_diagnosis
count,28.0,28.0,28.0,17.0,28.0,28.0,17.0,28.0,28.0,27.0,...,27.0,28.0,28.0,28.0,28.0,28.0,27.0,28.0,28.0,28.0
mean,14.5,1.0,1952.5,55.176471,31.607143,27.285714,90.441176,46.75,42.321429,174.574074,...,1431915000.0,17826.678571,10.785714,13.571429,34.535714,6.535714,2.259259,40.857143,62.5,53.392857
std,8.225975,0.0,8.962886,4.362271,3.413907,2.992053,5.261542,7.346579,9.084189,11.612645,...,6415554.0,3137.802227,6.285534,5.245305,18.101529,3.553633,0.594371,17.721538,8.962886,9.765166
min,1.0,1.0,1935.0,49.0,25.0,21.0,81.0,34.0,4.0,147.5,...,1423568000.0,1816.0,0.0,1.0,7.0,0.0,2.0,6.0,46.0,35.0
25%,7.75,1.0,1946.5,51.0,29.0,25.0,88.0,40.0,39.75,166.5,...,1427204000.0,18415.0,6.5,10.0,18.25,4.0,2.0,30.0,56.0,46.75
50%,14.5,1.0,1953.5,55.0,31.5,28.0,90.0,48.0,42.5,176.5,...,1430754000.0,18416.0,12.0,14.0,32.5,7.0,2.0,40.5,61.5,53.0
75%,21.25,1.0,1959.0,58.0,34.0,29.0,94.0,53.0,46.25,181.65,...,1435013000.0,18425.0,14.0,16.0,50.25,8.0,2.0,53.0,68.5,58.75
max,28.0,1.0,1969.0,64.0,40.0,33.0,102.0,60.0,58.0,195.0,...,1445945000.0,18426.0,24.0,23.0,66.0,13.0,4.0,76.0,80.0,74.0


In [9]:
# Get age group
age_bound = np.arange(30,100,10)
age_group = np.empty((len(age),1))
age_group[:] = np.NaN
for i,item in enumerate(age_bound[:-1]):
    is_group = (item <= age) & (age < age_bound[i+1])
    age_group[is_group] = i
demogra_data['age_group'] = age_group
# Get age diagnosis group
age_diagnosis_group = np.empty((len(age_diagnosis),1))
age_diagnosis_group[:] = np.NaN
for i,item in enumerate(age_bound[:-1]):
    is_group = (item <= age_diagnosis) & (age_diagnosis < age_bound[i+1])
    age_diagnosis_group[is_group] = i
demogra_data['age_diagnosis_group'] = age_diagnosis_group

In [10]:
# Get updrs score group
score_bound = np.arange(0,100,20)
updrs_score_groups = ['updrs_score_p1', 'updrs_score_p2', 'updrs_score_p3', 'updrs_score_p4',
                     'updrs_second_visit_score_p3']
for i, group_name in enumerate(updrs_score_groups):
    updrs_score = demogra_data[group_name]
    new_group_name = group_name + '_group'
    score_group = np.empty((len(demogra_data),1))
    score_group[:] = np.NaN
    for j, bound in enumerate(score_bound[:-1]):
        is_group = (bound <= updrs_score) & (updrs_score < score_bound[j+1])
        score_group[is_group] = i
    demogra_data[new_group_name] = score_group

In [11]:
df_score

Unnamed: 0,subject_id,visit,session,task_code,repetition,tremor_GENEActivHand,tremor_PebbleHand,dyskinesia_GENEActivHand,dyskinesia_PebbleHand,bradykinesia_GENEActivHand,bradykinesia_PebbleHand
0,3_BOS,1,1,stndg,1,0,0,Yes,Yes,NotApplicable,NotApplicable
1,3_BOS,1,1,wlkgs,1,0,0,Yes,Yes,No,No
2,3_BOS,1,1,wlkgc,1,0,0,Yes,Yes,No,No
3,3_BOS,1,1,strsu,1,0,0,Yes,No,No,No
4,3_BOS,1,1,strsd,1,0,0,No,Yes,No,No
...,...,...,...,...,...,...,...,...,...,...,...
6904,12_NYC,2,8,ntblt,1,0,0,No,No,NotApplicable,NotApplicable
6905,12_NYC,2,8,drnkg,1,0,0,No,No,No,No
6906,12_NYC,2,8,orgpa,1,0,0,No,No,No,No
6907,12_NYC,2,8,fldng,1,0,0,No,No,No,No


In [12]:
# Get task group
# task categories
task_groups = [['stndg', 'sittg'],['wlkgs', 'wlkgc', 'wlkgp', 'strsu', 'strsd', 'ststd'],
               ['ftnr', 'ftnl', 'ramr', 'raml', 'drawg', 'typng', 'ntblt', 'drnkg', 'orgpa', 'fldng']]
task_group_names = ['no_voluntary_movement','lowerlimb_movement', 'upperlimb_movement']


In [13]:
df_score['task_group'] = np.NaN
for i,task in enumerate(task_groups):
    is_task = df_score['task_code'].isin(task).tolist()
    df_score['task_group'].loc[is_task] = i

In [14]:
df_meta = pd.DataFrame()
df_meta['subject_id'] = df_score['subject_id']
# select task data to be included in training models
feature_task = ['visit', 'session', 'task_code', 'task_group']
for ft in feature_task:
    df_meta[ft] = df_score[ft]

In [15]:
# select demographic data to be included in training models
feature_demogra = ['gender','age_group', 'age_diagnosis_group', 'dominant_hand', 'pd_most_affected_side',
                  'gait_impediments', 'posture_instability', 'tremor', 'bradykinesia', 'disrupted_sleep',
                  'freeze_of_gait', 'dyskinesia', 'rigidity', 'updrs_score_p1_group', 'updrs_score_p2_group',
                  'updrs_score_p3_group', 'updrs_score_p4_group', 'updrs_second_visit_score_p3_group']
subject_ids = df_meta.subject_id.unique()
for fd in feature_demogra:
    df_meta[fd] = np.nan
for fd in feature_demogra:
    for sb in subject_ids:
        feature = demogra_data[fd].loc[demogra_data['subject_id']==sb].values[0]
        df_meta[fd].loc[df_meta['subject_id']==sb] = feature

In [16]:
df_meta

Unnamed: 0,subject_id,visit,session,task_code,task_group,gender,age_group,age_diagnosis_group,dominant_hand,pd_most_affected_side,...,bradykinesia,disrupted_sleep,freeze_of_gait,dyskinesia,rigidity,updrs_score_p1_group,updrs_score_p2_group,updrs_score_p3_group,updrs_score_p4_group,updrs_second_visit_score_p3_group
0,3_BOS,1,1,stndg,0.0,Female,5.0,3.0,Right,Right,...,Yes,No,Yes,Yes,No,0.0,1.0,2.0,3.0,4.0
1,3_BOS,1,1,wlkgs,1.0,Female,5.0,3.0,Right,Right,...,Yes,No,Yes,Yes,No,0.0,1.0,2.0,3.0,4.0
2,3_BOS,1,1,wlkgc,1.0,Female,5.0,3.0,Right,Right,...,Yes,No,Yes,Yes,No,0.0,1.0,2.0,3.0,4.0
3,3_BOS,1,1,strsu,1.0,Female,5.0,3.0,Right,Right,...,Yes,No,Yes,Yes,No,0.0,1.0,2.0,3.0,4.0
4,3_BOS,1,1,strsd,1.0,Female,5.0,3.0,Right,Right,...,Yes,No,Yes,Yes,No,0.0,1.0,2.0,3.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6904,12_NYC,2,8,ntblt,2.0,Male,4.0,4.0,Right,Left,...,Yes,No,No,Yes,Yes,0.0,1.0,2.0,3.0,4.0
6905,12_NYC,2,8,drnkg,2.0,Male,4.0,4.0,Right,Left,...,Yes,No,No,Yes,Yes,0.0,1.0,2.0,3.0,4.0
6906,12_NYC,2,8,orgpa,2.0,Male,4.0,4.0,Right,Left,...,Yes,No,No,Yes,Yes,0.0,1.0,2.0,3.0,4.0
6907,12_NYC,2,8,fldng,2.0,Male,4.0,4.0,Right,Left,...,Yes,No,No,Yes,Yes,0.0,1.0,2.0,3.0,4.0


In [17]:
# save metadata features
save_file_path = os.path.join(save_path, 'metadata_features.pkl')
df_meta.to_pickle(save_file_path)