In [None]:
## to-do-list

# Pebble 
# 1. Feature extraction

# Phone 
# 2. Feature extraction


In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import itertools
import tsfresh
from tsfresh.feature_extraction import extract_features, MinimalFCParameters, EfficientFCParameters
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:
direc = "E:\\WS4PD_data"
os.chdir(direc)
save_path = os.path.join(direc, "Feature_extraction")
# load data
demogra_data = pd.read_csv("Demographics_data.csv")
task_score = pd.read_csv("Task_scores_part_I.csv")

In [4]:
demogra_data.columns

Index(['ROW_ID', 'ROW_VERSION', 'subject_id', 'cohort', 'gender', 'birth_year',
       'dominant_hand', 'upper_limb_length', 'upper_arm_length',
       'lower_arm_length', 'lower_limb_length', 'thigh_length', 'shank_length',
       'height', 'weight', 'visit_date', 'diagnosis_day', 'diagnosis_month',
       'diagnosis_year', 'pd_most_affected_side', 'gait_impediments',
       'posture_instability', 'tremor', 'bradykinesia', 'disrupted_sleep',
       'freeze_of_gait', 'dyskinesia', 'rigidity', 'other_symptoms',
       'last_levodopa_dose_timestamp', 'regular_medication', 'geneactive_num',
       'pebble_num', 'geneactive_hand', 'pebble_hand', 'smartphone_location',
       'recording_start', 'recording_end', 'timezone', 'updrs_time',
       'updrs_score_p1', 'updrs_score_p2', 'updrs_score_p3', 'updrs_score_p4',
       'h_and_y_score', 'updrs_second_visit_time',
       'updrs_second_visit_score_p3'],
      dtype='object')

In [3]:
print(task_score.shape)
task_score.head()

(62199, 13)


Unnamed: 0,ROW_ID,ROW_VERSION,subject_id,visit,session,task_id,task_code,repetition,timestamp_start,timestamp_end,phenotype,body_segment,score
0,1,1,3_BOS,1,1,1,stndg,1,1423568629,1423568661,tremor,RightUpperLimb,0
1,2,1,3_BOS,1,1,1,stndg,1,1423568629,1423568661,tremor,LeftUpperLimb,0
2,3,1,3_BOS,1,1,1,stndg,1,1423568629,1423568661,tremor,LowerLimbs,0
3,4,1,3_BOS,1,1,1,stndg,1,1423568629,1423568661,dyskinesia,RightUpperLimb,Yes
4,5,1,3_BOS,1,1,1,stndg,1,1423568629,1423568661,dyskinesia,LeftUpperLimb,Yes


In [None]:
# devices
# devices = ['GENEActiv', 'Pebble', 'Phone']

In [5]:
visits = [1, 2]
days = [1, 4]
sample_rate = 50
dt = 1/sample_rate
subject_ids = task_score.subject_id.unique()

In [None]:
device = 'Pebble'
# 4_BOS didn't have Pebble data
subject_ids = np.delete(subject_ids, np.where(subject_ids == '4_BOS'))

# Dataset construction

## score

In [6]:
# devices
df_score = pd.DataFrame()

# Get column names 
devices = ['GENEActiv', 'Pebble']
phenotypes = task_score['phenotype'].unique()
column_names = ['subject_id', 'visit', 'session', 'task_code', 'repetition']
for i,j in itertools.product(phenotypes, devices):
    column_names.append(i+ '_' +j+'Hand')
is_phenotypes = [task_score.phenotype == pp for pp in phenotypes]
# Get task scores
for sb in subject_ids:
    geneactive_hand = demogra_data.geneactive_hand.loc[demogra_data.subject_id == sb].values[0]
    geneactive_body_segment = geneactive_hand+'UpperLimb'
    pebble_hand = demogra_data.pebble_hand.loc[demogra_data.subject_id == sb].values[0]
    pebble_body_segment = pebble_hand+'UpperLimb'
    for visit, day in zip(visits, days):
        is_sb_visit = (task_score.subject_id == sb) & (task_score.visit == visit)
        timestamp_start = task_score.timestamp_start[is_sb_visit].unique()
        for ts_start in timestamp_start:
            is_trials = (task_score.timestamp_start == ts_start) & is_sb_visit
            trials_idx = [i for i, val in enumerate(is_trials) if val]
            trial_idx = trials_idx[0]
            
            # Get trial info
            session = task_score['session'].values[trial_idx]
            task_code = task_score['task_code'].values[trial_idx]
            repetition = task_score['repetition'].values[trial_idx]
            data_trial = [sb, visit,session,task_code,repetition]
            
            if len(trials_idx) == 9:
                for i in is_phenotypes:
                    for j in [geneactive_body_segment, pebble_body_segment]:
                        is_trial = is_trials & i & (task_score.body_segment == j)
                        score_trial = task_score['score'].loc[is_trial].values[0]
                        data_trial.append(score_trial)

                df_trial = pd.DataFrame(data= [data_trial], columns = column_names)
                df_score = pd.concat([df_score,df_trial])
            else:
                print('More than one task in this trial:')
                print(sb, visit, session, task_code, repetition, trials_idx)
                    
            
df_score = df_score.reset_index(drop=True)

More than one task in this trial:
2_NYC 2 4 orgpa 1 [39069, 39070, 39071, 39072, 39073, 39074, 39075, 39076, 39077, 39078, 39079, 39080, 39081, 39082, 39083, 39084, 39085, 39086]


In [7]:
df_score.shape

(6909, 11)

In [8]:
# save score data
save_file_path = os.path.join(save_path,'score_by_device.pkl')
df_score.to_pickle(save_file_path)

In [None]:
# body_segments

df_score = pd.DataFrame()

# Get column names 
phenotypes = task_score['phenotype'].unique()
body_segments = task_score['body_segment'].unique()
column_names = ['subject_id', 'visit', 'session', 'task_code', 'repetition']
for i,j in itertools.product(phenotypes, body_segments):
    column_names.append(i+ '_' +j)

# Get task scores
for sb in subject_ids:
    for visit, day in zip(visits, days):
        is_sb_visit = (task_score.subject_id == sb) & (task_score.visit == visit)
        timestamp_start = task_score.timestamp_start[is_sb_visit].unique()
        for ts_start in timestamp_start:
            is_trials = (task_score.timestamp_start == ts_start) & is_sb_visit
            trials_idx = [i for i, val in enumerate(is_trials) if val]
            trial_idx = trials_idx[0]
            
            # Get trial info
            session = task_score['session'].values[trial_idx]
            task_code = task_score['task_code'].values[trial_idx]
            repetition = task_score['repetition'].values[trial_idx]
            data_trial = [sb, visit,session,task_code,repetition]
            
            if len(trials_idx) == 9:
                # Get task scores
                for trial in trials_idx:
                    score_trial = task_score['score'].values[trial]
                    data_trial.append(score_trial)

                df_trial = pd.DataFrame(data= [data_trial], columns = column_names)
                df_score = pd.concat([df_score,df_trial])
            else:
                print('More than one task in this trial:')
                print(sb, visit, session, task_code, repetition, trials_idx)
                    
            
df_score.reset_index(drop=True)

In [None]:
# save score data
save_file_path = os.path.join(save_path,'score_by_body_segment.pkl')
df_score.to_pickle(save_file_path)

In [None]:
df_score

In [None]:
# sensor data

In [None]:
sb = subject_ids[1]
visit = visits[0]
day = days[0]
sensor_path = os.path.join(direc, device, sb, 'rawdata_day'+str(day)+'.txt')
sensor_data = pd.read_pickle(sensor_path)
is_sb_visit = (task_score.subject_id == sb) & (task_score.visit == visit)
timestamp_start = task_score.timestamp_start[is_sb_visit].unique()
ts_start = timestamp_start[0]

In [None]:
is_trials = (task_score.timestamp_start == ts_start) & is_sb_visit
trials_idx = [i for i, val in enumerate(is_trials) if val]
trial_idx = trials_idx[0]
ts_end = task_score['timestamp_end'].values[trial_idx]
is_ts = (sensor_data.timestamp.values >= ts_start) & (sensor_data.timestamp.values <= ts_end)

# Get trial info
session = task_score['session'].values[trial_idx]
task_code = task_score['task_code'].values[trial_idx]
repetition = task_score['repetition'].values[trial_idx]

In [None]:
trials_idx

In [None]:
sensor_data

In [None]:
sum(is_ts)

In [None]:
ts_idx = [i for i, val in enumerate(is_ts) if val]
ts_idx

In [None]:
np.isnan(sensor_data['Pebble_X'].loc[244608])


In [None]:
# Pebble
df = pd.DataFrame()
for sb in subject_ids:
    for visit, day in zip(visits, days):
        sensor_path = os.path.join(direc, device, sb, 'rawdata_day'+str(day)+'.txt')
        sensor_data = pd.read_pickle(sensor_path)
        is_sb_visit = (task_score.subject_id == sb) & (task_score.visit == visit)
        timestamp_start = task_score.timestamp_start[is_sb_visit].unique()
        for ts_start in timestamp_start:
            is_trials = (task_score.timestamp_start == ts_start) & is_sb_visit
            trials_idx = [i for i, val in enumerate(is_trials) if val]
            trial_idx = trials_idx[0]
            ts_end = task_score['timestamp_end'].values[trial_idx]
            is_ts = (sensor_data.timestamp.values >= ts_start) & (sensor_data.timestamp.values <= ts_end)
            
            # Get trial info
            session = task_score['session'].values[trial_idx]
            task_code = task_score['task_code'].values[trial_idx]
            repetition = task_score['repetition'].values[trial_idx]
            
            if len(trials_idx) == 9:
                x = sensor_data['Pebble_X'].loc[is_ts].values
                y = sensor_data['Pebble_Y'].loc[is_ts].values
                z = sensor_data['Pebble_Z'].loc[is_ts].values
                mag = sensor_data['Pebble_Magnitude'].loc[is_ts].values
            else:
                print('More than one task in this trial:')
                print(sb, visit, session, task_code, repetition, trials_idx)
                
            df_trial = pd.DataFrame(data= {'subject_id': sb, 'visit': visit, 'session': session,
                                           'task_code': task_code, 'repetition': repetition,
                                           'Pebble_X': [x], 'Pebble_Y': [y], 'Pebble_Z': [z],
                                           'Pebble_Magnitude': [mag]})
            df = pd.concat([df,df_trial])
df.reset_index(drop=True)

In [None]:
# save Pebble data
file_name = device + '_trial.pkl'
save_file_path = os.path.join(save_path,file_name)
df.to_pickle(save_file_path)

In [None]:
# GENEActiv
# no control on overlapping tasks -> remove one trial from 2_NYC later 
df = pd.DataFrame()
for sb in subject_ids:
    for visit, day in zip(visits, days):
        sensor_path = os.path.join(direc, device, sb, 'rawdata_day'+str(day)+'.txt')
        sensor_data = pd.read_table(sensor_path)
        is_sb_visit = (task_score.subject_id == sb) & (task_score.visit == visit)
        timestamp_start = task_score.timestamp_start[is_sb_visit].unique()
        for ts_start in timestamp_start:
            is_trials = (task_score.timestamp_start == ts_start) & is_sb_visit
            trials_idx = [i for i, val in enumerate(is_trials) if val]
            trial_idx = trials_idx[0]
            ts_end = task_score['timestamp_end'].values[trial_idx]
            is_ts = (sensor_data.timestamp.values >= ts_start) & (sensor_data.timestamp.values <= ts_end)
            x = sensor_data['GENEActiv_X'].loc[is_ts].values
            y = sensor_data['GENEActiv_Y'].loc[is_ts].values
            z = sensor_data['GENEActiv_Z'].loc[is_ts].values
            mag = sensor_data['GENEActiv_Magnitude'].loc[is_ts].values
            session = task_score['session'].values[trial_idx]
            task_code = task_score['task_code'].values[trial_idx]
            repetition = task_score['repetition'].values[trial_idx]
            df_trial = pd.DataFrame(data= {'subject_id': sb, 'visit': visit, 'session': session,
                                           'task_code': task_code, 'repetition': repetition,
                                           'GENEActiv_X': [x], 'GENEActiv_Y': [y], 'GENEActiv_Z': [z],
                                           'GENEActiv_Magnitude': [mag]})
            df = pd.concat([df,df_trial])
df.reset_index(drop=True)

In [None]:
# remove 
sb = '2_NYC'
visit = 2
session = 4
task_code = 'orgpa'
repetition = 1

is_trial = (df.subject_id == sb) & (df.visit == visit) & (df.session == session) & (df.task_code == task_code)
df = df[~is_trial]


In [None]:
# save GENEActiv data
file_name = device + '_trial.pkl'
save_file_path = os.path.join(save_path,file_name)
df.to_pickle(save_file_path)

In [None]:
print(df.shape)
# check missing trials
df.isnull().sum().sum()

In [None]:
# Construct data for tsfresh 
# Each trial has a different id
# Save extracted features for each subject
# Output: extracted features (row: trial, column: feature)
for sb in subject_ids[2:]:
    df_tsfresh = pd.DataFrame()
    df_sb = df[df.subject_id == sb]
    for trial in range(len(df_sb)):
        x = df_sb.Pebble_X.iloc[trial]
        y = df_sb.Pebble_Y.iloc[trial]
        z = df_sb.Pebble_Z.iloc[trial]   
#         x = df_sb.GENEActiv_X.iloc[trial]
#         y = df_sb.GENEActiv_Y.iloc[trial]
#         z = df_sb.GENEActiv_Z.iloc[trial]
        t = np.round_([item * dt for item in range(len(x))],2)
        df_trial = pd.DataFrame(data = {'id':trial, 'time':t, 'x': x, 'y': y, 'z': z})
        df_tsfresh = pd.concat([df_tsfresh,df_trial])
    # extract comprehensive features (default)
    extracted_features = extract_features(df_tsfresh, column_id="id", column_sort="time")
    # save extracted features
    save_file_path = os.path.join(save_path,device,sb + '_features.pkl')
    extracted_features.to_pickle(save_file_path)

In [None]:
sb = subject_ids[1]
sb

In [None]:
df_tsfresh = pd.DataFrame()
df_sb = df[df.subject_id == sb]
for trial in range(len(df_sb)):
    x = df_sb.Pebble_X.iloc[trial]
    y = df_sb.Pebble_Y.iloc[trial]
    z = df_sb.Pebble_Z.iloc[trial]   
#         x = df_sb.GENEActiv_X.iloc[trial]
#         y = df_sb.GENEActiv_Y.iloc[trial]
#         z = df_sb.GENEActiv_Z.iloc[trial]
    t = np.round_([item * dt for item in range(len(x))],2)
    df_trial = pd.DataFrame(data = {'id':trial, 'time':t, 'x': x, 'y': y, 'z': z})
    df_tsfresh = pd.concat([df_tsfresh,df_trial])

In [None]:
trial = 0
x = df_sb.Pebble_X.iloc[trial]
x

In [None]:
df_tsfresh

In [None]:
# check the shape of extracted features 
extracted_features

In [None]:
df_sb