In [1]:
import pandas as pd
import os
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import itertools
import tsfresh
from tsfresh.feature_extraction import extract_features, MinimalFCParameters, EfficientFCParameters
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:
direc = "E:\\WS4PD_data"
os.chdir(direc)
save_path = os.path.join(direc, "Feature_extraction")
# load data
demogra_data = pd.read_csv("Demographics_data.csv")
task_score = pd.read_csv("Task_scores_part_I.csv")
subject_ids = task_score.subject_id.unique()

In [3]:
subject_ids

array(['3_BOS', '4_BOS', '5_BOS', '6_BOS', '7_BOS', '8_BOS', '9_BOS',
       '10_BOS', '11_BOS', '12_BOS', '13_BOS', '14_BOS', '15_BOS',
       '16_BOS', '17_BOS', '18_BOS', '19_BOS', '2_NYC', '3_NYC', '4_NYC',
       '5_NYC', '6_NYC', '7_NYC', '8_NYC', '9_NYC', '10_NYC', '11_NYC',
       '12_NYC'], dtype=object)

In [4]:
# GENEActiv
device = 'GENEActiv'
day = 2

# Every 30 sec as a trial
# Sliding window: 5 sec

sample_rate = 50 # Hz
dt = 1/sample_rate
time_window = 30 # sec
sliding_window = 10 # sec

df = pd.DataFrame()
sb = '7_NYC'
sensor_path = os.path.join(direc, device, sb, 'rawdata_day'+str(day)+'.txt')
sensor_data = pd.read_pickle(sensor_path)
first_ts = sensor_data['timestamp'].iloc[0]
last_ts = sensor_data['timestamp'].iloc[-1]
timestamp_start = np.arange(first_ts, (last_ts-time_window), sliding_window)
for ts_start in timestamp_start:
    ts_end = ts_start+time_window
    is_ts = (sensor_data.timestamp.values >= ts_start) & (sensor_data.timestamp.values < ts_end)
    x = sensor_data['GENEActiv_X'].loc[is_ts].values
    y = sensor_data['GENEActiv_Y'].loc[is_ts].values
    z = sensor_data['GENEActiv_Z'].loc[is_ts].values
    mag = sensor_data['GENEActiv_Magnitude'].loc[is_ts].values
    df_trial = pd.DataFrame(data= {'subject_id': sb, 'day': day,
                                   'GENEActiv_X': [x], 'GENEActiv_Y': [y], 'GENEActiv_Z': [z],
                                   'GENEActiv_Magnitude': [mag]})
    df = pd.concat([df,df_trial])
df.reset_index(drop=True)

Unnamed: 0,subject_id,day,GENEActiv_X,GENEActiv_Y,GENEActiv_Z,GENEActiv_Magnitude
0,7_NYC,2,"[-4.14429, -4.10604, -4.22274, -4.10604, -4.14...","[-8.77695, -8.77695, -8.73871, -8.73871, -8.73...","[-1.12286, -1.12286, -1.12286, -1.20033, -1.16...","[9.77092, 9.75476, 9.77023, 9.72961, 9.74106, ..."
1,7_NYC,2,"[-4.22274, -4.18352, -4.22274, -3.91089, -4.22...","[-8.77695, -8.73871, -8.73871, -8.89267, -8.81...","[-1.00812, -1.27683, -0.93065, -1.23858, -1.12...","[9.79197, 9.77226, 9.75001, 9.7933, 9.83958, 9..."
2,7_NYC,2,"[-4.26099, -4.10604, -4.14429, -4.06682, -4.22...","[-8.77695, -8.77695, -8.81618, -8.73871, -8.89...","[-1.16111, -1.04637, -1.08462, -1.04637, -1.12...","[9.82543, 9.74625, 9.80186, 9.6953, 9.90818, 9..."
3,7_NYC,2,"[-4.06682, -4.18352, -4.18352, -4.02759, -4.10...","[-8.85442, -8.81618, -8.73871, -8.85442, -8.73...","[-1.08462, -1.23858, -1.20033, -1.12286, -0.96...","[9.80389, 9.83671, 9.76256, 9.79199, 9.70378, ..."
4,7_NYC,2,"[-4.22274, -4.14429, -4.22274, -4.22274, -4.18...","[-8.70046, -8.77695, -8.81618, -8.85442, -8.73...","[-1.16111, -1.16111, -1.00812, -1.04637, -1.16...","[9.74052, 9.77539, 9.82715, 9.86546, 9.75782, ..."
...,...,...,...,...,...,...
8632,7_NYC,2,"[-0.17946, -0.25693, -0.10199, -0.25693, -0.21...","[-7.27261, -7.23339, -7.38833, -7.15689, -7.34...","[6.68421, 6.492, 6.53025, 6.68421, 6.56849, 6....","[9.87936, 9.72286, 9.86113, 9.79622, 9.85912, ..."
8633,7_NYC,2,"[-0.37363, -0.33539, -0.29616, -0.33539, -0.29...","[-7.3491, -7.27261, -7.3491, -7.31086, -7.2726...","[6.492, 6.492, 6.64597, 6.56849, 6.53025, 6.60...","[9.813, 9.75446, 9.91292, 9.83393, 9.77869, 9...."
8634,7_NYC,2,"[-0.33539, -0.33539, -0.41286, -0.41286, -0.41...","[-7.38833, -7.3491, -7.27261, -7.3491, -7.2726...","[6.53025, 6.56849, 6.53025, 6.64597, 6.64597, ...","[9.86631, 9.8624, 9.78292, 9.91709, 9.86054, 9..."
8635,7_NYC,2,"[-0.72373, -0.52956, -0.60703, -0.64626, -0.64...","[-7.23339, -7.3491, -7.27261, -7.27261, -7.117...","[6.72246, 6.53025, 6.56849, 6.60674, 6.53025, ...","[9.90137, 9.8455, 9.81858, 9.8467, 9.68106, 9...."


In [5]:
# Construct data for tsfresh 
# Each trial has a different id
# Save extracted features for each subject
# Output: extracted features (row: trial, column: feature)
df_tsfresh = pd.DataFrame()
df_sb = df[df.subject_id == sb]
for trial in range(len(df_sb)):
#     x = df_sb.Pebble_X.iloc[trial]
#     y = df_sb.Pebble_Y.iloc[trial]
#     z = df_sb.Pebble_Z.iloc[trial]   
    x = df_sb.GENEActiv_X.iloc[trial]
    y = df_sb.GENEActiv_Y.iloc[trial]
    z = df_sb.GENEActiv_Z.iloc[trial]
    t = np.round_([item * dt for item in range(len(x))],2)
    df_trial = pd.DataFrame(data = {'id':trial, 'time':t, 'x': x, 'y': y, 'z': z})
    df_tsfresh = pd.concat([df_tsfresh,df_trial])
# extract comprehensive features (default)
extracted_features = extract_features(df_tsfresh, column_id="id", column_sort="time")
# save extracted features
save_file_path = os.path.join(save_path,device,'day_2',sb + '_features.pkl')
extracted_features.to_pickle(save_file_path)

Feature Extraction: 100%|██████████| 20/20 [7:10:21<00:00, 1291.07s/it]  


In [None]:
extracted_features

In [None]:
# For all patients

In [None]:
# GENEActiv
device = 'GENEActiv'
day = 2

# Every 30 sec as a trial
# Sliding window: 5 sec

sample_rate = 50 # Hz
time_window = 30 # sec
sliding_window = 10 # sec

df = pd.DataFrame()

for sb in subject_ids:
    sensor_path = os.path.join(direc, device, sb, 'rawdata_day'+str(day)+'.txt')
    sensor_data = pd.read_pickle(sensor_path)
    first_ts = sensor_data['timestamp'].iloc[0]
    last_ts = sensor_data['timestamp'].iloc[-1]
    timestamp_start = np.arange(first_ts, (last_ts-time_window), sliding_window)
    for ts_start in timestamp_start:
        ts_end = ts_start+time_window
        is_ts = (sensor_data.timestamp.values >= ts_start) & (sensor_data.timestamp.values < ts_end)
        x = sensor_data['GENEActiv_X'].loc[is_ts].values
        y = sensor_data['GENEActiv_Y'].loc[is_ts].values
        z = sensor_data['GENEActiv_Z'].loc[is_ts].values
        mag = sensor_data['GENEActiv_Magnitude'].loc[is_ts].values
        df_trial = pd.DataFrame(data= {'subject_id': sb, 'day': day,
                                       'GENEActiv_X': [x], 'GENEActiv_Y': [y], 'GENEActiv_Z': [z],
                                       'GENEActiv_Magnitude': [mag]})
        df = pd.concat([df,df_trial])
df.reset_index(drop=True)

In [None]:
# Construct data for tsfresh 
# Each trial has a different id
# Save extracted features for each subject
# Output: extracted features (row: trial, column: feature)
for sb in subject_ids:
    df_tsfresh = pd.DataFrame()
    df_sb = df[df.subject_id == sb]
    for trial in range(len(df_sb)):
        x = df_sb.Pebble_X.iloc[trial]
        y = df_sb.Pebble_Y.iloc[trial]
        z = df_sb.Pebble_Z.iloc[trial]   
#         x = df_sb.GENEActiv_X.iloc[trial]
#         y = df_sb.GENEActiv_Y.iloc[trial]
#         z = df_sb.GENEActiv_Z.iloc[trial]
        t = np.round_([item * dt for item in range(len(x))],2)
        df_trial = pd.DataFrame(data = {'id':trial, 'time':t, 'x': x, 'y': y, 'z': z})
        df_tsfresh = pd.concat([df_tsfresh,df_trial])
    # extract comprehensive features (default)
    extracted_features = extract_features(df_tsfresh, column_id="id", column_sort="time")
    # save extracted features
    save_file_path = os.path.join(save_path,device,'day_2',sb + '_features.pkl')
    extracted_features.to_pickle(save_file_path)