In [36]:
## VISUALIZATION OF ALL DISTRIBUTION

In [27]:
%load_ext autoreload
%autoreload 2


import pandas as pd
import synapseclient as sc
import sys
sys.path.append("../../src")
from utils.munging_utils import get_file_entity, fix_column_name, save_data_to_synapse
from utils.preprocessing_utils import preprocess, addAdditionalFeatures_viz, collapseFeatures
from datetime import datetime
import numpy as np

Welcome, aryton tediarjo!

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
MPOWER_GAIT_DATA_V1 = "syn21111818"
MPOWER_DEMO_DATA_V1 = "syn10371840"
MPOWER_GAIT_DATA_V2 = "syn21113231"
MPOWER_DEMO_DATA_V2 = "syn15673379"
MPOWER_GAIT_DATA_PASSIVE = "syn21114136"
EMS_PROF_DATA = "syn10235463"
EMS_DEMO_DATA = "syn10295288"
EMS_GAIT_DATA = "syn21256442"
EMS_PASSIVE_DATA = "syn10651116"

In [4]:
syn = sc.login()

Welcome, aryton tediarjo!



In [None]:
metadata_cols = ['recordId', 'healthCode', 'appVersion', 
                 'phoneInfo', 'createdOn', 'PD', 'MS',
                 'gender', 'age', 'version']
data = data[[feat for feat in data.columns if ("." in feat) or (feat in metadata_cols)]]

In [79]:
## gait mPower V1 ##

def create_mPowerV1_data(MPOWER_DATA_V1, MPOWER_DEMO_V1, output_filename):
    demo_data = syn.tableQuery("SELECT * FROM {} where dataGroups\
                               NOT LIKE '%test_user%'".format(MPOWER_DEMO_V1)).asDataFrame()
    gait_data = get_file_entity(MPOWER_DATA_V1)
    demo_data = demo_data[["healthCode", "gender", "age",
                           "professional_diagnosis", "inferred_diagnosis"]].reset_index(drop = True)
    data = pd.merge(gait_data, demo_data, on = "healthCode", how = "inner")
    data = data.dropna(subset = ["inferred_diagnosis"], thresh = 1)
    data["PD"] = data["inferred_diagnosis"].map({True :1.0, False:0.0})
    data = data[(data["gender"] == "Female") | (data["gender"] == "Male")]
    data["age"] = data["age"].apply(lambda x: float(x))
    data = data[(data["age"] <= 100) & (data["age"] >= 0)]
    data["gender"] = data["gender"].apply(lambda x: x.lower())
    data = fix_column_name(data)
    data = data.reset_index(drop = True)
    metadata_cols = ['recordId', 'healthCode', 'appVersion', 
                 'phoneInfo', 'createdOn', 'PD', 'MS',
                 'gender', 'age', 'version']
    data = data[[feat for feat in data.columns if ("." in feat) or (feat in metadata_cols)]]
    save_data_to_synapse(data = data, 
                         output_filename = "mpower_v1_active_full.csv",
                         data_parent_id = "syn21267355")
    return data 



In [80]:
def create_mPowerV2_data(MPOWER_DATA_V2, MPOWER_DEMO_V2, output_filename):

    demo_data = syn.tableQuery("SELECT birthYear, healthCode, diagnosis, sex FROM {} \
                                where dataGroups NOT LIKE '%test_user%'".format(MPOWER_DEMO_V2)).asDataFrame()
    gait_data = get_file_entity(MPOWER_DATA_V2)
    data   = pd.merge(gait_data, demo_data, how = "inner", on = "healthCode")
    data   = data[data["diagnosis"] != "no_answer"] 
    data["PD"] = data["diagnosis"].map({"parkinsons":1, "control":0})
    data["age"] = data["birthYear"].apply(lambda year: datetime.now().year - year)
    data = data.rename({"sex":"gender"}, axis = 1)
    data = fix_column_name(data)
    data = data.reset_index(drop = True)
    metadata_cols = ['recordId', 'healthCode', 'appVersion', 
                 'phoneInfo', 'createdOn', 'PD', 'MS',
                 'gender', 'age', 'version']
    data = data[[feat for feat in data.columns if ("." in feat) or (feat in metadata_cols)]]
    save_data_to_synapse(data = data, output_filename = output_filename,
                              data_parent_id = "syn21267355")
    return data

In [81]:
def create_elevateMS_data(EMS_DATA, EMS_PROF_DATA, output_filename):
    profile_data = syn.tableQuery("SELECT healthCode, dataGroups, 'demographics.gender', 'demographics.age' FROM {}\
                           where dataGroups NOT LIKE '%test_user%'".format(EMS_PROF_DATA)).asDataFrame()
    gait_data    = get_file_entity(EMS_DATA)
    data         = pd.merge(gait_data, profile_data, how = "inner", on = "healthCode")
    data = data.dropna(subset = ["demographics.gender"])
    data["MS"] = data["dataGroups"].map({"ms_patient":1, "control":0})
    data  = data.rename({"demographics.gender" :"gender",
                         "demographics.age"    : "age"}, axis = 1)
    data = fix_column_name(data)
    data = data.reset_index(drop = True)
    metadata_cols = ['recordId', 'healthCode', 'appVersion', 
                 'phoneInfo', 'createdOn', 'PD', 'MS',
                 'gender', 'age', 'version']
    data = data[[feat for feat in data.columns if ("." in feat) or (feat in metadata_cols)]]
    save_data_to_synapse(data = data, output_filename = output_filename,
                                data_parent_id = "syn21267355")
    return data

In [84]:
dataV1                    = create_mPowerV1_data(MPOWER_GAIT_DATA_V1, MPOWER_DEMO_DATA_V1, "pdkit_mpowerv1_active_full.csv")
dataV1                    = dataV1.loc[:,~dataV1.columns.duplicated()]
dataV1["version"]         = "V1"
dataV2                    = create_mPowerV2_data(MPOWER_GAIT_DATA_V2, MPOWER_DEMO_DATA_V2, "pdkit_mpowerv2_active_full.csv")
dataV2["version"]         = "V2"
dataPassive               = create_mPowerV2_data(MPOWER_GAIT_DATA_PASSIVE, MPOWER_DEMO_DATA_V2, "pdkit_mpowerv2_passive_full.csv")
dataPassive["version"]    = "PDKIT_passive"
dataEMS_active            = create_elevateMS_data(EMS_GAIT_DATA, EMS_PROF_DATA, "pdkit_ems_active_full.csv")
dataEMS_active["version"] = "EMS_active"
dataEMS_active            = create_elevateMS_data(EMS_GAIT_DATA, EMS_PROF_DATA, "pdkit_ems_active_full.csv")
dataEMS_active["version"] = "EMS_passive"


##################################################
 Uploading file to Synapse storage 
##################################################


##################################################
 Uploading file to Synapse storage 
##################################################


##################################################
 Uploading file to Synapse storage 
##################################################


##################################################
 Uploading file to Synapse storage 
##################################################



In [85]:
### some data annotations ### 
data = pd.concat([dataV1, dataV2, dataPassive, dataEMS_active]).reset_index(drop = True)
metadata_cols = ['recordId', 'healthCode', 'appVersion', 
                 'phoneInfo', 'createdOn', 'PD', 'MS',
                 'gender', 'age', 'version']
data = data[[feat for feat in data.columns if ("." in feat) or (feat in metadata_cols)]]
data = data[(data != "#ERROR").all(axis = 1)]
data["is_control"] = data.apply(lambda x: 0 if ((x["PD"] == 0) or (x["MS"] == 0)) else 1, axis = 1)
data[[_ for _ in data.columns if "." in _]] = \
    data[[_ for _ in data.columns if "." in _]].apply(pd.to_numeric)
data.drop(["y.duration", "z.duration", "AA.duration"], axis = 1, inplace = True) 
data.rename({"x.duration": "duration"}, axis = 1, inplace = True)
data = addAdditionalFeatures_viz().transform(data)
# metadata = data[['MS', 'PD', 'age', 'appVersion', 'createdOn', 
#                  'gender', 'healthCode', 'phoneInfo', 'recordId', 
#                  'version', 'duration', "is_control"]]



# data = collapseFeatures(aggregation_type = "mean").transform(data[[feat for feat in data.columns if ("." in feat) and ("healthCode" in feat)]])


save_data_to_synapse(data = data.reset_index(drop = True), output_filename = "combined_gait_data.csv",
                                data_parent_id = "syn21267355")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  




##################################################
 Uploading file to Synapse storage 
##################################################



In [54]:
[feat for feat in dataV1.columns if "." in feat]

['AA.duration',
 'AA.duration',
 'AA.freeze_occurences',
 'AA.freeze_occurences',
 'AA.frequency_of_peaks',
 'AA.frequency_of_peaks',
 'AA.gait_step_regularity',
 'AA.gait_step_regularity',
 'AA.gait_stride_regularity',
 'AA.gait_stride_regularity',
 'AA.gait_symmetry',
 'AA.gait_symmetry',
 'AA.max_freeze_index',
 'AA.max_freeze_index',
 'AA.no_of_steps',
 'AA.no_of_steps',
 'AA.speed_of_gait',
 'AA.speed_of_gait',
 'x.duration',
 'x.duration',
 'x.freeze_occurences',
 'x.freeze_occurences',
 'x.frequency_of_peaks',
 'x.frequency_of_peaks',
 'x.gait_step_regularity',
 'x.gait_step_regularity',
 'x.gait_stride_regularity',
 'x.gait_stride_regularity',
 'x.gait_symmetry',
 'x.gait_symmetry',
 'x.max_freeze_index',
 'x.max_freeze_index',
 'x.no_of_steps',
 'x.no_of_steps',
 'x.speed_of_gait',
 'x.speed_of_gait',
 'y.duration',
 'y.duration',
 'y.freeze_occurences',
 'y.freeze_occurences',
 'y.frequency_of_peaks',
 'y.frequency_of_peaks',
 'y.gait_step_regularity',
 'y.gait_step_regularit

In [55]:
dataV1.columns

Index(['appVersion', 'createdOn', 'AA.duration', 'AA.duration',
       'AA.freeze_occurences', 'AA.freeze_occurences', 'AA.frequency_of_peaks',
       'AA.frequency_of_peaks', 'AA.gait_step_regularity',
       'AA.gait_step_regularity',
       ...
       'z.no_of_steps', 'z.speed_of_gait', 'z.speed_of_gait', 'healthCode',
       'phoneInfo', 'recordId', 'gender', 'age', 'PD', 'version'],
      dtype='object', length=153)

In [29]:
for i in syn.getChildren(parent = "syn21267355"):
    if i["name"] == 'combined_gait_data.csv':
        data = utils.get_file_entity(i["id"])
        featurized_healthcode_list = data["healthCode"].unique()

In [33]:
new_data = (utils.get_file_entity(EMS_GAIT_DATA))[~new_data.isin(featurized_healthcode_list)]

In [35]:
comb_data = pd.concat([data, new_data]).reset_index(drop = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.

