In [3]:
import sys
import warnings
sys.path.append("../src/pipeline")
from utils import query_utils as query
from utils import gait_features_utils as gproc
import synapseclient as sc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
%matplotlib inline
%load_ext autoreload
%autoreload 2
sns.set_style("dark")
warnings.simplefilter("ignore")

In [9]:
"""

Author: Sage Bionetworks

Script to group features by healthcodes,
with several different aggregation

# """
# # import future libraries
# from __future__ import print_function
# from __future__ import unicode_literals

# # import standard libraries
# import time
# import pandas as pd
# import numpy as np


# # import external libraries
# import synapseclient as sc

# # import project modules
# from utils import query_utils as query


# global variables
data_dict = {
    "FEATURE_DATA_SYNIDS": {
        "MPOWER_V2": "syn21597625",
        "MPOWER_PASSIVE": "syn21597842"},
    "DEMOGRAPHIC_DATA_SYNID": "syn21602828",
    "OUTPUT_INFO": {
        "PARENT_FOLDER": "syn21592268",
        "PROJ_REPO_NAME": "mpower-gait-analysis",
        "PATH_GITHUB_TOKEN": "~/git_token.txt"}
}


syn = sc.login()


def annot_phone(params):
    """
    Function to have more concrete phone types

    Args:
        params (type: string): raw phone information

    Returns:
        Rtype: String
        Returns an annotated dataset with lesser choice of phone information
    """
    if ";" in params:
        params = params.split(";")[0]
    if ("iPhone 6+" in params) or ("iPhone 6 Plus" in params):
        return "iPhone 6+"
    elif ("Unknown" in params) or ("iPad" in params) or ("iPod" in params):
        return "Other iPhone"
    elif ("iPhone 5" in params) or ("iPhone5" in params):
        return "iPhone 5"
    elif ("iPhone8" in params) or ("iPhone 8" in params):
        return "iPhone 8"
    elif ("iPhone9" in params) or ("iPhone 9" in params):
        return "iPhone 9"
    elif ("iPhone X" in params) or ("iPhone10" in params):
        return "iPhone X"
    return params


def iqr(x):
    """
    Function for getting IQR value
    """
    return q75(x) - q25(x)


def q25(x):
    """
    Function for getting first quantile
    """
    return x.quantile(0.25)


def q75(x):
    """
    Function for getting third quantile
    """
    return x.quantile(0.75)


def valrange(x):
    """
    Function for getting the value range
    """
    return x.max() - x.min()


def kurtosis(x):
    """
    Function to retrieve kurtosis
    """
    return x.kurt()


def skew(x):
    """
    Function to retrieve skewness
    """
    return x.skew()


def groupby_wrapper(data, group, metadata_columns=[]):
    """
    Wrapper function to wrap feature data
    into several aggregation function

    Args:
        data (dtype: pd.Dataframe)   : feature datasets
        group (dtype: string)        : which group to aggregate
        exclude_columns (dtype: list): columns to exclude during groupby
    Returns:
        Rtype: pd.Dataframe
        Returns grouped healthcodes features
    """
    # groupby features based on several aggregation
    feature_cols = [feat for feat in data.columns if
                    (feat not in metadata_columns) or (feat == "healthCode")]
    feature_data = data[feature_cols].groupby(group).agg([np.max,
                                                          np.median,
                                                          np.mean,
                                                          q25, q75,
                                                          valrange, iqr])
    feature_cols = []
    for feat, agg in feature_data.columns:
        feature_cols_name = "{}_{}".format(agg, feat)
        feature_cols.append(feature_cols_name)
    feature_data.columns = feature_cols

    # groupby metadata based on modes
    metadata = data[metadata_columns]\
        .groupby(["healthCode"])\
        .agg({"recordId": pd.Series.nunique,
              "phoneInfo": pd.Series.mode,
              "table_version": pd.Series.mode,
              "test_type": pd.Series.mode})
    metadata = metadata.rename({"recordId": "nrecords"}, axis=1)
    data["phoneInfo"] = data["phoneInfo"].apply(
         lambda x: x[0] if not isinstance(x, str) else x)
    data["phoneInfo"] = data["phoneInfo"].apply(annot_phone)

    # index join on aggregated feature and metadata
    data = feature_data.join(metadata, on="healthCode")
    return data


# def main():

#     used_script_url = query.get_git_used_script_url(
#         path_to_github_token=data_dict["OUTPUT_INFO"]["PATH_GITHUB_TOKEN"],
#         proj_repo_name=data_dict["OUTPUT_INFO"]["PROJ_REPO_NAME"],
#         script_name=__file__)

metadata_cols = ['appVersion', 'createdOn',
                 'phoneInfo', 'recordId',
                 'table_version', 'test_type',
                 'error_type', "healthCode"]
demo_data = query.get_file_entity(syn, data_dict["DEMOGRAPHIC_DATA_SYNID"])

data = query.get_file_entity(syn, data_dict["FEATURE_DATA_SYNIDS"]["MPOWER_V2"])
#     for test_type in data["test_type"].unique():
#         subset = data[data["test_type"] == test_type]
#         subset = groupby_wrapper(subset,
#                                  "healthCode",
#                                  metadata_cols)
#         subset = pd.merge(subset, demo_data, on="healthCode", how="inner")
#             query.save_data_to_synapse(
#                 syn=syn,
#                 data=subset,
#                 source_table_id=[synId, data_dict["DEMOGRAPHIC_DATA_SYNID"]],
#                 used_script=used_script_url,
#                 output_filename=("grouped_%s_%s_features.csv" %
#                                  (key, test_type)).lower(),
#                 data_parent_id="syn21537421")


# if __name__ == '__main__':
#     start_time = time.time()
#     main()
#     print("--- %s seconds ---" % (time.time() - start_time))


Welcome, aryton tediarjo!



INFO:synapseclient_default:Welcome, aryton tediarjo!



In [30]:
"""
Author: Sage Bionetworks

Script to gather all Demographics Data from all gait data from Synapse Table
"""


# global variables
data_dict = {
    "METADATA": {
        "MPOWER_V1": "syn10371840",
        "MPOWER_V2": "syn15673379",
        "ELEVATE_MS_DEMO": "syn10295288",
        "ELEVATE_MS_PROF": "syn10235463"},
    "OUTPUT_INFO": {
        "metadata_filename": "gait_metadata.csv",
        "parent_folder_synId": "syn21537423",
        "proj_repo_name": "mpower-gait-analysis",
        "path_to_github_token": "~/git_token.txt"}
}
syn = sc.login()



# demographics on mpower V1
demo_data_v1 = syn.tableQuery(
    "SELECT age, healthCode, \
    inferred_diagnosis as PD, gender \
    FROM {} where dataGroups \
    NOT LIKE '%test_user%'"
    .format(data_dict["METADATA"]["MPOWER_V1"])).asDataFrame()
demo_data_v1 = demo_data_v1.dropna(subset=["PD"], thresh=1)
demo_data_v1["class"] = demo_data_v1["PD"].map(
    {True: "PD", False: "control"})

# demographics on ElevateMS
demo_data_ems = syn.tableQuery(
    "SELECT healthCode, dataGroups as MS,\
    'gender.json.answer' as gender from {}\
    where dataGroups NOT LIKE '%test_user%'"
    .format(data_dict["METADATA"]["ELEVATE_MS_DEMO"])).asDataFrame()
profile_data_ems = syn.tableQuery(
    "SELECT healthCode as healthCode, \
    'demographics.age' as age from {}"
    .format(data_dict["METADATA"]["ELEVATE_MS_PROF"])).asDataFrame()
demo_data_ems = pd.merge(
    demo_data_ems, profile_data_ems, how="inner", on="healthCode")
demo_data_ems["class"] = demo_data_ems["MS"].map(
    {"ms_patient": "MS", "control": "control"})

# demographics on mpower V2
demo_data_v2 = syn.tableQuery(
    "SELECT birthYear, createdOn, healthCode, \
    diagnosis as PD, sex as gender FROM {} \
    where dataGroups NOT LIKE '%test_user%'"
    .format(data_dict["METADATA"]["MPOWER_V2"])).asDataFrame()
demo_data_v2 = demo_data_v2[demo_data_v2["PD"] != "no_answer"]
demo_data_v2["class"] = demo_data_v2["PD"].map(
    {"parkinsons": "PD", "control": "control"})
demo_data_v2["birthYear"] = demo_data_v2[demo_data_v2["birthYear"].apply(
    lambda x: True if x >= 0 else False)]
demo_data_v2["age"] =\
    pd.to_datetime(demo_data_v2["createdOn"],
                   unit="ms").dt.year - demo_data_v2["birthYear"]

# concatenate all demographic data
demo_data = pd.concat(
    [demo_data_v1, demo_data_v2, demo_data_ems], sort=False)\
    .reset_index(drop=True)

# filter gender
demo_data["gender"] = demo_data["gender"].str.lower()
demo_data = demo_data[(demo_data["gender"] == "female")
                      | (demo_data["gender"] == "male")]

# filter age
demo_data["age"] = demo_data["age"].apply(lambda x: float(x))
demo_data = demo_data[(demo_data["age"] <= 120) & (demo_data["age"] >= 18)]
demo_data = demo_data[~demo_data["age"].isin([np.inf, -np.inf])]
demo_data = demo_data.sort_values(by="age", ascending=False)

# check if multiple input of any class
demo_data = pd.merge(demo_data,
                     (demo_data.groupby("healthCode")
                      .nunique()["class"] >= 2)
                     .reset_index()
                     .rename({"class": "has_double_class_entry"}, axis=1),
                     on="healthCode",
                     how="left")
demo_data = demo_data.drop(
    ["PD", "MS", "birthYear",
     "createdOn", "has_double_class_entry"], axis=1)
demo_data = demo_data.drop_duplicates(
    'healthCode', keep="first").reset_index(drop=True)

    



    

Welcome, aryton tediarjo!



INFO:synapseclient_default:Welcome, aryton tediarjo!



In [31]:
demo_data_v2

Unnamed: 0,birthYear,createdOn,healthCode,PD,gender,class,age
13_6,1990,1537313728298,e0b36a06-56cc-4b62-a5ab-2ec8f0ed2c18,control,female,control,28
14_48,1965,1537353379331,8908507b-e8dc-4b8f-baa1-909460d7cc8f,parkinsons,female,PD,53
15_48,1951,1537403719795,fb0b63dd-f86c-4a4b-a12a-4eea145e5580,parkinsons,female,PD,67
16_7,1993,1537398283679,c18e1c20-0f50-4b76-b825-e46d3fb33511,parkinsons,female,PD,25
17_48,1994,1537446071863,6c19f855-6107-4ed9-b625-dc8012c047d2,parkinsons,female,PD,24
18_48,1951,1537453341000,1062671e-ea83-426a-a409-0a5c6ed430d0,parkinsons,female,PD,67
19_48,1973,1537577422439,10795802-4f59-4e5f-8aad-6dee84a0ebeb,parkinsons,female,PD,45
20_10,2018,1537891917779,fcfcc111-a43c-408d-9f2d-6c09fb76cf91,parkinsons,male,PD,0
21_11,2018,1537986554581,e2785b7a-30ea-432c-8128-fe7230a00fe6,parkinsons,male,PD,0
22_48,1968,1538240778472,a3442266-c1a2-406b-b1bb-92c9c827e3dc,parkinsons,female,PD,50
