In [23]:
"""
Author: Sage Bionetworks

Script to gather all Demographics Data from all gait data from Synapse Table
"""

# import future libraries
from __future__ import print_function
from __future__ import unicode_literals

# import standard libraries
import time
import sys
import os
import pandas as pd
import numpy as np

# import external libraries
import synapseclient as sc

# import project modules
sys.path.append("../src/pipeline")
from utils import query_utils as query

# global variables
DATA_DICT = {
    "DEMOGRAPHICS": {
        "MPOWER_V1": "syn10371840",
        "MPOWER_V2": "syn15673379",
        "ELEVATE_MS_DEMO": "syn10295288",
        "ELEVATE_MS_PROF": "syn10235463"},
    "OUTPUT_INFO": {
        "FILENAME": "gait_demographics.csv",
        "PARENT_SYN_ID": "syn21537423",
        "PROJ_REPO": "mpower-gait-analysis",
        "TOKEN_PATH": "~/git_token.txt"}
}


def generate_gait_demographic(syn):
    """
    Function to generate healthcode demographic informations
    from Demographic and Profiles synapse table.
    Takes in a dataframe containing healthcode,
    and join the table and compiled demographic
    data by their healthcodes.

    Cleaning process:
        1. Annotate controls, PD, MS in a column called class
        2. Filter healthCode if gender, inferred diagnosis is NULL
        3. If age is recorded as birthYear, age
            will be based on current year - birthYear
        4. healthCodes that has double PD status entry
            will be dropped from the dataframe
        5. Age is subsetted between 18-120 years old
        6. Aggregation of records for each healthcodes will be based on
        number of unique record entries, other metadata features will be
        aggregated based on most frequent occurences

    Args:
        syn                : synapseclient object
        data (pd.DataFrame): pandas dataframe

    Returns:
        RType: pd.DataFrame
        Returns a dataframe of unique healthcode and
        its corresponding metadata features
    """

    # demographics on mpower V1
    demo_data_v1 = syn.tableQuery(
        "SELECT age, healthCode, \
        inferred_diagnosis as PD, gender \
        FROM {} where dataGroups \
        NOT LIKE '%test_user%'"
        .format(DATA_DICT["DEMOGRAPHICS"]["MPOWER_V1"]))\
        .asDataFrame()
    demo_data_v1 = demo_data_v1\
        .dropna(subset=["PD"], thresh=1)
    demo_data_v1["class"] = demo_data_v1["PD"]\
        .map({True: "PD", False: "control"})
    demo_data_v1["table_version"] = "MPOWER_V1"

    # demographics on ElevateMS
    demo_data_ems = syn.tableQuery(
        "SELECT healthCode, dataGroups as MS,\
        'gender.json.answer' as gender from {}\
        where dataGroups NOT LIKE '%test_user%'"
        .format(DATA_DICT["DEMOGRAPHICS"]["ELEVATE_MS_DEMO"]))\
        .asDataFrame()
    profile_data_ems = syn.tableQuery(
        "SELECT healthCode as healthCode, \
        'demographics.age' as age from {}"
        .format(DATA_DICT["DEMOGRAPHICS"]["ELEVATE_MS_PROF"]))\
        .asDataFrame()
    demo_data_ems = pd.merge(
        demo_data_ems, profile_data_ems,
        how="inner", on="healthCode")
    demo_data_ems["class"] = demo_data_ems["MS"].map(
        {"ms_patient": "MS", "control": "control"})
    demo_data_ems["table_version"] = "ELEVATE_MS"
    
    
    demo_data = demo_data_ems

    # demographics on mpower V2
    demo_data_v2 = syn.tableQuery(
        "SELECT birthYear, createdOn, healthCode, \
        diagnosis as PD, sex as gender FROM {} \
        where dataGroups NOT LIKE '%test_user%'"
        .format(DATA_DICT["DEMOGRAPHICS"]["MPOWER_V2"])).asDataFrame()
    demo_data_v2 = demo_data_v2[demo_data_v2["PD"] != "no_answer"]
    demo_data_v2["class"] = demo_data_v2["PD"]\
        .map({"parkinsons": "PD", "control": "control"})
    demo_data_v2["birthYear"] = demo_data_v2[demo_data_v2["birthYear"]
                                             .apply(lambda x: True if x >= 0
                                                    else False)]
    demo_data_v2["age"] =\
        pd.to_datetime(demo_data_v2["createdOn"],
                       unit="ms").dt.year - demo_data_v2["birthYear"]
    demo_data_v2["table_version"] = "MPOWER_V2"

    # concatenate all demographic data
    demo_data = pd.concat(
        [demo_data_v1, demo_data_v2, demo_data_ems], sort=False)\
        .reset_index(drop=True)

    # filter gender
    demo_data["gender"] = demo_data["gender"].str.lower()
    demo_data = demo_data[(demo_data["gender"] == "female")
                          | (demo_data["gender"] == "male")]

    # filter age
    demo_data["age"] = demo_data["age"].apply(lambda x: float(x))
    demo_data = demo_data[(demo_data["age"] <= 120) & (demo_data["age"] >= 18)]
    demo_data = demo_data[~demo_data["age"].isin([np.inf, -np.inf])]
    demo_data = demo_data.sort_values(by="age", ascending=False)

    # check if multiple input of any class
    demo_data = pd.merge(demo_data,
                         (demo_data.groupby("healthCode")
                          .nunique()["class"] >= 2)
                         .reset_index()
                         .rename({"class": "has_double_class_entry"}, axis=1),
                         on="healthCode",
                         how="left")
    demo_data = demo_data.drop(
        ["PD", "MS", "birthYear",
         "createdOn", "has_double_class_entry"], axis=1)
    demo_data = demo_data.drop_duplicates(
        'healthCode', keep="first").reset_index(drop=True)
    return demo_data



"""
Main Function
Entry point for the script
Note: Passive gait data will be separated from active gait data
      as we dont want to combine both in analysis
"""

# retrieve synapse credential through config
path = os.path.join(os.getenv("HOME"),
                    ".synapseConfig")
syn = sc.Synapse(configPath=path)
syn.login(os.getenv("syn_username"),
          os.getenv("syn_password"),
          rememberMe=True)

# process metadata from synapse table
metadata = generate_gait_demographic(syn)

    

Welcome, aryton tediarjo!



In [29]:
metadata[metadata["table_version"] == "ELEVATE_MS"]["class"].value_counts()

MS         272
control     45
Name: class, dtype: int64

In [36]:
data = syn.tableQuery("SELECT * FROM syn10278766").asDataFrame()

In [37]:
data = (data.groupby("healthCode").agg({"recordId":pd.Series.nunique})).reset_index()

In [39]:
data = pd.merge(metadata, data, how = "right", on = "healthCode")

In [41]:
data = data[(data["age"] <= 120) & (data["age"] >= 18)]

In [43]:
data["class"].value_counts()

MS         212
control     28
Name: class, dtype: int64