In [3]:
import sys
import warnings
sys.path.append("../src/pipeline")
from utils import query_utils as query
from utils import gait_features_utils as gproc
import synapseclient as sc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import sklearn.metrics as metrics

# magic commands
%matplotlib inline
%load_ext autoreload
%autoreload 2

# extra commands
sns.set_style("darkgrid")
sns.set_context("paper")
warnings.simplefilter("ignore")

In [4]:
## package call ##
gf = gproc.GaitFeaturize()

In [5]:
syn = sc.login()


# global variables
DATA_DICT = {
    "FEATURE_DATA_SYNIDS": {
        "MPOWER_V1": "syn21765655",
        "MPOWER_V2": "syn21765659",
        "MPOWER_PASSIVE": "syn21765662",
        "ELEVATE_MS": "syn21765671"},
    "DEMOGRAPHIC_DATA_SYNID": "syn21602828",
    "OUTPUT_INFO": {
        "PARENT_SYN_ID": "syn21537421",
        "PROJ_REPO": "mpower-gait-analysis",
        "TOKEN_PATH": "~/git_token.txt"}
}


Welcome, aryton tediarjo!




UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.3) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html

INFO:synapseclient_default:Welcome, aryton tediarjo!



In [6]:
from scipy.stats import zscore

In [7]:
def iqr(x):
    """
    Function for getting IQR value
    """
    return x.quantile(0.75) - x.quantile(0.25)


def z_score(x):
    try:
        return zscore(x,ddof=1)
    except:
        return np.NaN


def aggregate_wrapper(data, group, metadata_columns=[]):
    """
    Wrapper function to wrap feature data
    into several aggregation function

    Args:
        data (dtype: pd.Dataframe)   : feature datasets
        group (dtype: string)        : which group to aggregate
        exclude_columns (dtype: list): columns to exclude during groupby
    Returns:
        Rtype: pd.Dataframe
        Returns grouped healthcodes features
    """

    data = data[data["gait_segment"] != "rest"]

    # groupby features based on several aggregation
    feature_mapping = {
        "nonrot_data":
        data[(data["gait_segment"] == "walk")
             & (data["window_size"] >= 5)]
        .drop("rotation_omega", axis=1),
        "rot_data":
        data[data["gait_segment"] == "rotation"][[group, "rotation_omega"]]
    }
    for gait_sequence, feature_data in feature_mapping.items():
        feature_cols = [feat for feat in feature_data.columns if
                        (feat not in metadata_columns)
                        or (feat == group)]
        feature_mapping[gait_sequence] = feature_data[feature_cols]\
            .groupby(group)\
            .agg([z_score])
        agg_feature_cols = []
        for feat, agg in feature_mapping[gait_sequence].columns:
            agg_feature_cols.append("{}_{}"
                                    .format(agg, feat))
        feature_mapping[gait_sequence].columns = agg_feature_cols

    feature_data = pd.concat([seqs for _,
                              seqs in feature_mapping.items()],
                             join="outer",
                             axis=1)
    feature_data.index.name = group
    feature_data = feature_data.reset_index()

#     # if aggregate on recordId no need to aggregate metadata
#     if group == "recordId":
#         metadata = data[metadata_columns].\
#             drop_duplicates(subset=["recordId"],
#                             keep="first")

#     # aggregate on healthcode require aggregate on metadata
#     else:
#         metadata = data[metadata_columns]\
#             .groupby([group])\
#             .agg({"recordId": pd.Series.nunique,
#                   "phoneInfo": pd.Series.mode,
#                   "table_version": pd.Series.mode,
#                   "test_type": pd.Series.mode})
#         metadata = metadata.rename(
#             {"recordId": "nrecords"}, axis=1)
#         metadata = metadata.reset_index()
#     metadata["phoneInfo"] = metadata["phoneInfo"].apply(
#         lambda x: x[0] if not isinstance(x, str) else x)
#     metadata["phoneInfo"] = metadata["phoneInfo"].apply(annot_phone)

#     feature_data = pd.merge(
#         feature_data, metadata, on=group, how="left")
    return feature_data

In [8]:
data = query.get_file_entity(syn, "syn21765659")

In [9]:
aggregate_wrapper(data, "healthCode")

Exception: Must produce aggregated value