In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../../src")
import utils.query_utils as query
import utils.preprocessing_utils as process
from sklearn.utils import resample
from sklearn import metrics
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
warnings.simplefilter("ignore")
import synapseclient as sc

# Introduction

This notebook will explore how models performed on our metadata through bootstrapped subsamling

In [18]:
syn = sc.login()

data = query.get_file_entity(syn, "syn21281633")
# data = data[(data["version"] == "mpower_v1") | (data["version"] == "mpower_v2")]

METADATA_COLS = ["PD", "appVersion", "phoneInfo", "age", "gender", "createdOn", "healthCode", "version", "recordId"]

def one_hot_encoder(data, *features):
    for feature in features:
        OHE_data = pd.get_dummies(data[feature], drop_first = True, dtype = float)
        data = pd.concat([data, OHE_data], axis = 1).drop(feature, axis = 1)
    return data

Welcome, aryton tediarjo!



In [20]:
## helper functions ## 
def annot_phone(params):
    """
    Function to annotate phone types
    returns an annotated dataset with lesser choice of phonetypes
    """
    if ";" in params:
        params = params.split(";")[0]
    if ("iPhone 6+" in params) or ("iPhone 6 Plus" in params):
        return "iPhone 6+"
    if ("iPhone9" in params) or ("Unknown" in params) or ("iPad" in params) or ("iPod" in params):
        return "Other iPhone"
    if ("iPhone 5" in params):
        return "iPhone 5"
    if ("iPhone8" in params):
        return "iPhone 8"
    if ("iPhone X" in params) or ("iPhone10" in params):
        return "iPhone X"
    return params

In [70]:
nrecords = data.groupby("healthCode").count()["recordId"].rename("nrecords").reset_index()
metadata = pd.merge(data[METADATA_COLS], nrecords, how = "inner", on = "healthCode")
metadata = metadata.sort_values(by = "createdOn", ascending = False)
metadata = metadata.drop_duplicates(keep = "first", subset = "healthCode")
metadata["phone_type"] = metadata["phoneInfo"].apply(annot_phone)
metadata = metadata[["healthCode", "version", "age", "gender", "PD", "phone_type", "nrecords"]]
OHE_metadata = one_hot_encoder(metadata, "gender", "phone_type")

In [72]:
query.save_data_to_synapse(syn, data = OHE_metadata.reset_index(drop = True),
                           output_filename = "OHE_metadata.csv",
                           data_parent_id = "syn21267355")


##################################################
 Uploading file to Synapse storage 
##################################################



In [39]:
## set X_test, y_test as OOB samples
## set X_train, y_train as sample for bootstrap
X_train, X_test, y_train, y_test = train_test_split(metadata, metadata["PD"], test_size=0.25, random_state=100)

In [40]:
def bootstrap_model(metadata, model):
    X_train, X_test, y_train, y_test = \
                        train_test_split(metadata, 
                                         metadata["PD"], 
                                         test_size=0.25, 
                                         random_state=100)
    performance_dict = {}
    for B in range(1,101):
#         performance_dict["bootstrap_sample %s" %B] = 
        y_train_boot   = resample(y_train, replace=True, random_state=100)
        model.fit(X_train, y_train_boot)
        y_pred = model.predict(X_test)
        results = metrics.roc_auc_score(y_test, y_pred)
        performance_dict["bootstrap_sample %s" %B] = [results]  
    return performance_dict

In [47]:
## grouped by median ##
OHE_metadata = metadata[["version", "age", "gender", "PD"]]
OHE_metadata = one_hot_encoder(OHE_metadata, "version", "gender")