In [77]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../../src")
import utils.munging_utils as query
import utils.preprocessing_utils as process
from sklearn.utils import resample
from sklearn import metrics
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
warnings.simplefilter("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Introduction

This notebook will explore how models performed on our metadata through bootstrapped subsamling

In [54]:
data = query.get_file_entity("syn21281633")
data = data[(data["version"] == "mpower_v1") | (data["version"] == "mpower_v2")]

METADATA_COLS = ["PD", "appVersion", "phoneInfo", "age", "gender"]

def one_hot_encoder(data, *features):
    for feature in features:
        OHE_data = pd.get_dummies(data[feature], drop_first = True, dtype = float)
        data = pd.concat([data, OHE_data], axis = 1).drop(feature, axis = 1)
    return data

In [65]:
metadata = data[METADATA_COLS]
metadata = one_hot_encoder(metadata, "appVersion", "phoneInfo", "gender")

In [70]:
## set X_test, y_test as OOB samples
## set X_train, y_train as sample for bootstrap
X_train, X_test, y_train, y_test = train_test_split(metadata, metadata["PD"], test_size=0.25, random_state=100)

In [102]:
def bootstrap_model(metadata, model):
    X_train, X_test, y_train, y_test = \
                        train_test_split(metadata, 
                                         metadata["PD"], 
                                         test_size=0.25, 
                                         random_state=100)
    performance_dict = {}
    for B in range(1,101):
#         performance_dict["bootstrap_sample %s" %B] = 
        y_train_boot   = resample(y_train, replace=True, random_state=100)
        model.fit(X_train, y_train_boot)
        y_pred = model.predict(X_test)
        results = metrics.roc_auc_score(y_test, y_pred)
        performance_dict["bootstrap_sample %s" %B] = [results]  
    return performance_dict

In [100]:
pd.DataFrame(bootstrap_model(X_train, X_test, y_train, y_test, 
                             RandomForestClassifier())).T

Unnamed: 0,0
bootstrap_sample 1,0.499530
bootstrap_sample 2,0.500208
bootstrap_sample 3,0.498216
bootstrap_sample 4,0.500134
bootstrap_sample 5,0.501992
bootstrap_sample 6,0.500267
bootstrap_sample 7,0.501082
bootstrap_sample 8,0.500448
bootstrap_sample 9,0.501934
bootstrap_sample 10,0.501694


In [108]:
## grouped by median ##
grouped_data = process.collapseFeatures(aggregation_type = "max").transform(data)
grouped_metadata = grouped_data[METADATA_COLS]
grouped_metadata = one_hot_encoder(grouped_metadata, "appVersion", "phoneInfo", "gender")

In [109]:
bootstrap_model(grouped_metadata, RandomForestClassifier())

{'bootstrap_sample 1': [0.5274314003784806],
 'bootstrap_sample 2': [0.5380846174641795],
 'bootstrap_sample 3': [0.5425495066234117],
 'bootstrap_sample 4': [0.5368680724520141],
 'bootstrap_sample 5': [0.5381437550689376],
 'bootstrap_sample 6': [0.540530379832387],
 'bootstrap_sample 7': [0.5339280886726143],
 'bootstrap_sample 8': [0.5564975331170587],
 'bootstrap_sample 9': [0.5380846174641795],
 'bootstrap_sample 10': [0.5349080832657476],
 'bootstrap_sample 11': [0.5459034536361179],
 'bootstrap_sample 12': [0.51191200324412],
 'bootstrap_sample 13': [0.5210065220329819],
 'bootstrap_sample 14': [0.5447460462287105],
 'bootstrap_sample 15': [0.5329945593403622],
 'bootstrap_sample 16': [0.5322384428223844],
 'bootstrap_sample 17': [0.539182887266829],
 'bootstrap_sample 18': [0.5273722627737227],
 'bootstrap_sample 19': [0.5417933901054339],
 'bootstrap_sample 20': [0.535191098945661],
 'bootstrap_sample 21': [0.5310810354149771],
 'bootstrap_sample 22': [0.5284832049202487],
 '