In [1]:
import pickle
from sklearn.externals import joblib 
import warnings
warnings.simplefilter("ignore")
import synapseclient as sc
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
np.random.seed(100)



In [92]:
def rename_column_name(data):
    for feature in filter(lambda x: "userAccel" in x, data.columns): 
        data  = data.rename({feature: "max_{}"\
                            .format(feature)}, axis = 1)
    return data

In [164]:
selected_columns = ['max_userAccel_x.no_of_steps', 'max_userAccel_x.frequency_of_peaks',
       'max_userAccel_x.max_freeze_index', 'max_userAccel_x.freeze_occurences',
       'max_userAccel_y.no_of_steps', 'max_userAccel_y.frequency_of_peaks',
       'max_userAccel_y.max_freeze_index', 'max_userAccel_y.freeze_occurences',
       'max_userAccel_z.no_of_steps', 'max_userAccel_z.frequency_of_peaks',
       'max_userAccel_z.max_freeze_index', 'max_userAccel_z.freeze_occurences',
       'max_userAccel_AA.frequency_of_peaks',
       'max_userAccel_AA.max_freeze_index',
       'max_userAccel_AA.freeze_occurences', "PD"]

In [264]:
syn = sc.login()

active_entity = syn.get("syn21027487")
active_data   = pd.read_csv(active_entity["path"], index_col = 0)
active_max_data = active_data.groupby("healthCode").max()
active_max_data = rename_column_name(active_max_data).dropna().drop(["gender"], axis = 1)
grouped  = active_data[["healthCode", "recordId"]].groupby("healthCode").count()["recordId"] > 5
active_max_data = active_max_data[active_max_data.index.isin(grouped[grouped].index)]

passive_entity = syn.get("syn21036367")
passive_data   = pd.read_csv(passive_entity["path"], index_col = 0)
passive_max_data = passive_data.groupby("healthCode").max()
passive_max_data = rename_column_name(passive_max_data).dropna().drop(["gender"], axis = 1)
grouped  = passive_data[["healthCode", "recordId"]].groupby("healthCode").count()["recordId"] > 5
passive_max_data = passive_max_data[passive_max_data.index.isin(grouped[grouped].index)]

Welcome, aryton tediarjo!



In [267]:
gb_walking_model = joblib.load('../Models/{}.pkl'.format("refitted_gb"))
xgb_walking_model = joblib.load('../Models/{}.pkl'.format("refitted_xgb"))
rf_walking_model = joblib.load('../Models/{}.pkl'.format("refitted_rf"))
lr_walking_model = joblib.load('../Models/{}.pkl'.format("refitted_lr"))

In [269]:
active_max_data[selected_columns].to_csv("../Data/MAX_WALKING_TRAINING_DATA_V2.csv")
passive_max_data[selected_columns].to_csv("../Data/MAX_WALKING_TRAINING_DATA_PASSIVE.csv")

In [216]:
def print_performance(model, walking_X_test, walking_y_test):
    print(model)
    pipeline = globals()[model]
    y_true, y_pred = walking_y_test, pipeline.predict(walking_X_test)
    print("ROC-AUC on Test-Set: {}".format(metrics.roc_auc_score(y_true, y_pred)))
    print("log-loss: {}".format(metrics.log_loss(y_true, y_pred)))
    print("Precision: {}".format(metrics.precision_score(y_true, y_pred)))
    print("Recall: {}".format(metrics.recall_score(y_true, y_pred)))
    print("F1-Score: {}".format(metrics.f1_score(y_true, y_pred)))
    print("\n")

# Performance in Active

In [217]:
walking_y_test = active_max_data["PD"]
walking_X_test = active_max_data[selected_columns].drop(["PD"], axis = 1)

In [218]:
for model in ["gb_walking_model", "xgb_walking_model", 
              "rf_walking_model", "lr_walking_model"]:
    print_performance(model, walking_X_test, walking_y_test)

gb_walking_model
ROC-AUC on Test-Set: 0.6365606270170585
log-loss: 9.057435735521665
Precision: 0.8915094339622641
Recall: 0.7842323651452282
F1-Score: 0.8344370860927152


xgb_walking_model
ROC-AUC on Test-Set: 0.48045182111572154
log-loss: 28.13824509931437
Precision: 0.75
Recall: 0.04979253112033195
F1-Score: 0.09338521400778209


rf_walking_model
ROC-AUC on Test-Set: 0.4734900875979714
log-loss: 28.017482942679827
Precision: 0.7368421052631579
Recall: 0.058091286307053944
F1-Score: 0.10769230769230768


lr_walking_model
ROC-AUC on Test-Set: 0.5096357768556938
log-loss: 28.017471759498985
Precision: 0.9090909090909091
Recall: 0.04149377593360996
F1-Score: 0.07936507936507936




In [219]:
print("SkLearn Gradient Boosting")
print(metrics.classification_report(gb_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(gb_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))

print("\nXGBoost Gradient Boosting")
print(metrics.classification_report(xgb_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(xgb_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))


print("\nRandom Forests")
print(metrics.classification_report(rf_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(rf_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))

SkLearn Gradient Boosting
              precision    recall  f1-score   support

           0       0.49      0.30      0.37        74
           1       0.78      0.89      0.83       212

    accuracy                           0.74       286
   macro avg       0.64      0.59      0.60       286
weighted avg       0.71      0.74      0.71       286

Predicted   0    1
Actual            
0          22   52
1          23  189

XGBoost Gradient Boosting
              precision    recall  f1-score   support

           0       0.91      0.15      0.26       270
           1       0.05      0.75      0.09        16

    accuracy                           0.19       286
   macro avg       0.48      0.45      0.18       286
weighted avg       0.86      0.19      0.25       286

Predicted   0    1
Actual            
0          41  229
1           4   12

Random Forests
              precision    recall  f1-score   support

           0       0.89      0.15      0.26       267
           1    

## Performance in Passive

In [220]:
walking_y_test = passive_max_data["PD"]
walking_X_test = passive_max_data[selected_columns].drop(["PD"], axis = 1)

In [221]:
for model in ["gb_walking_model", "xgb_walking_model", 
              "rf_walking_model", "lr_walking_model"]:
    print_performance(model, walking_X_test, walking_y_test)

gb_walking_model
ROC-AUC on Test-Set: 0.5943152454780362
log-loss: 10.627392698186961
Precision: 0.8648648648648649
Recall: 0.7441860465116279
F1-Score: 0.8


xgb_walking_model
ROC-AUC on Test-Set: 0.6240310077519382
log-loss: 13.94839813625798
Precision: 0.8928571428571429
Recall: 0.5813953488372093
F1-Score: 0.7042253521126761


rf_walking_model
ROC-AUC on Test-Set: 0.5568475452196383
log-loss: 15.276827989858969
Precision: 0.8571428571428571
Recall: 0.5581395348837209
F1-Score: 0.676056338028169


lr_walking_model
ROC-AUC on Test-Set: 0.5839793281653747
log-loss: 21.254647004511
Precision: 0.9230769230769231
Recall: 0.27906976744186046
F1-Score: 0.42857142857142855




In [222]:
print("SkLearn Gradient Boosting")
print(metrics.classification_report(gb_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(gb_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))

print("\nXGBoost Gradient Boosting")
print(metrics.classification_report(xgb_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(xgb_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))


print("\nRandom Forests")
print(metrics.classification_report(rf_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(rf_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))

SkLearn Gradient Boosting
              precision    recall  f1-score   support

           0       0.44      0.27      0.33        15
           1       0.74      0.86      0.80        37

    accuracy                           0.69        52
   macro avg       0.59      0.57      0.57        52
weighted avg       0.66      0.69      0.67        52

Predicted  0   1
Actual          
0          4  11
1          5  32

XGBoost Gradient Boosting
              precision    recall  f1-score   support

           0       0.67      0.25      0.36        24
           1       0.58      0.89      0.70        28

    accuracy                           0.60        52
   macro avg       0.62      0.57      0.53        52
weighted avg       0.62      0.60      0.55        52

Predicted  0   1
Actual          
0          6  18
1          3  25

Random Forests
              precision    recall  f1-score   support

           0       0.56      0.21      0.30        24
           1       0.56      0.8