In [1]:
import pickle
from sklearn.externals import joblib 
import warnings
warnings.simplefilter("ignore")
import synapseclient as sc
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV, SelectKBest, chi2, SelectFromModel, RFE
np.random.seed(100)



In [10]:
syn = sc.login()

active_entity = syn.get("syn21046181")
active_data   = pd.read_csv(active_entity["path"], index_col = 0)
active_max_data = active_data.groupby("healthCode").max()
active_max_data = rename_column_name(active_max_data).dropna().drop(["gender"], axis = 1)
grouped  = active_data[["healthCode", "recordId"]].groupby("healthCode").count()["recordId"] > 5
active_max_data = active_max_data[active_max_data.index.isin(grouped[grouped].index)]

passive_entity = syn.get("syn21046184")
passive_data   = pd.read_csv(passive_entity["path"], index_col = 0)
passive_max_data = passive_data.groupby("healthCode").max()
passive_max_data = rename_column_name(passive_max_data).dropna().drop(["gender"], axis = 1)
grouped  = passive_data[["healthCode", "recordId"]].groupby("healthCode").count()["recordId"] > 5
passive_max_data = passive_max_data[passive_max_data.index.isin(grouped[grouped].index)]

Welcome, aryton tediarjo!



In [11]:
gb_walking_model = joblib.load('../Models/{}.pkl'.format("refitted_gb"))
xgb_walking_model = joblib.load('../Models/{}.pkl'.format("refitted_xgb"))
rf_walking_model = joblib.load('../Models/{}.pkl'.format("refitted_rf"))
lr_walking_model = joblib.load('../Models/{}.pkl'.format("refitted_lr"))

In [12]:
active_max_data.drop(["recordId", "phoneInfo", "createdOn"], axis = 1).to_csv("../Data/MAX_WALKING_TRAINING_DATA_V2.csv")
passive_max_data.drop(["recordId", "phoneInfo", "createdOn"], axis = 1).to_csv("../Data/MAX_WALKING_TRAINING_DATA_PASSIVE.csv")

In [13]:
def print_performance(model, walking_X_test, walking_y_test):
    print(model)
    pipeline = globals()[model]
    y_true, y_pred = walking_y_test, pipeline.predict(walking_X_test)
    print("ROC-AUC on Test-Set: {}".format(metrics.roc_auc_score(y_true, y_pred)))
    print("log-loss: {}".format(metrics.log_loss(y_true, y_pred)))
    print("Precision: {}".format(metrics.precision_score(y_true, y_pred)))
    print("Recall: {}".format(metrics.recall_score(y_true, y_pred)))
    print("F1-Score: {}".format(metrics.f1_score(y_true, y_pred)))
    print("\n")

# Performance in Active

In [14]:
walking_y_test = active_max_data["PD"]
walking_X_test = active_max_data[cols]

In [15]:
for model in ["gb_walking_model", "xgb_walking_model", 
              "rf_walking_model", "lr_walking_model"]:
    print_performance(model, walking_X_test, walking_y_test)

gb_walking_model


ValueError: X has a different shape than during fitting.

In [72]:
print("SkLearn Gradient Boosting")
print(metrics.classification_report(gb_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(gb_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))

print("\nXGBoost Gradient Boosting")
print(metrics.classification_report(xgb_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(xgb_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))


print("\nRandom Forests")
print(metrics.classification_report(rf_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(rf_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))

SkLearn Gradient Boosting
              precision    recall  f1-score   support

           0       0.12      0.09      0.11        11
           1       0.94      0.95      0.94       152

    accuracy                           0.90       163
   macro avg       0.53      0.52      0.52       163
weighted avg       0.88      0.90      0.89       163

Predicted  0    1
Actual           
0          1   10
1          7  145

XGBoost Gradient Boosting
              precision    recall  f1-score   support

           0       0.88      0.05      0.09       153
           1       0.06      0.90      0.11        10

    accuracy                           0.10       163
   macro avg       0.47      0.47      0.10       163
weighted avg       0.82      0.10      0.09       163

Predicted  0    1
Actual           
0          7  146
1          1    9

Random Forests
              precision    recall  f1-score   support

           0       0.75      0.04      0.08       151
           1       0.06 

## Performance in Passive

In [62]:
walking_y_test = passive_max_data["PD"]
walking_X_test = passive_max_data[cols]

In [63]:
for model in ["gb_walking_model", "xgb_walking_model", 
              "rf_walking_model", "lr_walking_model"]:
    print_performance(model, walking_X_test, walking_y_test)

gb_walking_model
ROC-AUC on Test-Set: 0.4666666666666667
log-loss: 6.90786950718645
Precision: 0.8484848484848485
Recall: 0.9333333333333333
F1-Score: 0.888888888888889


xgb_walking_model
ROC-AUC on Test-Set: 0.5
log-loss: 9.868313209680789
Precision: 0.8571428571428571
Recall: 0.8
F1-Score: 0.8275862068965518


rf_walking_model
ROC-AUC on Test-Set: 0.4666666666666667
log-loss: 11.841957575104257
Precision: 0.8461538461538461
Recall: 0.7333333333333333
F1-Score: 0.7857142857142856


lr_walking_model
ROC-AUC on Test-Set: 0.5666666666666667
log-loss: 15.789200614669465
Precision: 0.8888888888888888
Recall: 0.5333333333333333
F1-Score: 0.6666666666666667




In [64]:
print("SkLearn Gradient Boosting")
print(metrics.classification_report(gb_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(gb_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))

print("\nXGBoost Gradient Boosting")
print(metrics.classification_report(xgb_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(xgb_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))


print("\nRandom Forests")
print(metrics.classification_report(rf_walking_model.predict(walking_X_test), 
                                    walking_y_test))
print(pd.crosstab(rf_walking_model.predict(walking_X_test), 
            walking_y_test, 
            rownames = ["Actual"], 
            colnames = ["Predicted"]))

SkLearn Gradient Boosting
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.93      0.85      0.89        33

    accuracy                           0.80        35
   macro avg       0.47      0.42      0.44        35
weighted avg       0.88      0.80      0.84        35

Predicted  0   1
Actual          
0          0   2
1          5  28

XGBoost Gradient Boosting
              precision    recall  f1-score   support

           0       0.20      0.14      0.17         7
           1       0.80      0.86      0.83        28

    accuracy                           0.71        35
   macro avg       0.50      0.50      0.50        35
weighted avg       0.68      0.71      0.70        35

Predicted  0   1
Actual          
0          1   6
1          4  24

Random Forests
              precision    recall  f1-score   support

           0       0.20      0.11      0.14         9
           1       0.73      0.8