In [1]:
## imports ##
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn import svm, decomposition, tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn import metrics
from sklearn.model_selection import learning_curve, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier

warnings.simplefilter("ignore")
np.random.seed(100)



## Contents:

- Gait data being used is based on the PDKit mPowerV1 data, collapsed by each healthCode ids
- Features chosen are based on a feature_engineering.ipnb
- For this ML Analysis, we will split the training-test set by 20%
- Parameters will be searched through grid search (GridSearchCV) and the score is assessed using Stratified 10-Fold Validation 
- Models being used: Logistic Regression, Xtreme Gradient Boost, Sklearn Gradient Boost and Random Forests
- Feature elimination will be done on several model using recursive feature elimination CV (Sklearn RFECV package)
- Learning Curves of each model (Work in progress)
- Best model will be dump into .pkl file (Work in progress)

## Helper Functions

In [28]:
def logreg_fit(X_train, y_train):
    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(random_state = 100))
        ])
    param = [{'classifier__penalty': ['l2'], 
              'classifier__solver': [ 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}, 
             {'classifier__penalty': ['l1'], 
              'classifier__solver': [ 'liblinear', 'saga']}  
            ]

    CV = GridSearchCV(estimator = pipe, param_grid = param , scoring= "roc_auc", n_jobs = 1, cv = 10)
    CV.fit(X_train, y_train)
    return CV



def xgb_fit(X_train, y_train):
    pipe = Pipeline(steps=[
        ('classifier', XGBClassifier(seed = 100))
        ])
    param = {
        "classifier__learning_rate" : [0.01, 0.05, 0.1],
        "classifier__tree_method"   : ["hist", "auto"],
        "classifier__max_depth"     : [6, 8],
        "classifier__gamma"         : [0, 1],
        "classifier__subsample"     : [0.8],
        "classifier__n_estimators"  : [100]
    }
    CV = GridSearchCV(estimator = pipe, param_grid = param , scoring= "roc_auc", n_jobs = 1, cv = 10)
    CV.fit(X_train, y_train)
    return CV
    

def gradientboost_fit(X_train, y_train):
    pca = decomposition.PCA()
    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('classifier', GradientBoostingClassifier(random_state = 100, warm_start = True))
        ])
    param = {
        'classifier__learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
        'classifier__max_depth':[1, 2, 3, 4, 5, 6],
        'classifier__loss': ["deviance", "exponential"], ## exponential will result in adaBoost
        "classifier__n_estimators"  : [100]
    }
    CV = GridSearchCV(estimator = pipe, param_grid = param , scoring= "roc_auc", n_jobs = 1, cv = 10)
    CV.fit(X_train, y_train)
    return CV

def randomforest_fit(X_train, y_train):
    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state = 100))
        ])
    param = {
        'classifier__max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'classifier__criterion': ["gini", "entropy"],## exponential will result in adaBoost
        'classifier__max_features': ["auto", "sqrt", "log2", None], 
        'classifier__n_estimators'  : [100, 200]
    }
    CV = GridSearchCV(estimator = pipe, param_grid = param , scoring= "roc_auc", n_jobs = 1, cv = 10)
    CV.fit(X_train, y_train)
    return CV


def printPerformance(model, X_test, y_test):
    print("Mean AUC score on K-folds: {}".format(model.best_score_))
    print("Parameter Used: {}".format(model.best_params_))
    y_true, y_pred = y_test, model.predict(X_test)
    print("ROC-AUC on Test-Set: {}".format(metrics.roc_auc_score(y_true, y_pred)))
    print("log-loss: {}".format(metrics.log_loss(y_true, y_pred)))
    print("Precision: {}".format(metrics.precision_score(y_true, y_pred)))
    print("Recall: {}".format(metrics.recall_score(y_true, y_pred)))
    print("F1-Score: {}".format(metrics.f1_score(y_true, y_pred)))

## Split to Training and Testing Sets

In [29]:
walking_train = pd.read_csv("../Data/walking_data_training.csv", index_col=0)
walking_train_imputed = pd.read_csv("../Data/walking_imputed.csv", index_col=0)
walking_test_imputed = pd.read_csv("../Data/walking_imputed_test_data.csv", index_col=0)
balance_train = pd.read_csv("../Data/balance_data_training.csv", index_col=0).dropna()
balance_X_train, balance_X_test, balance_y_train, balance_y_test = train_test_split(balance_train.drop(["healthCode", "PD"], axis = 1), balance_train["PD"], test_size=0.20, random_state = 100)
walking_X_train, walking_X_test, walking_y_train, walking_y_test = train_test_split(walking_train.drop(["healthCode", "PD"], axis = 1), walking_train["PD"], test_size=0.20, random_state = 100)

## Run Model on Walking Data

In [30]:
# lr_walking_model = logreg_fit(walking_X_train, walking_y_train)
rf_walking_model = randomforest_fit(walking_X_train, walking_y_train)
gb_walking_model = gradientboost_fit(walking_X_train, walking_y_train)
xgb_walking_model = xgb_fit(walking_X_train, walking_y_train)


print("\n### Gradient Boosting Walking ###")
printPerformance(gb_walking_model, walking_X_test.dropna(), walking_y_test.dropna())
print("\n### XTreme Gradient Boosting Walking ###")
printPerformance(xgb_walking_model, walking_X_test.dropna(), walking_y_test.dropna())
print("\n### Random Forest Walking ###")
printPerformance(rf_walking_model, walking_X_test.dropna(), walking_y_test.dropna())
# print("\n### Logistic Regression Walking ###")
# printPerformance(lr_walking_model, walking_X_test.dropna(), walking_y_test.dropna())


### Gradient Boosting Walking ###
Mean AUC score on K-folds: 0.6734360410830998
Parameter Used: {'classifier__learning_rate': 0.1, 'classifier__loss': 'exponential', 'classifier__max_depth': 2, 'classifier__n_estimators': 100}
ROC-AUC on Test-Set: 0.6785714285714286
log-loss: 10.627377321313302
Precision: 0.75
Recall: 0.5
F1-Score: 0.6

### XTreme Gradient Boosting Walking ###
Mean AUC score on K-folds: 0.6429738562091504
Parameter Used: {'classifier__gamma': 0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 6, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8, 'classifier__tree_method': 'auto'}
ROC-AUC on Test-Set: 0.7658730158730158
log-loss: 7.9705688703568445
Precision: 0.7647058823529411
Recall: 0.7222222222222222
F1-Score: 0.7428571428571428

### Random Forest Walking ###
Mean AUC score on K-folds: 0.6748366013071895
Parameter Used: {'classifier__criterion': 'gini', 'classifier__max_depth': 7, 'classifier__max_features': 'auto', 'classifier__n_estimators'

## Confusion Matrix 

In [5]:
print(metrics.classification_report(xgb_walking_model.predict(walking_X_test), walking_y_test))

              precision    recall  f1-score   support

           0       0.81      0.71      0.76        24
           1       0.61      0.73      0.67        15

    accuracy                           0.72        39
   macro avg       0.71      0.72      0.71        39
weighted avg       0.73      0.72      0.72        39



In [24]:
gb_test = xgb_walking_model.best_estimator_[1]

In [17]:
gb_test = XGBClassifier(random_state = 100, warm_start = True, 
                        gamma= 1, 
                        learning_rate= 0.1, 
                        max_depth= 6, 
                        n_estimators= 100, 
                        subsample= 0.8, 
                        tree_method= 'auto')

In [26]:
gb_test.fit(walking_X_train, walking_y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=1,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=100,
              silent=None, subsample=0.8, tree_method='auto', verbosity=1)

In [27]:
print(metrics.classification_report(gb_test.predict(walking_X_test), walking_y_test))

              precision    recall  f1-score   support

           0       0.81      0.68      0.74        25
           1       0.56      0.71      0.63        14

    accuracy                           0.69        39
   macro avg       0.68      0.70      0.68        39
weighted avg       0.72      0.69      0.70        39



In [11]:
pd.crosstab(xgb_walking_model.predict(walking_X_test), walking_y_test, rownames = ["Actual"], colnames = ["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,17,7
1,4,11


In [8]:
## Model Feature Importances by Gini Index
pd.Series(data = rf_walking_model.best_estimator_[1].feature_importances_, index = walking_X_test.columns).nlargest(10)

max_gait_symmetry userAccel_walking_features_z           0.142085
max_count_freeze_index userAccel_walking_features_AA     0.114956
max_median_freeze_index userAccel_walking_features_x     0.110739
max_gait_symmetry userAccel_walking_features_x           0.108403
max_median_freeze_index userAccel_walking_features_y     0.100008
max_median_freeze_index userAccel_walking_features_z     0.096129
max_median_freeze_index userAccel_walking_features_AA    0.095363
max_count_freeze_index userAccel_walking_features_x      0.084788
max_count_freeze_index userAccel_walking_features_z      0.084549
max_count_freeze_index userAccel_walking_features_y      0.062980
dtype: float64

In [7]:
## Model Feature Importances
pd.Series(data = xgb_walking_model.best_estimator_[1].feature_importances_, index = walking_X_test.columns).nlargest(10)

max_gait_symmetry userAccel_walking_features_z           0.124205
max_gait_symmetry userAccel_walking_features_x           0.115634
max_count_freeze_index userAccel_walking_features_AA     0.113101
max_median_freeze_index userAccel_walking_features_y     0.105339
max_median_freeze_index userAccel_walking_features_x     0.103012
max_count_freeze_index userAccel_walking_features_z      0.097726
max_count_freeze_index userAccel_walking_features_x      0.089981
max_median_freeze_index userAccel_walking_features_AA    0.088138
max_count_freeze_index userAccel_walking_features_y      0.085182
max_median_freeze_index userAccel_walking_features_z     0.077680
dtype: float32