## Classification Performance of Aggregated Features

### Import Libraries

In [7]:
import pandas as pd
import numpy as np
import synapseclient as sc
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [72]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import cross_validate

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [47]:
data = pd.read_csv("cleaned_walk_features.tsv", sep = "\t").dropna()

In [62]:
feat_used = [feat for feat in data.columns 
             if ("createdOn" not in feat) 
             and ("window" not in feat) 
             and ("error" not in feat) 
             and ('nrecords' not in feat)
             and ("healthCode" not in feat) 
             and ("gender" not in feat) 
             and ("PD" not in feat) 
             and ("age" not in feat)]

In [63]:
#Seperate train and test data
X_train, X_test, y_train, y_test = train_test_split(data[feat_used],
                                                   data['PD'],
                                                   test_size = 0.25,
                                                   random_state = 100)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2597, 106)
(866, 106)
(2597,)
(866,)


### Classifiers

In [79]:
clfs = []
clfs.append(LogisticRegression())
clfs.append(SVC())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier(max_depth = 5, 
                                   random_state = 100, 
                                   n_estimators = 1000))
clfs.append(GradientBoostingClassifier(max_depth = 5, 
                                       random_state = 100, 
                                       n_estimators = 1000))

### Validation Set Performance (ROC-AUC)

In [106]:
model_metrics_placeholder = {}
model_metrics_placeholder["classifier"] = []
model_metrics_placeholder["cross_validation_auc_score"] = []
model_metrics_placeholder["cross_validation_std"] = []
model_metrics_placeholder["test_auc_score"] = []

In [98]:
clfs = []
clfs.append(LogisticRegression())
clfs.append(SVC())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier(max_depth = 5, 
                                   random_state = 100, 
                                   n_estimators = 1000))
clfs.append(GradientBoostingClassifier(max_depth = 5, 
                                       random_state = 100, 
                                       n_estimators = 100))

for classifier in clfs:
    pipeline = Pipeline([
        ('normalizer', MinMaxScaler()), 
        ('clf', classifier)])
    
    scores = cross_validate(pipeline, 
                            X_train, 
                            y_train,
                            cv = 10, 
                           scoring = "roc_auc")
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    for key, values in scores.items():
            print(key,' mean ', values.mean())
            print(key,' std ', values.std())
    
    model_metrics_placeholder["classifier"].append(type(classifier).__name__)
    model_metrics_placeholder["cross_validation_auc_score"].append(scores["test_score"].mean())
    model_metrics_placeholder["cross_validation_std"].append(scores["test_score"].std())

---------------------------------
LogisticRegression()
-----------------------------------
fit_time  mean  0.04682326316833496
fit_time  std  0.004442435689465175
score_time  mean  0.003454756736755371
score_time  std  0.00039239661091942436
test_score  mean  0.8079767773715657
test_score  std  0.036692834019000566
---------------------------------
SVC()
-----------------------------------
fit_time  mean  0.37330658435821534
fit_time  std  0.01628038124011398
score_time  mean  0.03746335506439209
score_time  std  0.0017614182680470332
test_score  mean  0.8207411049719344
test_score  std  0.03141403844071745
---------------------------------
KNeighborsClassifier(n_neighbors=3)
-----------------------------------
fit_time  mean  0.014847493171691895
fit_time  std  0.0009136234136673755
score_time  mean  0.11067192554473877
score_time  std  0.005234460286649041
test_score  mean  0.6974152228949655
test_score  std  0.037933936009587554
---------------------------------
DecisionTreeClassifi

### Test-set Performance (ROC-AUC)

In [102]:
for classifier in clfs:
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    pipeline = Pipeline([
        ('normalizer', MinMaxScaler()), 
        ('clf', classifier)])
    auc_score = roc_auc_score(pipeline.fit(X_train, y_train).predict(X_test), y_test)
    print("ROC-AUC Score on Test-Set %s \n"%auc_score)
    model_metrics_placeholder["test_auc_score"].append(auc_score)

---------------------------------
LogisticRegression()
-----------------------------------
ROC-AUC Score on Test-Set 0.7573237597911227 

---------------------------------
SVC()
-----------------------------------
ROC-AUC Score on Test-Set 0.8076655294854835 

---------------------------------
KNeighborsClassifier(n_neighbors=3)
-----------------------------------
ROC-AUC Score on Test-Set 0.62514199935086 

---------------------------------
DecisionTreeClassifier()
-----------------------------------
ROC-AUC Score on Test-Set 0.6117581912561307 

---------------------------------
RandomForestClassifier(max_depth=5, n_estimators=1000, random_state=100)
-----------------------------------
ROC-AUC Score on Test-Set 0.8670154063479452 

---------------------------------
GradientBoostingClassifier(max_depth=5, random_state=100)
-----------------------------------
ROC-AUC Score on Test-Set 0.7581532277967775 

