In [1]:
import sys
import warnings
sys.path.append("../src/pipeline")
from utils import query_utils as query
from utils import gait_features_utils as gproc
import synapseclient as sc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import sklearn.metrics as metrics
import fancyimpute


# magic commands
%matplotlib inline
%load_ext autoreload
%autoreload 2

# extra commands
sns.set_style("darkgrid")
sns.set_context("paper")
warnings.simplefilter("ignore")

Using TensorFlow backend.


In [2]:
WALKING_GROUPED_FEATURES = "syn21626482"
MATCHED_DEMOGRAPHICS = "syn21614563"
syn = sc.login()

Welcome, aryton tediarjo!



INFO:synapseclient_default:Welcome, aryton tediarjo!



In [3]:
# get data
data = query.get_file_entity(syn, WALKING_GROUPED_FEATURES )
demographic = query.get_file_entity(syn, MATCHED_DEMOGRAPHICS)


data = data[[feat for feat in data.columns if ("filepath" not in feat) 
             and ("window" not in feat) 
             and ("steps" not in feat) 
             and (feat != "index")
             and ("AA_" not in feat)]]


# filter datasets
data = data[(data["test_type"] == "walking") & 
            (data["age"] > 0) & 
            (data["table_version"] != "MPOWER_PASSIVE") &
            (data["class"] != "MS")]

# merge inner
data = pd.merge(demographic[["healthCode"]], data, on = "healthCode", how = "inner")

data = data.set_index("healthCode")

# clean sparse feature
data = data[data.columns[data.isnull().mean() < 0.2]]

# cols of choice
metadata_cols = ["test_type", "phoneInfo", "age", "gender", "table_version", "nrecords", "class"]
feat_cols = [feat for feat in data.columns if (feat not in metadata_cols)]

## 1. Impute missing data on training using MICE

In [5]:
train = data[feat_cols]
target = data["class"].map({"PD":1, "control":0})

In [7]:
# split train test split
X_train, X_test, y_train, y_test = train_test_split(train, target, 
                                                    test_size = 0.25,
                                                   random_state = 100)

In [26]:
imputer = MICE(initial_strategy = "median", 
             random_state = 100, 
             max_iter = 20).fit(X_train)

In [27]:
X_train_impute = imputer.fit_transform(X_train)
X_test_impute = imputer.fit_transform(X_test)

In [31]:
# Create raw model with 1000 estimators
model = RandomForestClassifier(n_estimators=2000,
                              random_state = 100)
model.fit(X_train_impute, y_train)
y_pred = model.predict(X_test_impute)

In [32]:
print("Some Preliminary Prediction Scores using all Features\n")

print("AUC-score: {}\n".format(metrics.roc_auc_score(y_true = y_test, 
                              y_score = y_pred)))
print("F1-score: {}\n".format(metrics.f1_score(y_true = y_test, 
                              y_pred = y_pred)))
print("Logloss-score: {}\n".format(metrics.log_loss(y_true = y_test, 
                              y_pred = y_pred)))


print("\nClassification Report on Baseline Model Performance\n")
print(metrics.classification_report(y_test, y_pred))

Some Preliminary Prediction Scores using all Features

AUC-score: 0.6479540918163672

F1-score: 0.6415094339622641

Logloss-score: 12.382008696239048


Classification Report on Baseline Model Performance

              precision    recall  f1-score   support

           0       0.71      0.58      0.64       204
           1       0.58      0.71      0.64       167

    accuracy                           0.64       371
   macro avg       0.65      0.65      0.64       371
weighted avg       0.65      0.64      0.64       371

