**Using XGBoost to work with dataset that includes missing values**

In [2]:
import xgboost as xgb
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from scipy import interp
from itertools import cycle

In [3]:
df = pd.read_csv('Master_Clean_w_NaN.csv', index_col=0)

In [4]:
df.head()

Unnamed: 0,SEQN,Age,Heart_Rate,Irregular_Pulse,Systolic_BP_Avg,Diastolic_BP_Avg,Weight_kg,Height_cm,BMI,Waist_cm,...,Family_Income_15.0,Family_Income_nan,Gender_1.0,Gender_2.0,Race_1.0,Race_2.0,Race_3.0,Race_4.0,Race_6.0,Race_7.0
0,83732.0,62.0,76.0,0.0,122.666667,65.333333,94.8,184.5,27.8,101.1,...,0,0,1,0,0,0,1,0,0,0
1,83733.0,53.0,72.0,0.0,140.0,86.0,90.4,171.4,30.8,107.9,...,0,0,1,0,0,0,1,0,0,0
2,83735.0,56.0,78.0,0.0,134.0,70.0,109.8,160.9,42.4,110.1,...,0,0,0,1,0,0,1,0,0,0
3,83736.0,42.0,76.0,0.0,104.0,60.0,55.2,164.9,20.3,80.4,...,0,0,0,1,0,0,0,1,0,0
4,83741.0,22.0,66.0,0.0,111.333333,72.666667,76.6,165.4,28.0,86.6,...,0,0,1,0,0,0,0,1,0,0


In [5]:
df.shape

(4707, 68)

In [6]:
df = df[np.isfinite(df['Target'])]

In [7]:
df.shape

(4423, 68)

In [8]:
df_target = df['Target']

In [9]:
df_target.shape

(4423,)

In [10]:
df_features = df.drop(columns=['SEQN', 'Target'])

In [11]:
df_features.shape

(4423, 66)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, test_size=0.2)

In [13]:
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [14]:
training_preds = clf.predict(X_train)
val_preds = clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Training Accuracy: 98.62%
Validation accuracy: 98.53%


In [15]:
print('Accuracy Score:  ' + str(accuracy_score(y_test, val_preds)))
print('F1 Score:        ' + str(f1_score(y_test, val_preds, average="macro")))
print('Precision Score: ' + str(precision_score(y_test, val_preds, average="macro")))
print('Recall Score:    ' + str(recall_score(y_test, val_preds, average="macro")) )
report = classification_report(y_test, val_preds)
print ('')
print (report)

Accuracy Score:  0.9853107344632769
F1 Score:        0.4963005122367672
Precision Score: 0.49265536723163844
Recall Score:    0.5

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99       872
         1.0       0.00      0.00      0.00        13

   micro avg       0.99      0.99      0.99       885
   macro avg       0.49      0.50      0.50       885
weighted avg       0.97      0.99      0.98       885



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [16]:
param_grid = {
    'learning_rate': [0.1],
    'max_depth': [6],
    'min_child_weight': [10],
    'subsample': [ 0.7],
    'n_estimators': [5, 30, 100, 250],
}

In [17]:
clf = xgb.XGBClassifier()
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(X_train, y_train)

best_parameters = grid_clf.best_params_

print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

training_preds = grid_clf.predict(X_train)
val_preds = grid_clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))




Grid Search found the following optimal parameters: 
learning_rate: 0.1
max_depth: 6
min_child_weight: 10
n_estimators: 5
subsample: 0.7

Training Accuracy: 98.11%
Validation accuracy: 98.53%


In [18]:
grid_clf.cv_results_['mean_test_score']

array([0.98106275, 0.98106275, 0.98049746, 0.97823629])

In [19]:
param_grid = {
    'max_depth' : [3, 5, 6, 7],
    'learning_rate' : [0.1],
    'n_estimators' : [5, 30, 80, 100],
    'silent' : [True],
    'objective' : ['binary:logistic'],
    'booster' : ['gbtree'],
    'n_jobs' : [1],
    'nthread' : [None],
    'gamma' : [0],
    'min_child_weight' : [1, 5, 10],
    'max_delta_step' : [0],
    'subsample' : [1, 0.7],
    'colsample_bytree' : [1],
    'colsample_bylevel' : [1],
    'reg_alpha' : [0],
    'reg_lambda' : [1],
    'scale_pos_weight' : [1],
    'base_score' : [0.5],
    'random_state' : [0],
    'seed' : [None],
    'missing' : [None]
}

In [20]:
clf = xgb.XGBClassifier()
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(X_train, y_train)

best_parameters = grid_clf.best_params_

print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

training_preds = grid_clf.predict(X_train)
val_preds = grid_clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))




Grid Search found the following optimal parameters: 
base_score: 0.5
booster: 'gbtree'
colsample_bylevel: 1
colsample_bytree: 1
gamma: 0
learning_rate: 0.1
max_delta_step: 0
max_depth: 3
min_child_weight: 1
missing: None
n_estimators: 5
n_jobs: 1
nthread: None
objective: 'binary:logistic'
random_state: 0
reg_alpha: 0
reg_lambda: 1
scale_pos_weight: 1
seed: None
silent: True
subsample: 0.7

Training Accuracy: 98.11%
Validation accuracy: 98.53%
