# Train simple models for prediction of In-hospital mortality

- Logistic regression
- Random forest
- XGBoost

In [8]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [9]:
from fastai.structured import *
from fastai.column_data import *
import yaml
from pandas import DataFrame, Series
import shutil

In [10]:
from sklearn.preprocessing import Imputer, StandardScaler

In [11]:
NB_DIR = %pwd

In [12]:
RAW_DATA = '/data1/MIMIC-III/RAW/'
INTERIM_DATA = f'{RAW_DATA}/../interim/'
PROCESSED_DATA = f'{RAW_DATA}/../processed/'

In [13]:
MIMIC3_BENCHMARK_LOCATION = f'{NB_DIR}/../mimic3-benchmarks/'

# Train-validation split

In [14]:
task='in-hospital-mortality'

In [15]:
val_patients = set()
with open(f"{MIMIC3_BENCHMARK_LOCATION}/mimic3models/valset.csv", "r") as valset_file:
    for line in valset_file:
        x, y = line.split(',')
        if int(y) == 1:
            val_patients.add(x)

In [16]:
with open(f"{PROCESSED_DATA}/{task}/train/listfile.csv") as listfile:
    lines = listfile.readlines()
    header = lines[0]
    lines = lines[1:]

In [17]:
train_lines = [x for x in lines if x[:x.find("_")] not in val_patients]
val_lines = [x for x in lines if x[:x.find("_")] in val_patients]
assert len(train_lines) + len(val_lines) == len(lines)

In [18]:
with open(f"{PROCESSED_DATA}/{task}/train_listfile.csv", "w") as train_listfile:
    train_listfile.write(header)
    for line in train_lines:
        train_listfile.write(line)

In [19]:
with open(f"{PROCESSED_DATA}/{task}/val_listfile.csv", "w") as val_listfile:
    val_listfile.write(header)
    for line in val_lines:
        val_listfile.write(line)

In [20]:
shutil.copy(f"{PROCESSED_DATA}/{task}/test/listfile.csv",
            f"{PROCESSED_DATA}/{task}/test_listfile.csv")

'/data1/MIMIC-III/RAW//../processed//in-hospital-mortality/test_listfile.csv'

## Read and extract features

In [21]:
def read_chunk(reader, chunk_size):
    data = {}
    for i in range(chunk_size):
        ret = reader.read_next()
        for k, v in iter(ret.items()):
            if k not in data:
                data[k] = []
            data[k].append(v)
    data["header"] = data["header"][0]
    return data

In [22]:
def extract_features_from_rawdata(chunk, header, period, features):
    with open(os.path.join(f'{MIMIC3_BENCHMARK_LOCATION}/resources/', "channel_info.json")) as channel_info_file:
        channel_info = json.loads(channel_info_file.read())
    data = [convert_to_dict(X, header, channel_info) for X in chunk]
    return extract_features(data, period, features)

In [23]:
def convert_to_dict(data, header, channel_info):
    """ convert data from readers output in to array of arrays format """
    ret = [[] for i in range(data.shape[1] - 1)]
    for i in range(1, data.shape[1]):
        ret[i-1] = [(t, x) for (t, x) in zip(data[:, 0], data[:, i]) if x != ""]
        channel = header[i]
        if (len(channel_info[channel]['possible_values']) != 0):
            ret[i-1] = list(map(lambda x: (x[0], channel_info[channel]['values'][x[1]]), ret[i-1])) # list(..) for Python3
        ret[i-1] = list(map(lambda x: (float(x[0]), float(x[1])), ret[i-1]))
    return ret

In [24]:
import numpy as np
from scipy.stats import skew

all_functions = [min, max, np.mean, np.std, skew, len]

functions_map = {
    "all": all_functions,
    "len": [len],
    "all_but_len": all_functions[:-1]
}

periods_map = {
    "all": (0, 0, 1, 0),
    "first4days": (0, 0, 0, 4 * 24),
    "first8days": (0, 0, 0, 8 * 24),
    "last12hours": (1, -12, 1, 0),
    "first25percent": (2, 25),
    "first50percent": (2, 50)
}

sub_periods = [(2, 100), (2, 10), (2, 25), (2, 50),
               (3, 10), (3, 25), (3, 50)]


def get_range(begin, end, period):
    # first p %
    if period[0] == 2:
        return (begin, begin + (end - begin) * period[1] / 100.0)
    # last p %
    if period[0] == 3:
        return (end - (end - begin) * period[1] / 100.0, end)

    if period[0] == 0:
        L = begin + period[1]
    else:
        L = end + period[1]

    if period[2] == 0:
        R = begin + period[3]
    else:
        R = end + period[3]

    return (L, R)


def calculate(channel_data, period, sub_period, functions):
    if len(list(channel_data)) == 0:
        return np.full((len(functions, )), np.nan)

    L = channel_data[0][0]
    R = channel_data[-1][0]
    L, R = get_range(L, R, period)
    L, R = get_range(L, R, sub_period)

    data = [x for (t, x) in channel_data
            if L - 1e-6 < t < R + 1e-6]

    if len(data) == 0:
        return np.full((len(functions, )), np.nan)
    return np.array([fn(data) for fn in functions], dtype=np.float32)


def extract_features_single_episode(data_raw, period, functions):
    global sub_periods
    extracted_features = [np.concatenate([calculate(data_raw[i], period, sub_period, functions)
                                          for sub_period in sub_periods],
                                         axis=0)
                          for i in range(len(data_raw))]
    return np.concatenate(extracted_features, axis=0)


def extract_features(data_raw, period, features):
    period = periods_map[period]
    functions = functions_map[features]
    return np.array([extract_features_single_episode(x, period, functions)
                     for x in data_raw])

In [25]:
def read_and_extract_features(reader, period, features):
    ret = read_chunk(reader, reader.get_number_of_examples())
    # ret = common_utils.read_chunk(reader, 100)
    X = extract_features_from_rawdata(ret['X'], ret['header'], period, features)
    return (X, ret['y'], ret['name'])

## Data readers

In [26]:
period = "all" # choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']
features = "all" # choices=['all', 'len', 'all_but_len']

In [27]:
class Reader(object):
    def __init__(self, dataset_dir, listfile=None):
        self._dataset_dir = dataset_dir
        self._current_index = 0
        if listfile is None:
            listfile_path = os.path.join(dataset_dir, "listfile.csv")
        else:
            listfile_path = listfile
        with open(listfile_path, "r") as lfile:
            self._data = lfile.readlines()
        self._listfile_header = self._data[0]
        self._data = self._data[1:]

    def get_number_of_examples(self):
        return len(self._data)

    def random_shuffle(self, seed=None):
        if (seed is not None):
            random.seed(seed)
        random.shuffle(self._data)

    def read_example(self, index):
        raise NotImplementedError()

    def read_next(self):
        to_read_index = self._current_index
        self._current_index += 1
        if (self._current_index == self.get_number_of_examples()):
            self._current_index = 0
        return self.read_example(to_read_index)

In [28]:
class InHospitalMortalityReader(Reader):
    def __init__(self, dataset_dir, listfile=None, period_length=48.0):
        """ Reader for in-hospital moratality prediction task.
        :param dataset_dir:   Directory where timeseries files are stored.
        :param listfile:      Path to a listfile. If this parameter is left `None` then
                              `dataset_dir/listfile.csv` will be used.
        :param period_length: Length of the period (in hours) from which the prediction is done.
        """
        Reader.__init__(self, dataset_dir, listfile)
        self._data = [line.split(',') for line in self._data]
        self._data = [(x, int(y)) for (x, y) in self._data]
        self._period_length = period_length

    def _read_timeseries(self, ts_filename):
        ret = []
        with open(os.path.join(self._dataset_dir, ts_filename), "r") as tsfile:
            header = tsfile.readline().strip().split(',')
            assert header[0] == "Hours"
            for line in tsfile:
                mas = line.strip().split(',')
                ret.append(np.array(mas))
        return (np.stack(ret), header)

    def read_example(self, index):
        """ Reads the example with given index.
        :param index: Index of the line of the listfile to read (counting starts from 0).
        :return: Dictionary with the following keys:
            X : np.array
                2D array containing all events. Each row corresponds to a moment.
                First column is the time and other columns correspond to different
                variables.
            t : float
                Length of the data in hours. Note, in general, it is not equal to the
                timestamp of last event.
            y : int (0 or 1)
                In-hospital mortality.
            header : array of strings
                Names of the columns. The ordering of the columns is always the same.
            name: Name of the sample.
        """
        if (index < 0 or index >= len(self._data)):
            raise ValueError("Index must be from 0 (inclusive) to number of lines (exclusive).")

        name = self._data[index][0]
        t = self._period_length
        y = self._data[index][1]
        (X, header) = self._read_timeseries(name)

        return {"X": X,
                "t": t,
                "y": y,
                "header": header,
                "name": name}


In [29]:
train_reader = InHospitalMortalityReader(dataset_dir=f'{PROCESSED_DATA}/in-hospital-mortality/train/',
                                             listfile=f'{PROCESSED_DATA}/in-hospital-mortality/train_listfile.csv',
                                             period_length=48.0)

val_reader = InHospitalMortalityReader(dataset_dir=f'{PROCESSED_DATA}/in-hospital-mortality/train/',
                                           listfile=f'{PROCESSED_DATA}/in-hospital-mortality/val_listfile.csv',
                                           period_length=48.0)

test_reader = InHospitalMortalityReader(dataset_dir=f'{PROCESSED_DATA}/in-hospital-mortality/test/',
                                            listfile=f'{PROCESSED_DATA}/in-hospital-mortality/test_listfile.csv',
                                            period_length=48.0)

In [30]:
train_reader.get_number_of_examples()

14681

In [18]:
train_reader?

[0;31mType:[0m           InHospitalMortalityReader
[0;31mString form:[0m    <__main__.InHospitalMortalityReader object at 0x7faf3b689cc0>
[0;31mDocstring:[0m      <no docstring>
[0;31mInit docstring:[0m
Reader for in-hospital moratality prediction task.
:param dataset_dir:   Directory where timeseries files are stored.
:param listfile:      Path to a listfile. If this parameter is left `None` then
                      `dataset_dir/listfile.csv` will be used.
:param period_length: Length of the period (in hours) from which the prediction is done.


In [32]:
print('Reading data and extracting features ...')
(train_X, train_y, train_names) = read_and_extract_features(train_reader, period, features)
(val_X, val_y, val_names) = read_and_extract_features(val_reader, period, features)
(test_X, test_y, test_names) = read_and_extract_features(test_reader, period, features)
print('  train data shape = {}'.format(train_X.shape))
print('  validation data shape = {}'.format(val_X.shape))
print('  test data shape = {}'.format(test_X.shape))

Reading data and extracting features ...
  train data shape = (14681, 714)
  validation data shape = (3222, 714)
  test data shape = (3236, 714)


In [33]:
#%debug

**SAVE train, val, test arrays**

In [34]:
np.save(f'{PROCESSED_DATA}/{task}/train_X', train_X)
np.save(f'{PROCESSED_DATA}/{task}/train_y', train_y)
np.save(f'{PROCESSED_DATA}/{task}/train_names', train_names)

np.save(f'{PROCESSED_DATA}/{task}/val_X', val_X)
np.save(f'{PROCESSED_DATA}/{task}/val_y', val_y)
np.save(f'{PROCESSED_DATA}/{task}/val_names', val_names)

np.save(f'{PROCESSED_DATA}/{task}/test_X', test_X)
np.save(f'{PROCESSED_DATA}/{task}/test_y', test_y)
np.save(f'{PROCESSED_DATA}/{task}/test_names', test_names)

In [31]:
train_X = np.load(f'{PROCESSED_DATA}/{task}/train_X.npy')
train_y = np.load(f'{PROCESSED_DATA}/{task}/train_y.npy')
train_names = np.load(f'{PROCESSED_DATA}/{task}/train_names.npy')

val_X = np.load(f'{PROCESSED_DATA}/{task}/val_X.npy')
val_y = np.load(f'{PROCESSED_DATA}/{task}/val_y.npy')
val_names = np.load(f'{PROCESSED_DATA}/{task}/val_names.npy')


test_X = np.load(f'{PROCESSED_DATA}/{task}/test_X.npy')
test_y = np.load(f'{PROCESSED_DATA}/{task}/test_y.npy')
test_names = np.load(f'{PROCESSED_DATA}/{task}/test_names.npy')

FileNotFoundError: [Errno 2] No such file or directory: '/data1/MIMIC-III/RAW//../processed//in-hospital-mortality/train_X.npy'

**TESTS (TMP)**

In [35]:
ret = read_chunk(train_reader, train_reader.get_number_of_examples())

In [55]:
len(ret['X'])

14681

In [56]:
type(ret['X'])

list

In [57]:
header = ret['header']

In [58]:
len(header)

18

In [59]:
channel_info_file = open(f'{MIMIC3_BENCHMARK_LOCATION}/resources/channel_info.json')
channel_info = json.loads(channel_info_file.read())

In [60]:
data = [convert_to_dict(X, header, channel_info) for X in ret['X']]

In [None]:
data[0]

In [63]:
len(data)

14681

## Imputing

In [38]:
imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0,
                  verbose=0, copy=True)

In [39]:
imputer.fit(train_X)

Imputer(axis=0, copy=True, missing_values=nan, strategy='mean', verbose=0)

In [40]:
train_X = np.array(imputer.transform(train_X), dtype=np.float32)
val_X = np.array(imputer.transform(val_X), dtype=np.float32)

## Normalizing

In [41]:
scaler = StandardScaler()

In [42]:
scaler.fit(train_X)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [43]:
train_X = scaler.transform(train_X)
val_X = scaler.transform(val_X)

# Logistic regression

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
C = 0.001 # Inverse of L1 / L2 regularization
l2 = True 
l1 = False

## Define logreg-model

In [44]:
penalty = ("l2" if l2 else "l1")

In [45]:
penalty

'l2'

In [46]:
logreg = LogisticRegression(penalty=penalty, C=C)

In [47]:
logreg.fit(train_X, train_y)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Predict on train and valdata

In [48]:
train_prob = logreg.predict_proba(train_X)
val_prob = logreg.predict_proba(val_X)

## Metrics

Accuracy:

In [49]:
logreg.score(train_X, train_y)

0.8842721885430148

In [50]:
from sklearn import metrics

In [51]:
def print_metrics_binary(y_true, predictions, verbose=1):
    predictions = np.array(predictions)
    if (len(predictions.shape) == 1):
        predictions = np.stack([1 - predictions, predictions]).transpose((1, 0))

    cf = metrics.confusion_matrix(y_true, predictions.argmax(axis=1))
    if verbose:
        print("confusion matrix:")
        print(cf)
    cf = cf.astype(np.float32)

    acc = (cf[0][0] + cf[1][1]) / np.sum(cf)
    prec0 = cf[0][0] / (cf[0][0] + cf[1][0])
    prec1 = cf[1][1] / (cf[1][1] + cf[0][1])
    rec0 = cf[0][0] / (cf[0][0] + cf[0][1])
    rec1 = cf[1][1] / (cf[1][1] + cf[1][0])
    auroc = metrics.roc_auc_score(y_true, predictions[:, 1])

    (precisions, recalls, thresholds) = metrics.precision_recall_curve(y_true, predictions[:, 1])
    auprc = metrics.auc(recalls, precisions)
    minpse = np.max([min(x, y) for (x, y) in zip(precisions, recalls)])

    if verbose:
        print("accuracy =", acc)
        print("precision class 0 =", prec0)
        print("precision class 1 =", prec1)
        print("recall class 0 =", rec0)
        print("recall class 1 =", rec1)
        print("AUC of ROC =", auroc)
        print("AUC of PRC =", auprc)
        print("min(+P, Se) =", minpse)

    return {"acc": acc,
            "prec0": prec0,
            "prec1": prec1,
            "rec0": rec0,
            "rec1": rec1,
            "auroc": auroc,
            "auprc": auprc,
            "minpse": minpse}

In [52]:
acc, prec0, prec1, rec0, rec1, auroc, auprc, minpse = print_metrics_binary(train_y, train_prob)

confusion matrix:
[[12222   472]
 [ 1227   760]]
accuracy = 0.8842722
precision class 0 = 0.90876645
precision class 1 = 0.6168831
recall class 0 = 0.9628171
recall class 1 = 0.38248616
AUC of ROC = 0.8638282521595984
AUC of PRC = 0.5483875454442582
min(+P, Se) = 0.5284348263714141


In [53]:
acc_v, prec0_v, prec1_v, rec0_v, rec1_v, auroc_v, auprc_v, minpse_v = print_metrics_binary(val_y, val_prob)

confusion matrix:
[[2666  120]
 [ 274  162]]
accuracy = 0.8777157
precision class 0 = 0.9068027
precision class 1 = 0.5744681
recall class 0 = 0.9569275
recall class 1 = 0.37155962
AUC of ROC = 0.8462438338481397
AUC of PRC = 0.49532829033260056
min(+P, Se) = 0.5114678899082569


**Predict on test**

In [54]:
test_X = np.array(imputer.transform(test_X), dtype=np.float32)

In [55]:
test_X = scaler.transform(test_X)

In [56]:
test_prob = logreg.predict_proba(test_X)

In [57]:
acc_t, prec0_t, prec1_t, rec0_t, rec1_t, auroc_t, auprc_t, minpse_t = print_metrics_binary(test_y, test_prob)

confusion matrix:
[[2745  117]
 [ 230  144]]
accuracy = 0.89276886
precision class 0 = 0.9226891
precision class 1 = 0.55172414
recall class 0 = 0.9591195
recall class 1 = 0.38502672
AUC of ROC = 0.8498609849886211
AUC of PRC = 0.47363518708673674
min(+P, Se) = 0.4572192513368984


# Random forests

In [58]:
from sklearn.ensemble import RandomForestClassifier

In [59]:
rf = RandomForestClassifier(n_jobs=-1)

In [60]:
rf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [61]:
train_prob_rf = rf.predict_proba(train_X)

In [62]:
rf.score(train_X, train_y)

0.9853552210339895

In [63]:
rf.score(val_X, val_y)

0.8792675356921167

In [64]:
rf.score(test_X, test_y)

0.892150803461063

In [65]:
cf = metrics.confusion_matrix(test_y, rf.predict_proba(test_X).argmax(axis=1))

In [66]:
cf

array([[2809,   53],
       [ 296,   78]])

**A slightly more powerful RF**

In [67]:
m = RandomForestClassifier(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)

In [68]:
m.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [69]:
m.score(test_X, test_y)

0.8995673671199012

In [70]:
metrics.confusion_matrix(test_y, m.predict_proba(test_X).argmax(axis=1))

array([[2801,   61],
       [ 263,  111]])

In [80]:
test_prob_rf2 = m.predict_proba(test_X)

In [82]:
rf2_metrics = print_metrics_binary(test_y, test_prob_rf2)

confusion matrix:
[[2800   62]
 [ 263  111]]
accuracy = 0.89956737
precision class 0 = 0.91413647
precision class 1 = 0.6416185
recall class 0 = 0.9783368
recall class 1 = 0.29679143
AUC of ROC = 0.8364116563339647
AUC of PRC = 0.48517862664126066
min(+P, Se) = 0.4839572192513369


# XGBoost

In [72]:
from xgboost import XGBClassifier

In [73]:
model = XGBClassifier()

In [74]:
model.fit(train_X, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [75]:
test_proba_xg = model.predict(test_X)

  if diff:


In [83]:
from sklearn.metrics import accuracy_score

In [84]:
accuracy_score(test_y, test_proba_xg)

0.9017305315203955

In [85]:
metrics.confusion_matrix(test_y, model.predict_proba(test_X).argmax(axis=1))

array([[2798,   64],
       [ 254,  120]])

In [86]:
metrics_xg = print_metrics_binary(test_y, test_proba_xg)

confusion matrix:
[[2798   64]
 [ 254  120]]
accuracy = 0.90173054
precision class 0 = 0.9167759
precision class 1 = 0.65217394
recall class 0 = 0.977638
recall class 1 = 0.32085562
AUC of ROC = 0.6492468151735631
AUC of PRC = 0.525760746703055
min(+P, Se) = 0.32085561497326204


## Grid search for hyperparameters

Based on https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost/notebook

In [113]:
?XGBClassifier

[0;31mInit signature:[0m [0mXGBClassifier[0m[0;34m([0m[0mmax_depth[0m[0;34m=[0m[0;36m3[0m[0;34m,[0m [0mlearning_rate[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m [0msilent[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mobjective[0m[0;34m=[0m[0;34m'binary:logistic'[0m[0;34m,[0m [0mbooster[0m[0;34m=[0m[0;34m'gbtree'[0m[0;34m,[0m [0mn_jobs[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mnthread[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mgamma[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mmin_child_weight[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mmax_delta_step[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0msubsample[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mcolsample_bytree[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mcolsample_bylevel[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mreg_alpha[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mreg_lambda[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mscale_pos_weight[

In [117]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [147]:
param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.0, 1.0, 1.5],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5, 6, 7, 8]
        }

param_grid_large = {
        'learning_rate': [0.1, 0.05, 0.2],
        'n_estimators': [50, 100, 500, 600],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.0, 1.0, 1.5],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.5, 0.6, 0.8],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'reg_lambda': [1, ]
        }

In [148]:
xgb = XGBClassifier(silent=True, nthread=-1)

**Brute force grid search**

In [149]:
grid = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                    scoring='roc_auc', n_jobs=-1, cv=skf.split(train_X, train_y), verbose=0 )

In [150]:
%%time
grid.fit(train_X, train_y)

CPU times: user 3min 27s, sys: 4.08 s, total: 3min 31s
Wall time: 37min 59s


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7f80b171a360>,
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [151]:
print(grid.best_estimator_)


 Best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=5, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=10, missing=None, n_estimators=100,
       n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.6)


In [152]:
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(grid.best_score_ * 2 - 1)


 Best normalized gini score for 5-fold search with 40 parameter combinations:
0.737479995135286


In [153]:
print(grid.best_params_)


 Best hyperparameters:
{'colsample_bytree': 0.8, 'gamma': 5, 'max_depth': 5, 'min_child_weight': 10, 'subsample': 0.6}


In [154]:
results_grid = pd.DataFrame(grid.cv_results_)

In [None]:
results_grid.head()

**Test best**

In [156]:
test_proba_xg_best = grid.predict_proba(test_X)

In [157]:
metrics_xg_best = print_metrics_binary(test_y, test_proba_xg_best)

confusion matrix:
[[2780   82]
 [ 242  132]]
accuracy = 0.8998764
precision class 0 = 0.91992056
precision class 1 = 0.6168224
recall class 0 = 0.9713487
recall class 1 = 0.3529412
AUC of ROC = 0.8673462333284753
AUC of PRC = 0.5158677841625119
min(+P, Se) = 0.49214659685863876


**Random search**

In [134]:
folds = 5
param_comb = 40

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)

In [135]:
random_search = RandomizedSearchCV(xgb, param_distributions=param_grid, n_iter=param_comb, 
                                   scoring='roc_auc', n_jobs=-1, 
                                   cv=skf.split(train_X, train_y), verbose=3, random_state=1001 )

In [136]:
%%time
random_search.fit(train_X, train_y)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=9, learning_rate=0.05, gamma=5, colsample_bytree=0.8 
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=9, learning_rate=0.05, gamma=5, colsample_bytree=0.8 
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=9, learning_rate=0.05, gamma=5, colsample_bytree=0.8 
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=9, learning_rate=0.05, gamma=5, colsample_bytree=0.8 
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=9, learning_rate=0.05, gamma=5, colsample_bytree=0.8 
[CV] subsample=0.8, reg_lambda=1, n_estimators=50, min_child_weight=1, max_depth=5, learning_rate=0.05, gamma=1.5, colsample_bytree=0.5 
[CV] subsample=0.8, reg_lambda=1, n_estimators=50, min_child_weight=1, max_depth=5, learning_rate=0.05, gamma=1.5, colsam

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  3.0min


[CV]  subsample=0.8, reg_lambda=1, n_estimators=600, min_child_weight=10, max_depth=6, learning_rate=0.05, gamma=0.5, colsample_bytree=0.5, score=0.8591925757182922, total= 2.9min
[CV] subsample=0.6, reg_lambda=1, n_estimators=50, min_child_weight=1, max_depth=6, learning_rate=0.05, gamma=0, colsample_bytree=0.5 
[CV]  subsample=0.8, reg_lambda=1, n_estimators=50, min_child_weight=5, max_depth=7, learning_rate=0.05, gamma=5, colsample_bytree=0.8, score=0.867353913706878, total=  28.1s
[CV] subsample=0.6, reg_lambda=1, n_estimators=50, min_child_weight=1, max_depth=6, learning_rate=0.05, gamma=0, colsample_bytree=0.5 
[CV]  subsample=0.8, reg_lambda=1, n_estimators=600, min_child_weight=10, max_depth=6, learning_rate=0.05, gamma=0.5, colsample_bytree=0.5, score=0.8737071954586535, total= 2.8min
[CV] subsample=0.6, reg_lambda=1, n_estimators=50, min_child_weight=1, max_depth=6, learning_rate=0.05, gamma=0, colsample_bytree=0.5 
[CV]  subsample=0.8, reg_lambda=1, n_estimators=600, min_chi

[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 11.9min finished


CPU times: user 1min 19s, sys: 1.6 s, total: 1min 20s
Wall time: 12min 55s


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f80b1741780>,
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=40, n_jobs=-1,
          param_distributions={'learning_rate': [0.1, 0.05, 0.2], 'n_estimators': [50, 100, 500, 600], 'min_child_weight': [1, 5, 10], 'gamma': [0, 0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.5, 0.6, 0.8], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'reg_lambda': [1]},
          pre_dispatch='2*n_jobs', random_state=1001, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=3

Best from previous run. Scores AUC of ROC = 0.87, AUC of PRC = 0.536, min(+P, Se) = 0.508 on test.

```XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=1.5, learning_rate=0.02,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=600, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.6)```

In [144]:
print('\n Best estimator:')
print(random_search.best_estimator_)


 Best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=1, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.6)


In [138]:
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)


 Best normalized gini score for 5-fold search with 40 parameter combinations:
0.7397114839572771


In [139]:
print('\n Best hyperparameters:')
print(random_search.best_params_)


 Best hyperparameters:
{'subsample': 0.6, 'reg_lambda': 1, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 1, 'colsample_bytree': 0.5}


In [140]:
results = pd.DataFrame(random_search.cv_results_)

In [None]:
results.head()

**Predict on test:**

In [145]:
test_proba_xg_best = random_search.predict_proba(test_X)

In [146]:
metrics_xg_best = print_metrics_binary(test_y, test_proba_xg_best)

confusion matrix:
[[2793   69]
 [ 243  131]]
accuracy = 0.90358466
precision class 0 = 0.9199605
precision class 1 = 0.655
recall class 0 = 0.975891
recall class 1 = 0.35026738
AUC of ROC = 0.8708795315343595
AUC of PRC = 0.5365112671413969
min(+P, Se) = 0.5
