# Imports

In [292]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam
from keras.utils import to_categorical

In [224]:
# Reading the dataset
ds = pd.read_csv('preprocessed_data.csv')
ds = ds.drop('Unnamed: 0', axis=1)

In [225]:
# Splittig features and target
X = ds.drop('DX_bl', axis=1)
y = ds[['DX_bl']]

# Creating Random Forest Model

In [226]:
# Number of trees in random forest
n_estimators = [100, 200, 400, 600, 1000]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [10, 40, 70, None]
# Minimum number of samples required to split a node
min_samples_split = [2, 6, 8, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 3, 5]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [227]:
# Searching for best hyperparameters
# Creating the base model to tune
rf = RandomForestClassifier()
# Grid search of parameters, using 5 fold cross validation, 
rf_grid = GridSearchCV(estimator=rf, param_grid = param_grid, n_jobs=-1, cv=5, verbose=2)
y = np.ravel(y)
# Fit the grid search model
rf_grid.fit(X, y)
print(rf_grid.best_params_)

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 20.2min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 24.3min
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed: 25.8min finished


{'bootstrap': True, 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 100}


In [240]:
# Creating the model
rf_model = RandomForestClassifier(n_estimators=100, min_samples_split=8, min_samples_leaf=1, 
                                  max_features='sqrt', max_depth=40, bootstrap=True, random_state=42)

# Creating Neural Network Model

In [328]:
# Function to create model, required for KerasClassifier
def create_premodel():
    # Create model
    model = Sequential()
    model.add(Dense(3, input_dim=14, activation='relu'))
    model.add(Dense(5, activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [278]:
# Encoding target attribute so it can be used properly in the NN models
y_nn = to_categorical(y-1, num_classes=5)
y_nn.shape
print(y_nn)

[[1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]]


## Tuning number of epochs & batch size

In [329]:
# Creating the base model to tune
pre_nn = KerasClassifier(build_fn=create_premodel, verbose=0)
# Defining the grid search parameters
# Number of samples in a batch
batch_size = [32, 64, 128, 256, 512]
# Number of epochs
epochs = [10, 30, 50, 70]
# Creating the grid
pre_param_grid_2 = dict(batch_size=batch_size, epochs=epochs)
# Grid search of parameters, using 5 fold cross validation
pre_nn_grid = GridSearchCV(estimator=pre_nn, param_grid=pre_param_grid_2, n_jobs=-1, cv=5)
# Fit the grid search model
pre_nn_grid_result = pre_nn_grid.fit(X, y_nn)

In [330]:
# Summarize results
print("Best: %f using %s" % (pre_nn_grid_result.best_score_, pre_nn_grid_result.best_params_))
means = pre_nn_grid_result.cv_results_['mean_test_score']
stds = pre_nn_grid_result.cv_results_['std_test_score']
params = pre_nn_grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.732917 using {'batch_size': 32, 'epochs': 10}
0.732917 (0.041135) with: {'batch_size': 32, 'epochs': 10}
0.641392 (0.038247) with: {'batch_size': 32, 'epochs': 30}
0.723104 (0.054989) with: {'batch_size': 32, 'epochs': 50}
0.652810 (0.049524) with: {'batch_size': 32, 'epochs': 70}
0.712400 (0.051768) with: {'batch_size': 64, 'epochs': 10}
0.680999 (0.046328) with: {'batch_size': 64, 'epochs': 30}
0.722748 (0.054342) with: {'batch_size': 64, 'epochs': 50}
0.707404 (0.050847) with: {'batch_size': 64, 'epochs': 70}
0.672435 (0.048954) with: {'batch_size': 128, 'epochs': 10}
0.725245 (0.039840) with: {'batch_size': 128, 'epochs': 30}
0.664942 (0.052823) with: {'batch_size': 128, 'epochs': 50}
0.642462 (0.036973) with: {'batch_size': 128, 'epochs': 70}
0.683675 (0.064057) with: {'batch_size': 256, 'epochs': 10}
0.728813 (0.074807) with: {'batch_size': 256, 'epochs': 30}
0.677074 (0.036350) with: {'batch_size': 256, 'epochs': 50}
0.715968 (0.052945) with: {'batch_size': 256, 'epochs'

## Tuning hidden layer units, dropout parameters and weight initialization scheme

In [331]:
# Function to create model, required for KerasClassifier
def create_model(neurons=1, init_mode='uniform', dropout_rate=0.0, weight_constraint=0):
    # Create model
    model = Sequential()
    model.add(Dense(neurons, input_dim=14, kernel_initializer=init_mode, activation='relu', kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(5, kernel_initializer=init_mode, activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [332]:
# Creating the base model to tune
nn = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)
# Defining the grid search parameters
# Number of units in the hidden layer
neurons = [1, 3, 5, 7, 9]
# Weight initialization
init_mode = ['uniform']
# Weight constraint
weight_constraint = [0, 1, 2, 3, 4, 5]
# Neuron dropout rate (probability)
dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
# Creating the grid
param_grid_2 = dict(neurons=neurons, init_mode=init_mode, dropout_rate=dropout_rate, weight_constraint=weight_constraint)
# Grid search of parameters, using 5 fold cross validation
nn_grid = GridSearchCV(estimator=nn, param_grid=param_grid_2, n_jobs=-1, cv=5, verbose=2)
# Fit the grid search model
nn_grid_result = nn_grid.fit(X, y_nn)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 21.2min finished


In [333]:
# Summarize results
print("Best: %f using %s" % (nn_grid_result.best_score_, nn_grid_result.best_params_))
means = nn_grid_result.cv_results_['mean_test_score']
stds = nn_grid_result.cv_results_['std_test_score']
params = nn_grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.800000 using {'dropout_rate': 0.0, 'init_mode': 'uniform', 'neurons': 1, 'weight_constraint': 0}
0.800000 (0.000000) with: {'dropout_rate': 0.0, 'init_mode': 'uniform', 'neurons': 1, 'weight_constraint': 0}
0.680999 (0.068064) with: {'dropout_rate': 0.0, 'init_mode': 'uniform', 'neurons': 1, 'weight_constraint': 1}
0.729706 (0.038763) with: {'dropout_rate': 0.0, 'init_mode': 'uniform', 'neurons': 1, 'weight_constraint': 2}
0.719001 (0.056874) with: {'dropout_rate': 0.0, 'init_mode': 'uniform', 'neurons': 1, 'weight_constraint': 3}
0.800000 (0.000000) with: {'dropout_rate': 0.0, 'init_mode': 'uniform', 'neurons': 1, 'weight_constraint': 4}
0.755397 (0.065838) with: {'dropout_rate': 0.0, 'init_mode': 'uniform', 'neurons': 1, 'weight_constraint': 5}
0.800000 (0.000000) with: {'dropout_rate': 0.0, 'init_mode': 'uniform', 'neurons': 3, 'weight_constraint': 0}
0.661731 (0.070420) with: {'dropout_rate': 0.0, 'init_mode': 'uniform', 'neurons': 3, 'weight_constraint': 1}
0.690633 (0.037

In [402]:
# Creating the model
nn_model = Sequential()
nn_model.add(Dense(5, input_dim=14, kernel_initializer='uniform', activation='relu'))
nn_model.add(Dense(3, kernel_initializer='uniform', activation='relu'))
nn_model.add(Dense(5, kernel_initializer='uniform', activation='softmax'))
nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])