In [1]:
# native python libraries imports 
import math
from time import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# sklearn imports 
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# keras imports 
import tensorflow
import keras
from keras import Sequential
from keras.layers import Dense
from keras_tuner.tuners import RandomSearch, BayesianOptimization, Sklearn
from keras_tuner.engine.hyperparameters import HyperParameters
from keras.activations import relu, sigmoid
from keras.optimizers import Adam, SGD
from keras.utils import get_file

# required for compatibility between sklearn and keras
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

**DETERMINE OPTIMAL NUM LAYERS, NUM NODES**

In [23]:
def load_quickdraw10(file = './quickdraw10.npz'):
    '''
    Retrieves the quickdraw dataset and separates into training and testing classes
    
    Parameters:
    ------------
    file: str: location of quickdraw file on local machine
    
    Returns:
    --------
    X_train: numpy array list[list[int]]]: pixel values for all images that the model will be trained on
    X_test: numpy array list[list[int]]: pixel values for all images that the model will test predictions on
    y_train: numpy array list[int]: list of answer values for what number the picture represents; used in training
    y_test: numpy array list[int]: list of answer values for what the picture represents; used to determine model acc
    '''
    data = np.load(file)

    X = data['arr_0']
    Y = data['arr_1']
        
    X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle=True)

    return X_train, X_test, y_train, y_test

def create_model(n_layers, first_layer_nodes, last_layer_nodes, act_funct = "relu", negative_node_incrementation=True):
    """"
    Returns a compiled keras model 
    
    Parameters
    ----------
    n_layers: int: number of hidden layers in model (excludes input/output)
    first_layer_nodes: int: number of nodes in the first hidden layer 
    last_layer_nodes: int: number of nodes in the last hidden layer
    act_funct: str: name of activation function to use in hidden layers (excluding output layer)
    negative_node_incrementation: bool: whether subsequent hidden layers have more/less nodes
        
    Returns
    -------
    model: keras object: compiled neural network model ready to be fit on a dataset
    """
    
    def gen_layer_nodes(n_layers, first_layer_nodes, last_layer_nodes, negative_node_incrementation=True):
        """
        Generates and returns the number of nodes in each hidden layer.  

        Note
        ----
        Number of nodes in each layer is linearly incremented. 
        e.g., gen_layer_nodes(5, 500, 100) will generate [500, 400, 300, 200, 100]

        Parameters
        ----------
        n_layers: int: number of hidden layers
        first_layer_nodes: int: number of nodes in first hidden layer
        last_layer_nodes: int: number of nodes in the last hidden layer
        negative_node_incrementation: bool: whether subsequent hidden layers have more/less nodes

        Returns
        -------
        layers: list[int]: Contains number of nodes for each layer 
        """

        # throws an error if n_layers is less than 2 
        assert n_layers >= 2, "n_layers must be 2 or greater"

        layers = []

        # when set to True number of nodes are decreased for subsequent layers 
        if negative_node_incrementation:
            # subtract this amount from previous layer's nodes in order to increment towards smaller numbers 
            nodes_increment = (last_layer_nodes - first_layer_nodes)/ (n_layers-1)
            
        # when set to False number of nodes are increased for subsequent layers
        else:
            # add this amount from previous layer's nodes in order to increment towards larger numbers 
            nodes_increment = (first_layer_nodes - last_layer_nodes)/ (n_layers-1)

        nodes = first_layer_nodes

        for i in range(1, n_layers + 1):
            layers.append(math.ceil(nodes))

            # increment nodes for next layer 
            nodes += nodes_increment

        return layers
    
    # create model
    model = Sequential()
    
    n_nodes = gen_layer_nodes(n_layers, first_layer_nodes, last_layer_nodes, negative_node_incrementation)
    
    for i in range(1, n_layers):
        if i == 1:
            model.add(Dense(first_layer_nodes, input_dim = X_train.shape[1], activation = act_funct))
        else:
            model.add(Dense(n_nodes[i-1], activation = act_funct))
            
            
    # output layer 
    model.add(Dense(10, # 10 unit/neurons in output layer because we have 10 possible labels to predict  
                    activation='softmax')) # softmax used for a label set greater than 2            
    
    # Compile model
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer='adam', # adam is a good default optimizer 
                  metrics=['accuracy'])
    
    # do not include model.fit() inside the create_model function
    # KerasClassifier is expecting a compiled model 
    return model

def simpler_model(hp):
    '''
    Returns a compiled keras model ready for keras-tuner gridsearch algorithms 
    
    Parameters:
    -----------
    hp: keras object: our hyperparameters that we will be tuning
    
    Returns:
    -----------
    model: keras object: a keras compiled model ready for tuning and then fitting
    '''
    
    model = Sequential()
    
    # hidden layer
    model.add(Dense(units=hp.get('units'),activation=hp.get("activation")))
    
    # output layer
    model.add(Dense(10, activation='softmax'))
    
    model.compile(
        optimizer=keras.optimizers.Adam(hp.get('learning_rate')),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])
    
    return model

def sklearn_model(units, learning_rate, activation):    
    '''
    Returns a compiled keras model ready for keras-tuner gridsearch algorithms 
    
    Parameters:
    -----------
    units: list[int]: list of possible nodes to to evaluate in the models hidden layer
    learning_rate: list[int]: list of possible learning rates to evaluate in the model
    activations: list[str]: list of possible activation functions to evaluate in the model
    
    Returns:
    -----------
    model: keras object: compiled keras model ready for tuning and fitting to data
    '''
    
    model = Sequential()
    
    # hidden layer
    model.add(Dense(units, activation=activation))
    
    # output layer
    model.add(Dense(10, activation='softmax'))
    
    model.compile(
        optimizer=Adam(learning_rate),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])
    
    return model

**AUTOMATING NODE COUNT BETWEEN FIRST AND FINAL HIDDEN LAYERS**

In [4]:
# LOADING DATASET
X_train, X_test, y_train, y_test = load_quickdraw10()

# Create model where first layers nodes are 500 and continuously decrease until the final layer
model = create_model(n_layers = 10, first_layer_nodes = 500, last_layer_nodes = 100, act_funct = 'relu')
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 500)               392500    
_________________________________________________________________
dense_1 (Dense)              (None, 456)               228456    
_________________________________________________________________
dense_2 (Dense)              (None, 412)               188284    
_________________________________________________________________
dense_3 (Dense)              (None, 367)               151571    
_________________________________________________________________
dense_4 (Dense)              (None, 323)               118864    
_________________________________________________________________
dense_5 (Dense)              (None, 278)               90072     
_________________________________________________________________
dense_6 (Dense)              (None, 234)              

In [5]:
# Create model where first layers nodes are 500 and continuously increase until the final layer
model2 = create_model(n_layers = 10, first_layer_nodes = 500, last_layer_nodes = 100, act_funct = 'relu', 
                      negative_node_incrementation=False)
model2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 500)               392500    
_________________________________________________________________
dense_11 (Dense)             (None, 545)               273045    
_________________________________________________________________
dense_12 (Dense)             (None, 589)               321594    
_________________________________________________________________
dense_13 (Dense)             (None, 634)               374060    
_________________________________________________________________
dense_14 (Dense)             (None, 678)               430530    
_________________________________________________________________
dense_15 (Dense)             (None, 723)               490917    
_________________________________________________________________
dense_16 (Dense)             (None, 767)              

**TUNING NODE COUNT + ACTIVATIONS + LEARNING RATE**

**Picking Hyperparameters and Number of Combinations to Sample**

In [16]:
# Pick hyperparameters to tune and the range of values for each to evaluate
hp = HyperParameters()
hp.Int('units', min_value = 32, max_value = 512, step = 32)
hp.Choice('learning_rate', values = [1e-1, 1e-2, 1e-3])
hp.Choice('activation', values = ['relu', 'sigmoid'])

# Determine the number of unique hyperparameter combinations that can be evaluated
# Nodes = 512 / 32 | Learning Rate = 3 | Activation Choices = 2
num_unique_hp_combs = (512 / 32) * 3 * 2

# Determine the number of combinations to actually sample
hp_combs_sampled = num_unique_hp_combs * 0.33

**OPTIMIZED SEARCH METHOD 1: RANDOM SEARCH**

In [17]:
# Set up the random search algorithm
random_tuner = RandomSearch(
                            simpler_model,
                            objective = 'val_accuracy', # define the most optimal combination
                            max_trials = hp_combs_sampled, # num times to sample parameters & build model 
                            seed = 1234,
                            hyperparameters = hp, # pass in the hyperparameter dictionary
                            directory = './keras-tuner-trial',
                            project_name = 'random_search')

# Run the random search algorithm
random_tuner.search(X_train,
                    y_train,
                    epochs = 3,
                    validation_data = (X_test, y_test))




Trial 32 Complete [00h 00m 19s]
val_accuracy: 0.44683998823165894

Best val_accuracy So Far: 0.7750800251960754
Total elapsed time: 00h 05m 56s
INFO:tensorflow:Oracle triggered exit


In [21]:
# Identify the best score and hyperparameter combination for Random search approach
# random_tuner.results_summary()
print('RANDOM SEARCH RESULTS')
print('Best Val Score: 0.7751')
print('Nodes: 352')
print('Learning Rate: 0.001')
print('Activation: Relu')

RANDOM SEARCH RESULTS
Best Val Score: 0.7751
Nodes: 352
Learning Rate: 0.001
Activation: Relu


**OPTIMIZED SEARCH METHOD 2: BAYESIAN SEARCH**

In [22]:
# Bayesian approach runs trials with hps informed by prior trial runs and those prior hps combinations
# Should be able to run fewer trials and get same or better results than random search
max_trials = hp_combs_sampled * .5

# Number of trials to run before being used as an influence in future trail combinations
num_initial_points = 5

# Variance-threshold for the hyperparameters; 
# i.e., higher is looking to find global min, lower more likely to converge on local minimum
beta = 2.5

# Set up the Bayesian search
bayesian_tuner = BayesianOptimization(
                    simpler_model,
                    objective = 'val_accuracy',
                    max_trials = max_trials,
                    hyperparameters = hp,
                    num_initial_points = num_initial_points, 
                    beta = beta, 
                    seed = 1234,
                    directory = './keras-tuner-trial',
                    project_name = 'bayesian_optimization_4')

# Run the baysian search 
bayesian_tuner.search(X_train,
                      y_train,
                      epochs = 3,
                      validation_data = (X_test, y_test))

Trial 16 Complete [00h 00m 15s]
val_accuracy: 0.7839199900627136

Best val_accuracy So Far: 0.7839199900627136
Total elapsed time: 00h 03m 39s
INFO:tensorflow:Oracle triggered exit


In [46]:
# Identify the best score and hyperparameter combination for Bayesian approach
# bayesian_tuner.results_summary()
print('BAYESIAN SEARCH RESULTS: 3.66 minutes')
print('Best Val Score: 0.7839')
print('Nodes: 416')
print('Learning Rate: 0.001')
print('Activation: Relu')
print()

BAYESIAN SEARCH RESULTS: 3.66 minutes
Best Val Score: 0.7839
Nodes: 416
Learning Rate: 0.001
Activation: Relu



**OPTIMIZED SEARCH METHOD 3: BRUTE FORCE SEARCH**

In [27]:
# build out the hyperparameter dictionary 
hyper_parameters = {
                    "units": np.arange(32, 544, 32).tolist(),
                    "learning_rate": [1e-1, 1e-2, 1e-3],
                    "activation":["relu", "sigmoid"]}

# Create the model
model = KerasClassifier(build_fn = sklearn_model)


start = time() # Get start time of the brute force search


# Create the Grid Search
grid = GridSearchCV(estimator = model, 
                    param_grid = hyper_parameters, 
                    n_jobs = -2, 
                    verbose = 1, 
                    cv = 3)

# Run the Grid Search
grid_result = grid.fit(X_train, y_train)

end = time() # Get the time at which the grid search finishes

Fitting 3 folds for each of 96 candidates, totalling 288 fits


2021-10-20 14:10:10.785316: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-20 14:10:10.796779: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-20 14:10:10.821594: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorF















In [43]:
# Get the best test accuracy
best_model = grid_result.best_estimator_
test_acc = best_model.score(X_test, y_test)
print()
# total run time 
total_run_time_in_minutes = (end - start) / 60

# Report Results
print(f'Brute Force Search Duration: {round(total_run_time_in_minutes, 2)} minutes')
print('Best Val Score: 0.7517')
print(f"Nodes: {grid_result.best_params_['units']}")
print(f"Learning Rate: {grid_result.best_params_['learning_rate']}")
print(f"Activation: {grid_result.best_params_['activation']}")
print()


Brute Force Search Duration: 8.16 minutes
Best Val Score: 0.7517
Nodes: 512
Learning Rate: 0.001
Activation: sigmoid


**CONCLUSION**

In [47]:
print(f'BRUTE FORCE SEARCH RESULTS: {round(total_run_time_in_minutes, 2)} minutes')
print('Best Val Score: 0.7517')
print(f"Nodes: {grid_result.best_params_['units']}")
print(f"Learning Rate: {grid_result.best_params_['learning_rate']}")
print(f"Activation: {grid_result.best_params_['activation']}")
print()

print('RANDOM SEARCH RESULTS: 5.93 minutes')
print('Best Val Score: 0.7751')
print('Nodes: 352')
print('Learning Rate: 0.001')
print('Activation: Relu')
print()

print('BAYESIAN SEARCH RESULTS: 3.66 minutes')
print('Best Val Score: 0.7839')
print('Nodes: 416')
print('Learning Rate: 0.001')
print('Activation: Relu')

BRUTE FORCE SEARCH RESULTS: 8.16 minutes
Best Val Score: 0.7517
Nodes: 512
Learning Rate: 0.001
Activation: sigmoid

RANDOM SEARCH RESULTS: 5.93 minutes
Best Val Score: 0.7751
Nodes: 352
Learning Rate: 0.001
Activation: Relu

BAYESIAN SEARCH RESULTS: 3.66 minutes
Best Val Score: 0.7839
Nodes: 416
Learning Rate: 0.001
Activation: Relu
