# Keras Neural Net Modeling

## Summary
Lets make some NN

## Baseline NN

In [69]:
# Import statements
import keras
from keras.layers import Dense
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
import optuna
# Created functions from functions.py
from functions import metrics as custom_score
from functions import improvement as custom_change

In [3]:
# Load in cleaned data

# Training Data
X_train = pd.read_csv('../Data/train/X_train.csv', index_col=0)
y_train = pd.read_csv('../Data/train/y_train.csv', index_col=0)

# Testing Data
X_test = pd.read_csv('../Data/test/X_test.csv', index_col=0)
y_test = pd.read_csv('../Data/test/y_test.csv', index_col=0)

In [34]:
 # Instantiating a NN
FSM_NN = keras.Sequential()

# Starting small with 30 neurons
FSM_NN.add(Dense(30, 'relu', input_shape=(422,)))

# 1 output node, for binary classification problem
FSM_NN.add(Dense(1, 'sigmoid'))

# Compiling model with accuracy, precision, and recall metrics. Using "Adam" as an optimizer
FSM_NN.compile('adam', 'binary_crossentropy', metrics=['acc', 'Precision', 'Recall','AUC'])

FSM_NN.fit(X_train, y_train, epochs=10, steps_per_epoch=100, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x148a29f10>

In [22]:
## Evaluating NN
FSM_loss, FSM_acc, FSM_prec, FSM_recall, FSM_AUC = FSM_NN.evaluate(X_test, y_test)

results_FSM = {
    'Accuracy': FSM_acc,
    'Precision': FSM_prec,
    'Recall': FSM_recall,
    'ROCAUC': FSM_AUC

}



### Conclusion
The neural network is boasting some impressively bad precision here, but it is doing it's best with the 30 neurons I provided it. This can definitely be improved, so let's start by giving it more brain power.

## 3 Layer NN and more Training Time
I'll add some more layers, and give the model more time to train and see if that gives us an improvement.

In [19]:
# Instantiating a NN
NN = keras.Sequential()

# 3 layers, double the size at each layer.
NN.add(Dense(32, 'relu', input_shape=(422,)))
NN.add(Dense(64, 'relu'))
NN.add(Dense(128, 'relu'))

# 1 output
NN.add(Dense(1, 'sigmoid'))

NN.compile('adam', 'binary_crossentropy', metrics=['acc', 'Precision', 'Recall','AUC'])

NN.fit(X_train, y_train, epochs=25, steps_per_epoch=100, validation_data=(X_test, y_test))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x148f2a490>

In [31]:
NN_loss, NN_acc, NN_prec, NN_recall, NN_AUC = NN.evaluate(X_test, y_test)
results_NN = {
    'Accuracy': NN_acc,
    'Precision': NN_prec,
    'Recall': NN_recall,
    'ROCAUC': NN_AUC

}



In [25]:
custom_change(results_FSM, results_NN)

Change in Results
Accuracy        +0.09
Precision       +0.35
Recall          -0.62
ROCAUC          +0.06


### Analysis
An improvement in accuracy, precision and ROCAUC, but a huge drop in Recall. This probably has to do with the data imbalance, I'll address it using oversampling, and class weights, and see what works better.

## Initial Weights
Setting the initial weights of the classes can help improve performance.

In [59]:
# Class bias = log([pos/neg])
initial_bias = np.log([4306/42777])

# Creating tf init object from bias
output_bias = tf.initializers.Constant(initial_bias)
# Instantiating a NN
IW_NN = keras.Sequential()

# 3 layers, double the size at each layer.
IW_NN.add(Dense(32, 'relu', input_shape=(422,)))
IW_NN.add(Dense(64, 'relu'))
IW_NN.add(Dense(128, 'relu'))

# 1 output
IW_NN.add(Dense(1, 'sigmoid', bias_initializer=output_bias))

IW_NN.compile('adam', 'binary_crossentropy', metrics=['acc', 'Precision', 'Recall','AUC'])

IW_NN.fit(X_train, y_train, epochs=25, steps_per_epoch=100, validation_data=(X_test, y_test))


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x148dcb100>

In [60]:
IW_NN_loss, IW_NN_acc, IW_NN_prec, IW_NN_recall, IW_NN_AUC = IW_NN.evaluate(X_test, y_test)
results_IW_NN = {
    'Accuracy': IW_NN_acc,
    'Precision': IW_NN_prec,
    'Recall': IW_NN_recall,
    'ROCAUC': IW_NN_AUC

}



In [62]:
custom_change(results_NN, IW_NN)

Change in Results
Accuracy        +0.03
Precision       +0.21
Recall          -0.25
ROCAUC          +0.02


## Weighted NN
Now, let's try setting initial weights, and adding class weights.

In [63]:
pos = 4306
neg = 42777
total = pos+neg
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.55
Weight for class 1: 5.47


In [64]:
weighted_NN = keras.Sequential()

# 3 layers, double the size at each layer.
weighted_NN.add(Dense(32, 'relu', input_shape=(422,)))
weighted_NN.add(Dense(64, 'relu'))
weighted_NN.add(Dense(128, 'relu'))

# 1 output
weighted_NN.add(Dense(1, 'sigmoid', bias_initializer=output_bias))

weighted_NN.compile('adam', 'binary_crossentropy', metrics=['acc', 'Precision', 'Recall','AUC'])

weighted_NN.fit(X_train, y_train, epochs=25, steps_per_epoch=100, validation_data=(X_test, y_test), class_weight=class_weight)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1496f01f0>

In [65]:
weighted_NN_loss, weighted_NN_acc, weighted_NN_prec, weighted_NN_recall, weighted_NN_AUC = weighted_NN.evaluate(X_test, y_test)
results_weighted_NN = {
    'Accuracy': weighted_NN_acc,
    'Precision': weighted_NN_prec,
    'Recall': weighted_NN_recall,
    'ROCAUC': weighted_NN_AUC
}



In [66]:
custom_change(results_NN, results_weighted_NN)

Change in Results
Accuracy        -0.04
Precision       -0.23
Recall          +0.33
ROCAUC          +0.01


# Oversample Nerual Network

In [35]:
# Initiate Over sampler
ros = RandomOverSampler(random_state=15)

# Applying ONLY to training set to prevent data leakage.
X_train_os, y_train_os = ros.fit_resample(X_train, y_train)

In [36]:
# Instantiating a OS NN
OS_NN = keras.Sequential()

# 3 layers, double the size at each layer.
OS_NN.add(Dense(32, 'relu', input_shape=(422,)))
OS_NN.add(Dense(64, 'relu'))
OS_NN.add(Dense(128, 'relu'))

# 1 output
OS_NN.add(Dense(1, 'sigmoid'))

OS_NN.compile('adam', 'binary_crossentropy', metrics=['acc', 'Precision', 'Recall','AUC'])

OS_NN.fit(X_train_os, y_train_os, epochs=25, steps_per_epoch=100, validation_data=(X_test, y_test))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x147c09970>

In [61]:
OS_NN_loss, OS_NN_acc, OS_NN_prec, OS_NN_recall, OS_NN_AUC = OS_NN.evaluate(X_test, y_test)
results_OS_NN = {
    'Accuracy': OS_NN_acc,
    'Precision': OS_NN_prec,
    'Recall': OS_NN_recall,
    'ROCAUC': OS_NN_AUC

}



In [38]:
custom_change(results_NN, results_OS_NN)

Change in Results
Accuracy        -0.02
Precision       -0.23
Recall          +0.52
ROCAUC          -0.01


### Analysis
The oversample seems to have balanced things out. We have decent AUC so we aren't far off the mark, but precision is now below 0.5 which is a bit rough. There is also a large difference between the training scores and testing scores, so our model is overfitting, let's just go ahead and adjust that now.

# PCA
 I think there is a bigger issue here as well. There are 422 features in this dataset right now. I imagine only about ~200 of those columns actually are providing useful information. Let's perform PCA on the model and see if it changes anything.

## PCA


In [46]:
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_train_os)
X_te_scaled = scaler.transform(X_test)

In [48]:
pca = PCA(n_components=.9)
X_train_transformed = pca.fit_transform(X_tr_scaled)
X_test_transformed = pca.transform(X_te_scaled)

In [50]:
pca.n_components_

110

In [52]:
# Instantiating a OS NN
pca_NN = keras.Sequential()

# 3 layers, double the size at each layer, and 30% dropout at each layer
pca_NN.add(Dense(32, 'relu', input_shape=(110,)))
pca_NN.add(Dense(64, 'relu'))
pca_NN.add(Dense(128, 'relu'))
# 1 output
pca_NN.add(Dense(1, 'sigmoid'))

pca_NN.compile('adam', 'binary_crossentropy', metrics=['acc', 'Precision', 'Recall','AUC'])

pca_NN.fit(X_train_transformed, y_train_os, epochs=25, steps_per_epoch=100, validation_data=(X_test_transformed, y_test))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x14a7f8d60>

In [53]:
# Saving results of the evaluate() function
pca_NN_loss, pca_NN_acc, pca_NN_prec, pca_NN_recall, pca_NN_AUC = pca_NN.evaluate(X_test_transformed, y_test)

# Placing results in a dictionary for ease of comparison.
results_pca_NN = {
    'Accuracy': pca_NN_acc,
    'Precision': pca_NN_prec,
    'Recall': pca_NN_recall,
    'ROCAUC': pca_NN_AUC
}



In [54]:
# Printing change from the previous model's results
custom_change(results_OS_NN, results_pca_NN)

Change in Results
Accuracy        -0.01
Precision       -0.02
Recall          -0.04
ROCAUC          -0.05


### Analysis
A slight drop in performance, probably from the loss of explainability, however, it looks like there is less overfitting occuring.

## MISC

In [68]:
# Instantiating a OS NN
pca_NN = keras.Sequential()

# 3 layers, double the size at each layer, and 30% dropout at each layer
pca_NN.add(Dense(32, 'relu', input_shape=(110,)))
pca_NN.add(Dense(64, 'relu'))
pca_NN.add(Dense(128, 'relu'))
# 1 output
pca_NN.add(Dense(1, 'sigmoid'))

pca_NN.compile('adam', 'binary_crossentropy', metrics=['acc', 'Precision', 'Recall','AUC'])

pca_NN.fit(X_train_transformed, y_train_os, epochs=25, steps_per_epoch=50, validation_data=(X_test_transformed, y_test))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x14c795550>

In [76]:
def create_model(trial):
    # We optimize the number of layers, hidden units and dropout in each layer and
    # the learning rate of RMSProp optimizer.

    # We define our MLP.
    n_layers = trial.suggest_int("n_layers", 1, 3)
    model = keras.Sequential()
    for i in range(n_layers):
        num_hidden = trial.suggest_int("n_units_l{}".format(i), 4, 128, log=True)
        model.add(Dense(num_hidden, activation="relu"))
        dropout = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        model.add(keras.layers.Dropout(rate=dropout))
    model.add(Dense(1, activation="sigmoid"))

    # We compile our model with a sampled learning rate.
    lr = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        metrics=["accuracy", "Recall", "Precision", "AUC"],
    )

    return model

def objective(trial):
    # Clear clutter from previous session graphs.
    # Generate our trial model.
    model = create_model(trial)

    # Fit the model on the training data.
    # The KerasPruningCallback checks for pruning condition every epoch.
    model.fit(
        X_train_os,
        y_train_os,
        callbacks=[optuna.integration.TFKerasPruningCallback(trial, "val_accuracy")],
        validation_data=(X_test, y_test),
    )

    # Evaluate the model accuracy on the validation set.
    score = model.evaluate(X_test, y_test, verbose=0)
    return score[1]

In [77]:
study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=20)
pruned_trials = study.get_trials(deepcopy=False, states=[optuna.trial.TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[optuna.trial.TrialState.COMPLETE])
print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-12-01 13:08:22,388][0m A new study created in memory with name: no-name-2c87de1f-183a-47d3-b45d-bfc8b820a475[0m




[32m[I 2021-12-01 13:08:29,377][0m Trial 0 finished with value: 0.3569997549057007 and parameters: {'n_layers': 1, 'n_units_l0': 37, 'dropout_l0': 0.3067475205405741, 'learning_rate': 7.992263750104924e-05}. Best is trial 0 with value: 0.3569997549057007.[0m




[32m[I 2021-12-01 13:08:35,021][0m Trial 1 finished with value: 0.8988229632377625 and parameters: {'n_layers': 1, 'n_units_l0': 37, 'dropout_l0': 0.2706902374099793, 'learning_rate': 0.014893590634633382}. Best is trial 1 with value: 0.8988229632377625.[0m




[32m[I 2021-12-01 13:08:40,730][0m Trial 2 finished with value: 0.2893814146518707 and parameters: {'n_layers': 1, 'n_units_l0': 43, 'dropout_l0': 0.42390250382754746, 'learning_rate': 5.47521899420535e-05}. Best is trial 1 with value: 0.8988229632377625.[0m




[32m[I 2021-12-01 13:08:48,145][0m Trial 3 finished with value: 0.896443784236908 and parameters: {'n_layers': 2, 'n_units_l0': 22, 'dropout_l0': 0.3985510510211421, 'n_units_l1': 14, 'dropout_l1': 0.2037287526433547, 'learning_rate': 0.0009079982671871224}. Best is trial 1 with value: 0.8988229632377625.[0m




[32m[I 2021-12-01 13:08:55,135][0m Trial 4 finished with value: 0.10117705911397934 and parameters: {'n_layers': 3, 'n_units_l0': 92, 'dropout_l0': 0.25470317362216915, 'n_units_l1': 117, 'dropout_l1': 0.21640855568752967, 'n_units_l2': 9, 'dropout_l2': 0.45963107057653496, 'learning_rate': 0.00029751307225551956}. Best is trial 1 with value: 0.8988229632377625.[0m




[32m[I 2021-12-01 13:09:00,688][0m Trial 5 finished with value: 0.8244428038597107 and parameters: {'n_layers': 1, 'n_units_l0': 52, 'dropout_l0': 0.25755086928161247, 'learning_rate': 0.0009562156065466245}. Best is trial 1 with value: 0.8988229632377625.[0m




[32m[I 2021-12-01 13:09:05,731][0m Trial 6 pruned. Trial was pruned at epoch 0.[0m




[32m[I 2021-12-01 13:09:10,622][0m Trial 7 pruned. Trial was pruned at epoch 0.[0m




[32m[I 2021-12-01 13:09:15,772][0m Trial 8 finished with value: 0.7669671773910522 and parameters: {'n_layers': 3, 'n_units_l0': 19, 'dropout_l0': 0.31097610457769137, 'n_units_l1': 23, 'dropout_l1': 0.37153022728050294, 'n_units_l2': 48, 'dropout_l2': 0.4907202831512819, 'learning_rate': 1.579766318609225e-05}. Best is trial 1 with value: 0.8988229632377625.[0m




[32m[I 2021-12-01 13:09:21,253][0m Trial 9 pruned. Trial was pruned at epoch 0.[0m




[32m[I 2021-12-01 13:09:27,808][0m Trial 10 finished with value: 0.8988229632377625 and parameters: {'n_layers': 2, 'n_units_l0': 6, 'dropout_l0': 0.4750818299279267, 'n_units_l1': 105, 'dropout_l1': 0.49986428150910145, 'learning_rate': 0.04518581468579907}. Best is trial 1 with value: 0.8988229632377625.[0m




[32m[I 2021-12-01 13:09:32,787][0m Trial 11 finished with value: 0.8988229632377625 and parameters: {'n_layers': 2, 'n_units_l0': 6, 'dropout_l0': 0.4967570073036629, 'n_units_l1': 118, 'dropout_l1': 0.499123815298541, 'learning_rate': 0.04744835325331595}. Best is trial 1 with value: 0.8988229632377625.[0m




[32m[I 2021-12-01 13:09:37,947][0m Trial 12 finished with value: 0.8988229632377625 and parameters: {'n_layers': 2, 'n_units_l0': 4, 'dropout_l0': 0.49139754065145147, 'n_units_l1': 43, 'dropout_l1': 0.4977136632644317, 'learning_rate': 0.03539552194347708}. Best is trial 1 with value: 0.8988229632377625.[0m




[32m[I 2021-12-01 13:09:42,679][0m Trial 13 pruned. Trial was pruned at epoch 0.[0m




[32m[I 2021-12-01 13:09:46,990][0m Trial 14 pruned. Trial was pruned at epoch 0.[0m




[32m[I 2021-12-01 13:09:52,635][0m Trial 15 finished with value: 0.8988229632377625 and parameters: {'n_layers': 2, 'n_units_l0': 4, 'dropout_l0': 0.35731677382450844, 'n_units_l1': 43, 'dropout_l1': 0.42530366220779703, 'learning_rate': 0.009589540024318003}. Best is trial 1 with value: 0.8988229632377625.[0m




[32m[I 2021-12-01 13:09:58,526][0m Trial 16 finished with value: 0.8989481329917908 and parameters: {'n_layers': 1, 'n_units_l0': 126, 'dropout_l0': 0.20783526359847965, 'learning_rate': 0.08557404442998885}. Best is trial 16 with value: 0.8989481329917908.[0m




[32m[I 2021-12-01 13:10:03,978][0m Trial 17 pruned. Trial was pruned at epoch 0.[0m




[32m[I 2021-12-01 13:10:10,512][0m Trial 18 pruned. Trial was pruned at epoch 0.[0m




[32m[I 2021-12-01 13:10:15,688][0m Trial 19 pruned. Trial was pruned at epoch 0.[0m


Study statistics: 
  Number of finished trials:  20
  Number of pruned trials:  8
  Number of complete trials:  12
Best trial:
  Value:  0.8989481329917908
  Params: 
    n_layers: 1
    n_units_l0: 126
    dropout_l0: 0.20783526359847965
    learning_rate: 0.08557404442998885
