# Neural Network for CLA Project

In [1]:
# import statements
from tensorflow import nn
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras import optimizers
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
import numpy as np
import errno
import os
import sys
import Constants

Using TensorFlow backend.


In [2]:
# This method calculates the Balanced Error Rate (BER), and the error rates for no algae and algae prediction. This
# method accepts an array of predicted labels, pred_labels, and an array of target labels, target_labels. This method
# returns ber (the balanced error rate), no_alg_error (error rate for no algae prediction), and alg_error (error
# rate for algae prediction). The confusion matrix, mat_conf, is returned as well (see first comment in method for a
# description of a confusion matrix).
def calculate_error(pred_labels, target_labels):
    # Construct a confusion matrix, mat_conf. A confusion matrix consists of the true labels for the data points
    # along its rows, and the predicted labels from k-nearest neighbors along its columns. The confusion matrix will
    # be necessary to calculate BER and other relevant errors for evaluation of the kernel trick with linear
    # classification. mat_conf is a 2x2 matrix because we only have two labels: no algae and algae. Each entry in
    # mat_conf is the sum of occurrences of each predicted label for each true label, aka the confusion matrix.
    mat_conf = np.zeros(shape=(2, 2), dtype=int)

    if len(pred_labels) != len(target_labels):
        print("Predicted and target label arrays are not the same length!")
        sys.exit()

    mat_conf = confusion_matrix(y_test, y_predict)  # get the confusion matrix

    # Calculate relevant errors and accuracies
    # Given a confusion matrix as follows:
    # [ a b ]
    # [ c d ]
    # We can define the following equations:
    # Balanced Error Rate (BER) = (b / (a + b) + c / (c + d)) / 2
    # error per label = each of the terms in the numerator of BER. ex: b / (a + b)

    ber = (mat_conf[0, 1] / (mat_conf[0, 0] + mat_conf[0, 1]) + mat_conf[1, 0] / (mat_conf[1, 1] + mat_conf[1, 0])) / 2

    no_alg_error = mat_conf[0, 1] / (mat_conf[0, 0] + mat_conf[0, 1])
    alg_error = mat_conf[1, 0] / (mat_conf[1, 1] + mat_conf[1, 0])

    ber = float("%0.4f" % ber)
    no_alg_error = float("%0.4f" % no_alg_error)
    alg_error = float("%0.4f" % alg_error)

    return ber, no_alg_error, alg_error, mat_conf

In [3]:
np.set_printoptions(threshold=np.inf)  # prints a full matrix rather than an abbreviated matrix

# read in data

# define data and destination paths
dest_path = "/Users/Alliot/Documents/CLA-Project/Data/all-data-no-na/neural-network/"
data_path = "/Users/Alliot/Documents/CLA-Project/Data/data-sets/"
data_set = "data_2017_summer"

# if dest_path does not exist, create it
if not os.path.exists(dest_path):
    try:
        os.makedirs(dest_path)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

# load data sets
X = np.load(data_path + data_set + ".npy")
y = np.load(data_path + data_set + "_labels.npy")

# manipulate data set. labels are converted to -1, +1 for binary classification; samples are removed uniformly 
# from the data set so that the disproportionately large number of negative samples (no algae) does 
# not bias the model.

num_alg = 0  # count the number of algae instances
num_no_alg = 0  # count the number of no algae instances

# Convert labels to binary: -1 for no algae and 1 for algae
for i in range(0, len(y)):
    if y[i] == 0:
        y[i] = -1
        num_no_alg += 1
    if y[i] == 1 or y[i] == 2:
        y[i] = 1
        num_alg += 1

# shrink the data set by randomly removing occurences of no algae until the number of no algae samples equals the
# number of algae samples minus the sample_bias
idx = 0  # index for the data set
sample_bias = 0  # adjust the difference in the number of the two types of samples (no_alg and alg)
while num_no_alg != (num_alg - sample_bias):
    # circle through the data set until the difference of num_no_alg and num_alg equals
    # the value specified by sample_bias
    if idx == (len(y) - 1):
        idx = 0
        
    if y[idx] == -1:
        if np.random.rand() >= 0.5:  # remove this sample with some probability
            y = np.delete(y, obj=idx)
            X = np.delete(X, obj=idx, axis=Constants.ROWS)
            num_no_alg -= 1
        else:
            idx += 1
    else:
        idx += 1

In [4]:
# process and split data set
X = preprocessing.scale(X, axis=1)  # standardize data: remove the mean and variance in each sample

num_splits = 2
test_size = 0.2
sss = model_selection.StratifiedShuffleSplit(n_splits=num_splits, test_size=test_size)

idx, _ = sss.split(X, y);
train_idx = idx[0]
test_idx = idx[1]
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

In [5]:
# neural network model

num_features = X.shape[1]
multiplier = 20

model = Sequential()
model.add(Dense(num_features * multiplier, input_shape=(num_features,), activation=nn.relu))
model.add(Dense(num_features * multiplier, input_shape=(num_features,), activation=nn.relu))
model.add(Dense(num_features * multiplier, input_shape=(num_features,), activation=nn.relu))
model.add(Dense(num_features * multiplier, input_shape=(num_features,), activation=nn.relu))
model.add(Dense(num_features * multiplier, input_shape=(num_features,), activation=nn.relu))
model.add(Dense(num_features * multiplier, input_shape=(num_features,), activation=nn.relu))
model.add(Dense(num_features * multiplier, activation=nn.relu))
model.add(Dense(1, activation=nn.softmax))

# sgd = optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)

model.compile(
    optimizer="sgd",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.fit(X_train[0:5, :], y_train[0:5], epochs=30);

ValueError: ('Could not interpret optimizer identifier:', <keras.optimizers.SGD object at 0x1a39aedf98>)

In [None]:
# Apply trained model to test sets

y_predict = model.predict(X_test)

ber, no_alg_error, alg_error, mat_conf = calculate_error(y_predict, y_test)

print("Results:")
print("BER:", ber)
print("No Algae Prediction Error:", no_alg_error)
print("Algae Prediction Error:", alg_error)
print("Confusion Matrix:")
print(mat_conf)

In [None]:
y_predict