# Neural Network for CLA Project

In [3]:
# import statements
import tensorflow as tf
import numpy as np
from sklearn import preprocessing
import errno
import os
import sys
import Constants

In [4]:
# read in data

# define data and destination paths
dest_path = "/Users/Alliot/Documents/CLA-Project/Data/all-data-no-na/neural-network/"
data_path = "/Users/Alliot/Documents/CLA-Project/Data/data-sets/"

# if dest_path does not exist, create it
if not os.path.exists(dest_path):
    try:
        os.makedirs(dest_path)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

# load data sets
data_set = "data_2017_summer"
X = np.load(data_path + data_set + ".npy")
y = np.load(data_path + data_set + "_labels.npy")

In [None]:
# manipulate data. labels are converted to -1, +1 for binary classification; samples are removed uniformly from the 
# data set so that the disproportionately large number of negative samples (no algae) does not bias the model.

num_alg = 0  # count the number of algae instances
num_no_alg = 0  # count the number of no algae instances

# Convert labels to binary: -1 for no algae and 1 for algae
for i in range(0, len(y)):
    if y[i] == 0:
        y[i] = -1
        num_no_alg += 1
    if y[i] == 1 or y[i] == 2:
        y[i] = 1
        num_alg += 1
        
        
# shrink the data set by randomly removing occurences of no algae until the number of no algae samples equals the
# number of algae samples minus the sample_bias
idx = 0  # index for the data set
sample_bias = 14  # adjust the difference in the number of the two types of samples (no_alg and alg)
while num_no_alg != (num_alg - sample_bias):
    # circle through the data set until the difference of num_no_alg and num_alg equals
    # the value specified by sample_bias
    if idx == (len(y) - 1):
        idx = 0

    if y[idx] == -1:
        if np.random.rand() >= 0.5:  # remove this sample with some probability
            y = np.delete(y, obj=idx)
            X = np.delete(X, obj=idx, axis=Constants.ROWS)
            num_no_alg -= 1
        else:
            idx += 1
    else:
        idx += 1

X = preprocessing.scale(X, axis=1)  # standardize data: remove the mean and variance in each sample