# Neural Network for CLA Project

### Import statements

In [1]:
from sklearn import preprocessing
from sklearn import model_selection
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as utils
import numpy as np
import errno
import os
import sys
import Constants

### Hyperparameters

In [34]:
# data processing
sample_bias = 0     # adjust the difference in the number of the two types of samples (no algae vs algae)
test_size = 0.2
batch_size = 1000    # batch size for the DataLoaders

# NN model
num_features = 17
input_size = num_features     # size of input layer
multiplier = 12               # multiplied by num_features to determine the size of each hidden layer
learning_rate = 0.001         # learning rate of optimizer
num_epochs = 3                # number of epochs

### Read in data

In [26]:
np.set_printoptions(threshold=np.inf)  # prints a full matrix rather than an abbreviated matrix

# define data and destination paths
dest_path = "/Users/Alliot/Documents/CLA-Project/Data/all-data-no-na/neural-network/"
data_path = "/Users/Alliot/Documents/CLA-Project/Data/data-sets/"
data_set = "data_2017_summer"

# if dest_path does not exist, create it
if not os.path.exists(dest_path):
    try:
        os.makedirs(dest_path)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

# load data sets
X = np.load(data_path + data_set + ".npy")
y = np.load(data_path + data_set + "_labels.npy")

# manipulate data set. labels are converted to -1, +1 for binary classification; samples are removed uniformly 
# from the data set so that the disproportionately large number of negative samples (no algae) does 
# not bias the model.

num_alg = 0  # count the number of algae instances
num_no_alg = 0  # count the number of no algae instances

# Convert labels to binary: -1 for no algae and 1 for algae
for i in range(0, len(y)):
    if y[i] == 0:
        y[i] = -1
        num_no_alg += 1
    if y[i] == 1 or y[i] == 2:
        y[i] = 1
        num_alg += 1

# shrink the data set by randomly removing occurences of no algae until the number of no algae samples equals the
# number of algae samples minus the sample_bias
idx = 0  # index for the data set
while num_no_alg != (num_alg - sample_bias):
    # circle through the data set until the difference of num_no_alg and num_alg equals
    # the value specified by sample_bias
    if idx == (len(y) - 1):
        idx = 0
        
    if y[idx] == -1:
        if np.random.rand() >= 0.5:  # remove this sample with some probability
            y = np.delete(y, obj=idx)
            X = np.delete(X, obj=idx, axis=Constants.ROWS)
            num_no_alg -= 1
        else:
            idx += 1
    else:
        idx += 1

### Process and split data set

In [39]:
# standardize data: remove the mean and variance in each sample

# TEST DATA DELETE THIS
num_test_samples = 47000
X = 100*np.random.rand(num_test_samples, 17)
y = np.random.randint(2, size=num_test_samples);
for i in range(0, len(y)):
    if y[i] == 0:
        y[i] = -1
        



num_splits = 2   # do not change
sss = model_selection.StratifiedShuffleSplit(n_splits=num_splits, test_size=test_size)

idx, _ = sss.split(X, y);
train_idx = idx[0]
test_idx = idx[1]
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

X_train = preprocessing.scale(X_train, axis=1, with_mean=True, with_std=True)
X_test = preprocessing.scale(X_test, axis=1, with_mean=True, with_std=True)

# convert numpy arrays to pytorch tensors
X_train, X_test = torch.from_numpy(X_train), torch.from_numpy(X_test)
y_train, y_test = torch.from_numpy(y_train), torch.from_numpy(y_test)

# convert pytorch tensors to pytorch TensorDataset
train_set = utils.TensorDataset(X_train, y_train)
test_set = utils.TensorDataset(X_test, y_test)

# create DataLoaders
train_loader = utils.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = utils.DataLoader(test_set, batch_size=batch_size, shuffle=True)

### Define neural network model

In [40]:
class CLANet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CLANet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
#         self.relu2 = nn.ReLU()
#         self.fc3 = nn.Linear(hidden_size, hidden_size)
#         self.relu3 = nn.ReLU()
#         self.fc4 = nn.Linear(hidden_size, hidden_size)
#         self.relu4 = nn.ReLU()
#         self.fc5 = nn.Linear(hidden_size, hidden_size)
#         self.relu5 = nn.ReLU()
#         self.fc6 = nn.Linear(hidden_size, hidden_size)
#         self.relu6 = nn.ReLU()
#         self.fc7 = nn.Linear(hidden_size, hidden_size)
#         self.relu7 = nn.ReLU()
#         self.fc8 = nn.Linear(hidden_size, hidden_size)
#         self.relu8 = nn.ReLU()
#         self.fc9 = nn.Linear(hidden_size, hidden_size)
#         self.relu9 = nn.ReLU()
#         self.fc10 = nn.Linear(hidden_size, hidden_size)
#         self.relu10 = nn.ReLU()
#         self.fc11 = nn.Linear(hidden_size, hidden_size)
#         self.relu11 = nn.ReLU()
#         self.fc12 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
#         out = self.relu2(out)
#         out = self.fc3(out)
#         out = self.relu3(out)
#         out = self.fc4(out)
#         out = self.relu4(out)
#         out = self.fc5(out)
#         out = self.relu5(out)
#         out = self.fc6(out)
#         out = self.relu6(out)
#         out = self.fc7(out)
#         out = self.relu7(out)
#         out = self.fc8(out)
#         out = self.relu8(out)
#         out = self.fc9(out)
#         out = self.relu9(out)
#         out = self.fc10(out)
#         out = self.relu10(out)
#         out = self.fc11(out)
#         out = self.relu11(out)
#         out = self.fc12(out)
        return out

### Instantiate the neural network

In [41]:
model = CLANet(num_features, multiplier * num_features, 1)
criterion = nn.BCEWithLogitsLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, nesterov=True, momentum=0.9, dampening=0)
model.double();     # cast model parameters to double

### Train the neural network

In [42]:
model.train()     # training mode

for epoch in range(num_epochs):
    print("Epoch: %d/%d" %(epoch+1, num_epochs))
    
    for i, (samples, labels) in enumerate(train_loader):
        optimizer.zero_grad()     # clear gradient
        
        output = torch.sign(model(samples))    # forward pass
        output = torch.flatten(output)         # resize predicted labels
        labels = labels.type(torch.DoubleTensor)
        
        loss = criterion(output, labels)  # calculate loss
        loss.backward()           # calculate gradients
        optimizer.step()          # update weights
        
        # calculate and print error
        error = 1 - torch.sum(output == labels).item() / labels.size()[0]
        print("  Iteration: %d/%d, Error: %0.4f, Loss: %g" % 
              (i+1, np.ceil(X_train.size()[0] / batch_size).astype(int), error, loss.item()))

Epoch: 1/3
  Iteration: 1/38, Error: 0.5050, Loss: 0.946262
  Iteration: 2/38, Error: 0.4880, Loss: 0.905262
  Iteration: 3/38, Error: 0.4900, Loss: 0.874262
  Iteration: 4/38, Error: 0.4870, Loss: 0.916262
  Iteration: 5/38, Error: 0.4790, Loss: 0.915262
  Iteration: 6/38, Error: 0.4970, Loss: 0.916262
  Iteration: 7/38, Error: 0.4920, Loss: 0.908262
  Iteration: 8/38, Error: 0.4960, Loss: 0.933262
  Iteration: 9/38, Error: 0.4930, Loss: 0.910262
  Iteration: 10/38, Error: 0.4980, Loss: 0.909262
  Iteration: 11/38, Error: 0.5030, Loss: 0.927262
  Iteration: 12/38, Error: 0.4800, Loss: 0.887262
  Iteration: 13/38, Error: 0.5090, Loss: 0.916262
  Iteration: 14/38, Error: 0.5150, Loss: 0.919262
  Iteration: 15/38, Error: 0.5210, Loss: 0.962262
  Iteration: 16/38, Error: 0.5220, Loss: 0.953262
  Iteration: 17/38, Error: 0.5090, Loss: 0.939262
  Iteration: 18/38, Error: 0.4880, Loss: 0.900262
  Iteration: 19/38, Error: 0.4980, Loss: 0.912262
  Iteration: 20/38, Error: 0.4930, Loss: 0.90726