In [4]:
from sklearn import preprocessing
from sklearn import model_selection
import torch
import numpy as np
import os

### Define Constants

In [12]:
# data processing
sample_bias = 0     # adjust the difference in the number of the two types of samples (no algae vs algae)
test_size = 0.2
batch_size = 100    # batch size for the DataLoaders
num_features = 17
num_splits_outer = 10  # number of splits for outer CV
num_splits_inner = 9   # number of splits for inner CV

## Import Data

In [13]:
np.set_printoptions(threshold=np.inf)  # prints a full matrix rather than an abbreviated matrix

# define data and destination paths
dest_path = "/Users/Alliot/Documents/CLA-Project/Data/all-data-no-na/neural-network/"
data_path = "/Users/Alliot/Documents/CLA-Project/Data/data-sets/"
data_set = "data_2017_summer"

# if dest_path does not exist, create it
if not os.path.exists(dest_path):
    try:
        os.makedirs(dest_path)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

# load data sets
X = np.load(data_path + data_set + ".npy")
y = np.load(data_path + data_set + "_labels.npy")

# manipulate data set. labels are converted to -1, +1 for binary classification; samples are removed uniformly 
# from the data set so that the disproportionately large number of negative samples (no algae) does 
# not bias the model.

num_alg = 0  # count the number of algae instances
num_no_alg = 0  # count the number of no algae instances

# Convert labels to binary: -1 for no algae and 1 for algae
for i in range(0, len(y)):
    if y[i] == 0:
        num_no_alg += 1
    if y[i] == 1 or y[i] == 2:
        y[i] = 1
        num_alg += 1

# oversample the data set by randomly adding occurences of algae until the difference between the number of algae
# samples and no algae samples equals sample_bias (defined below)
idx = 0
sample_bias = 0
length_y = len(y)
while num_alg != (num_no_alg + sample_bias):
    # circle through the data sets until the difference of num_no_alg and num_alg equals
    # the value specified by sample_bias
    if idx == (length_y - 1):
        idx = 0

    if y[idx] == 1:
        if np.random.rand() >= 0.5:  # add this sample with some probability
            y = np.append(y, y[idx])
            X = np.append(X, np.reshape(X[idx, :], newshape=(1, num_features)), axis=0)
            num_alg += 1
        else:
            idx += 1
    else:
        idx += 1

In [18]:
np.save("X.npy", X)
np.save("y.npy", y)

## Split Data

### Outer CV Splits

In [14]:
skf = model_selection.StratifiedKFold(n_splits=num_splits_outer)
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [ 162  163  168  170  172  173  174  175  176  177  178  179  180  181
  183  186  187  188  189  190  191  192  193  194  195  196  197  198
  200  201  204  205  206  207  208  209  210  211  212  213  214  215
  216  217  218  219  220  221  222  223  224  225  226  227  228  229
  230  231  232  233  234  235  236  237  238  239  240  241  242  243
  244  245  246  247  248  249  250  251  252  253  254  255  257  258
  259  260  261  264  265  267  268  270  271  273  274  279  283  284
  285  286  287  288  289  290  291  292  293  294  295  296  299  300
  301  302  306  307  310  311  312  313  315  316  317  318  319  320
  321  322  323  324  325  326  328  329  330  331  332  333  334  335
  336  337  338  339  340  341  342  344  345  346  347  348  349  350
  351  352  353  354  355  356  357  358  359  360  361  362  363  364
  365  366  367  368  369  370  371  372  373  374  375  376  377  378
  379  380  381  382  383  384  385  386  387  388  389  390  391  392

In [None]:
# standardize data: remove the mean and variance in each sample
# num_splits = 2   # do not change
# sss = model_selection.StratifiedShuffleSplit(n_splits=num_splits, test_size=test_size)

# idx, _ = sss.split(X, y);
# train_idx = idx[0]
# test_idx = idx[1]
# X_train, X_test = X[train_idx], X[test_idx]
# y_train, y_test = y[train_idx], y[test_idx]

# X_train = preprocessing.scale(X_train, axis=1, with_mean=True, with_std=True)
# X_test = preprocessing.scale(X_test, axis=1, with_mean=True, with_std=True)

# # convert numpy arrays to pytorch tensors
# train_set_size = X_train.shape
# test_set_size = X_test.shape
# X_train, X_test = torch.from_numpy(X_train), torch.from_numpy(X_test)
# y_train, y_test = torch.from_numpy(y_train), torch.from_numpy(y_test)

# # convert pytorch tensors to pytorch TensorDataset
# train_set = utils.TensorDataset(X_train, y_train)
# test_set = utils.TensorDataset(X_test, y_test)

# # create DataLoaders
# train_loader = utils.DataLoader(train_set, batch_size=batch_size, shuffle=True)
# test_loader = utils.DataLoader(test_set, batch_size=test_set_size[0], shuffle=True)