In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from copy import copy
import h5py

## Loading in Data

In [None]:
EP_data = pd.read_csv("C:/Users/Avery/Anaconda3/envs/mnistbrain/EP1.01.txt", sep="\t", header=None)

In [None]:
EP_data.columns = ["ID", "Event ID", "Device", "Channel", "Label", "HzCaptured", "Signal"]

# Exploring Labels

In [None]:
# Loops through the data and extracts all of the unique labels.
def get_labels_in_set(dataset, labelcolname="Label"):
    listoflabels = []
    for i in range(len(dataset)):
        newlabel = dataset[labelcolname][i]
        if newlabel not in listoflabels:
            listoflabels.append(newlabel)
    return listoflabels

In [None]:
# Unique Labels in IN Dataset
listoflabels = get_labels_in_set(EP_data, "Label")
print("Labels In Data Set: ", sorted(listoflabels))

## Filtering Data by Hz

In [None]:
# Filters out all observations with <250 Hz Captured. Needed for the 250 nodes in the input layer of our neural network.
def filter_data(dataset, Hz_colname, filternum=250):
    data_filter = dataset[dataset[Hz_colname] >= filternum]
    print("lengths of dataset", len(dataset),
          "length of filtered set:", len(data_filter),
          "length that was filtered out:", len(dataset[dataset[Hz_colname] < filternum]))
    print("Percent of Original Data Retained:", round(len(data_filter) / len(dataset) * 100, 2), "%")
    return data_filter
    

In [None]:
EP_data_filter = filter_data(EP_data, "HzCaptured", 250)

In [None]:
del EP_data

## Filtering Data by -1 Label

In [None]:
def filter_negatives(dataset, label_colname):
    data_filter = dataset[dataset[label_colname] != -1]
    print("lengths of dataset", len(dataset),
          "length of filtered set", len(data_filter),
          "length that was filtered out", len(dataset[dataset[label_colname] == -1]))
    print("Percent of Original Data Retained", round(len(data_filter) / len(dataset) * 100, 2), "%")
    return data_filter
    

In [None]:
EP_data_filter = filter_negatives(EP_data_filter, "Label")

## Converting Signal Strings to Arrays of Floats

In [None]:
EP_data_array = EP_data_filter.to_numpy()
print(EP_data_array.shape)

In [None]:
# This converts the string signal data into an array and then converts each string element into a float element.
def string_to_float(stringed_signal_data):
    float_signal_data = np.empty((len(stringed_signal_data)), dtype=object)
    for n in range(len(float_signal_data)):
        single_signal_observation = stringed_signal_data[n][6].split(",")
        single_signal_observation = [float(i) for i in single_signal_observation]        
        float_signal_data[n] = single_signal_observation
    print("The Shape of the Array we created after converting to floats:", float_signal_data.shape)
    print("The Shape of the Original Array of Stringed Signal Data:     ", stringed_signal_data[:,6].shape)

    return float_signal_data

In [None]:
EP_signal_float = string_to_float(EP_data_array)

In [None]:
del EP_data_filter

## Cropping the Signal Data

In [None]:
def crop_signal(signal_float, length):
    signal_cropped = np.empty((len(signal_float)), dtype=object)
    for n in range(len(signal_cropped)):
        signal_cropped[n] = np.array(signal_float[n][:250])
    print("Shape of 1st sample in signal", signal_cropped[0].shape)
    priant("Shape of all signal data", signal_cropped.shape)
    return signal_cropped

In [None]:
EP_signal_cropped = crop_signal(EP_signal_float)

In [None]:
del EP_signal_float

## Grouping X (by event)

In [None]:
# This groups everything. First, create an empty array, then create mini arrays of c observations grouped together.
def group_x(x_,channels):
    grouped_x = np.empty((len(x_)//channels), dtype=object)
    temp_x = []
    for i in range(len(x_)):
        temp_x.append(x_[i])
        if (i+1) % channels == 0:
            j = ((i+1)//channels)-1
            grouped_x[j] = np.array(temp_x)
            temp_x = []
    
    print("Check if everything is of type array:", type(grouped_x), type(grouped_x[10]), type(grouped_x[15][9]))
    print("Shape of Grouped X:", grouped_x.shape, "\nShape of Original X Divided by C:", len(x_)//channels)
    
    return grouped_x

In [None]:
EP_grouped_x = group_x(EP_signal_cropped, 14)

In [None]:
del EP_signal_cropped

In [None]:
EP_grouped_x.dtype # Tensorflow can't handle Objects "O" dtypes. Luckily this is automatically fixed when flattening.

## Checking Input Shape

In [None]:
def change_shape(x_, new_shape_order):
    if new_shape_order == "channel-time":
        copy_x = []
        for i in x_:
            copy_x.append(i)
        copy_x = np.array(copy_x)
    elif new_shape_order == "time-channel":
        copy_x = []
        for i in x_:
            copy_x.append(i.T)
        copy_x = np.array(copy_x)
    elif new_shape_order == "flattened":
        copy_x = []
        for i in x_:
            copy_x.append(i.T.flatten())
        copy_x = np.array(copy_x)
    
    print("New Shape:", copy_x.shape)
    print("Sample of X:", copy_x[13])
    return copy_x

In [None]:
EP_x = change_shape(EP_grouped_x, "flattened")

In [None]:
del EP_grouped_x 

## Feature Scaling

In [None]:
print(EP_x.shape)

In [None]:
temp_x = np.array(([1.0,2.0,3.0],
          [4.0,5.0,6.0],
          [7.0,1.0,14.0],
          [10.0,11.0,12.0]))
print(temp_x.shape)
print(temp_x)

In [None]:
def scale_features(x_prescaled):
    for feature in range(x_prescaled.T.shape[0]):
        feature_min = min(x_prescaled.T[feature])
        feature_max = max(x_prescaled.T[feature])
        feature_range = feature_max - feature_min
        x_prescaled.T[feature] = ( x_prescaled.T[feature] - feature_min ) / feature_range
    return x_prescaled

In [None]:
# Verified that this works properly
scaled_x = scale_features(EP_x)

In [None]:
scaled_x.shape

In [None]:
del EP_x

## Mean Normalization

In [None]:
def mean_normalize(x_prenormalized):
    for feature in range(x_prenormalized.T.shape[0]):
        feature_mean = sum(x_prenormalized.T[feature]) / len(x_prenormalized.T[feature])
        x_prenormalized.T[feature] = x_prenormalized.T[feature] - feature_mean
    return x_prenormalized

In [None]:
# Verified that this works properly.
normalized_x = mean_normalize(scaled_x)

In [None]:
del scaled_x

## Reshaping X

In [None]:
def reshape_x(pre_x, channels, timepoints):
    new_x = np.reshape(pre_x, (pre_x.shape[0], channels, timepoints))
    return new_x

In [None]:
EP_x = reshape_x(normalized_x, 14, 250)

In [None]:
del normalized_x

## 1-Hot Encoding Labels

In [None]:
def create_y(dataarray_set, c):
    y_preprocess = dataarray_set[:,4]
    print("Y Original Length:", len(y_preprocess), "\nExamples of Y:", y_preprocess[:30])
    
    y_divided = [y_preprocess[i] for i in range(len(y_preprocess)) if (i+1)%c == 0] # Extract 1 label per event instead of c
    print("\n Y Length after filtering out event duplicates (dividing by c):", len(y_divided))
    print("Examples of Y after filtering:", y_divided[:30])
    
    return y_divided

In [None]:
# This creates an equivalent array of arrays using the encoding system.
def encode_hot_y(dataarray_set, labelslist, c):
    
    y_empty = np.empty((len(dataarray_set)//c), dtype=object)
    
    for i in range(len(y_empty)):
        y_empty[i] = np.zeros((len(labelslist)-1), int)
        
    y_ = create_y(dataarray_set, c)
    
    print("\n What y array looks like before assigning 1s: \n", y_empty[:10])
    
    for i in range(len(y_empty)): # This encodes the 1 for each label
        n = y_[i]
        y_empty[i][n] = 1
        
    print("What y array looks like after assigning 1s: \n", y_empty[:10])
    
    return y_empty
    
    

In [None]:
EP_encode_y = encode_hot_y(EP_data_array, listoflabels, 14)

In [None]:
del EP_data_array

In [None]:
def fix_dtype(y_):
    print("Previous Dtype:", y_.dtype)
    if y_.dtype == 'O':
        y_ = np.vstack(y_[:]).astype(np.float)
        print("Fixed. New Dtye:", y_.dtype)
        return y_
    else:
        return "Not Object Type:", y_dtype

In [None]:
EP_y = fix_dtype(EP_encode_y)

In [None]:
del EP_encode_y

## Saving X & Y

In [None]:
print(EP_x.nbytes/(1024**3), "GB")
print(EP_y.nbytes/(1024**3), "GB")

In [None]:
def save_data(x, y, dataName):
    with h5py.File(dataName + '_x.h5', 'w') as hf:
        hf.create_dataset(dataName + "_x_dataset", data=x)
        
    with h5py.File(dataName + "_y.h5", "w") as hf:
        hf.create_dataset(dataName + "_y_dataset", data=y)

In [None]:
save_data(EP_x, EP_y, "EP_preprocessed")

In [None]:
del EP_x
del EP_y