In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from copy import copy

from numpy.fft import fft
from sklearn.decomposition import PCA


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

## Functions

In [2]:
def pad_data(data_array, pad_type):
    max_length = get_max_length(data_array)
    
    for sample in range(len(data_array)):
        data_array[sample] = np.pad(data_array[sample], (0, max_length-len(data_array[sample])), pad_type)
        
    return data_array

In [3]:
def get_max_length(data_array):
    max_length = 0
    for sample in range(len(data_array)):
        if len(data_array[sample]) > max_length:
            max_length = len(data_array[sample])
            
    print("MAX LENGTH: ", max_length)
    return max_length

In [4]:
def filter_data(dataset, Hz_colname, filter_bottom, filter_top):
    filter_ = dataset[(dataset[Hz_colname] >= filter_bottom) & (dataset[Hz_colname] <= filter_top)]
    print("lengths of dataset", len(dataset),
          "length of filtered set:", len(filter_),
          "length that was filtered out:", len(dataset) - len(filter_))
    print("Percent of Original Data Retained:", round(len(filter_) / len(dataset) * 100, 2), "%")
    return filter_
    

In [5]:
def crop_signal(signal, cropLength):
    signal_cropped = np.empty((len(signal)), dtype=object)
    for n in range(len(signal_cropped)):
        signal_cropped[n] = np.array(signal[n][:cropLength])
    return signal_cropped

In [6]:
def fix_dtype(data):
    print("Previous Dtype:", data.dtype)
    if data.dtype == 'O':
        data = np.vstack(data[:]).astype(np.float)
        print("Fixed. New Dtye:", data.dtype)
        return data
    else:
        return "Not Object Type:", data.dtype

In [7]:
def reshape_x(x_, channels, length):
    reshaped_x = np.empty((x_.shape[0], channels, length))
    for event in range(len(x_)):
        for channel in range(len(x_[0])):
            reshaped_x[event][channel] = grouped_x[event][channel]
    
    reshaped_x = np.reshape(reshaped_x, (reshaped_x.shape[0], reshaped_x.shape[1]*reshaped_x.shape[2]))
    return reshaped_x

In [8]:
def apply_fft(signalData):
    x_fourier, x_cropped_fourier = build_empty_array(signalData)
    
    for observation in range(len(signalData)):
        x_fourier[observation] = fft(signalData[observation])
        
        full_length = len(x_fourier[observation])
        
        x_cropped_fourier[observation] = (2/full_length)*x_fourier[observation]  # [:full_length//2]

    return x_cropped_fourier

In [9]:
def build_empty_array(signalData):
    x_fourier = np.empty((signalData.shape[0]), dtype=object)
    x_cropped_fourier = np.empty((signalData.shape[0]), dtype=object)
    
    print(x_fourier.shape, x_cropped_fourier.shape)
    return x_fourier, x_cropped_fourier

In [10]:
def scale_features(x_prescaled):
    for feature in range(x_prescaled.T.shape[0]):
        feature_min = min(x_prescaled.T[feature])
        feature_max = max(x_prescaled.T[feature])
        feature_range = feature_max - feature_min
        x_prescaled.T[feature] = ( x_prescaled.T[feature] - feature_min ) / feature_range
    return x_prescaled

In [11]:
# This converts the string signal data into an array and then converts each string element into a float element.
def string_to_float(stringed_signal_data):
    
    float_signal_data = np.empty((len(stringed_signal_data)), dtype=object)
    
    for n in range(len(float_signal_data)):
        single_signal_observation = stringed_signal_data[n][6].split(",")
        single_signal_observation = [float(i) for i in single_signal_observation]        
        float_signal_data[n] = single_signal_observation
        
    print("The Shape of the Array we created after converting to floats:", float_signal_data.shape)
    print("The Shape of the Original Array of Stringed Signal Data:     ", stringed_signal_data[:,6].shape)

    return float_signal_data

In [12]:
def array_signal(signal_float):
    signal_array = np.empty((len(signal_float)), dtype=object)
    for n in range(len(signal_array)):
        signal_array[n] = np.array(signal_float[n])
    return signal_array

## Loading in Data

In [13]:
IN_data = pd.read_csv("C:/Users/Avery/Anaconda3/envs/mnistbrain/IN.txt", sep="\t", header=None)

In [14]:
IN_data.columns = ["ID", "Event ID", "Device", "Channel", "Label", "HzCaptured", "Signal"]

## Filter Data by Hz - OPTIONAL

In [15]:
#lowerBound = 248
#upperBound =  260

In [16]:
#IN_data = filter_data(IN_data, "HzCaptured", lowerBound, upperBound)

lengths of dataset 65250 length of filtered set: 58010 length that was filtered out: 7240
Percent of Original Data Retained: 88.9 %


## Converting Signal Data to Arrays of Floats

In [17]:
IN_data_array = IN_data.to_numpy()
print(IN_data_array.shape)

(58010, 7)


In [18]:
IN_signal_float = string_to_float(IN_data_array)

The Shape of the Array we created after converting to floats: (58010,)
The Shape of the Original Array of Stringed Signal Data:      (58010,)


In [19]:
del IN_data

## Crop Data - DO THIS IF NOT PADDING

In [20]:
# IN_signal_float = crop_signal(signal=IN_signal_float, cropLength=lowerBound)

## Arraying the Signal Data

In [21]:
# This is needed if padding, doesn't affect results if not padding
IN_signal_array = array_signal(IN_signal_float)

In [22]:
del IN_signal_float

## Feature Scaling

In [23]:
scaled_x = scale_features(IN_signal_array)

In [24]:
del IN_signal_array

## Creating Y

In [25]:
IN_y = IN_data_array[:,4]
IN_y = IN_y.astype('int')

In [26]:
del IN_data_array

## Novel-Method Loop - IF PADDING DATA

In [27]:
pads = ['constant', 'minimum', 'maximum', 'mean']

In [28]:
for pad in pads:
    print("Pad Type: ", pad)
    
    IN_padded = pad_data(scaled_x, pad)
    
    IN_fourier = apply_fft(IN_padded)
    del IN_padded
    
    IN_fourier = fix_dtype(IN_fourier)
        
    X_train, X_test, y_train, y_test = train_test_split(IN_fourier, IN_y, test_size=0.20)
    print("X_train, X_test, y_train, y_test Shapes: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    
    pca = PCA(n_components=20)
    pricinple_components = pca.fit(X_train)
    PCA_X_train = pca.transform(X_train)
    PCA_X_test = pca.transform(X_test)
    print("Explained Variance Ratio:\n", 100*pca.explained_variance_ratio_)
    
    error = []
    for i in range(1, 5):
        print("Neighbors:", i)
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(PCA_X_train, y_train)
        pred_i = knn.predict(PCA_X_test)
        error.append(np.mean(pred_i != y_test))
        print("Classification Report:\n", classification_report(y_test, pred_i))

'for pad in pads:\n    print("Pad Type: ", pad)\n    \n    IN_padded = pad_data(scaled_x, pad)\n    \n    IN_fourier = apply_fft(IN_padded)\n    del IN_padded\n    \n    IN_fourier = fix_dtype(IN_fourier)\n        \n    X_train, X_test, y_train, y_test = train_test_split(IN_fourier, IN_y, test_size=0.20)\n    print("X_train, X_test, y_train, y_test Shapes: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)\n    \n    pca = PCA(n_components=20)\n    pricinple_components = pca.fit(X_train)\n    PCA_X_train = pca.transform(X_train)\n    PCA_X_test = pca.transform(X_test)\n    print("Explained Variance Ratio:\n", 100*pca.explained_variance_ratio_)\n    \n    error = []\n    for i in range(1, 5):\n        print("Neighbors:", i)\n        knn = KNeighborsClassifier(n_neighbors=i)\n        knn.fit(PCA_X_train, y_train)\n        pred_i = knn.predict(PCA_X_test)\n        error.append(np.mean(pred_i != y_test))\n        print("Classification Report:\n", classification_report(y_test, pre

## Novel-Method Loop - IF NOT PADDING

In [29]:
"""print("--- Not Padding ---")

IN_fourier = apply_fft(scaled_x)

IN_fourier = fix_dtype(IN_fourier)

X_train, X_test, y_train, y_test = train_test_split(IN_fourier, IN_y, test_size=0.20)
print("X_train, X_test, y_train, y_test Shapes: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

pca = PCA(n_components=20)
pricinple_components = pca.fit(X_train)
PCA_X_train = pca.transform(X_train)
PCA_X_test = pca.transform(X_test)
print("Explained Variance Ratio:\n", 100*pca.explained_variance_ratio_)

error = []
for i in range(1, 5):
    print("Neighbors:", i)
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(PCA_X_train, y_train)
    pred_i = knn.predict(PCA_X_test)
    error.append(np.mean(pred_i != y_test))
    print("Classification Report:\n", classification_report(y_test, pred_i))"""

--- Not Padding ---
(58010,) (58010,)
Previous Dtype: object


  after removing the cwd from sys.path.


Fixed. New Dtye: float64
X_train, X_test, y_train, y_test Shapes:  (46408, 248) (11602, 248) (46408,) (11602,)
Explained Variance Ratio:
 [23.12624896 13.20681327  9.72873594  6.77748426  3.43223904  2.38579587
  1.89081628  1.42913086  1.29003336  1.12257301  0.9680974   0.93083403
  0.85337429  0.82207197  0.78164342  0.72550031  0.71734524  0.68077264
  0.64665494  0.62363394]
Neighbors: 1
Classification Report:
               precision    recall  f1-score   support

           0       0.17      0.17      0.17      1158
           1       0.16      0.16      0.16      1174
           2       0.16      0.17      0.17      1174
           3       0.20      0.20      0.20      1179
           4       0.17      0.18      0.17      1122
           5       0.20      0.20      0.20      1133
           6       0.18      0.18      0.18      1141
           7       0.18      0.17      0.18      1146
           8       0.21      0.20      0.20      1175
           9       0.20      0.19      