# Classification of seizures or non seizures by features from EEG data

## Data preparation

In [2]:
import os
import mne
import numpy as np
import pandas as pd
import glob

### Load .csv files

In [3]:
# CHANGE THIS PATH FOR THE FOLDER THAT CONTAINS THE .csv FILES OF SPECIFIC PREPROCESSED DATA
folder_path = os.path.join('processed_data','chb01_int10_ov00')

# paths of all files
preproc_files = glob.glob(os.path.join(folder_path, "*.csv"))

dataset_list = []
for file_path in preproc_files:
    # concadenate files
    data = pd.read_csv(file_path)
    dataset_list.append(data)

dataset = pd.concat(dataset_list)
dataset.head()


Unnamed: 0,start_time,FP1-F7_rms,FP1-F7_variance,FP1-F7_kurtosis,FP1-F7_skewness,FP1-F7_max_amp,FP1-F7_min_amp,FP1-F7_n_peaks,FP1-F7_n_crossings,FP1-F7_hfd,...,T8-P8-1_median_freq,T8-P8-1_peak_freq,T8-P8-1_hjorth_mobility,T8-P8-1_hjorth_complexity,T8-P8-1_power_1hz,T8-P8-1_power_5hz,T8-P8-1_power_10hz,T8-P8-1_power_15hz,T8-P8-1_power_20hz,seizure
0,0,5.9e-05,3.458339e-09,0.650299,-0.06906,0.00018,-0.00022,91.0,74.0,0.02202,...,2.0,1.0,0.001255,219.180264,0.599482,0.270304,0.075103,0.031834,0.023277,0
1,1,5.8e-05,3.422638e-09,0.74051,-0.111613,0.00018,-0.00022,92.0,74.0,0.021621,...,2.0,1.0,0.001283,165.246798,0.625755,0.251214,0.072065,0.026998,0.023968,0
2,2,5.5e-05,3.002143e-09,0.966499,0.055559,0.00018,-0.00022,92.0,73.0,0.021733,...,2.0,1.0,0.001379,167.521614,0.614227,0.254178,0.074888,0.032268,0.024439,0
3,3,5.6e-05,3.150802e-09,0.687155,0.017912,0.00018,-0.00022,90.0,78.0,0.022694,...,2.0,1.0,0.001496,249.743848,0.602653,0.246206,0.08544,0.036898,0.028803,0
4,4,5.8e-05,3.336271e-09,0.547265,0.001656,0.00018,-0.00022,89.0,82.0,0.022578,...,2.0,1.0,0.001265,165.567277,0.623107,0.230839,0.085172,0.034325,0.026558,0


### Exclude not usefull things

In [4]:
x = dataset.loc[:, dataset.columns != "seizure"]
x = x.loc[:, x.columns != "start_time"]
x = x.loc[:, x.columns != "file ID"]
y = np.asarray(dataset['seizure'])

### Separate features and labels and normalize features

In [5]:
from sklearn import preprocessing

x = preprocessing.normalize(x)
print(x.shape)

unique, counts = np.unique(y, return_counts=True)
print(dict(zip(unique, counts)))

(31044, 506)
{0: 30539, 1: 505}


### Separate in train and test 

In [6]:
from sklearn.model_selection import train_test_split

x_train_unb, x_test, y_train_unb, y_test =  train_test_split(x, y, test_size = 0.5, random_state = 0)
print(x_train_unb.shape)
print(y_train_unb.shape)
print(x_test.shape)
print(y_test.shape)

unique, counts = np.unique(y_train_unb, return_counts=True)
print(dict(zip(unique, counts)))

unique, counts = np.unique(y_test, return_counts=True)
print(dict(zip(unique, counts)))


(15522, 506)
(15522,)
(15522, 506)
(15522,)
{0: 15269, 1: 253}
{0: 15270, 1: 252}


### Balancing training set

In [7]:
from sklearn.utils import resample

def balance_train_set(x_train_unb, y_train_unb):
    labels_train = np.hstack((x_train_unb, np.expand_dims(y_train_unb, axis=1)))
    print(labels_train.shape)

    non_seizure_idx = np.nonzero(labels_train[:,276] == 0)
    labels_majority = labels_train[non_seizure_idx]

    seizure_idx = np.nonzero(labels_train[:,276] == 1)
    labels_minority = labels_train[seizure_idx]

    print("Before balancing:")
    print(np.shape(labels_majority))
    print(np.shape(labels_minority))

    labels_minority = resample(labels_minority,
                                replace=True,
                                n_samples=int(0.5*labels_majority.shape[0]),
                                random_state=123)

    print("After balancing:")
    print(np.shape(labels_majority))
    print(np.shape(labels_minority))

    labels_balanced = np.concatenate((labels_majority, labels_minority))

    np.random.shuffle(labels_balanced)
    x_train, y_train = labels_balanced[:,:276], labels_balanced[:,276]

    print("Full data after balancing:")
    print(x_train.shape)
    print(y_train.shape)

    unique, counts = np.unique(y_train, return_counts=True)
    print(dict(zip(unique, counts)))

    return x_train, y_train

# balance train set
# x_train, y_train = balance_train_set(x_train_unb, y_train_unb)

x_train, y_train = x_train_unb, y_train_unb

### Filter features by variance and correlation 

In [8]:
from sklearn.feature_selection import VarianceThreshold

# check zero variance features
thresholder = VarianceThreshold(threshold=0)
print("Variables Kept after removing features with 0 variance: ", thresholder.fit_transform(x).shape[1])

# highly correlated features
corr = abs(pd.DataFrame(x).corr())
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
cols = [column for column in upper.columns if any(upper[column] < 0.9)]
print("Variables Kept after removing features with corr > 0.9: ", len(cols)) 

Variables Kept after removing features with 0 variance:  506
Variables Kept after removing features with corr > 0.9:  504


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))


## Training and evaluating ML models

### SVM Classifier

In [10]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC

svm = SVC(kernel="rbf", class_weight='balanced', random_state = 0)

# cross validation
kf = KFold(n_splits=5)
accuracy, tpr, fpr = [], [], []
for train, test in kf.split(x_train):
    svm.fit(x_train[train, :], y_train[train])
    pred = svm.predict(x_train[test])
    tn, fp, fn, tp = confusion_matrix(y_train[test], pred).ravel()
    accuracy.append((tp + tn)/(tn + fp + fn + tp))
    #print("true positive: %.4f\ntrue negative: %.4f\nfalse positive: %.4f\nfalse negative: %.4f\n" % (tp, tn, fp, fn))
    tpr.append(tp / (tp + fn))
    fpr.append(fp / (fp + tn))

print("SVM KFold results")
print(f"Accuracy: {accuracy}")
print(f"True Positive Rate: {tpr}")
print(f"False Positive Rate: {fpr}")



SVM KFold results
Accuracy: [0.9867954911433172, 0.9874396135265701, 0.9845360824742269, 0.9871134020618557, 0.9900128865979382]
True Positive Rate: [0.9482758620689655, 0.9767441860465116, 0.9642857142857143, 0.9803921568627451, 0.9555555555555556]
False Positive Rate: [0.012471283229405973, 0.012410189418680601, 0.015091863517060367, 0.012774320340648543, 0.009480222294867604]


In [11]:
# Testing 
svm.fit(x_train, y_train)
pred = svm.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
accuracy = (tp + tn)/(tn + fp + fn + tp)
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print("SVM validation results")
print("Accuracy: %.2f" % (accuracy))
print("True Positive Rate: %.2f" % (tpr))
print("False Positive Rate: %.2f" % (fpr))

SVM validation results
Accuracy: 0.99
True Positive Rate: 0.98
False Positive Rate: 0.01


In [12]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

# cross validation
kf = KFold(n_splits=5)
accuracy, tpr, fpr = [], [], []
for train, test in kf.split(x_train):
    mlp.fit(x_train[train, :], y_train[train])
    pred = mlp.predict(x_train[test])
    tn, fp, fn, tp = confusion_matrix(y_train[test], pred).ravel()
    accuracy.append((tp + tn)/(tn + fp + fn + tp))
    tpr.append(tp / (tp + fn))
    fpr.append(fp / (fp + tn))


print("MLP KFold results")
print(f"Accuracy: {accuracy}")
print(f"True Positive Rate: {tpr}")
print(f"False Positive Rate: {fpr}")




MLP KFold results
Accuracy: [0.9967793880837359, 0.9967793880837359, 0.9945231958762887, 0.9948453608247423, 0.9987113402061856]
True Positive Rate: [0.8620689655172413, 0.9069767441860465, 0.8392857142857143, 0.7647058823529411, 0.9111111111111111]
False Positive Rate: [0.0006563833278634722, 0.001959503592423253, 0.0026246719160104987, 0.0013101867016049786, 0.0]




In [13]:
# Testing 
mlp.fit(x_train, y_train)
pred = mlp.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
accuracy = (tp + tn)/(tn + fp + fn + tp)
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print("MLP validation results")
print("Accuracy: %.2f" % (accuracy))
print("True Positive Rate: %.2f" % (tpr))
print("False Positive Rate: %.2f" % (fpr))



MLP validation results
Accuracy: 1.00
True Positive Rate: 0.87
False Positive Rate: 0.00


In [14]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(3)

# cross validation
kf = KFold(n_splits=5)
accuracy, tpr, fpr = [], [], []
for train, test in kf.split(x_train):
    knn.fit(x_train[train, :], y_train[train])
    pred = knn.predict(x_train[test])
    tn, fp, fn, tp = confusion_matrix(y_train[test], pred).ravel()
    accuracy.append((tp + tn)/(tn + fp + fn + tp))
    tpr.append(tp / (tp + fn))
    fpr.append(fp / (fp + tn))

print("KNN KFold results")
print(f"Accuracy: {accuracy}")
print(f"True Positive Rate: {tpr}")
print(f"False Positive Rate: {fpr}")

KNN KFold results
Accuracy: [0.9938808373590983, 0.9954911433172303, 0.9958118556701031, 0.9967783505154639, 0.9971005154639175]
True Positive Rate: [0.6896551724137931, 0.7209302325581395, 0.7678571428571429, 0.803921568627451, 0.8444444444444444]
False Positive Rate: [0.0003281916639317361, 0.0006531678641410843, 0.0, 0.0, 0.0006538084341288003]


In [15]:
# Testing 
knn.fit(x_train, y_train)
pred = knn.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
accuracy = (tp + tn)/(tn + fp + fn + tp)
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print("KNN validation results")
print("Accuracy: %.2f" % (accuracy))
print("True Positive Rate: %.2f" % (tpr))
print("False Positive Rate: %.2f" % (fpr))

KNN validation results
Accuracy: 1.00
True Positive Rate: 0.76
False Positive Rate: 0.00


So we can see that the models were able to learn by the features extracted with the preprocessing notebook. FOr the next steps, we will try to apply models to automatic extract these features and pass it to classification models.