# Classification of seizures or non seizures by features from EEG data

## Data preparation

In [2]:
import os
import mne
import numpy as np
import pandas as pd
import glob

### Load .csv files

In [3]:
# CHANGE THIS PATH FOR THE FOLDER THAT CONTAINS THE .csv FILES OF SPECIFIC PREPROCESSED DATA
folder_path = os.path.join('processed_data','chb01_int10_ov00')

# paths of all files
preproc_files = glob.glob(os.path.join(folder_path, "*.csv"))

dataset_list = []
for file_path in preproc_files:
    # concadenate files
    data = pd.read_csv(file_path)
    dataset_list.append(data)

dataset = pd.concat(dataset_list)
dataset.head()


Unnamed: 0,start_time,FP1-F7_rms,FP1-F7_variance,FP1-F7_kurtosis,FP1-F7_skewness,FP1-F7_max_amp,FP1-F7_min_amp,FP1-F7_n_peaks,FP1-F7_n_crossings,FP1-F7_hfd,...,T8-P8-1_median_freq,T8-P8-1_peak_freq,T8-P8-1_hjorth_mobility,T8-P8-1_hjorth_complexity,T8-P8-1_power_1hz,T8-P8-1_power_5hz,T8-P8-1_power_10hz,T8-P8-1_power_15hz,T8-P8-1_power_20hz,seizure
0,0,7.7e-05,5.710358e-09,3.148057,0.466432,0.000326,-0.000265,192.0,162.0,0.110011,...,3.0,1.0,0.002766,171.222337,0.420823,0.226051,0.128492,0.106838,0.117797,0
1,1,7.2e-05,5.183641e-09,4.726739,0.330759,0.000326,-0.000265,192.0,168.0,0.107901,...,3.0,1.0,0.003211,148.493351,0.384135,0.234295,0.140616,0.115382,0.125572,0
2,2,6.9e-05,4.818348e-09,5.677232,0.423363,0.000326,-0.000265,202.0,184.0,0.117626,...,3.0,1.0,0.003564,142.209197,0.389655,0.238327,0.124545,0.10926,0.138213,0
3,3,7.1e-05,5.102686e-09,6.53423,0.724773,0.000418,-0.000265,202.0,177.0,0.120345,...,3.0,1.0,0.003514,140.840795,0.396989,0.232559,0.120853,0.118132,0.131467,0
4,4,7.7e-05,5.8734e-09,5.99093,0.941052,0.000418,-0.000265,199.0,167.0,0.112363,...,3.0,1.0,0.003494,142.43422,0.390794,0.230665,0.125717,0.119646,0.133177,0


### Exclude not usefull things

In [4]:
x = dataset.loc[:, dataset.columns != "seizure"]
x = x.loc[:, x.columns != "start_time"]
x = x.loc[:, x.columns != "file ID"]
y = np.asarray(dataset['seizure'])

### Separate features and labels and normalize features

In [5]:
from sklearn import preprocessing

x = preprocessing.normalize(x)
print(x.shape)

unique, counts = np.unique(y, return_counts=True)
print(dict(zip(unique, counts)))

(31044, 506)
{0: 30539, 1: 505}


### Separate in train and test 

In [6]:
from sklearn.model_selection import train_test_split

x_train_unb, x_test, y_train_unb, y_test =  train_test_split(x, y, test_size = 0.5, random_state = 0)
print(x_train_unb.shape)
print(y_train_unb.shape)
print(x_test.shape)
print(y_test.shape)

unique, counts = np.unique(y_train_unb, return_counts=True)
print(dict(zip(unique, counts)))

unique, counts = np.unique(y_test, return_counts=True)
print(dict(zip(unique, counts)))


(15522, 506)
(15522,)
(15522, 506)
(15522,)
{0: 15271, 1: 251}
{0: 15268, 1: 254}


### Balancing training set

In [7]:
from sklearn.utils import resample

def balance_train_set(x_train_unb, y_train_unb):
    labels_train = np.hstack((x_train_unb, np.expand_dims(y_train_unb, axis=1)))
    print(labels_train.shape)

    non_seizure_idx = np.nonzero(labels_train[:,276] == 0)
    labels_majority = labels_train[non_seizure_idx]

    seizure_idx = np.nonzero(labels_train[:,276] == 1)
    labels_minority = labels_train[seizure_idx]

    print("Before balancing:")
    print(np.shape(labels_majority))
    print(np.shape(labels_minority))

    labels_minority = resample(labels_minority,
                                replace=True,
                                n_samples=int(0.5*labels_majority.shape[0]),
                                random_state=123)

    print("After balancing:")
    print(np.shape(labels_majority))
    print(np.shape(labels_minority))

    labels_balanced = np.concatenate((labels_majority, labels_minority))

    np.random.shuffle(labels_balanced)
    x_train, y_train = labels_balanced[:,:276], labels_balanced[:,276]

    print("Full data after balancing:")
    print(x_train.shape)
    print(y_train.shape)

    unique, counts = np.unique(y_train, return_counts=True)
    print(dict(zip(unique, counts)))

    return x_train, y_train

# balance train set
# x_train, y_train = balance_train_set(x_train_unb, y_train_unb)

# do not balance the train set
x_train, y_train = x_train_unb, y_train_unb

## Training and evaluating ML models

### SVM Classifier

In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC

svm = SVC(kernel="rbf", class_weight='balanced', random_state = 0)

# cross validation
kf = KFold(n_splits=5)
accuracy, tpr, fpr = [], [], []
for train, test in kf.split(x_train):
    svm.fit(x_train[train, :], y_train[train])
    pred = svm.predict(x_train[test])
    tn, fp, fn, tp = confusion_matrix(y_train[test], pred).ravel()
    accuracy.append((tp + tn)/(tn + fp + fn + tp))
    #print("true positive: %.4f\ntrue negative: %.4f\nfalse positive: %.4f\nfalse negative: %.4f\n" % (tp, tn, fp, fn))
    tpr.append(tp / (tp + fn))
    fpr.append(fp / (fp + tn))

print("SVM KFold results")
print(f"Accuracy: {accuracy}")
print(f"True Positive Rate: {tpr}")
print(f"False Positive Rate: {fpr}")
print(f"Accuracy: {np.mean(accuracy)}")
print(f"True Positive Rate: {np.mean(tpr)}")
print(f"False Positive Rate: {np.mean(fpr)}")

SVM KFold results
Accuracy: [0.984219001610306, 0.985829307568438, 0.9867912371134021, 0.9861469072164949, 0.9900128865979382]
True Positive Rate: [1.0, 0.9454545454545454, 1.0, 0.9583333333333334, 1.0]
False Positive Rate: [0.01602878639188747, 0.013442622950819673, 0.013416230366492147, 0.013416230366492147, 0.010157273918741808]
Accuracy: 0.9865998680213158
True Positive Rate: 0.9807575757575758
False Positive Rate: 0.01329222879888665


In [16]:
# Testing 
svm.fit(x_train, y_train)
pred = svm.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
accuracy = (tp + tn)/(tn + fp + fn + tp)
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print("SVM validation results")
print("Accuracy: %.4f" % (accuracy))
print("True Positive Rate: %.4f" % (tpr))
print("False Positive Rate: %.4f" % (fpr))

SVM validation results
Accuracy: 0.9888
True Positive Rate: 0.9921
False Positive Rate: 0.0113


In [15]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

# cross validation
kf = KFold(n_splits=5)
accuracy, tpr, fpr = [], [], []
for train, test in kf.split(x_train):
    mlp.fit(x_train[train, :], y_train[train])
    pred = mlp.predict(x_train[test])
    tn, fp, fn, tp = confusion_matrix(y_train[test], pred).ravel()
    accuracy.append((tp + tn)/(tn + fp + fn + tp))
    tpr.append(tp / (tp + fn))
    fpr.append(fp / (fp + tn))


print("MLP KFold results")
print(f"Accuracy: {accuracy}")
print(f"True Positive Rate: {tpr}")
print(f"False Positive Rate: {fpr}")
print(f"Accuracy: {np.mean(accuracy)}")
print(f"True Positive Rate: {np.mean(tpr)}")
print(f"False Positive Rate: {np.mean(fpr)}")

MLP KFold results
Accuracy: [0.9967793880837359, 0.9951690821256038, 0.9964561855670103, 0.9967783505154639, 0.9958118556701031]
True Positive Rate: [0.7916666666666666, 0.8909090909090909, 0.8125, 0.9166666666666666, 0.75]
False Positive Rate: [0.0, 0.0029508196721311475, 0.0006544502617801048, 0.001963350785340314, 0.0]
Accuracy: 0.9961989723923835
True Positive Rate: 0.8323484848484849
False Positive Rate: 0.0011137241438503131


In [20]:
# Testing 
mlp.fit(x_train, y_train)
pred = mlp.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
accuracy = (tp + tn)/(tn + fp + fn + tp)
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print("MLP validation results")
print("Accuracy: %.4f" % (accuracy))
print("True Positive Rate: %.4f" % (tpr))
print("False Positive Rate: %.4f" % (fpr))

MLP validation results
Accuracy: 0.9977
True Positive Rate: 0.8819
False Positive Rate: 0.0003


In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(3)

# cross validation
kf = KFold(n_splits=5)
accuracy, tpr, fpr = [], [], []
for train, test in kf.split(x_train):
    knn.fit(x_train[train, :], y_train[train])
    pred = knn.predict(x_train[test])
    tn, fp, fn, tp = confusion_matrix(y_train[test], pred).ravel()
    accuracy.append((tp + tn)/(tn + fp + fn + tp))
    tpr.append(tp / (tp + fn))
    fpr.append(fp / (fp + tn))

print("KNN KFold results")
print(f"Accuracy: {accuracy}")
print(f"True Positive Rate: {tpr}")
print(f"False Positive Rate: {fpr}")
print(f"Accuracy: {np.mean(accuracy)}")
print(f"True Positive Rate: {np.mean(tpr)}")
print(f"False Positive Rate: {np.mean(fpr)}")

KNN KFold results
Accuracy: [0.9958132045088567, 0.9958132045088567, 0.9951675257731959, 0.9967783505154639, 0.9938788659793815]
True Positive Rate: [0.7708333333333334, 0.7818181818181819, 0.7291666666666666, 0.8541666666666666, 0.6346153846153846]
False Positive Rate: [0.0006542361792607131, 0.0003278688524590164, 0.0006544502617801048, 0.000981675392670157, 0.0]
Accuracy: 0.9954902302571508
True Positive Rate: 0.7541200466200466
False Positive Rate: 0.0005236461372339983


In [18]:
# Testing 
knn.fit(x_train, y_train)
pred = knn.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
accuracy = (tp + tn)/(tn + fp + fn + tp)
tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

print("KNN validation results")
print("Accuracy: %.4f" % (accuracy))
print("True Positive Rate: %.4f" % (tpr))
print("False Positive Rate: %.4f" % (fpr))

KNN validation results
Accuracy: 0.9959
True Positive Rate: 0.7717
False Positive Rate: 0.0003


So we can see that the models were able to learn by the features extracted with the preprocessing notebook. FOr the next steps, we will try to apply models to automatic extract these features and pass it to classification models.