## UPenn and Mayo Clinic's Seizure Detection Challenge
https://www.kaggle.com/c/seizure-detection/data

# Data set description: 

* 8 Patients

* 4 Dogs

For Each subject 
  * Different numbers of channels 
  * Different sampling rates from 500 Hz to 5,000 Hz
  * Different number of samples


Each data set consists of

  * 1-second EEG clips labeled "Ictal" for seizure data segments, or "Interictal" for non-seizure data segments

  * matrix of EEG sample values arranged  as [n_channel, n_time_points]
  
 

# Before run this notebook data set must be downloaded on a google drive folder
Run before: *Download_visualize_data.ipynb*

In [0]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import os
import glob
import re
from scipy.io import loadmat
from scipy.signal import resample
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from sklearn import metrics
from tensorflow.keras.callbacks import EarlyStopping


# Load folder where files are stored:

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# List all files in folder for each subject

In [0]:
def file_list(folder_path, output=False):
    
    file_list = []
   
    for filename in glob.glob(folder_path):
        file_list.append(filename)
        
    file_list.sort()
    
    if output:
        print(str(len(file_list)) + " files found")
        pp.pprint(file_list)
    
    return file_list

# Load sample data to check how it looks


In [0]:
ictal_list, interictal_list = get_data('Patient_1')
first_ictal_file = ictal_list[1]
first_interictal_file = interictal_list[1]

print('Ictal')
upenn_seizure_df, upenn_seizure_freq = mat_to_df(first_ictal_file, output=True)
print('Interictal')
upenn_baseline_df, upenn_baseline_freq = mat_to_df(first_interictal_file, output=True)

# Use mne library to visualize data, data is needed to be loaded as dataframe

from: https://github.com/Eldave93/Seizure-Detection-Tutorials

In [0]:
def mat_to_df(file_path, output = False):
  mat = loadmat(file_path)    # load mat-file

  data = mat['data']          # variable in mat file
  channels = mat['channels']  # dtypes of structures are "unsized objects"
  freq = mat['freq'][0]

  channels_list = []
  for channel_array in channels[0][0]:
    channels_list.append(channel_array[0])

  df = pd.DataFrame(data,
                    index=channels_list)

  df = df.T

  # remove columns that do not change value
  df = df.loc[:, (df != df.iloc[0]).any()]

  if output:
    display(df.head())

  return df, freq

In [0]:
mne.set_log_level('WARNING')

def mne_object(data, freq):

  info = mne.create_info(ch_names=list(data.columns), 
                         sfreq=freq, 
                         ch_types=['eeg']*data.shape[-1])
  
  # data needs to be in volts rather than in microvolts
  data = data.apply(lambda x: x*1e-6)
  # transpose the data
  data_T = data.transpose()
  
  # create raw mne object
  raw = mne.io.RawArray(data_T, info)
  raw_tmp = raw.copy()
  raw_tmp.filter(1, 70)

  return raw, raw_tmp


plot_kwargs = {
    'scalings': dict(eeg=20e-5),   # zooms the plot out
    'highpass': 0.5,              # filters out low frequencies
    'lowpass': 70.,                # filters out high frequencies
    'show_scrollbars': False,
    'show': True
}

print('Interictal')
upenn_baseline_mne, upp_filt= mne_object(upenn_baseline_df, upenn_baseline_freq)
upenn_baseline_mne.plot(**plot_kwargs);
print()
print('Ictal')
upenn_seizure_mne , sei_filt = mne_object(upenn_seizure_df, upenn_seizure_freq)
upenn_seizure_mne.plot(**plot_kwargs)

# Read dataset names and create ictal and interictal lists

In [0]:
def get_data(subject):

  ictal_list = []
  interictal_list = []


  data_dir = os.path.join(os.getcwd(), 'drive','My Drive','deep_learning','eeg','Volumes', 'Seagate', 
                            'seizure_detection', 'competition_data', 
                            'clips', subject)
  
  all_list = file_list(os.path.join(data_dir, '*'), output=False)

  for file in all_list:
    if re.findall('interictal', file):
      ictal_list.append(file)
    elif re.findall('ictal', file):
      interictal_list.append(file)
  return ictal_list, interictal_list

# Load *mat* files standirize and resample to avoid memory problems

In [0]:
def parse_input_data(data_list, data_type):
  y_value = (data_type == 'ictal')*1

  print('Loading data')
    
  X = []
  y = []

  for filename in data_list:
    data_l = loadmat(filename)
    data = data_l['data']
    d = resample(data, 256, axis=1)
    c = (d-d.mean())/d.std()
    X.append(c)
    y.append(y_value)
      
  X = np.array(X)
  y = np.array(y)

  return X,y

# Split data set in train and test



In [0]:
def get_split_data(ictal_list, interictal_list):

  X_ictal, y_ictal = parse_input_data(ictal_list, 'ictal')
  X_interictal, y_interictal = parse_input_data(interictal_list, 'interictal')

  X = np.concatenate((X_ictal, X_interictal), axis=0)
  y = np.concatenate((y_ictal, y_interictal), axis=0)
  X = np.swapaxes(X,1,2)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

  return X_train, X_test, y_train, y_test

# Over sampling the data to fix unbalanced dataset

finally it wasn't used for training because it didn´t show an improvement

In [0]:
def over_saple_train(X_train, y_train):

  for i in range(X_train.shape[2]):
    sm = SMOTE(random_state=42)
    X_c = X_train[:,:,i]
    X_res, y_res = sm.fit_resample(X_c, y_train)
    X_res = X_res.reshape(-1,X_res.shape[1],1)
  
    if i ==0:
      X_s =  X_res
    else:
      X_s = np.concatenate((X_s, X_res), axis=2)
  return X_s, y_res


# For each subject a model with the same configuration was trained

1.   2 layers of LSTM with 128 cell each one
2.   1 dropout layer
3.   1 fully connected layer for classification output 



In [0]:
def train_model(X_train, y_train,X_test, y_test):
  
  early_stop = EarlyStopping(monitor='val_accuracy',
                                           min_delta=0,
                                           patience=10,
                                           verbose=1,
                                           mode='auto')
  n_batch = 1
  epochs = 100


  model = Sequential()
  model.add(LSTM(128, input_shape=(X_train.shape[1],X_train.shape[2]),return_sequences=True))
  model.add(Dropout(rate=0.4))
  model.add(LSTM(128))
                  
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.fit(X_train, y_train, batch_size=n_batch, epochs=epochs, verbose=1, validation_data=(X_test, y_test), shuffle=False, callbacks=[early_stop])

  return model

In [0]:
subjects = ['Patient_1', 'Patient_3', 'Patient_4', 'Patient_5', 'Patient_6', 'Patient_7', 'Patient_8',
            'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4']

AUC = []
for subject in subjects:

  ictal_list, interictal_list = get_data(subject)

  X_train, X_test, y_train, y_test = get_split_data(ictal_list, interictal_list)

  #X_s, y_res = over_saple_train(X_train, y_train)

  model = train_model(X_train, y_train,X_test, y_test)

  pred = (model.predict(X_test) > 0.5).astype("int32")
  fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)
  auc = metrics.auc(fpr, tpr)
  AUC.append(auc)




Loading data
Loading data
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 00015: early stopping
Loading data
Loading data
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100

# Comparision with kaggle competition leaderBoard:

The winner got a value for mean AUC of **0.96287**

He used:

* Combination of  FFT with time and frequency correlation, taking both correlation coefficients and eigenvalues
* RandomForestClassifier

https://github.com/MichaelHills/seizure-detection