In [1]:
import mne
import os
from glob import glob
import numpy as np
import pandas
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas


In [2]:
all_file_path = glob('dataset/*.edf')
print(len(all_file_path))

28


In [3]:
healthy_file_path = [i for i in all_file_path if 'h' in i.split('\\')[1]]
patient_file_path = [i for i in all_file_path if 's' in i.split('\\')[1]]
print(len(healthy_file_path))
print(len(patient_file_path))

14
14


In [4]:
def read_data(file_path):
    data = mne.io.read_raw_edf(file_path, preload = True)
    data.set_eeg_reference()
    data.filter(l_freq = 0.5, h_freq = 45)
    epochs = mne.make_fixed_length_epochs(data, duration = 5, overlap = 1)
    array = epochs.get_data()
    return array

In [5]:
sample_data = read_data(healthy_file_path[0])

Extracting EDF parameters from f:\workspace\EEG-classification\dataset\h01.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 231249  =      0.000 ...   924.996 secs...
EEG channel type selected for re-referencing
Applying average reference.
Applying a custom ('EEG',) reference.
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 45 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 45.00 Hz
- Upper transition bandwidth: 11.25 Hz (-6 dB cutoff frequency: 50.62 Hz)
- Filter length: 1651 samples (6.604 s)

Not setting metadata
231 matching events found
No baseline correction applied
0 projection items activa

In [6]:
sample_data.shape  # number of epochs, channels, length of signal

(231, 19, 1250)

In [7]:
%%capture
control_epochs_array = [read_data(i) for i in healthy_file_path]
patient_epochs_array = [read_data(i) for i in patient_file_path]

In [8]:
control_epochs_array[0].shape

(231, 19, 1250)

## Creating labels

In [9]:
control_epoch_labels = [[len(i)*[0]] for i in control_epochs_array]
patient_epoch_labels = [[len(i)*[1]] for i in patient_epochs_array]
print(len(control_epoch_labels), len(patient_epoch_labels))

14 14


In [10]:
# combining the data
data_list = control_epochs_array + patient_epochs_array  
label_list = control_epoch_labels + patient_epoch_labels

In [11]:
group_list = [[i]*len(j) for i,j in enumerate(data_list)]

In [12]:
len(group_list)

28

In [13]:
len(group_list[8])

226

In [14]:
data_array = np.vstack(data_list)
label_array = np.hstack(label_list)
group_array = np.hstack(group_list)
print(data_array.shape)
print(label_array.shape)
print(group_array.shape)

(7201, 19, 1250)
(1, 7201)
(7201,)


In [15]:
from scipy import stats
def mean(data):
    return np.mean(data,axis=-1)
    
def std(data):
    return np.std(data,axis=-1)

def ptp(data):
    return np.ptp(data,axis=-1)

def var(data):
        return np.var(data,axis=-1)

def minim(data):
      return np.min(data,axis=-1)


def maxim(data):
      return np.max(data,axis=-1)

def argminim(data):
      return np.argmin(data,axis=-1)


def argmaxim(data):
      return np.argmax(data,axis=-1)

def mean_square(data):
      return np.mean(data**2,axis=-1)

def rms(data): #root mean square
      return  np.sqrt(np.mean(data**2,axis=-1))  

def abs_diffs_signal(data):
    return np.sum(np.abs(np.diff(data,axis=-1)),axis=-1)


def skewness(data):
    return stats.skew(data,axis=-1)

def kurtosis(data):
    return stats.kurtosis(data,axis=-1)

def concatenate_features(data):
    return np.concatenate((mean(data),std(data),ptp(data),var(data),minim(data),maxim(data),argminim(data),argmaxim(data),
                          mean_square(data),rms(data),abs_diffs_signal(data),
                          skewness(data),kurtosis(data)),axis=-1)

In [18]:
# from tqdm import tqdm_notebook
# features=[]
# for data in tqdm_notebook(data_array):
#     features.append(concatenate_features(data))
# features=np.array(features)
# features.shape

In [19]:
features = []
for d in data_array:
    features.append(concatenate_features(d))

In [20]:
features_array = np.array(features)
features_array.shape

(7201, 247)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold, GridSearchCV

In [25]:
clf = LogisticRegression()
gkf = GroupKFold(5)
pipe = Pipeline([('scaler',StandardScaler()),('clf',clf)])
param_grid = {'clf__C': [0.1,0.5, 0.7, 1, 3, 5, 7]}
gscv = GridSearchCV(pipe,param_grid,cv=gkf,n_jobs=12)
gscv.fit(features_array,label_array,groups=group_array)


ValueError: Found input variables with inconsistent numbers of samples: [7201, 1]

In [None]:
# clf=LogisticRegression()
# gkf=GroupKFold(n_splits=5)
# pipe=Pipeline([('scaler',StandardScaler()),('classifier',clf)])
# param_grid = {'clf_C': [0.1,0.5, 0.7, 1, 3, 5, 7]}
# gscv=GridSearchCV(pipe,param_grid,cv=gkf,n_jobs=16)
# gscv.fit(features,label_array,groups=group_array)