In [3]:
from glob import glob
import os
import mne
import numpy as np
import pandas as pd
from mne.viz import plot_alignment, set_3d_view
# from pathlib import Path


# Exploring CHB-MIT dataset

## Reading edf files

In [2]:
patient_id = 1
DATASET_PATH = os.path.join(os.getcwd(), 'data', 'chb01-summary.txt') #ONLY FOR TESTING
patient_id_str = str(patient_id).zfill(2)

all_file_path = glob(f'chb{patient_id_str}/*.edf')
print(len(all_file_path))

4


In [3]:
raw = mne.io.read_raw_edf(all_file_path[0])
print(raw.get_channel_types())
print(raw.info)
print(raw.ch_names)
print(raw.get_data().shape)

Extracting EDF parameters from /home/guisoares/soares_repo/SCC0276-Machine-Learning/chb01/chb01_03.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
['eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg', 'eeg']
<Info | 7 non-empty values
 bads: []
 ch_names: FP1-F7, F7-T7, T7-P7, P7-O1, FP1-F3, F3-C3, C3-P3, P3-O1, ...
 chs: 23 EEG
 custom_ref_applied: False
 highpass: 0.0 Hz
 lowpass: 128.0 Hz
 meas_date: 2076-11-06 13:43:04 UTC
 nchan: 23
 projs: []
 sfreq: 256.0 Hz
>
['FP1-F7', 'F7-T7', 'T7-P7', 'P7-O1', 'FP1-F3', 'F3-C3', 'C3-P3', 'P3-O1', 'FP2-F4', 'F4-C4', 'C4-P4', 'P4-O2', 'FP2-F8', 'F8-T8', 'T8-P8-0', 'P8-O2', 'FZ-CZ', 'CZ-PZ', 'P7-T7', 'T7-FT9', 'FT9-FT10', 'FT10-T8', 'T8-P8-1']


  raw = mne.io.read_raw_edf(all_file_path[0])


(23, 921600)


In [4]:
def read_data_edf(file_path):
    data = mne.io.read_raw_edf(file_path, preload=True)
    data.set_eeg_reference()
    data.filter(l_freq=0.5, h_freq=45)
    epochs = mne.make_fixed_length_epochs(data, duration=10, overlap=1)
    epochs_array = epochs.get_data()
    return epochs, epochs_array
# def read_data_seizures(file_path):
#     ref_file = open(file_path, 'r')
#     array=epochs.get_data()
#     return array

def read_edf_to_raw(file_path):
    raw = mne.io.read_raw_edf(file_path, preload=True)
    raw.set_eeg_reference()
    raw.filter(l_freq=0.5, h_freq=45)
    return raw

In [5]:
%%capture

# Shape ()
epochs_data_list = [read_data_edf(i)[1] for i in all_file_path]
epochs_list = [read_data_edf(i)[0] for i in all_file_path]


In [6]:
epochs_data_array = np.array(epochs_data_list)
epochs_data_array.shape

(4, 399, 23, 2560)

In [7]:
# Each file has a representant epochs objects that contains all the epochs.  
print(epochs_list[0].metadata)

None


In [8]:
# merge the files on one array 
# shape: (N_files, 399 epochs, 23 channels, 2560 values) ->  (N_files*399 epochs, 23 channels, 2560 values)
data_array = np.vstack(epochs_data_array)
data_array.shape

(1596, 23, 2560)

## Extract features

In [11]:
########! Features extraction !#####
from scipy import stats
def mean(x):
    return np.mean(x, axis=-1)
def std(x):
    return np.std(x, axis=-1)
def ptp(x):
    return np.ptp(x, axis=-1)
def var(x):
    return np.var(x, axis=-1)
def minim(x):
    return np.min(x, axis=-1)
def maxim(x):
    return np.max(x, axis=-1)
def argminim(x):
    return np.argmin(x, axis=-1)
def argmaxim(x):
    return np.argmax(x, axis=-1)
def rms(x):
    return np.sqrt(np.mean(x**2, axis=-1))
def abs_diff_signal(x):
    return np.sum(np.abs(np.diff(x, axis=-1)), axis=-1)
def skewness(x):
    return stats.skew(x, axis=-1)
def kurtosis(x):
    return stats.kurtosis(x, axis=-1)
def concatenate_features(x):
    return np.concatenate((mean(x), std(x), ptp(x), var(x), minim(x), maxim(x), argmaxim(x), 
                           argminim(x), rms(x), abs_diff_signal(x), skewness(x), kurtosis(x)), axis=-1)

In [12]:
features = []
for d in data_array:
    features.append(concatenate_features(d))
    
features_array = np.array(features)
features_array.shape

(1596, 276)

## Visualize features per channel and per epoch

In [13]:
# get features of specific channel
def features_by_channel(features_array):
    features_channel = []
    MAX_CHANNELS = 23
    for i in range(MAX_CHANNELS):
        array = features_array[:, i*12:(i+1)*12];
        features_channel.append(pd.DataFrame(array, columns=['Mean', 'Std', 'Ptp', 'Var', 'Minimo', 'Maxim', 'Arg Max', 'Arg Min', 'RMS', 'ABS DIFF', 'skewness', 'kurtosis']))
    return features_channel

features_channel = features_by_channel(features_array)
print('Info for channel 23: ')

# Shape of features_channel=(23, 1596, 12)
features_channel[22].shape

Info for channel 23: 


(1596, 12)

# Preprocessing and labelling data

In [15]:
# patient_folder = Path.cwd()/'chbmit'/'chb{}'.format(str(patient_id).zfill(2))
# DATASET_PATH = 
DATASET_PATH = os.getcwd() #ONLY FOR TESTING

patient_id = 1
patient_id_str = str(patient_id).zfill(2)

patient_folder = os.path.join(DATASET_PATH, f'chb{patient_id_str}')
print(f"Serching for patient {patient_id_str} files at {patient_folder} ...")
patient_files = glob(os.path.join(patient_folder, f'*.edf'))
print(f"Found {len(patient_files)} files")
summary_path = os.path.join(patient_folder, f'chb{patient_id_str}-summary.txt')

Serching for patient 01 files at /home/guisoares/soares_repo/SCC0276-Machine-Learning/chb01 ...
Found 4 files


In [None]:
## READ SUMMARY (TODO: automate this)
seizures_dict = {"chb01_03": [[2996, 3036]],
                "chb01_04": [[1467, 1494]],
                "chb01_15": [[1732, 1772]],
                "chb01_16": [[1015, 1066]],
                "chb01_18": [[1720, 1810]],
                "chb01_21": [[327, 420]],
                "chb01_26": [[1862, 1963]],
                
                "chb02_16": [[130, 212]],

                "chb05_06": [[417, 532]], 
                "chb05_13": [[1086, 1196]],
                "chb05_16": [[2317, 2413]], 
                "chb05_17": [[2451, 2571]],
                "chb05_22": [[2348, 2465]],
                
                "chb08_02": [[2670, 2841]], 
                "chb08_05": [[2856, 3046]],
                "chb08_11": [[2988, 3211]], 
                "chb08_13": [[2417, 2577]],
                "chb08_21": [[2083, 2347]]}

curr_time = 0
epoch_time = 10
overlap = 5

# Divide into epochs
labels = []
for file_path in patient_files:
    # get filename: chbxx_xx.edf
    filename = os.path.split(file_path)[1]

    # remove .edf staying only chbxx_xx
    filename = os.path.splitext(filename)[0]
    
    # read raw
    raw = read_edf_to_raw(file_path)

    curr_time = 0
    while curr_time <= max(raw.times) + 0.01 - epoch_time:  # max(raw.times) = 3600
        epoch_features = []
        start_time = curr_time 
        if start_time < 0.:
            start_time = 0.
        end_time = curr_time + epoch_time
        start, stop = raw.time_as_index([start_time, end_time])
        temp = raw[:, start:stop][0]

        # start time as ID
        # features.append(start_time)

        # features
        epoch_features.extend(concatenate_features(temp))

        # seizure flag for y
        aux = []
        if filename in seizures_dict:  # if file has seizure
            for seizure in seizures_dict[filename]:
                if start_time > seizure[0] and start_time < seizure[1]:
                    aux.append(1)
                elif start_time + epoch_time > seizure[0] and start_time + epoch_time < seizure[1]:
                    aux.append(1)
                else:
                    aux.append(0)
        else:    
            aux.append(0)

        if 1 in aux:
            epoch_features.extend([1])
        else:
            epoch_features.extend([0])

        labels.append(epoch_features)

        curr_time = curr_time + epoch_time - overlap  
        print("Section ", str(len(labels)), "; start: ", start, " ; stop: ", stop)

In [37]:
np.array(labels).shape

(2876, 277)

### Save to npy

In [39]:
# np.savetxt("chb01_10i_5o_wlabel.txt", np.array(labels), delimiter=" ")
np.save("chb01_10i_5o_wlabel", np.array(labels))

### Load npy

In [7]:
labels = np.load("chb01_10i_5o_wlabel.npy")
x, y = labels[:,:276], labels[:,276]
print(x.shape)
print(y.shape)

(2876, 276)
(2876,)


### Filter features by variance and correlation 

In [59]:
from sklearn.feature_selection import VarianceThreshold

# # check zero variance features
# thresholder = VarianceThreshold(threshold=0)
# print("Variables Kept after removing features with 0 variance: ", thresholder.fit_transform(x).shape[1])

# # highly correlated features
# corr = abs(x.corr())
# upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
# cols = [column for column in upper.columns if any(upper[column] < 0.9)]
# print("Variables Kept after removing features with corr > 0.9: ", len(cols)) 

# normalize features



### Normalize features

In [8]:
from sklearn import preprocessing

x = preprocessing.normalize(x)
print(x.shape)

(2876, 276)


### Split features and labels in train and test

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size = 0.1)

In [12]:
unique, counts = np.unique(y, return_counts=True)
print(dict(zip(unique, counts)))

{0.0: 2859, 1.0: 17}
