In [2]:
import mne
import numpy as np
import glob
import os
from utils.dataloader import read_edf_to_raw, calculate_neg_pos

In [1]:
seizures_dict = {"chb01_03": [[2996, 3036]],
                "chb01_04": [[1467, 1494]],
                "chb01_15": [[1732, 1772]],
                "chb01_16": [[1015, 1066]],
                "chb01_18": [[1720, 1810]],
                "chb01_21": [[327, 420]],
                "chb01_26": [[1862, 1963]],
                
                "chb02_16": [[130, 212]],

                "chb05_06": [[417, 532]], 
                "chb05_13": [[1086, 1196]],
                "chb05_16": [[2317, 2413]], 
                "chb05_17": [[2451, 2571]],
                "chb05_22": [[2348, 2465]],
                
                "chb08_02": [[2670, 2841]], 
                "chb08_05": [[2856, 3046]],
                "chb08_11": [[2988, 3211]], 
                "chb08_13": [[2417, 2577]],
                "chb08_21": [[2083, 2347]]}

## Not balanced dataset

In [5]:
################ INPUT YOUR DATASET PATH HERE
# DATASET_PATH = os.path.join(os.getcwd(), 'data') #ONLY FOR TESTING
DATASET_PATH = "/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0"
NEW_DATASET_PATH = "/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit-segments/0"

################ CHOOSE THE FILES THAT YOU WANT HERE
selected_files = {
                'chb01': ['03','04','15','16','18','21','26'],
                'chb02': ['16'],
                'chb05': ['06', '13', '16', '17', '22'],
                'chb08': ['02','05','11','13','21']
                }

files = [os.path.join(DATASET_PATH, folder, f"{folder}_{fn}.edf") for folder, fn_list in selected_files.items() for fn in fn_list]

for file_path in files:
    print(file_path)

# parameters for epochs generation
epoch_time = 100
overlap = 0

for file_path in files:
    # get filename: chbxx_xx.edf
    filename = os.path.split(file_path)[1]
    # remove .edf staying only chbxx_xx
    filename = os.path.splitext(filename)[0]
    
    raw = read_edf_to_raw(file_path)
    
    signals = []
    labels = []

    curr_time = 0
    while curr_time <= max(raw.times) - epoch_time:

        # calculate window and get data to epoch array
        start_time = curr_time 
        if start_time < 0.:
            start_time = 0.
        end_time = curr_time + epoch_time

        start, stop = raw.time_as_index([start_time, end_time])
        epoch = raw[:, start:stop][0]
        signals.append(epoch)

        # seizure flag for y
        aux = []
        if filename in seizures_dict:  # if file has seizure
            for seizure in seizures_dict[filename]:
                if start_time > seizure[0] and start_time < seizure[1]:
                    aux.append(1)
                if start_time + epoch_time > seizure[0] and start_time + epoch_time < seizure[1]:
                    aux.append(1)
                if start_time < seizure[0] and end_time > seizure[0]:
                    aux.append(1)
                if start_time < seizure[0] and end_time > seizure[1]:
                    aux.append(1)
                else:
                    aux.append(0)
        else:    
            aux.append(0)
        
        # if the current time is inside at least one seizure interval -> 1; otherwise -> 0
        if 1 in aux:
            labels.append([1])
        else:
            labels.append([0])

        # calculate next current time
        curr_time = curr_time + epoch_time - overlap
        # print("Section ", str(len(signals)), "; start: ", start, " ; stop: ", stop, "\r")
    
    new_filename = filename + "_segments.npz"
    new_file_path = os.path.join(NEW_DATASET_PATH, new_filename)

    print(f"Saving file {new_filename}...")
    np.savez(new_file_path, signals, labels)
        
signals = np.array(signals)
labels = np.array(labels)


/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_03.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_04.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_15.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_16.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_18.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_21.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_26.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb02/chb02_16.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb05/chb05_06.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scal

  raw = mne.io.read_raw_edf(file_path, preload=True, verbose=0)


KeyboardInterrupt: 

## Balanced Dataset by Undersampling

In [10]:
import random

In [13]:
def calculate_neg_pos(y):
    unique, counts = np.unique(y, return_counts=True)
    dic = dict(zip(unique, counts))
    neg = dic[0]
    pos = dic[1]
    return neg, pos

def custom_fit_resample(x, y, rate_p=0.3):

    neg, pos = calculate_neg_pos(y)
    new_neg = (1-rate_p)/rate_p*(pos)

    while neg > new_neg:
        indice = random.randint(0, y.shape[0]-1)
        if y[indice] == 0:
            y = np.delete(y, indice, 0)
            x = np.delete(x, indice, 0)
            neg, pos = calculate_neg_pos(y)
    
    return x,y


In [26]:
################ INPUT YOUR DATASET PATH HERE
# DATASET_PATH = os.path.join(os.getcwd(), 'data') #ONLY FOR TESTING
DATASET_PATH = "/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0"
NEW_DATASET_PATH = "/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit-segments/1"


################ CHOOSE THE FILES THAT YOU WANT HERE
selected_files = {
                'chb01': ['03','04','15','16','18','21','26'],
                'chb02': ['16'],
                'chb05': ['06', '13', '16', '17', '22'],
                'chb08': ['02','05','11','13','21']
                }

files = [os.path.join(DATASET_PATH, folder, f"{folder}_{fn}.edf") for folder, fn_list in selected_files.items() for fn in fn_list]

for file_path in files:
    print(file_path)


# parameters for epochs generation
epoch_time = 100
overlap = 0

# for each patient
for folder, fn_list in selected_files.items():

    # signals and labels for each patient
    signals = []
    labels = []
    # for each file of specific patitent
    for fn in fn_list:
        file_path = os.path.join(DATASET_PATH, folder, f"{folder}_{fn}.edf")
        # get filename: chbxx_xx.edf
        filename = os.path.split(file_path)[1]
        # remove .edf staying only chbxx_xx
        filename = os.path.splitext(filename)[0]
        
        raw = read_edf_to_raw(file_path)
        
        # Divide into epochs
        curr_time = 0
        signals = []
        labels = []
        while curr_time <= max(raw.times) - epoch_time:

            # calculate window and get data to epoch array
            start_time = curr_time 
            if start_time < 0.:
                start_time = 0.
            end_time = curr_time + epoch_time

            # seizure flag for y
            aux = []
            if filename in seizures_dict:  # if file has seizure
                for seizure in seizures_dict[filename]:
                    if start_time > seizure[0] and start_time < seizure[1]:
                        aux.append(1)
                    if start_time + epoch_time > seizure[0] and start_time + epoch_time < seizure[1]:
                        aux.append(1)
                    if start_time < seizure[0] and end_time > seizure[0]:
                        aux.append(1)
                    if start_time < seizure[0] and end_time > seizure[1]:
                        aux.append(1)
                    else:
                        aux.append(0)
            else:    
                aux.append(0)
            
            # if the current time is inside at least one seizure interval -> 1; otherwise -> 0
            if 1 in aux:
                labels.append([1])
            else:
                labels.append([0])

            start, stop = raw.time_as_index([start_time, end_time])
            epoch = raw[:, start:stop][0]
            signals.append(epoch)

            # calculate next current time
            curr_time = curr_time + epoch_time - overlap
            # print("Section ", str(len(signals)), "; start: ", start, " ; stop: ", stop, "\r")
        
        # fit and apply the transform
        x_over, y_over = custom_fit_resample(np.array(signals), np.array(labels))

        new_filename = folder + "_segments.npz"
        new_file_path = os.path.join(NEW_DATASET_PATH, new_filename)

        print(calculate_neg_pos(y_over))

        print(f"Saving file {new_filename}...")
        np.savez(new_file_path, x_over, y_over)
        
signals = np.array(signals)
labels = np.array(labels)

/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_03.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_04.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_15.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_16.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_18.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_21.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb01/chb01_26.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb02/chb02_16.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scalp-eeg-database-1.0.0/chb05/chb05_06.edf
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit/chb-mit-scal

  raw = mne.io.read_raw_edf(file_path, preload=True, verbose=0)


EEG channel type selected for re-referencing
Applying average reference.
Applying a custom ('EEG',) reference.
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 45 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 45.00 Hz
- Upper transition bandwidth: 11.25 Hz (-6 dB cutoff frequency: 50.62 Hz)
- Filter length: 1691 samples (6.605 sec)



KeyboardInterrupt: 

In [18]:
print(calculate_neg_pos(y_over))

(14, 6)


## Counting features

In [19]:
def read_npz(file):
    arrays = np.load(file)
    x = arrays['arr_0']
    y = arrays['arr_1']
    return x, y

In [21]:
train_files = glob.glob(os.path.join(NEW_DATASET_PATH,'train',"*.npz"))
test_files = glob.glob(os.path.join(NEW_DATASET_PATH,'test',"*.npz"))

negatives = 0
positives = 0
for file in train_files:
    print(file)
    _, y = read_npz(file)

    unique, counts = np.unique(y, return_counts=True)
    dic = dict(zip(unique, counts))
    negatives += dic[0]
    try:
        positives += dic[1]
    except:
        pass

print(positives, negatives)

/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit-segments/1/train/chb01_segments.npz
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit-segments/1/train/chb02_segments.npz
/media/guisoares/guisoares-ext-hdd/Datasets/chb-mit-segments/1/train/chb08_segments.npz
12 28


In [19]:
dic

{0: 35}