### Imports

In [1]:
from mne.io import read_raw_edf
import numpy as np
import os
import pandas as pd

### Seizure Indices

In [2]:
main_path = "/Users/bryanmcelvy/Documents/physionet.org/files/chbmit/1.0.0"
my_lst = []

for n in np.arange(1,24+1):
    patient = f"{n}" if n > 9 else f"0{n}"
    patient = f"chb{patient}"
    
    with open(main_path + f"/{patient}/{patient}-summary.txt") as file:
        text = file.read().split()
        
    for idx, item in enumerate(text):
        if item == "Name:": my_lst.append(text[idx+1])
        elif item == "Time:" and text[idx+2] == "seconds": my_lst.append(text[idx+1])
        

In [3]:
my_lst2 = []
for i in range(len(my_lst)-1):
    if my_lst[i].isdigit() or (not my_lst[i].isdigit() and my_lst[i+1].isdigit()):
        my_lst2.append(my_lst[i])
my_lst2.append(my_lst[-1])

my_lst3 = [[], []]
current_file = ""

for val in my_lst2:
    if not val.isdigit():
        current_file = val
    else:
        my_lst3[0].append(current_file)
        my_lst3[1].append(val)

my_lst3

[['chb01_03.edf',
  'chb01_03.edf',
  'chb01_04.edf',
  'chb01_04.edf',
  'chb01_15.edf',
  'chb01_15.edf',
  'chb01_16.edf',
  'chb01_16.edf',
  'chb01_18.edf',
  'chb01_18.edf',
  'chb01_21.edf',
  'chb01_21.edf',
  'chb01_26.edf',
  'chb01_26.edf',
  'chb02_16.edf',
  'chb02_16.edf',
  'chb02_16+.edf',
  'chb02_16+.edf',
  'chb02_19.edf',
  'chb02_19.edf',
  'chb03_01.edf',
  'chb03_01.edf',
  'chb03_02.edf',
  'chb03_02.edf',
  'chb03_03.edf',
  'chb03_03.edf',
  'chb03_04.edf',
  'chb03_04.edf',
  'chb03_34.edf',
  'chb03_34.edf',
  'chb03_35.edf',
  'chb03_35.edf',
  'chb03_36.edf',
  'chb03_36.edf',
  'chb04_05.edf',
  'chb04_05.edf',
  'chb04_08.edf',
  'chb04_08.edf',
  'chb04_28.edf',
  'chb04_28.edf',
  'chb04_28.edf',
  'chb04_28.edf',
  'chb05_06.edf',
  'chb05_06.edf',
  'chb05_13.edf',
  'chb05_13.edf',
  'chb05_16.edf',
  'chb05_16.edf',
  'chb05_17.edf',
  'chb05_17.edf',
  'chb05_22.edf',
  'chb05_22.edf',
  'chb06_01.edf',
  'chb06_01.edf',
  'chb06_01.edf',
  'chb06

In [4]:
df = pd.DataFrame(np.transpose(my_lst3))
df

Unnamed: 0,0,1
0,chb01_03.edf,2996
1,chb01_03.edf,3036
2,chb01_04.edf,1467
3,chb01_04.edf,1494
4,chb01_15.edf,1732
...,...,...
391,chb24_15.edf,3569
392,chb24_17.edf,3515
393,chb24_17.edf,3581
394,chb24_21.edf,2804


In [5]:
filenames = []
starts = []
ends = []

for idx in list(df.index):
    if idx % 2 == 0:
        filenames.append(df[0][idx])
        starts.append(df[1][idx])
    else:
        ends.append(df[1][idx])
        
seizure_idx= pd.DataFrame(data=np.transpose([filenames, starts, ends]), columns=["filename", "start", "end"])

In [6]:
seizure_idx.to_csv("seizure_idx2.csv")

### Dataset Generation

In [7]:
main_path

'/Users/bryanmcelvy/Documents/physionet.org/files/chbmit/1.0.0'

In [8]:
seizure_idx["filename"][0]

'chb01_03.edf'

In [9]:
filename_list = list(seizure_idx["filename"].unique())
filename_list[0]

'chb01_03.edf'

In [12]:
''' Create dataset '''
data_full = pd.DataFrame()
data = pd.DataFrame()
channel = "FP2-F4"
file = read_raw_edf(input_fname = "/Users/bryanmcelvy/Documents/physionet.org/files/chbmit/1.0.0/chb01/chb01_01.edf", preload=False, verbose='ERROR')
fs = int(file.info['sfreq'])

filename_list = list(seizure_idx["filename"].unique())

# Import data
for fname in filename_list:
    file = read_raw_edf(input_fname=f"{main_path}/{fname[:5]}/{fname}", preload=False, verbose='ERROR')
    if channel not in file.ch_names: continue
    temp = pd.DataFrame()
    temp['filename'] = [fname for _ in np.arange(file.n_times)]
    temp['sample_num'] = np.arange(file.n_times)
    temp['time'] = temp['sample_num'].values / fs
    temp['voltage'] = np.transpose(file.get_data()[file.ch_names.index(channel)]) * 1e6
    temp['state'] = np.zeros(file.n_times, dtype=int)
    data_full = pd.concat([data_full, temp]).reset_index(drop=True)

# Annotate class labels (0 – Normal, 1 – Seizure)
for idx in range(len(seizure_idx.index)):
    fname = seizure_idx.loc[idx, "filename"]
    start = int(int(seizure_idx.loc[idx, "start"]) * fs)
    end = int(int(seizure_idx.loc[idx, "end"]) * fs)
    
    temp_class1 = data_full.copy().loc[data_full["filename"] == fname].iloc[start:end]
    data_full = data_full.drop(temp_class1.index)
    
    temp_class1['state'] = [1 for _ in range(len(temp_class1.index))]
    data = pd.concat([data, temp_class1]).reset_index(drop=True)

# Balance dataset by randomly undersampling class 0 data
for fname in filename_list:
    temp_class0 = data_full.loc[data_full["filename"] == fname]
    temp_class1 = data.loc[data["filename"] == fname]
    total_time = int((len(temp_class0.index) + len(temp_class1.index))/fs)
    
    time_pool = list(np.arange(total_time) * fs)
    samples = []
    while len(samples) < len(temp_class1.index):
        rand_sample_idx = np.random.randint(low=0, high=len(time_pool))
        rand_sample = time_pool.pop(rand_sample_idx)
        if (rand_sample in temp_class1["sample_num"].values) or (rand_sample not in temp_class0["sample_num"].values): 
            continue
        samples = samples + [n for n in range(rand_sample, rand_sample+fs)]
    
    temp_class0 = temp_class0.loc[samples]
    
    data = pd.concat([data, temp_class0]).reset_index(drop=True)
    data_full = data_full.drop(temp_class0.index)
    

chb01_03.edf
chb01_04.edf
chb01_15.edf
chb01_16.edf
chb01_18.edf
chb01_21.edf
chb01_26.edf
chb02_16.edf
chb02_16+.edf
chb02_19.edf
chb03_01.edf
chb03_02.edf
chb03_03.edf
chb03_04.edf
chb03_34.edf
chb03_35.edf
chb03_36.edf
chb04_05.edf
chb04_08.edf
chb04_28.edf
chb05_06.edf
chb05_13.edf
chb05_16.edf
chb05_17.edf
chb05_22.edf
chb06_01.edf
chb06_04.edf
chb06_09.edf
chb06_10.edf
chb06_13.edf
chb06_18.edf
chb06_24.edf
chb07_12.edf
chb07_13.edf
chb07_19.edf
chb08_02.edf
chb08_05.edf
chb08_11.edf
chb08_13.edf
chb08_21.edf
chb09_06.edf


KeyboardInterrupt: 

In [None]:
''' Save to CSV'''
data.to_csv(path_or_buf='data_.csv', index=False)