### Organize BCH data


The purpose of this is to organize the downloaded data into a dictionary. <br>
key: patient <br>
value: data <br>
> column 0: time (seconds) <br>
> column 1: Seizure yes/no <br>
> column 2: Time until next seizure. Value will be thresholded to [0, MAX_TIME] <br>
> column 2-n: Raw data from ECG <br>

*Since not all the patients have data for all the ECG (and some have extra leads) just be careful

In [43]:
def extract_summary_data(f, temp_d):
    summary = f.read()
    summary = summary.split('\n\n')
    temp_d['freq'] = float(summary[0].split(' ')[3])
    for line in summary:
        if 'File Name:' in line:
            temp = filter(None, line.split('\n'))
            key = temp[0].split(' ')[2]
            if len(temp) == 4:
                temp_d[key] = []
                continue
            temp = temp[4:]
            seizure_time = []
            for x,y in zip(temp[0::2], temp[1::2]):
                x = int(x.split(' ')[-2])
                y = int(y.split(' ')[-2])
                seizure_time.append((x,y))
            temp_d[key] = seizure_time

def read_edf(filename):
    f = pyedflib.EdfReader(os.path.join(root, filename))
    assert(np.all(f.getSampleFrequencies() == f.getSampleFrequencies()[0]))
    n = f.signals_in_file
    signal_labels = f.getSignalLabels()
    sigbufs = np.zeros((n, f.getNSamples()[0]))
    for i in np.arange(n):
        sigbufs[i, :] = f.readSignal(i)
    return sigbufs, f.getSignalLabels()
    
def label_edf(sigbufs, seizure_time, eeg_label):
    time = np.arange(sigbufs.shape[1])/temp_d['freq']
    seizure_id = np.zeros(sigbufs.shape[1])
    seizure_delay = np.zeros(sigbufs.shape[1])
    if seizure_time:
        prev_end = 0
        for x,y in seizure_time:
            start_idx = int(temp_d['freq']*x)
            end_idx = int(temp_d['freq']*y)+1 # Include the end point within the data
            seizure_id[start_idx:end_idx].fill(1)
            seizure_delay[prev_end:start_idx] = np.arange(start_idx-prev_end)[::-1]/temp_d['freq']
            seizure_delay[end_idx:].fill(np.inf)
            prev_end = end_idx
    else:
        seizure_delay += np.inf
    df = np.vstack([time, seizure_id, seizure_delay, sigbufs])
    df = df.T
    df = df.astype('float32')
    eeg_label = ['time', 'seizure', 'seizure_delay'] + eeg_label
    df = pd.DataFrame(df, columns=eeg_label)
    df['seizure'] = df['seizure'].astype('int8')
    return df

# Use the function like this
# df = read_single_edf('chb10_89.edf')
def read_single_edf(filename):
    if '/' not in filename:
        data_folder = '/'.join(os.getcwd().split('/')[:-1]) + '/ANES212_data/'
        patient_folder = filename.split('_')[0]
        filename = data_folder + patient_folder + '/' + filename
    else:
        data_folder = '/'.join(os.getcwd().split('/')[:-1]) + '/ANES212_data/'
        patient_folder = filename.split('/')[-2]
    
    # Extract label data for the filename (e.g. start and end of seizures if any)
    summary_file = data_folder + patient_folder + '/' + patient_folder + '-summary.txt'
    d_temp = []
    with open(summary_file, 'rb') as f:
        extract_summary_data(f, temp_d)
    
    # Now read edf and label
    sigbufs, eeg_label = read_edf(full_path)
    df = label_edf(sigbufs, temp_d[filename.split('/')[-1]], eeg_label)
    return df

In [45]:
# Converts all edf to csv for readability
# Takes up a lot of memory so may not be ideal

%matplotlib inline  
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pyedflib
import re

# Parameter
patient_folder_list = []

# Load data from data_folder
data_folder = '/'.join(os.getcwd().split('/')[:-1]) + '/ANES212_data/'
patient_dict = {}
for root, dirs, files in os.walk(data_folder):
    df_list = []
    if any([patient_folder in root for patient_folder in patient_folder_list]):
        temp_d = {} # Stores data from the summary txt file
        for filename in files:
            full_path = os.path.join(root, filename)
            print filename
            if '.edf'in filename and '.seizures' not in filename:
                sigbufs, eeg_label = read_edf(full_path)
                df = label_edf(sigbufs, temp_d[filename], eeg_label)
                df_filename = os.path.join(root, filename.split('.')[0]) + '.csv'
                #df.to_csv(df_filename, sep='\t')
            elif '-summary.txt' in filename:
                with open(full_path, 'rb') as f:
                    extract_summary_data(f, temp_d)

In [108]:
print vars(f)
print dir(f)
print f.getSignalLabels()

{'file_name': '/mnt/c/Users/choec/Documents/GitHub/ANES212_data/chb04/chb04_05.edf'}
['__class__', '__del__', '__delattr__', '__dict__', '__doc__', '__enter__', '__exit__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_close', '_convert_string', '_get_float', 'admincode', 'annotations_in_file', 'birthdate', 'check_open_ok', 'datarecord_duration', 'datarecords_in_file', 'digital_max', 'digital_min', 'equipment', 'file_duration', 'file_info', 'file_info_long', 'file_name', 'gender', 'getAdmincode', 'getBirthdate', 'getDigitalMaximum', 'getDigitalMinimum', 'getEquipment', 'getFileDuration', 'getGender', 'getHeader', 'getLabel', 'getNSamples', 'getPatientAdditional', 'getPatientCode', 'getPatientName', 'getPhysicalDimension', 'getPhysicalMaximum', 'getPhysicalMinimum', 'getPrefilter', 'getRecordingAdditional', 'getSampl