# CRISP-DM: Data Preparation

## Imports

In [None]:
import platform
import os
import gc
import numpy as numpy
import pandas as pd

import mne
mnse.set_log_level('ERROR')

## Select Data

Patients from infancy to adolescence that are admitted to a Pediatric Intensive Care Unit and monitored
by EEG are the primary focus of this research. Only patients in a comatose state are included in this
focus. Patients may be medically induced for a number of unspecified reasons, such as traumatic brain
injury, cardiac arrest, but also to treat prolonged or severe seizures. There are a total of 127 EEG
recordings from unique subjects available, each with a distinct duration that sums up to nearly 138
days of continuous data. The average recording duration per subject is 26 hours.

A case selection is applied, where only EEG recordings containing one or more seizures are eligible for
further research. This might provide the DL model more targeted information about the distinctive
patterns associated with epileptic seizures, rather than also including data from patients that did not
contain any seizures during the recording. The limitation of this is that the model is not inclusive
of patients that have no seizures. It might not be able to accurately detect and classify segments of
recordings that are completely free of seizures. The selection reduces the available data to 37 EEG
recordings that include seizures, which now consists of a total of nearly 78 days of continuous data
available, of which 3 days account for seizures. Only EEG data is selected for to limit memory constraints
while loading data. This is considered a necessary preprocessing step.

Despite this modification, there
are still 6 out of the 37 selected files that cannot be fully loaded due to its considerable size. Since they
contain many seizures, the six recordings are handled differently to still include them in the selection. The six long recordings that cannot be fully loaded are handled in segments. These large files are
cropped by its seizure annotations, resulting in a Raw object for each individual seizure. These are
loaded and concatenated at a later point. The same amount on non-seizure data from this subject
is then loaded in segments as well. This selection is in line with the data mining goals, regardless of
the technical limitations. The downside of this approach is that, due to fixed length epoching done at
a later point, this means that short seizures with a duration of less than 10 seconds are completely
excluded. This ends up leaving out a total of 12 seizures with a duration between 1.5 and 8.9 seconds
across the six files. 

In [None]:
eeg_channels = [
    'Fp1', 'Fp2', 'F3',
    'F4', 'F7', 'F8',
    'Fz', 'C3', 'C4',
    'Cz', 'T3', 'T4', 
    'T5', 'T6', 'P3',
    'P4', 'Pz', 'O1', 'O2'
]

## Clean Data

Before the data is loaded and cleaned, it is first read with necessary preprocessing steps, such as
inferring channel types and only selecting EEG channels. While all the files are read individually, files
that contain no annotations with the description indicative of a seizure, they are skipped. After this,
each file can be loaded and undergo steps that are considered cleaning.

In [None]:
# Preprocessing of raw EEG recording
def preprocess(raw):
    try:
        # Channel types are not always inferred correly
        raw.set_channel_types({ch: 'eeg' for ch in raw.ch_names})
        # Channel positions and digitization points
        raw.set_montage('standard_1020')
        # Removes electrical noise creating 50 Hz artefacts
        raw.notch_filter(np.arange(50,101,50))
        # Bandpass filter (Delta - Gamma band)
        raw.filter(0.1, 70)
        # Average of all channels is used as reference
        raw.set_eeg_reference(ref_channels='average')

        print('Cleaned and filtered')

        sfreq = raw.info['sfreq']
        if sfreq != 256.0:
            print(f'Sampling frequency is {int(sfreq)} Hz')
            raw.resample(sfreq=256.0)
            print(f'Resampled to 256 Hz')

    except Exception as e:
        print(e)
    
    return raw

## Construct Data

The seizure labels are stored in Raw’s annotations. They can be found using its description. The
annotations are converted to a DataFrame, which results in the onset and duration and in seconds for
each seizure per recording. By summing these, the end time of the seizure is calculated and added to
the DataFrame.

In [None]:
# Extract seizure annotations and create DataFrame with start and end times
def seizure_annotations(raw):
    sz_annotations = raw.annotations[raw.annotations.description == 'AANVAL']
    print(f'Found {len(sz_annotations)} seizures')

    raw.set_annotations(sz_annotations)
    
    df_sz_annotations = pd.DataFrame(sz_annotations, columns=['onset', 'duration'])
    df_sz_annotations['end'] = df_sz_annotations['onset'] + df_sz_annotations['duration']
    
    return df_sz_annotations

As the data mining goal is to classify EEG segments, fixed length epochs are created from the recording.
After consulting, it was decided that each epoch will be 10 seconds long with no overlap. The epochs
are converted to a DataFrame, and the original time in seconds is restored.

In [None]:
# Create fixed length segments (epochs) from EEG recording
def create_epochs(raw, duration, overlap):
    try:
        # Fixed length segments (epochs) from EEG
        epochs = mne.make_fixed_length_epochs(raw, duration, overlap)
        print('Epoched')
        del raw
        gc.collect()

        df_epochs = epochs.to_data_frame()
        del epochs
        gc.collect()
        print('Converted to DataFrame')
        
        df_epochs.drop(columns=['condition'], inplace=True)
        # Add original time from EEG recording
        df_epochs['count'] = df_epochs.index / 256

        if 'Pz' not in df.epochs.columns:
            print(f'Pz is missing')
            eeg_cols = [col for col in df_epochs.columns if col != 'count']
            df_epochs['Pz'] = df_epochs[eeg_cols].mean(axis=1)
            print(f'Filled with mean values')
        
        # Round EEG values to 2 decimal places to save memory
        for col in eeg_channels:
            df_epochs[col] = df_epochs[col].round(2)
        print('Values rounded')
    
    except Exception as e:
        print(e)
    
    return df_epochs

## Integrate data

The epochs DataFrame and seizure annotations are merged on the count of the recording and the
onset of each seizure. A column is added to add binary labels to, indicating each row as seizure (1) or
non-seizure (0). Each row at closest time match between each seizure start and end time are marked
as 1. With the memory constraints it is not yet possible to merge the data of all the recordings into
one DataFrame.

In [None]:
# Create binary labels to indicate seiure/non-seizure datapoints
def labeling(df_epochs, df_sz_annotations):
    # Add seizures times to DataFrame by merging
    df = pd.merge_asof(df_epochs, df_sz_annotations, left_on='count', right_on='onset')
    del df_epochs
    gc.collect()

    # Initialize default value to indicate sample as non-seizure (0)
    df['seizure'] = 0

    # Sample indication changes to seizure (1) at closest match
    df.loc[(df['count'] >= df['onset']) & (df['count'] <= df['end']), 'seizure'] = 1
    print(f"Labeled")
    
    # Seizure times can now be removed
    df.drop(columns=["count", "onset", "end"], inplace=True)

    return df

## Format Data

As mentioned, all values are presented as floats with 6 decimal places. This precision is not needed
and rounding all values to 2 decimal places will save memory and be more computationally efficient. This is done during epoching.

The data mining goals state that segments should be classified as seizure or non-seizure. The epochs
are created, but the DataFrame still presents each data point as an individual row. Reshaping takes
place so that each record represents an entire epoch of 10 seconds. This is done by grouping and
aggregating float values and putting them an array. The target column can now present an entire
epoch as seizure/non-seizure. Its value is set to the (first) most common value.
This means that, e.g., an epoch with 6 seconds of seizure data will be considered a seizure segment.
On the other hand, an epoch with 4 seconds of seizure data will be considered a non-seizure segment.
This approach is approved from the expert perspective. Appointing seizure labels to epochs that
consist of only seizure data is also possible. There is no right of wrong approach, but it should be a
conscious design choice. This decision is made to not further reduce the occurrence of seizure epochs.
Another approach is to make sure that you are not falling into a seizure or non-seizure area
within the same epoch.

It should be mentioned that, besides the excluded seizures from the problem handling section, more seizures
are excluded because of this. Luckily, seizures shorter than 5 seconds do not occur often in the data. One of the recordings (M15_0000615#01) only contains one seizure that lasts 3
seconds. The seizure is too short to be classified as a seizure epoch, and this leaves no other seizures in
this recording. Because of this, the subject is no longer included in the data selection. This reduces
the number of subjects selected for further analysis to 36.

In [None]:
# Reshape DataFrame for each record to represent each epoch
def reshape_df(df):
    agg = {}

    for channel in eeg_channels:
        agg[channel] = df.groupby('epoch')[channel].apply(lambda x: np.array(x.tolist(), dtype=np.float32))
    
    seizure_label = df.groupby('epoch')['seizure'].apply(lambda x: x.mode().iat[0])

    df = pd.DataFrame(agg)
    df['seizure'] = seizure_label.values
    df.reset_index(inplace=True)

    print('Reshaped')

    return df

Every recording contains more non-seizure epochs than seizure epochs. To avoid class imbalance, the
DataFrame is separated based on the target value. The number of seizure epochs per recording is
calculated, after which the same number of non-seizure epochs are selected randomly. These two
subsets are concatenated, presenting the final DataFrame per recording.

In [None]:
# Seperate epochs based on target value (seizure)
def epochs_seizure_based(df):
    df_sz_epochs = df.loc[df['seizure'] == 1].reset_index(drop=True)
    
    # Count number of seizure epochs for this patient
    N = df_sz_epochs.shape[0]
    print(f'Recording has {N} seizure epochs')

    # Randomly select N non-seizure epochs
    df_no_sz_epochs = df.loc[df['seizure'] == 0].sample(n=N, random_state=42).reset_index(drop=True)
    print(f' {N} random non-seizure epochs selected')
    
    df = pd.concat([df_sz_epochs, df_no_sz_epochs], axis=0).reset_index(drop=True)
    print('Balanced DataFrame')
    
    return df

After extracting the subject ID for later identification, each DataFrame is exported to a feather file.
This format is chosen for its high read and write performance, and ability to store complex data types
efficiently.

In [None]:
# Get pseudonym from filename
def extract_subject_id(output_filename):
    parts = os.path.splitext(output_filename)[0].split('_')
    if 'epoched' in parts[-2]:
        return '_'.join(parts[2:-2])
    else:
        return '_'.join(parts[2:-1])
    
# Export DataFrame to FEATHER file
def df_conversion(df, output_folder, output_filename):
    output_path = os.path.join(output_folder, output_filename)
    subject_id = extract_subject_id(output_filename)
    df.insert(0, 'subject', subject_id)

    df.to_feather(output_path)

    print('Exported data to FEATHER')

## Execute Data Preparation

In [None]:
# Define directory paths with relevant system prefix
def define_paths(project_dir, data_dir):
    SYSTEM = platform.system()
    if SYSTEM not in ('Windows', 'Linux'):
        raise Exception(f'Unsupported plaform: {SYSTEM}')
    disk_prefix = None
    if SYSTEM == 'Windows':
        disk_prefix = 'Z:/'
    elif SYSTEM == 'Linux':
        disk_prefix == '/media/mount/'
    
    project_path = os.path.join(disk_prefix, project_dir)
    data_path = os.path.join(project_path, data_dir)

    return project_path, data_path

In [None]:
# Select all EDF files in data directory
def select_edf_files(data_path):
    edf_files = [file for file in os.listdir(data_path) if file.endswith('.edf')]
    print(f{len(edf_files)} EDF files available)

    return edf_files

In [None]:
# Reading file and optional loading
def load_raw(file, seizure_files_only, loading):
    try:
        raw = mne.io.read_raw_edf(file, infer_types=True, include=eeg_channels)
        print('Read')

        if seizure_files_only:
            if not any('AANVAL' in ann['description'] for ann in raw.annotations):
                del raw
                gc.collect()
                raise Exception(f'Recording has no seizures... exiting')
        
        if loading:
            raw = raw.load_data()
            print('Loaded')
        
        return raw
    
    except Exception as e:
        print(f'{file} is not loaded: {e}')

In [None]:
# Define paths and select files
project_path, data_path = define_paths(project_dir, data_dir='_01_Raw_Data/EDF/')
output_path = define_paths(project_dir, data_dir= '_04_Epoched_Data/FEATHER/')

edf_files = select_edf_files(data_path)

In [None]:
# Run loading and preprocessing functions
def loading(data_path, file, loading):
    '''
    Reads and optionally loads EEG recordings with MNE

    Args:
        data_path: data location (str)
        file: selected EEG filename (str)
    
    Returns:
        raw: read/loaded EEG files (mne.io.Raw)
    
    Raises:
        MemoryError: some files are too large and raise
            an error when exceeding available memory
    '''

    file = os.path.join(data_path, file)
    raw = load_raw(file, seizure_files_only=True, loading=loading)
    return raw

In [None]:
def preprocessing(raw):
    '''
    Prepares the data for the use of modeling
    Includes filtering, setting annotations, epoching data, 
    and creating DataFrame with labeled encoding

    Args:
        raw: EEG recording (mne.io.Raw)
    
    Raises:
        MemoryError: some files are too large and raise
            an error when exceeding available memory
    '''
    
    raw = preprocess(raw)
    annotations = seizure_annotations(raw)
    epochs = create_epochs(raw, duration=10.0, overlap=0.0)

    df = labeling(epochs, annotations)
    df = reshape_df(df)
    df = epochs_seizure_based(df)

    return df

    print(f'...Done')

The functions defined above are executed in a loop for each file individually

In [None]:
for index, file in enumerate(edf_files, start=1):
    print('-------------------------------------------------')
    print(f'Handling {file}... EEG {index}/{len(edf_files)}')
    try:
        raw = loading(data_path, file, loading=True)
        df = preprocessing(raw)
        output_filename = os.path.splitext(file)[0] + '_epoched.feather'
        df_conversion(df, output_path, output_filename)
    except Exception as e:
        print(f'Error with {file}: {e}')

### Large file handling

In [None]:
# Check for missing epoch files
folder_raw = data_path

seizure_files = [
 'EXP_EMC_BR2_0000210#01.edf','EXP_EMC_BR3_0000495#01.edf','EXP_EMC_BR3_0000520#01.edf',
 'EXP_EMC_M15_0000099#01.edf','EXP_EMC_M15_0000106#01.edf','EXP_EMC_M15_0000120#01.edf',
 'EXP_EMC_M15_0000139#01.edf','EXP_EMC_M15_0000151#01.edf','EXP_EMC_M15_0000241#01.edf',
 'EXP_EMC_M15_0000311#01.edf','EXP_EMC_M15_0000333#01.edf','EXP_EMC_M15_0000434#01.edf',
 'EXP_EMC_M15_0000469#01.edf','EXP_EMC_M15_0000556#01.edf','EXP_EMC_M15_0000568#01.edf',
 'EXP_EMC_M15_0000574#01.edf','EXP_EMC_M15_0000615#01.edf','EXP_EMC_M16_0000070#01.edf',
 'EXP_EMC_M16_0000109#01.edf','EXP_EMC_M16_0000110#01.edf','EXP_EMC_M16_0000111#01.edf',
 'EXP_EMC_M16_0000153#01.edf','EXP_EMC_M16_0000168#01.edf','EXP_EMC_M16_0000232#01.edf',
 'EXP_EMC_M16_0000233#01.edf','EXP_EMC_M16_0000243#01.edf','EXP_EMC_M16_0000244#01.edf',
 'EXP_EMC_M16_0000351#01.edf','EXP_EMC_M16_0000541#01.edf','EXP_EMC_M16_0000542#01.edf',
 'EXP_EMC_M16_0000555#01.edf','EXP_EMC_M16_0000579#01.edf','EXP_EMC_M16_0000580#01.edf',
 'EXP_EMC_M16_0000583#01.edf','EXP_EMC_M16_0000600#01.edf','EXP_EMC_M18_0001180#01.edf',
 'EXP_EMC_M6b_0000513#01.edf'
 ]

files_epoched = [file for file in os.listdir(output_folder) if file.endswith('_epoched.feather')]

missing_files = [file for file in seizure_files if file[:-4] + '_epoched.feather' not in files_epoched]
print(f'Missing epoched seizure files: {len(missing_files)}')
for file in missing_files:
    print(file)

In [None]:
for index, file in enumerate(missing_files, start=1):
    print('-------------------------------------------------')
    print(f'Handling {file}... EEG {index}/{len(edf_files)}')
    try:
        raw = loading(data_path, file, loadig=False)
        seizures = df_sz_annotations
        N = seizure.shape[0]

        intervals_df = pd.DataFrame(columns = ['start', 'end'])
        for x in range(len(seizures)):
            if x == 0:
                intervals_df.loc[x,'start'] = 0
                intervals_df.loc[x, 'end'] = seizures.loc[x, 'onset']
            else:
                intervals_df.loc[x,'start'] = seizures.loc[x-1, 'end']
                intervals_df.loc[x, 'end'] = seizures.loc[x, 'onset']
        
        last_start = seizures.iloc[-1, 1]
        last_end = (raw.last_samp / raw.info['sfreq']) - 1
        intervals_df.loc[len(intervals_df.index)] = [last_start, last_end]
        intervals_df['label'] = 'interictal'

        onsets = intervals_df['start'].values
        durations = intervals_df['end'].values - intervals_df['start'].values
        descriptions = intervals_df['label'].values

        annotations = mne.Annotations(onsets, durations, descriptions)
        raw.set_annotations(annotations)
        annotations = raw.annotations[raw.annotations.description == 'interictal']
        n_annotations = int(len(annotations) * 0.3)
        annotations = random.sample(list(annotations), n_annotations)
        print(len(annotations), 'annotations selected')

        raws = raw.crop_by_annotations(annotations)
        print('Cropped by annotations')

        epochs = []
        for raw in tqdm(raws):
            try:
                raw.load_data()
                preprocess(raw)

                segment = mne.make_fixed_length_epochs(raw, 10.0, 0.0)
                epochs.append(segment)
                print('Loaded, preprocessed, and epoched')
                del raw
                gc.collect()

            except Exception:
                print(f'Segment {raw} is too short to epoch')
        
        del raws
        gc.collect()

        epochs = mne.concatenate_epochs(epochs)
        print('Concatenated epochs')
        df = epochs.to_data_frame()
        print('Created DataFrame')
        df.drop(columns=['condition', inplace=True])
        df['seizure'] = 0

        del epochs
        gc.collect()

        df = reshape_df(df)
        df = df.sample(n=N, random_state=42).reset_index(drop=True)

        output_filename = os.path.splitext(file)[0] + '_epoched.feather'
        df_conversion(df, output_path, output_filename)

        del df
        gc.collect()

    except Exception as e:
        print(f'Error with {file}: {e}')