## Pre-Processing

In [5]:
from source import data_import
from source.constants import CHANNELS, DEFAULT_PATIENTS
import numpy as np
import pandas as pd
# import modin.pandas as pd

from source.filter_eeg_channels import filter_eeg_channels
from source.calculate_mean_psd import calculate_mean_psd
from source.constants import CHANNELS, FREQUENCY_RANGES
                                            
from sklearn.preprocessing import StandardScaler, minmax_scale, MinMaxScaler

In [6]:
def load_file(file_name):
    if file_name.endswith('.arrow'):
        return data_import.load_pyarrow(file_name=file_name)
    elif file_name.endswith('.npy'):
        print('loading npy')
        return np.load('data/' + file_name, allow_pickle=True)
    else:
        print('no filename provided, trying npy')
        try:
            np.load('data/' + file_name + '.npy', allow_pickle=True)
        except FileNotFoundError:
            print('no npy file found, trying arrow')
            return data_import.load_pyarrow(file_name=file_name + '.arrow')
        
def save_file(data, file_name):
    if file_name is None:
        print('skipping save file.')
    elif file_name.endswith('.arrow'):
        data_import.save_pyarrow(data, file_name=file_name)
    elif file_name.endswith('.npy'):
        print('saving npy')
        np.save('data/' + file_name, data)
    else:
        print('no filetype provided, saving as npy')
        np.save('data/' + file_name + '.npy', data)

In [7]:
## PARAMETERS ##
SKIP_DATA = False
data_filename = "processed_data.arrow" # data will be loaded/saved with this filename. Put None to skip saving the file

SKIP_FEATURES = False
feature_filename = 'extracted_features.npy' # data will be loaded/saved with this filename. Put None to skip saving the file

### DATA ###
# Load Patient Data #
patient_ids = DEFAULT_PATIENTS #DEFAULT_PATIENTS # use DEFAULT_PATIENTS for default patients selection
nr_segments=60
segment_duration=1
ictal_segmentation_foo=data_import.preictal_segmentation
interictal_segmentation_foo=data_import.inter_segmentation
channels=CHANNELS
seizure_offset=0

# filter #
exclude_ranges=[[58, 62], [118, 122]]

### FEATURES ###
target_colname = 'target'
PRED_INTERVAL = 6000 # how long should a segment count as preictal in seconds

window_size = 30 # Define the sequence_train window size

In [8]:
if not SKIP_DATA:
    # Load Patient Data
    p_df = data_import.load_segmented_data(patient_ids=patient_ids,
                                            nr_segments=nr_segments,
                                            segment_duration=segment_duration,
                                            ictal_segmentation_foo=data_import.preictal_segmentation,
                                            interictal_segmentation_foo=data_import.inter_segmentation,
                                            channels=channels,
                                            seizure_offset=seizure_offset
                                            )
    
    # Filter
    fit_df = filter_eeg_channels(p_df, CHANNELS, fs=256, exclude_ranges=exclude_ranges, Q=30)
    pd_toconcat = p_df[['epoch', 'segment_id']]
    fit_df = pd.concat(objs=[fit_df, pd_toconcat], axis =1)
    save_file(data=fit_df, file_name=data_filename)
else:
    fit_df = load_file(data_filename)
    fit_df = pd.DataFrame(fit_df)
fit_df.head

chb01_01.edf was import but not resampled 256Hz.
chb01_02.edf was import but not resampled 256Hz.
chb01_03.edf was import but not resampled 256Hz.
chb01_03.edf seizure and buffer was labeled
chb01_04.edf was import but not resampled 256Hz.
chb01_04.edf seizure and buffer was labeled
chb01_05.edf was import but not resampled 256Hz.
chb01_06.edf was import but not resampled 256Hz.
chb01_07.edf was import but not resampled 256Hz.
chb01_08.edf was import but not resampled 256Hz.
chb01_09.edf was import but not resampled 256Hz.
chb01_10.edf was import but not resampled 256Hz.
chb01_11.edf was import but not resampled 256Hz.
chb01_12.edf was import but not resampled 256Hz.
chb01_13.edf was import but not resampled 256Hz.
chb01_14.edf was import but not resampled 256Hz.
chb01_15.edf was import but not resampled 256Hz.
chb01_15.edf seizure and buffer was labeled
chb01_16.edf was import but not resampled 256Hz.
chb01_16.edf seizure and buffer was labeled
chb01_17.edf was import but not resample

<bound method NDFrame.head of channel                        F4-C4      F3-C3   FT9-FT10      FZ-CZ  \
0 days 00:30:00            16.878726  20.334213  -3.322584  17.941953   
0 days 00:30:00.003906250  22.891349  24.865283  -8.175141  23.784967   
0 days 00:30:00.007812500  22.162881  25.360125  -8.902114  25.615925   
0 days 00:30:00.011718750  29.573426  33.953147 -14.930178  33.038895   
0 days 00:30:00.015625     21.292400  18.163275 -11.955401  21.366748   
...                              ...        ...        ...        ...   
0 days 00:30:59.980468750  14.606619  38.625778 -33.971210 -47.037566   
0 days 00:30:59.984375    -14.157892  35.028692 -12.626489  19.119589   
0 days 00:30:59.988281250  -6.142158  29.984239   6.891542 -30.945329   
0 days 00:30:59.992187500  -7.461311  31.703687  11.110258  -0.631780   
0 days 00:30:59.996093750 -25.057236  32.478871 -31.257322  45.032384   

channel                        F7-T7     FP2-F4    T8-P8-1    T8-P8-0  \
0 days 00:30:00     

## Feature Extraction

In [9]:
### aggregate Functions for mean psd:
delta = lambda x: calculate_mean_psd(x, frequency_ranges={'Delta' : FREQUENCY_RANGES['Delta']})[x.name]['Delta']
theta = lambda x: calculate_mean_psd(x, frequency_ranges={'Theta' : FREQUENCY_RANGES['Theta']})[x.name]['Theta']
gamma = lambda x: calculate_mean_psd(x, frequency_ranges={'Gamma': FREQUENCY_RANGES['Gamma']})[x.name]['Gamma']

delta_agg = pd.NamedAgg(column='delta', aggfunc=delta)
theta_agg = pd.NamedAgg(column='theta', aggfunc=theta)
gamma_agg = pd.NamedAgg(column='gamma', aggfunc=gamma)

### aggregate mean features:
abs_mean = lambda x: x.apply(abs).mean()
abs_mean_agg = pd.NamedAgg(column='abs_mean', aggfunc=abs_mean)

### aggregate Functions for target:
target_foo = lambda x, pred_interval=PRED_INTERVAL: 0 < x.dt.total_seconds().min() < pred_interval

In [10]:
if not SKIP_FEATURES:
    # aggregate features
    df_features = fit_df.groupby(['epoch', 'segment_id']).agg(
        {C:['std',
            'var',
            #'mean',
            abs_mean_agg,
            delta_agg,
            theta_agg,
            gamma_agg
            ] for C in CHANNELS} | 
        {target_colname: [target_foo]} 
        ) 

    # joining column names with agg functions, but leaving target column
    df_features.columns = ['_'.join(col).strip() for col in df_features.columns.values if target_colname != col[0]] + [target_colname]
    df_features.reset_index(inplace=True)

    # Scaling the features
    num_features= df_features.drop(['epoch','segment_id','target'],axis =1)
    scaler = StandardScaler()
    num_features_scaled = scaler.fit_transform(num_features)

    original_array = np.array(df_features)
    target = original_array[:,-1]
    target = target[:, np.newaxis]
    segseiz_column = original_array[:, 0:2] ## epoch and segment_id
    array_all_scaled = np.concatenate((segseiz_column, num_features_scaled, target), axis=1)
    array_all_scaled.shape

    ### Reshape Array ###
    original_array =np.array(df_features)

    # Extract the epoch column
    epoch_column = original_array[:, 0]

    # Determine the number of epochs (assuming epochs are from 1 to number of segments)
    num_epochs = df_features.epoch.unique()[-1]

    # Determine the number of segments for each epoch
    num_segments = len(df_features.segment_id.unique())  # Assuming there are 30 segments for each epoch

    # Initialize an empty 3D array
    reshaped_array = np.empty((num_epochs,num_segments, array_all_scaled.shape[1]))

    # Reshape the data for each epoch and insert it into the 3D array
    for epoch in range(num_epochs):
        start_idx = epoch * num_segments
        end_idx = (epoch + 1) * num_segments
        reshaped_array[epoch,:, :] = array_all_scaled[start_idx:end_idx,:]

    ### Create Sequence Trains ###
    # Assuming original_data_array has dimensions (batch, sequence, features)
    num_batches, num_sequences, num_features = reshaped_array.shape

    # Calculate the number of augmented batches
    num_augmented_batches = num_sequences - window_size + 1
    print(f"num_augmented_batches: {num_augmented_batches}")

    # Create an empty array for the augmented data
    data = np.zeros((num_batches * num_augmented_batches, window_size, num_features))
    print(f"data.shape: {data.shape}")

    # Iterate through batches
    for batch_idx in range(num_batches):
        # Iterate through sequences to create augmented batches
        for seq_idx in range(num_augmented_batches):
            # Copy the window of data
            data[batch_idx * num_augmented_batches + seq_idx, :, :] = reshaped_array[batch_idx, seq_idx:seq_idx + window_size, :]

    save_file(data=data, file_name=feature_filename)
else:
    data = load_file(feature_filename)

data.shape

loading npy


(1271, 30, 63)

In [12]:
%run model_rnn.ipynb

data already loaded in parent notebook
train shape (992, 30, 62)
test shape (279, 30, 62)


KeyError: 1

KeyError: 1