In [1]:
from source import data_import
from source.constants import CHANNELS
import pandas as pd
from warnings import warn

In [2]:
from source.data_import import *

# how to handle epoch at beginning and end??

root_dir = DATA_ROOT
nr_segments = 20
segment_duration = 10

def ictal_segmentation(df, epoch=0, duration_segment=10, nr_segments = 20):
    '''segment for ictal intervals. it adds as many ictal segments as available and fills them up with pre-ictal segments to reach the total nr of segments.
    
    if end of data is reached before the end of seizure, only this part of seizure will be used.
    
    '''

    # how to validate nr_segments is well chosen? 
    df['segment_id'] = 0
    ictal_epochs = []
    # for time, seizure in (s_df[s_df == True]).items():
    for ep_start, seizure in df.loc[df['seizure_start'] == True, 'seizure_start'].items():
        ictal_segments = []
        segment_id = 1
        seg_start = ep_start
        while True:
            # get previous duration_segment duration in seconds
            seg_end = seg_start + (pd.Timedelta(seconds=duration_segment)) - df.index.freq
            if seg_end > df.index[-1]:
                print(f"reached end of data. epoch number: {epoch}, number of segments completed: {segment_id-1}")
                break
            
            ictal_seg = df.loc[seg_start: seg_end, :].copy()
            ictal_seg['segment_id'] = segment_id
        
            # check if is_seizure in it AND if all are before_seizure
            # s_int.loc['10S', 'is_seizure'] = True  # check
            if not all(ictal_seg['is_seizure']):
                print(f"seizure end reached. epoch number: {epoch}, number of segments completed: {segment_id-1}")
                break # get out of while loop
            
            ictal_segments.append(ictal_seg)
            seg_start =  seg_end + df.index.freq
            segment_id += 1
        
        print(f"adding {segment_id-1} ictal segments to epoch {epoch}.") # segment_id is already incremented, but starts with 0: x-1+1=x
        ictal_epoch = pd.concat(ictal_segments)
        
        ### get missing (pre-ictal) segments before ictal
        segments_missing = nr_segments - (segment_id - 1)
        # calc start / end: 
        preictal_start = ep_start - segments_missing * pd.Timedelta(seconds=duration_segment)
        preictal_end = ep_start - df.index.freq

        # validate target epoch is within dataframe
        if preictal_start < df.index[0]:
            print(f"preictal interval is not fully covered by datafile! skipping ...")
            break
        # get_signal: 
        preictal_ep = df.loc[preictal_start:preictal_end, :].copy()
        # validate signal is not ictal
        if any(preictal_ep['is_seizure']):
            print(f"overlapping ictal interval for this episode. skipping ...")
            break
        
        # merge with ictal part of interval
        print(f"adding {segments_missing} pre-ictal segments to epoch {epoch}.")
        # full_epoch = preictal_ep.join(ictal_epoch)
        full_epoch = pd.concat([preictal_ep, ictal_epoch])
        # set segment_ids
        full_epoch['segment_id'] = [i for i in range(nr_segments) for _ in range(int(len(full_epoch)/nr_segments))]
        full_epoch['epoch'] = epoch
        epoch += 1

        # add full epoch to epoch list
        ictal_epochs.append(full_epoch)
    
    return ictal_epochs

def inter_segmentation(df, epoch=0, duration_segment=10, nr_segments=20):
    '''segment function that just adds an "epoch" from the middle of a seizure free datafile.'''
    # from 20 min in the dataframe, get the segments
    start = df.index[-1] // 2
    segments = df.loc[start:start + pd.Timedelta(seconds = nr_segments * duration_segment), :].copy()
    # add segment numbers and epoch id
    segments['epoch'] = epoch
    segments['segment_id'] = [i for i in range(nr_segments) for _ in range(int(len(segments)/nr_segments))]
    return segments

patient_ids = [1]
patient_list = get_patient_list(patient_ids=patient_ids)

epoch_counter = 0
df_patients = []
for patient in patient_list:

    summary = get_patient_summary(patient=patient)

    session_list = sorted([s.name for s in (root_dir / patient).rglob('*.edf')])
    session_dfs = []

    for session in session_list:
        df, is_seizure = return_pandas_df(patient=patient, session=session, summary=summary)
        df['seizure_start'] = df['is_seizure'] & ~df['is_seizure'].shift(fill_value=False)
        if is_seizure:
            # session_dfs.append(ictal_segmentation(df, epoch = epoch_counter))
            session_dfs.extend(ictal_segmentation(
                df, 
                epoch = epoch_counter, 
                duration_segment=segment_duration, 
                nr_segments=nr_segments)
            )
        else:
            session_dfs.append(inter_segmentation(
                df, 
                epoch = epoch_counter, 
                duration_segment=segment_duration, 
                nr_segments=nr_segments)
            )

        epoch_counter = session_dfs[-1]['epoch'].max() + 1
    
    df_patients.extend(session_dfs)

output = pd.concat(df_patients)
output.shape



chb01_01.edf was import but not resampled 256Hz.
chb01_02.edf was import but not resampled 256Hz.
chb01_03.edf was import but not resampled 256Hz.
chb01_03.edf seizure and buffer was labeled
seizure end reached. epoch number: 2, number of segments completed: 8
adding 8 ictal segments to epoch 2.
adding 12 pre-ictal segments to epoch 2.
chb01_04.edf was import but not resampled 256Hz.
chb01_04.edf seizure and buffer was labeled
seizure end reached. epoch number: 3, number of segments completed: 5
adding 5 ictal segments to epoch 3.
adding 15 pre-ictal segments to epoch 3.
chb01_05.edf was import but not resampled 256Hz.
chb01_06.edf was import but not resampled 256Hz.
chb01_07.edf was import but not resampled 256Hz.
chb01_08.edf was import but not resampled 256Hz.
chb01_09.edf was import but not resampled 256Hz.
chb01_10.edf was import but not resampled 256Hz.
chb01_11.edf was import but not resampled 256Hz.
chb01_12.edf was import but not resampled 256Hz.
chb01_13.edf was import but no

(1075200, 28)

In [9]:
output.tail(100)

channel,FP1-F7,F7-T7,T7-P7,P7-O1,FP1-F3,F3-C3,C3-P3,P3-O1,FP2-F4,F4-C4,...,P7-T7,T7-FT9,FT9-FT10,FT10-T8,T8-P8-1,is_seizure,before_seizure,seizure_start,epoch,segment_id
0 days 00:31:39.609375,-4.884005,67.008547,28.717949,34.188034,35.750916,1.758242,19.340659,68.180708,34.578755,1.758242,...,-28.327228,-31.843712,-2.148962,89.670330,19.731380,False,False,False,41,19
0 days 00:31:39.613281250,-0.586081,59.584860,24.420024,34.188034,37.313797,2.539683,15.042735,63.101343,53.724054,-6.837607,...,-24.029304,-26.373626,-13.870574,90.061050,24.029304,False,False,False,41,19
0 days 00:31:39.617187500,19.731380,56.849817,20.903541,31.062271,65.445665,-1.367521,10.354090,55.286935,40.830281,1.758242,...,-20.512821,-25.982906,-23.247863,91.233211,26.764347,False,False,False,41,19
0 days 00:31:39.621093750,-11.135531,59.584860,18.949939,28.327228,29.499389,3.711844,8.791209,52.942613,42.002442,-16.214896,...,-18.559219,-27.936508,-22.857143,90.451770,25.201465,False,False,False,41,19
0 days 00:31:39.625000,-33.015873,64.273504,13.089133,23.247863,-2.930403,9.963370,10.354090,49.816850,38.485958,-30.671551,...,-12.698413,-25.982906,-24.029304,83.809524,26.373626,False,False,False,41,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0 days 00:31:39.980468750,-32.625153,-57.631258,-42.393162,-13.870574,-43.174603,-45.128205,-30.671551,-27.545788,-81.855922,-31.452991,...,42.783883,-9.572650,86.544567,-86.935287,-69.743590,False,False,False,41,19
0 days 00:31:39.984375,-9.181929,-107.252747,-29.499389,-18.949939,-83.028083,-18.559219,-27.936508,-35.360195,-62.319902,-46.300366,...,29.890110,8.791209,109.597070,-104.126984,-83.809524,False,False,False,41,19
0 days 00:31:39.988281250,18.168498,-124.835165,-27.155067,-21.294261,-57.631258,-29.890110,-26.764347,-40.439560,-38.095238,-30.280830,...,27.545788,31.452991,81.465201,-127.179487,-40.830281,False,False,False,41,19
0 days 00:31:39.992187500,2.148962,-140.463980,4.884005,-18.949939,-57.240537,-43.956044,-12.698413,-36.923077,-20.122100,-27.936508,...,-4.493284,80.293040,61.147741,-127.179487,-33.797314,False,False,False,41,19


In [10]:
output.epoch.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41])

In [None]:
data_import.save_pyarrow(output, file_name="preprocessed_seg")

In [18]:

pivot_df = output.pivot(index=['segment_id', 'epoch'], columns=CHANNELS, values=CHANNELS).unstack().fillna(0)


: 

In [5]:
freq = 256
assert output[['epoch']].value_counts()[0] == segment_duration * nr_segments * freq
assert all(element == output[['epoch']].value_counts()[0] for element in output[['epoch']].value_counts())
assert all(element == (output['epoch'].max() + 1) * segment_duration * freq for element in output['segment_id'].value_counts())

In [6]:
output[['epoch']].value_counts()

epoch
0        25600
31       25600
23       25600
24       25600
25       25600
26       25600
27       25600
28       25600
29       25600
30       25600
32       25600
1        25600
33       25600
34       25600
35       25600
36       25600
37       25600
38       25600
39       25600
40       25600
22       25600
21       25600
20       25600
19       25600
2        25600
3        25600
4        25600
5        25600
6        25600
7        25600
8        25600
9        25600
10       25600
11       25600
12       25600
13       25600
14       25600
15       25600
16       25600
17       25600
18       25600
41       25600
Name: count, dtype: int64

In [7]:
output['segment_id'].value_counts()

segment_id
0     53760
1     53760
18    53760
17    53760
16    53760
15    53760
14    53760
13    53760
12    53760
11    53760
10    53760
9     53760
8     53760
7     53760
6     53760
5     53760
4     53760
3     53760
2     53760
19    53760
Name: count, dtype: int64

In [4]:
p_df = data_import.load_pyarrow(file_name="preprocess_test")

/home/weasel/reps/ai-seizure-detectives/source/../data/preprocess_test.arrow was loaded.


In [5]:
from source.filter_eeg_channels import filter_eeg_channels

exclude_ranges=[[58, 62], [118, 122]]
fit_df = filter_eeg_channels(p_df, CHANNELS, fs=256, exclude_ranges=exclude_ranges, Q=30)
fit_df.head()

channel,F4-C4,F3-C3,FT9-FT10,FZ-CZ,F7-T7,FP2-F4,T8-P8-1,T8-P8-0,FP1-F3,CZ-PZ,before_seizure,is_seizure
0 days 00:00:00,-30.700675,2.79097,-6.246458,-12.625819,26.713574,-11.828398,-40.80133,-40.80133,-0.39871,58.078766,False,False
0 days 00:00:00.003906250,-10.319968,1.083164,-1.993871,-4.165896,9.228259,-3.894393,-13.759008,-13.759008,-0.002848,19.907382,False,False
0 days 00:00:00.007812500,-2.200027,0.394352,-0.305718,-0.799886,2.24748,-0.738115,-2.982458,-2.982458,0.147268,4.677136,False,False
0 days 00:00:00.011718750,-13.337263,1.442223,-2.883424,-5.217581,12.188861,-5.125166,-18.140539,-18.140539,0.009338,26.544704,False,False
0 days 00:00:00.015625,15.164319,-1.099635,3.174129,6.435789,-12.652102,5.960205,19.951526,19.951526,0.440694,-27.708167,False,False


In [6]:
### go through non-seizure data (before_seizure)
# extract seizure-starts
    # for each seizure start
        # get previous 20 seconds
        # check if is_seizure in it AND if all are before_seizure
            # yes: drop and continue to next seizure start
        # take 20 seconds:
            # mean of amplitude over time per channel
            # mean of power spectrum for frequency ranges per channel

### go through seizure data (is_seizure)
# take seizre-starts
    # for each seizure start
        # get next 20 seconds
        # check if all are is_seizure
            # no: drop and continue
        # take 20 seconds:
            # mean of amplitude over time per channel
            # mean of power spectrum for frequency ranges per channel

# out put: data resampled to 20sec with features

In [7]:
fit_df.info()

<class 'pandas.core.frame.DataFrame'>
TimedeltaIndex: 24484608 entries, 0 days 00:00:00 to 1 days 02:34:02.996093750
Freq: 3906250N
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   F4-C4           float64
 1   F3-C3           float64
 2   FT9-FT10        float64
 3   FZ-CZ           float64
 4   F7-T7           float64
 5   FP2-F4          float64
 6   T8-P8-1         float64
 7   T8-P8-0         float64
 8   FP1-F3          float64
 9   CZ-PZ           float64
 10  before_seizure  bool   
 11  is_seizure      bool   
dtypes: bool(2), float64(10)
memory usage: 2.1 GB


In [8]:
fit_df['seizure_start'] = fit_df['is_seizure'] & ~fit_df['is_seizure'].shift(fill_value=False)
fit_df.loc[fit_df['seizure_start'] == True, 'seizure_start']

0 days 00:49:55    True
0 days 01:24:26    True
0 days 02:28:51    True
0 days 03:16:54    True
0 days 04:28:39    True
0 days 05:05:26    True
0 days 06:31:01    True
0 days 07:28:16    True
0 days 07:40:54    True
0 days 08:50:52    True
0 days 09:00:45    True
0 days 10:06:54    True
0 days 11:01:55    True
0 days 12:30:45    True
0 days 13:27:45    True
0 days 14:37:55    True
0 days 15:23:28    True
0 days 18:04:47    True
0 days 20:21:05    True
0 days 23:01:38    True
0 days 23:36:41    True
Name: seizure_start, dtype: bool

In [9]:
fit_df['seizure_id'] = fit_df['seizure_start'].cumsum()
fit_df['seizure_id'].tail()

1 days 02:34:02.980468750    21
1 days 02:34:02.984375       21
1 days 02:34:02.988281250    21
1 days 02:34:02.992187500    21
1 days 02:34:02.996093750    21
Freq: 3906250N, Name: seizure_id, dtype: int64

In [10]:
fit_df.index

TimedeltaIndex([          '0 days 00:00:00', '0 days 00:00:00.003906250',
                '0 days 00:00:00.007812500', '0 days 00:00:00.011718750',
                   '0 days 00:00:00.015625', '0 days 00:00:00.019531250',
                '0 days 00:00:00.023437500', '0 days 00:00:00.027343750',
                   '0 days 00:00:00.031250', '0 days 00:00:00.035156250',
                ...
                '1 days 02:34:02.960937500', '1 days 02:34:02.964843750',
                   '1 days 02:34:02.968750', '1 days 02:34:02.972656250',
                '1 days 02:34:02.976562500', '1 days 02:34:02.980468750',
                   '1 days 02:34:02.984375', '1 days 02:34:02.988281250',
                '1 days 02:34:02.992187500', '1 days 02:34:02.996093750'],
               dtype='timedelta64[ns]', length=24484608, freq='3906250N')

In [11]:
# extract seizure-stab4seizure_intervals
    # for each seizure start

fit_df['segment_id'] = 0
# sdafaf
b4_seizure = []
# for time, seizure in (s_df[s_df == True]).items():
for time, seizure in fit_df.loc[fit_df['seizure_start'] == True, 'seizure_start'].items():
    intervals = []
    segment_id = 15
    while True:
        # get previous 20 seconds
        int_end = time - fit_df.index.freq
        int_start = time - (pd.Timedelta(seconds=20))
        
        if int_start < pd.Timedelta('0S'):
            raise('this should probably not happen.')
        b4s_int = fit_df.loc[int_start:int_end, :].copy()
        b4s_int['segment_id'] = segment_id
    
        # check if is_seizure in it AND if all are before_seizure
        # s_int.loc['10S', 'is_seizure'] = True  # check
        if any(b4s_int['is_seizure']) or not all(b4s_int['before_seizure']):
            print(f"either previous seizure found or start of before seizure interval reached. seizure number: {b4s_int['seizure_id'].values[0]}")
            break # get out of while loop
            
        intervals.insert(0, b4s_int)
        time = int_start
        segment_id -= 1
    
    b4_seizure.extend(intervals)

b4seizure_df = pd.concat(b4_seizure)

# b4seizure_df.info()


either previous seizure found or start of before seizure interval reached. seizure number: 0
either previous seizure found or start of before seizure interval reached. seizure number: 1
either previous seizure found or start of before seizure interval reached. seizure number: 2
either previous seizure found or start of before seizure interval reached. seizure number: 3
either previous seizure found or start of before seizure interval reached. seizure number: 4
either previous seizure found or start of before seizure interval reached. seizure number: 5
either previous seizure found or start of before seizure interval reached. seizure number: 6
either previous seizure found or start of before seizure interval reached. seizure number: 7
either previous seizure found or start of before seizure interval reached. seizure number: 8
either previous seizure found or start of before seizure interval reached. seizure number: 9
either previous seizure found or start of before seizure interval reac

In [12]:
b4seizure_df.shape

(1566720, 15)

### Seizures

In [13]:
fit_df['segment_id'] = 0

post_seizure = []
# for time, seizure in (s_df[s_df == True]).items():
for time, seizure in fit_df.loc[fit_df['seizure_start'] == True, 'seizure_start'].items():
    seizure_length = fit_df['seizure_start']
    intervals = []
    segment_id = 1
    while True:
        # get previous 20 seconds
        int_start = time  
        int_end = int_start + (pd.Timedelta(seconds=20))
        if int_start > fit_df.index[-1]:
            print('reached end of data.')
            break
        
        post_int = fit_df.loc[int_start: int_end, :].copy()
        post_int['segment_id'] = segment_id
    
        # check if is_seizure in it AND if all are before_seizure
        # s_int.loc['10S', 'is_seizure'] = True  # check
        if not all(post_int['is_seizure']):
            print(f"seizure end reached. seizure number: {post_int['seizure_id']}, number of segments completed: {segment_id-1}")
            break # get out of while loop

        intervals.append(post_int)
        time =  int_end + fit_df.index.freq
        segment_id += 1
    post_seizure.extend(intervals)

post_seizure_df = pd.concat(post_seizure)

seizure end reached. seizure number: 0 days 00:50:35.007812500    1
0 days 00:50:35.011718750    1
0 days 00:50:35.015625       1
0 days 00:50:35.019531250    1
0 days 00:50:35.023437500    1
                            ..
0 days 00:50:54.992187500    1
0 days 00:50:54.996093750    1
0 days 00:50:55              1
0 days 00:50:55.003906250    1
0 days 00:50:55.007812500    1
Freq: 3906250N, Name: seizure_id, Length: 5121, dtype: int64, number of segments completed: 2
seizure end reached. seizure number: 0 days 01:24:46.003906250    2
0 days 01:24:46.007812500    2
0 days 01:24:46.011718750    2
0 days 01:24:46.015625       2
0 days 01:24:46.019531250    2
                            ..
0 days 01:25:05.988281250    2
0 days 01:25:05.992187500    2
0 days 01:25:05.996093750    2
0 days 01:25:06              2
0 days 01:25:06.003906250    2
Freq: 3906250N, Name: seizure_id, Length: 5121, dtype: int64, number of segments completed: 1
seizure end reached. seizure number: 0 days 02:29:31.007

In [14]:
merged_df = pd.concat(objs=[b4seizure_df, post_seizure_df])
merged_df.shape

(1884222, 15)

In [15]:
data_import.save_pyarrow(merged_df, file_name="preprocessed_df_new")

/home/weasel/reps/ai-seizure-detectives/source/../data/preprocessed_df_new.arrow was successfully written.
