In [22]:
import numpy as np
import pandas as pd
import os
import pickle

In [None]:
# Sketch of code for splitting data into training, validation, & test
# This was turned into the script split_testing_valid_training.py

In [11]:
# Import list of seizure times
sub='1096'

# First 3 clinical srs, are training, next 2 clinical szrs validation, everything else is testing
df_fname='/Users/davidgroppe/Dropbox/TWH_INFO/EU_METADATA/szr_on_off_FR_'+sub+'.pkl'
szr_times_df=pickle.load(open(df_fname,'rb'))
szr_times_df.tail()

Unnamed: 0,SzrOffsetSec,SzrOffsetStr,SzrOnsetSec,SzrOnsetStr,SzrType
4,299067400.0,23.06.2009 10:16:13.383789,299067200.0,23.06.2009 10:13:36.033203,Clinical
5,299072800.0,23.06.2009 11:47:03.908203,299072700.0,23.06.2009 11:45:01.008789,Clinical
6,299082500.0,23.06.2009 14:27:52.616211,299082400.0,23.06.2009 14:25:52.366211,Clinical
7,299088700.0,23.06.2009 16:11:57.724609,299088600.0,23.06.2009 16:09:33.741211,Clinical
8,299167300.0,24.06.2009 14:01:43.983398,299167200.0,24.06.2009 14:00:30.941406,Clinical


In [9]:
clin_ids=szr_times_df[szr_times_df['SzrType']=='Clinical'].index.tolist()
#df[df['BoolCol'] == True].index.tolist()

In [10]:
print(clin_ids)

[0, 1, 2, 3, 4, 5, 6, 7, 8]


In [47]:
# Get border between training-validation data and validation-testing data
# Training data contain the first 3 clinical szrs
# Validation data contain the next 2 clinical szrs
# Testing data consists of the rest
train_szr_offset=szr_times_df.loc[2].SzrOffsetSec
valid_szr_offset=szr_times_df.loc[3].SzrOffsetSec
valid_szr_onset=szr_times_df.loc[3].SzrOnsetSec
test_szr_onset=szr_times_df.loc[5].SzrOnsetSec

train_valid_border=int(np.round( (train_szr_offset+valid_szr_onset)/2 ))
print('Train-Valid border %d sec' % train_valid_border)
valid_test_border=int(np.round( (valid_szr_offset+test_szr_onset)/2 ))
print('Valid-Test border %d sec' % valid_test_border)

Train-Valid border 298990031 sec
Valid-Test border 299035376 sec


In [28]:
# Import mat file files to figure out which ones should be used in which data split
# Also keep track of which ones contain seizures (clinical or subclinical)
csv_fname='/Users/davidgroppe/Dropbox/TWH_INFO/EU_METADATA/data_on_off_FR_'+sub+'.csv'
mat_df=pd.read_csv(csv_fname)
mat_df=mat_df.drop('Unnamed: 0',1)
mat_df.head()

Unnamed: 0,DurationSec,HeaderFname,StartSec,StartStr,StopSec
0,3600.0,109600102_0000.head,298641241.0,2009-06-18 11:54:01.000,298644841.0
1,3600.0,109600102_0001.head,298644842.0,2009-06-18 12:54:02.000,298648442.0
2,3600.0,109600102_0002.head,298648443.0,2009-06-18 13:54:03.000,298652043.0
3,3600.0,109600102_0003.head,298652045.0,2009-06-18 14:54:05.000,298655645.0
4,3600.0,109600102_0004.head,298655646.0,2009-06-18 15:54:06.000,298659246.0


In [34]:
mat_df.tail()

Unnamed: 0,DurationSec,HeaderFname,StartSec,StartStr,StopSec
160,3600.0,109600102_0160.head,299213094.0,2009-06-25 02:44:54.000,299216694.0
161,3600.0,109600102_0161.head,299216695.0,2009-06-25 03:44:55.000,299220295.0
162,3600.0,109600102_0162.head,299220297.0,2009-06-25 04:44:57.000,299223897.0
163,3600.0,109600102_0163.head,299223898.0,2009-06-25 05:44:58.000,299227498.0
164,385.0,109600102_0164.head,299227503.0,2009-06-25 06:45:03.000,299227888.0


In [48]:
train_files=list()
train_szr_files=list()
valid_files=list()
valid_szr_files=list()
test_files=list()
test_szr_files=list()
total_szrs=0
for row_id in range(mat_df.shape[0]):
    # See if the file contains szrs
    # Szrs with onsets after file onset
    post_ids=szr_times_df[szr_times_df['SzrOnsetSec']>=mat_df.iloc[row_id,2]].index.tolist()
    # Szrs with onsets before file offset
    pre_ids=szr_times_df[szr_times_df['SzrOnsetSec']<=mat_df.iloc[row_id,4]].index.tolist()
    n_szrs_in_file=len(np.intersect1d(post_ids,pre_ids))
    total_szrs+=n_szrs_in_file
    
    if mat_df.iloc[row_id,2]>test_szr_onset:
        # If file onset is after the valid-test border, make it test data
        test_files.append(mat_df.iloc[row_id,1].split('.')[0])
        if n_szrs_in_file>0:
            test_szr_files.append(test_files[-1])
    elif mat_df.iloc[row_id,2]>valid_szr_onset:
        # If file onset is after the train-valid border, make it validation data
        valid_files.append(mat_df.iloc[row_id,1].split('.')[0])
        if n_szrs_in_file>0:
            valid_szr_files.append(valid_files[-1])
    else:
        # Otherwise, make it training data
        train_files.append(mat_df.iloc[row_id,1].split('.')[0])
        if n_szrs_in_file>0:
            train_szr_files.append(train_files[-1])

print('%d training files' % (len(train_files)))
print('%d validation files' % (len(valid_files)))
print('%d testing files' % (len(test_files)))
print('%d total szrs found' % total_szrs)
print('There should be %d total szrs' % szr_times_df.shape[0])

101 training files
21 validation files
43 testing files
31 total szrs found
There should be 31 total szrs


In [49]:
print(train_szr_files)
print(valid_szr_files)
print(test_szr_files)

['109600102_0001', '109600102_0002', '109600102_0004', '109600102_0007', '109600102_0011', '109600102_0019', '109600102_0036', '109600102_0059', '109600102_0063', '109600102_0065', '109600102_0069', '109600102_0072', '109600102_0074', '109600102_0079', '109600102_0080', '109600102_0081', '109600102_0084', '109600102_0087', '109600102_0090', '109600102_0092', '109600102_0095', '109600102_0100']
['109600102_0119', '109600102_0121']
['109600102_0123', '109600102_0125', '109600102_0147']


In [54]:
out_fname='/Users/davidgroppe/Dropbox/TWH_INFO/EU_METADATA/data_splits_FR_'+sub+'.pkl'
split_dict={'train_szr_files': train_szr_files,'test_szr_files': test_szr_files,
            'valid_szr_files': valid_szr_files,'train_files': train_files,
            'test_files': test_files, 'valid_files': valid_files}
print('Saving lists of training, testing, & validation files to %s' % out_fname)
pickle.dump(split_dict,open(out_fname,'wb'))

Saving lists of training, testing, & validation files to /Users/davidgroppe/Dropbox/TWH_INFO/EU_METADATA/data_splits_FR_1096.pkl
