In [1]:
import os
import numpy as np
import scipy as sc
import pandas as pd
import tensorflow as tf

## Importing data

We first import the grades (classes) for each file:

In [2]:
grades_df = pd.read_csv("../data/eeg_grades.csv")
grades_df

Unnamed: 0,file_ID,baby_ID,epoch_number,grade
0,ID01_epoch1,ID01,1,
1,ID01_epoch2,ID01,2,
2,ID02_epoch1,ID02,1,1.0
3,ID02_epoch2,ID02,2,2.0
4,ID02_epoch3,ID02,3,1.0
...,...,...,...,...
164,ID52_epoch1,ID52,1,
165,ID52_epoch2,ID52,2,
166,ID52_epoch3,ID52,3,
167,ID52_epoch4,ID52,4,


Now we import the `.mat` files with preprocessed data (preprocessing done in MatLab, see `process_qtfd.m` script).

Each `.mat` file contains variables:
- `eeg_sig` : processed eeg signal before qtfd, with shape (t, channels, segments). 
	- *t* is the number of points in a 5 min segment, downsampled to 64 Hz. 
	- *channels* is the number of bipolar channels. 
	- *segments* is the number of segments of 5 min from the original 1 H file (50% overlap).
- `qtfd` is the output of the qTFD transform (`full_qtfd()` on MatLab created by J. O'Toole), in the shape (256, 128, 8, 23)
- `qtfd_log` is the log of the absolute of `qtfd` (final preprocessing step)

In [6]:
from scipy.io import loadmat

file_ext = ".mat"
data_basepath = "../data/MAT_format/"
data_files = list(grades_df['file_ID'])	

In [7]:
test = loadmat('../data/MAT_format/ID01_epoch1.mat')
print('data file: ' + str(test.keys()))
print('eeg_sig shape: ' + str(test['eeg_sig'].shape))
print('qtfd_log shape: ' + str(test['qtfd_log'].shape))

data file: dict_keys(['__header__', '__version__', '__globals__', 'eeg_sig', 'qtfd', 'qtfd_log'])
eeg_sig shape: (19200, 8, 23)
qtfd_log shape: (256, 128, 8, 23)


`qtfd_log` is the input for the CNN. It is a (256, 128) matrix for each of the 8 channels divided into 23 segments of 5 min (50% overlap). We save that into `data_qtfd`, a dictionary with the filename as key.

In [8]:
data_qtfd = dict()
for fname in data_files:
	data_mat = loadmat(data_basepath + fname + '.mat')
	data_qtfd[fname] = np.array(data_mat['qtfd_log'])

In [9]:
# store the qtfd data we just loaded
%store data_qtfd

Stored 'data_qtfd' (dict)


In [2]:
# reload data_qtfd in case the notebook kernel is restarted
%store -r data_qtfd

In [10]:
fname = 'ID06_epoch1'
data_qtfd[fname].shape

(256, 128, 8, 23)

For the training and testing of the model, we drop the files without grade.

In [18]:
train_test_grades_df = grades_df.dropna()
train_test_grades_df = train_test_grades_df.set_index('file_ID')
train_test_files = list(train_test_grades_df.index)

Final preprocessing step: remove components 0-2 Hz and 30-32Hz

In [11]:
chop_f = True

if chop_f:
	qtfd_shape = (256,112)
else:
	qtfd_shape = list(data_qtfd.values())[1].shape[0:2]  #(256, 128)


def get_qtfd(fname, ch, seg):
	qtfd = data_qtfd[fname]
	if chop_f:
		return qtfd[:, 7:119, ch, seg]
	else:
		return qtfd[:, :, ch, seg]

In [14]:
nsegm = list(data_qtfd.values())[1].shape[3]
nch = list(data_qtfd.values())[1].shape[2]
nfiles = len(train_test_grades_df.index)
n_total_inputs = nsegm*nch*nfiles

train_test_x = np.empty( (n_total_inputs, qtfd_shape[0], qtfd_shape[1]) )
train_test_y = np.empty( (n_total_inputs, 1) )

print('shape input train_test_x: ', train_test_x.shape)
print('shape target train_test_y: ', train_test_y.shape)


shape input train_test_x:  (19320, 256, 112)
shape target train_test_y:  (19320, 1)


In [19]:
for i in range(nfiles):
	fname = train_test_files[i]
	# print(fname)
	for j in range(nsegm):
		for k in range(nch):
			n = i*(nsegm*nch) + j*nch + k;
			#print(n)
			train_test_x[n,:,:] = get_qtfd(fname, k, j)
			train_test_y[n] = train_test_grades_df['grade'].loc[fname]

Now `train_test_x` has a list of 256x128 (or 256x112 if `chop_f=True`) matrices that will be the inputs of the CNN, and `train_test_y` has the list of classes for each matrix.

### DataFrame for all data files

Create a Pandas DataFrame so we can control the Leave-One-Subject-Out Cross Validation

In [22]:
# lists to create the dataframe
file_ID = []
baby_ID = []
epoch = []
segment = []
channel = []
qtfd = []
grade = []

grades_df = grades_df.set_index('file_ID')
filenames = list(grades_df.index)
for i, fname in enumerate(filenames):
	# print(fname)
	for j in range(nsegm):
		for k in range(nch):
			n = i*(nsegm*nch) + j*nch + k;
			#print(n)
			file_ID.append(fname)
			baby_ID.append(grades_df['baby_ID'].loc[fname])
			epoch.append(grades_df['epoch_number'].loc[fname])
			segment.append(j)
			channel.append(k)
			qtfd.append(get_qtfd(fname, k, j))
			grade.append(grades_df['grade'].loc[fname])

In [23]:
data_dict = {'file_ID': file_ID, 'baby_ID': baby_ID, 'epoch': epoch, 'segment': segment, 'channel': channel, 'qTFD': qtfd, 'grade': grade}
data = pd.DataFrame(data_dict)
data

Unnamed: 0,file_ID,baby_ID,epoch,segment,channel,qTFD,grade
0,ID01_epoch1,ID01,1,0,0,"[[6.6023696353706764, 6.15385990503455, 5.5655...",
1,ID01_epoch1,ID01,1,0,1,"[[5.956437627684241, 6.171991320782033, 6.3732...",
2,ID01_epoch1,ID01,1,0,2,"[[5.790945571769252, 5.8214202914188204, 5.785...",
3,ID01_epoch1,ID01,1,0,3,"[[6.35190778713108, 6.305407904325214, 6.22342...",
4,ID01_epoch1,ID01,1,0,4,"[[7.006041303088381, 6.906027779704137, 6.7721...",
...,...,...,...,...,...,...,...
31091,ID53_epoch1,ID53,1,22,3,"[[6.458609932430011, 6.213766185376376, 5.7423...",
31092,ID53_epoch1,ID53,1,22,4,"[[10.358890774999143, 10.023876847405162, 9.60...",
31093,ID53_epoch1,ID53,1,22,5,"[[7.435327945244904, 6.913063945796877, 6.0541...",
31094,ID53_epoch1,ID53,1,22,6,"[[6.10206505040248, 6.028219188968934, 5.74461...",


In [24]:
%store data

Stored 'data' (DataFrame)


In [2]:
%store -r data

In [3]:
data_train_test = data.dropna().copy()
data_train_test

Unnamed: 0,file_ID,baby_ID,epoch,segment,channel,qTFD,grade
368,ID02_epoch1,ID02,1,0,0,"[[6.9388474487104705, 8.04235544553957, 7.0435...",1.0
369,ID02_epoch1,ID02,1,0,1,"[[5.633287187614542, 5.224858890958373, 4.3496...",1.0
370,ID02_epoch1,ID02,1,0,2,"[[7.637610076410876, 8.371093580089752, 7.2013...",1.0
371,ID02_epoch1,ID02,1,0,3,"[[7.1952406368287205, 7.664078726379879, 6.845...",1.0
372,ID02_epoch1,ID02,1,0,4,"[[7.03630080417184, 7.634677625886598, 7.02603...",1.0
...,...,...,...,...,...,...,...
30171,ID51_epoch4,ID51,4,22,3,"[[2.314323884614313, 3.001018120724679, 2.2154...",2.0
30172,ID51_epoch4,ID51,4,22,4,"[[3.4156377448221775, 3.434901567824473, 2.618...",2.0
30173,ID51_epoch4,ID51,4,22,5,"[[4.033218579672476, 3.7532375324865472, 2.902...",2.0
30174,ID51_epoch4,ID51,4,22,6,"[[4.3772613640678, 3.7530439324956113, 3.01815...",2.0


## Model (CNN)

(from `Infant.ipynb` by Oisin)

### Create CNN layers and model

In [4]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import Activation, Dense, Flatten, BatchNormalization, Conv2D, MaxPool2D, MaxPooling2D, AveragePooling2D, Dropout,GlobalAveragePooling2D, MaxPooling1D 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

In [5]:
def create_convnet():
    # layers as specified in the paper
    input_shape = tf.keras.Input(shape=(256, 112, 1))

    tower_1 = Conv2D(10, (8, 1), padding='same', activation='relu')(input_shape)
    tower_1 = MaxPooling2D((4, 4), strides=(2, 2), padding='same')(tower_1)

    tower_2 = Conv2D(10, (1, 8), padding='same', activation='relu')(input_shape)
    tower_2 = MaxPooling2D((4, 4), strides=(2, 2), padding='same')(tower_2)

    tower_3 = Conv2D(10, (8, 8), padding='same', activation='relu')(input_shape)
    tower_3 = MaxPooling2D((4, 4), strides=(2, 2), padding='same')(tower_3)

    merged = keras.layers.concatenate([tower_1, tower_2, tower_3], axis=3)

    layer1 = Conv2D(60, (4,4), padding='same', 
                    activation ='relu',strides =(2,2))(merged)
    
    layer2a = MaxPooling2D((2,2), padding ='same', strides =(2,2))(layer1)
    layer2b = BatchNormalization()(layer2a)
    layer3 = Conv2D(60, (2,2), padding='same' )(layer2b)

    layer4 = MaxPooling2D( (2,2), padding='same', strides =(2,2))(layer3)
    layer5 = GlobalAveragePooling2D()(layer4)

    out = Dense(60, activation='relu')(layer5)
    out = Dense(4, activation='softmax')(out)

    model = tf.keras.Model(input_shape, out)
    #plot_model(model, to_file=img_path)
    return model

In [6]:
model = create_convnet()
model.summary()

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 256, 112, 1  0           []                               
                                )]                                                                
                                                                                                  
 conv2d (Conv2D)                (None, 256, 112, 10  90          ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 conv2d_1 (Conv2D)              (None, 256, 112, 10  90          ['input_1[0][0]']                
                 

2022-08-26 12:51:17.965702: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-08-26 12:51:17.966294: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [7]:
#defining the learning rate step
def scheduler(epoch, lr):
  n =np.floor((epoch-1)/5)
  return lr*(0.8)**n
opt = tf.keras.optimizers.SGD(momentum =0.9, nesterov =True)
callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

In [8]:
model.compile(opt,loss='categorical_crossentropy')

In [9]:
#model.fit(x=x, y=y, epochs=30, batch_size=128, callbacks=[callback])

In [9]:
def mpredict(x):
  return model.predict(x)

### Separate training and test data:

We will use LOSO (leave-one-subject-out).

In [10]:
subjects = data_train_test['baby_ID']
subjects

368      ID02
369      ID02
370      ID02
371      ID02
372      ID02
         ... 
30171    ID51
30172    ID51
30173    ID51
30174    ID51
30175    ID51
Name: baby_ID, Length: 19320, dtype: object

In [None]:
# for each subject in subjects
# drop from dataframe the rows corresponding to that subject ID
# train = df[!subjectID]
# test = df[subjectID]
# fit model to train data
# get evaluation metricts by predict(test)

In [11]:
y = data_train_test['grade']
X = data_train_test['qTFD']

In [12]:
from sklearn.model_selection import LeaveOneGroupOut
logo = LeaveOneGroupOut()
logo.get_n_splits(X,y,subjects)


31

In [13]:
# LOSO Cross Validation
fold_no = 1
for train_index, test_index in logo.split(X,y,subjects):
	# separate train and test based on group
	X_train, X_test = X.iloc[train_index], X.iloc[test_index]
	y_train, y_test = y.iloc[train_index], y.iloc[test_index]

	# convert X to ndarrays of (n, 256, 112) shape
	# and y to (n, 1)
	X_train = np.reshape(list(X_train), (len(X_train), 256, 112))
	X_test = np.reshape(list(X_test), (len(X_test), 256, 112))
	# y_train = np.reshape(list(y_train), (len(y_train), 1)).astype(np.int32)
	# y_train = list(y_train - 1)
	# y_test = list(y_test)
	y_train = tf.keras.utils.to_categorical(y_train - 1, num_classes=4)
	y_test = tf.keras.utils.to_categorical(y_test - 1, num_classes=4)

	# create model architecture
	model = create_convnet()
	opt = tf.keras.optimizers.SGD(momentum =0.9, nesterov =True)
	callback_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

	# compile model
	model.compile(opt,loss='categorical_crossentropy')

	# print
	print('------------------------------------------------------------------------') 
	print(f'Training for fold {fold_no} ...')

	# fit model
	history = model.fit(x=X_test, y=y_test, 
		epochs=1, batch_size=4, callbacks=[callback_lr])

	# get metrics
	scores = model.evaluate(x=X_test, y=y_test, verbose=0)
	print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')

	fold_no += 1
	break



------------------------------------------------------------------------
Training for fold 1 ...


2022-08-26 12:52:07.694613: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-08-26 12:52:07.955889: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


: 

: 