In [1]:
## mount gdrive
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
## go to your working directory
cd '/content/gdrive/My Drive/project/18SAT/script'

/content/gdrive/My Drive/project/18SAT/script


In [3]:
import os
import pandas as pd
import numpy as np
import json
import h5py
import pathlib
import pickle
from collections import defaultdict
import random
from copy import deepcopy
from datetime import datetime

from sklearn.utils import shuffle, class_weight
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import balanced_accuracy_score
import itertools

from keras.models import model_from_json
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, LSTM, GRU, RNN, CuDNNGRU, CuDNNLSTM, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam, Adagrad, Adadelta
from keras import backend as K
from keras import regularizers

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.debugger import set_trace

Using TensorFlow backend.


In [0]:
## global variable
datanorm = True
datacols = ['CURRENT_FIX_X', 'CURRENT_FIX_Y', 'CURRENT_FIX_PUPIL', 'CURRENT_FIX_DURATION']

delta = 10 # size of window based on the middle point
step =  2*delta+1 #if you want no overlap: 2*delta+1 # size of step in window extraction

datasplit = 'subject'

labelcols = ['subj', 'book',
            'acc_level', 'subj_acc_level', 
            'confidence', 'difficulty', 'familiarity', 'recognition', 
            'interest', 'pressured', 'sleepiness', 'sleephours',
            'sex', 'native']

pred_variable = 'subj_acc_level'

modeltype = 'cnn'

BATCH_SIZE = 100
EPOCHS = 1000

# data path
file_fixdata = '../data/18sat_fixfinal.csv'
file_trialdata = '../data/18sat_trialfinal.csv'
file_labels = '../data/18sat_labels.csv'


In [5]:
## score file load
sc = pd.read_csv(file_labels)

## select label columns
sc = sc[labelcols]

## cut/replace values
sc['sex'] = sc['sex'].replace(['F', 'M'], [1,0])
binarycols = ('recognition', 'sex', 'native')
subsetcols = [c for c in labelcols if c not in binarycols]
sc[subsetcols] = sc[subsetcols].replace([0,1,2,3], [0,0,1,1])

## frequency table per column
for column in sc:
    print(sc[column].value_counts(sort=False, dropna=False), '\n')

msd034    4
msd063    4
msd019    4
msd089    4
msd086    4
         ..
msd054    4
msd055    4
msd085    4
msd070    4
msd101    4
Name: subj, Length: 95, dtype: int64 

northpole    95
genome       95
dickens      95
flytrap      95
Name: book, dtype: int64 

0    149
1    231
Name: acc_level, dtype: int64 

0    200
1    180
Name: subj_acc_level, dtype: int64 

0    219
1    161
Name: confidence, dtype: int64 

0    252
1    128
Name: difficulty, dtype: int64 

0    291
1     89
Name: familiarity, dtype: int64 

0    373
1      7
Name: recognition, dtype: int64 

0    150
1    230
Name: interest, dtype: int64 

0    250
1    130
Name: pressured, dtype: int64 

0    233
1    147
Name: sleepiness, dtype: int64 

0    180
1    200
Name: sleephours, dtype: int64 

0    128
1    252
Name: sex, dtype: int64 

0    116
1    264
Name: native, dtype: int64 



## create dataset

In [0]:
## preprocessing to window
def group_windows(fixationRows):
    windows = []
    fixationRows.reset_index(inplace=True)
    for n in range(delta, len(fixationRows)-delta, step):
        window = fixationRows.loc[n-delta:n+delta,datacols].values.tolist()
        windows.append(window)
    return windows

# original version
    # nRow = fixationRows.shape[0]
    # nCol = fixationRows.shape[1]
    # windows = []

    # fixNum = 0

    # for index, row in fixationRows.iterrows():
    #     if (index+1)%
    #     if fixNum + delta <= nRow-1 and fixNum - delta >= 0:
    #         deltaMin = fixNum - delta
    #         deltaMax = fixNum + delta
    #         window = []
    #         for i in range(deltaMin, deltaMax+1):
    #             x = fixationRows['CURRENT_FIX_X']
    #             x = x.values[i]
    #             y = fixationRows['CURRENT_FIX_Y'] # - yOffset
    #             y = y.values[i]
    #             d = fixationRows['CURRENT_FIX_DURATION']
    #             d = d.values[i]
    #             p = fixationRows['CURRENT_FIX_PUPIL']
    #             p = p.values[i]
    #             #r = fixationRows['CURRENT_FIX_INTEREST_AREA_LABEL']
    #             #r = r.values[i]
    #             window.append([x, y, d, p])
    #         windows.append(window)
    #     fixNum += 1
    # return windows

## Loop over all articles and subjects
def generate_windata(fixation):
    subjectPool = pd.unique(fixation['RECORDING_SESSION_LABEL'])
    pagePool = pd.unique(fixation['page_name'])
    windowData = {}
    for subject in subjectPool:
        subjectRows = fixation.loc[fixation['RECORDING_SESSION_LABEL'] == subject]
        windowData[subject] = {}
        print("\rprocessing Subject: " + subject, end='')
        for page in pagePool:
            # print ("Subject: " + subject + ", Page: " + page)
            pageRows = subjectRows.loc[subjectRows['page_name'] == page]
            # visualize_article(article, subjectRows)
            windows = group_windows(pageRows)
            windowData[subject][page] = windows
    print ("\nwindow data ready")
    return windowData


## create dataset 
def create_dataset(windowData, sc):
    dataset = []
    index= []
    labeldf = pd.DataFrame()

    for subject in windowData:
        for article in windowData[subject]:
            windows = windowData[subject][article]
            for window in windows:
                dataset.append(window)
                book = article.split('-')[1] # article = 'reading-dickens-1'
                row = sc[(sc['subj'] == subject) & (sc['book'] == book)]
                labeldf = pd.concat([labeldf, row]) 
    
    print('dataset created')
    return np.array(dataset), labeldf  

In [7]:
## fixation data load
fd = pd.read_csv(file_fixdata) #encoding = "ISO-8859-1",\

## data split
fd_rd = fd.loc[fd.type == 'reading']
# fd_comp = fd.loc[(fd.type == 'question') & (fd.page <= 5)]
# fd_mental = fd.loc[(fd.type == 'question') & (fd.page > 5)]

## data normalization
if datanorm:
    # normalized_df=(df-df.mean())/df.std()
    # normalized_df=(df-df.min())/(df.max()-df.min())
    # fd_rd_mean = fd_rd.copy(deep=True)
    # fd_rd_mean[cols]=(fd_rd[cols]-fd_rd[cols].mean())/fd_rd[cols].std()
    fixData = fd_rd.copy(deep=True)
    fixData[datacols] = (fd_rd[datacols]-fd_rd[datacols].min())/(fd_rd[datacols].max()-fd_rd[datacols].min())
else:
    fixData = fd_rd.copy(deep=True)

fixData[datacols].describe()

# from sklearn import preprocessing

# x = df.values #returns a numpy array
# min_max_scaler = preprocessing.MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)
# df_scaled = pd.DataFrame(x_scaled)

## call function
windowData = generate_windata(fixData)

processing Subject: msd107
window data ready


In [0]:
## save windowdata
with open('../data/windowData_'+str(delta)+'.pkl', 'wb') as fp:
    pickle.dump(windowData, fp)

# ## load windowdata
# with open('../data/windowData_'+str(delta)+'.pkl', 'rb') as fp:
#     windowData = pickle.load(fp)

In [0]:
## data split
datasplit = 'book'

if datasplit == 'subject':
    # subject-wise dataset split  (subject wise)
    # current plan is to use 60:20:20 
    subjkeys = list(windowData.keys())
    random.Random(23).shuffle(subjkeys) #random shuffling
    N_totalsub = len(subjkeys)
    N_trainsub = round(0.6*N_totalsub)
    N_validsub = round(0.2*N_totalsub)
    N_testsub = N_totalsub - N_trainsub - N_validsub

    windowData_train = deepcopy(windowData)
    windowData_valid = {}
    windowData_test = {}

    for i, subj in enumerate(subjkeys):
        if i in range(N_validsub):
            #print(subj, 'to valid')
            windowData_valid[subj] = windowData_train[subj]
            del windowData_train[subj]
        elif i in range(N_validsub, N_validsub + N_testsub):
            #print(subj, 'to test')
            windowData_test[subj] = windowData_train[subj]
            del windowData_train[subj]

    print("train subj #", len(list(windowData_train.keys())))
    print("valid subj #", len(list(windowData_valid.keys())))
    print("test subj #", len(list(windowData_test.keys())))

    ## create dataset
    X_train, labels_train = create_dataset(windowData_train, sc)
    X_valid, labels_valid = create_dataset(windowData_valid, sc)
    X_test, labels_test = create_dataset(windowData_test, sc)
        
elif datasplit == 'record':
    X, labels = create_dataset(windowData, sc)
    X_train, X_test, labels_train, labels_test = train_test_split(X, labels, test_size=0.4, random_state=23)
    X_valid, X_test, labels_valid, labels_test = train_test_split(X_test, labels_test, test_size=0.5, random_state=23)

elif datasplit == 'book':
    # book-wise dataset split  
    ## current plan is to use 50:25:25 (2,1,1)

    subjkeys = list(windowData.keys())
    pagekeys = list(windowData[subjkeys[0]].keys())
    bookkeys = list(np.unique(sc['book'])) # ['dickens' 'flytrap' 'genome' 'northpole']
    print('list of books:', bookkeys)

    windowData_train = deepcopy(windowData)
    windowData_valid = defaultdict(dict)
    windowData_test = defaultdict(dict)

    for subj in subjkeys:
        tmp = random.sample(bookkeys,2)
        for page in pagekeys:
            if (page.split('-')[1] == tmp[0]):
                windowData_valid[subj][page] = windowData_train[subj][page]
                del windowData_train[subj][page]
                
            elif (page.split('-')[1] == tmp[1]): 
                windowData_test[subj][page] = windowData_train[subj][page]
                del windowData_train[subj][page]

    ## create dataset
    X_train, labels_train = create_dataset(windowData_train, sc)
    X_valid, labels_valid = create_dataset(windowData_valid, sc)
    X_test, labels_test = create_dataset(windowData_test, sc)

    print("train book #", list(windowData_train['msd001'].keys()))
    print("valid book #", list(windowData_valid['msd001'].keys()))
    print("test book #", list(windowData_test['msd001'].keys()))



list of books: ['dickens', 'flytrap', 'genome', 'northpole']
dataset created
dataset created
dataset created
train book # ['reading-dickens-1', 'reading-dickens-2', 'reading-dickens-3', 'reading-dickens-4', 'reading-dickens-5', 'reading-flytrap-1', 'reading-flytrap-2', 'reading-flytrap-3', 'reading-flytrap-4', 'reading-flytrap-5', 'reading-flytrap-6']
valid book # ['reading-genome-1', 'reading-genome-2', 'reading-genome-3', 'reading-genome-4', 'reading-genome-5', 'reading-genome-6']
test book # ['reading-northpole-1', 'reading-northpole-2', 'reading-northpole-3', 'reading-northpole-4', 'reading-northpole-5']


In [0]:
## save dataset
np.save('../dataset/'+ datasplit +'wise/train/fix_train_' + str(delta) + '.npy', X_train)
np.save('../dataset/' + datasplit + 'wise/val/fix_valid_' + str(delta) + '.npy', X_valid)
np.save('../dataset/'+ datasplit + 'wise/test/fix_test_' + str(delta) + '.npy', X_test)

labels_train.to_csv('../dataset/'+ datasplit +'wise/train/label_train_' + str(delta)+ '.csv', index=False)
labels_valid.to_csv('../dataset/'+ datasplit +'wise/val/label_train_' + str(delta) + '.csv', index=False)
labels_test.to_csv('../dataset/'+ datasplit + 'wise/test/label_train_' + str(delta) + '.csv', index=False)



## model training

In [0]:
## load dataset
X_train = np.load('../dataset/'+ datasplit +'wise/train/fix_train_' + str(delta) + '.npy')
X_valid = np.load('../dataset/' + datasplit + 'wise/val/fix_valid_' + str(delta) + '.npy')
X_test = np.load('../dataset/'+ datasplit + 'wise/test/fix_test_' + str(delta) + '.npy')

labels_train = pd.read_csv('../dataset/'+ datasplit +'wise/train/label_train_' + str(delta)+ '.csv')
labels_valid = pd.read_csv('../dataset/'+ datasplit +'wise/val/label_train_' + str(delta) + '.csv')
labels_test = pd.read_csv('../dataset/'+ datasplit + 'wise/test/label_train_' + str(delta) + '.csv')

In [0]:
## create dataset
pred_variable = 'difficulty'

if pred_variable == 'subj':
    ## labels as categorical
    y_train = labels_train[pred_variable].astype('category').cat.codes
    y_valid = labels_valid[pred_variable].astype('category').cat.codes
    y_test = labels_test[pred_variable].astype('category').cat.codes

else:
    ## labels as categorical
    y_train = labels_train[pred_variable]
    y_valid = labels_valid[pred_variable]
    y_test = labels_test[pred_variable]

# ## randomize row for training data
# from sklearn.utils import shuffle
# X_train, labels_train, idx_train = shuffle(X_train, labels_train, idx_train, random_state=23)

## data description 
num_classes = len(pd.unique(y_train)) # labels_train[pred_variable].shape (TTTT,)

print("##### data description #####")
print("# of classes:\t",num_classes)

input_shape = X_train.shape[1:]
print("input shape is:\t",input_shape)

N_samples_train = X_train.shape[0]
print("# of samples for training is:\t", N_samples_train)

N_samples_valid = X_valid.shape[0]
print("# of samples for validation is:\t", N_samples_valid)

N_samples_test = X_test.shape[0]
print("# of samples for prediction is:\t", N_samples_test)

N_total = N_samples_train + N_samples_valid + N_samples_test
print("# of total sampels:\t", N_total)

## check data imbalances and caculate weights for loss
weights = class_weight.compute_class_weight('balanced'
        ,np.unique(y_train)
        ,y_train)

print("\n##### data imbalances #####")
print(y_train.value_counts(normalize=True).sort_index())

print("\n##### loss weight #####")
weights = dict(enumerate(weights))
print(weights)

print("\n##### null acc for test dataset #####")
print(np.max(y_test.value_counts(normalize=True).sort_index()))

## one hot encoding
y_train = np_utils.to_categorical(y_train, num_classes)
y_valid = np_utils.to_categorical(y_valid, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)


# ##### data description #####
# # of classes:	 4
# input shape is:	 (41, 4)
# # of samples for training is:	 3216
# # of samples for validation is:	 1109
# # of samples for prediction is:	 1106
# # of total sampels:	 5431

# ##### data imbalances #####
# 0    0.152674
# 1    0.255908
# 2    0.317475
# 3    0.273943
# Name: acc_level, dtype: float64

# ##### loss weight #####
# {0: 1.6374745417515275, 1: 0.976913730255164, 2: 0.7874632713026445, 3: 0.9125993189557321}



##### data description #####
# of classes:	 2
input shape is:	 (21, 4)
# of samples for training is:	 6867
# of samples for validation is:	 2362
# of samples for prediction is:	 2319
# of total sampels:	 11548

##### data imbalances #####
0    0.615116
1    0.384884
Name: difficulty, dtype: float64

##### loss weight #####
{0: 0.8128551136363636, 1: 1.2990919409761634}

##### null acc for test dataset #####
0.7654161276412247


In [0]:
## model specify and compile
modeltype = 'rnn'

if 'model' in globals():
    del model
    K.clear_session()

if modeltype == 'linear':
    inputs = Input(shape=input_shape)
    x = Flatten()(inputs)

    # x = Dense(100)(x)
    # # x = BatchNormalization()(x)
    # x = Activation('relu')(x)

    # x = Dense(100)(x)
    # # x = BatchNormalization()(x)
    # x = Activation('relu')(x)

    # x = Dense(100)(x)
    # # x = BatchNormalization()(x)
    # x = Activation('relu')(x)


    predictions = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=predictions)
    model.summary()

elif modeltype == 'cnn':

    ## 62 bookwise, 64 recordwise for subj_acc_level  (dense 50, 10)
    model = Sequential()

    model.add(Conv1D(40, 3, input_shape= input_shape)) #padding= 'same', #use_bias = False
    # model.add(BatchNormalization())
    model.add(Activation('relu'))
    # model.add(BatchNormalization())

    model.add(Conv1D(40, 3)) #padding= 'same', #use_bias = False
    # model.add(BatchNormalization())
    model.add(Activation('relu'))
    # model.add(BatchNormalization())

    model.add(Conv1D(40, 3)) #padding= 'same', #use_bias = False
    # model.add(BatchNormalization())
    model.add(Activation('relu'))
    # model.add(BatchNormalization())

    # model.add(Conv1D(20, 3)) #padding= 'same', #use_bias = False
    # # model.add(BatchNormalization())
    # model.add(Activation('relu'))
    # # model.add(BatchNormalization())

    model.add(MaxPooling1D(2))
    model.add(Dropout(0.3))

    model.add(Flatten())

    model.add(Dense(50)) # kernel_regularizer=regularizers.l2(reg)
    # model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(BatchNormalization())
    model.add(Dropout(0.3))

    model.add(Dense(20)) # kernel_regularizer=regularizers.l2(reg)
    # model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(BatchNormalization())

    model.add(Dense(num_classes, activation='softmax'))

    print(model.summary())



    ## 0422 96% acc
    # model = Sequential()

    # model.add(Conv1D(64, 3, padding= 'same', input_shape= input_shape)) #use_bias = False
    # model.add(BatchNormalization())
    # model.add(Activation('relu'))

    # model.add(Conv1D(64, 3, padding= 'same'))
    # model.add(BatchNormalization())
    # model.add(Activation('relu'))

    # model.add(Conv1D(64, 3))
    # model.add(BatchNormalization())
    # model.add(Activation('relu'))

    # # model.add(Conv1D(64, 3))
    # # model.add(BatchNormalization())
    # # model.add(Activation('relu'))

    # model.add(Flatten())
    # model.add(Dropout(0.5))

    # model.add(Dense(20))
    # model.add(BatchNormalization())
    # model.add(Activation('relu'))
    # # model.add(Dropout(0.5))

    # model.add(Dense(num_classes, activation='softmax'))
    
    # print(model.summary())


elif modeltype == 'rnn':
    model = Sequential()

    # model.add(BatchNormalization(input_shape=input_shape))
    model.add(Bidirectional(LSTM(25, return_sequences = True),input_shape=input_shape)) 
    # model.add(BatchNormalization())
    # model.add(Bidirectional(CuDNNLSTM(32, return_sequences = True)))
    # model.add(BatchNormalization())
    model.add(Bidirectional(LSTM(25)))
    model.add(Dropout(0.3))

    model.add(Dense(50)) # kernel_regularizer=regularizers.l2(reg)
    # model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(BatchNormalization())
    model.add(Dropout(0.3))

    model.add(Dense(20)) # kernel_regularizer=regularizers.l2(reg)
    # model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(BatchNormalization())

    model.add(Dense(num_classes, activation='softmax'))
    print(model.summary())

    # model.add(BatchNormalization(input_shape=input_shape))

    # #model.add(LSTM(4, input_shape=input_shape, activation='relu'))
    # #model.add(LSTM(4, input_shape=input_shape, activation='relu', return_sequences = True))
    # #model.add(LSTM(4, activation='relu', return_sequences = True))
    # #model.add(LSTM(4, activation='relu', return_sequences = True))
    # #model.add(LSTM(4, activation='relu', return_sequences = True))
    # #model.add(LSTM(4, activation='relu'))

    # # model.add(CuDNNLSTM(4, return_sequences = True))
    # # model.add(CuDNNLSTM(4, return_sequences = True))
    # # #model.add(LSTM(4, activation='relu', return_sequences = True))
    # # model.add(CuDNNLSTM(4))

    # # model.add(CuDNNGRU(4, return_sequences = True))
    # # model.add(CuDNNGRU(4, return_sequences = True))
    # # #model.add(LSTM(4, activation='relu', return_sequences = True))
    # # model.add(CuDNNGRU(4))

    # #model.add(Bidirectional(CuDNNLSTM(32)))
    # model.add(Bidirectional(CuDNNLSTM(32, return_sequences = True)))
    # model.add(BatchNormalization())

    # model.add(Bidirectional(CuDNNLSTM(32, return_sequences = True)))
    # model.add(BatchNormalization())

    # # #model.add(LSTM(4, activation='relu', return_sequences = True))
    # model.add(Bidirectional(CuDNNLSTM(32)))
    # model.add(BatchNormalization())
    # model.add(Dropout(0.1))

    # #model.add(Dropout(0.2))

    # # model.add(Dense(10))
    # # model.add(BatchNormalization())
    # # model.add(Activation('relu'))

    # model.add(Dense(num_classes, activation='softmax'))
    # model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 21, 50)            6000      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 50)                15200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2550      
_________________________________________________________________
activation_1 (Activation)    (None, 50)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 20)               

In [0]:
# BATCH_SIZE = 100
# EPOCHS = 1000

#learning_rate = 0.0001
#decay_rate= 1e-04
#optimizer = RMSprop(lr=learning_rate, rho=0.9, epsilon=1e-08, decay=decay_rate) 
#optimizer = SGD(lr=learning_rate, decay=decay_rate, momentum=0.9, nesterov=True)
#optimizer = Adagrad(lr=learning_rate, epsilon=None, decay=decay_rate)
#optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=decay_rate, amsgrad=False)
#optimizer = Adadelta()

#model.compile(loss='categorical_crossentropy', optimizer= optimizer, metrics=['accuracy'])
model.compile(loss='categorical_crossentropy', optimizer= 'adam', metrics=['accuracy'])

## checkpoint path
date_time = datetime.now()
d = date_time.strftime("%m%d%H%M")
print(d)

callbacks_list = [
    ModelCheckpoint(
        filepath= '../checkpoint/' + str(d) +'-{epoch:02d}-{val_acc:.2f}.h5',
        monitor='val_acc', save_best_only=True),
    EarlyStopping(monitor='val_loss', patience=50)
]

hist = model.fit(X_train, y_train, batch_size=BATCH_SIZE,  epochs =EPOCHS,  
                 class_weight = weights, verbose=2, callbacks=callbacks_list,
                 validation_data= (X_valid, y_valid), shuffle=True) 



01240653
Train on 6867 samples, validate on 2362 samples
Epoch 1/1000
 - 7s - loss: 0.6938 - acc: 0.5203 - val_loss: 0.7007 - val_acc: 0.3323
Epoch 2/1000
 - 5s - loss: 0.6923 - acc: 0.4963 - val_loss: 0.6849 - val_acc: 0.6677
Epoch 3/1000
 - 5s - loss: 0.6922 - acc: 0.4873 - val_loss: 0.6908 - val_acc: 0.4890
Epoch 4/1000
 - 5s - loss: 0.6856 - acc: 0.5368 - val_loss: 0.6949 - val_acc: 0.5440
Epoch 5/1000
 - 5s - loss: 0.6787 - acc: 0.5592 - val_loss: 0.6909 - val_acc: 0.5864
Epoch 6/1000
 - 5s - loss: 0.6743 - acc: 0.5707 - val_loss: 0.6570 - val_acc: 0.6710
Epoch 7/1000
 - 5s - loss: 0.6684 - acc: 0.5864 - val_loss: 0.6905 - val_acc: 0.5449
Epoch 8/1000
 - 5s - loss: 0.6567 - acc: 0.6014 - val_loss: 0.6903 - val_acc: 0.5576
Epoch 9/1000
 - 5s - loss: 0.6663 - acc: 0.5796 - val_loss: 0.7251 - val_acc: 0.4568
Epoch 10/1000
 - 4s - loss: 0.6608 - acc: 0.5991 - val_loss: 0.6973 - val_acc: 0.5178
Epoch 11/1000
 - 5s - loss: 0.6487 - acc: 0.6131 - val_loss: 0.6879 - val_acc: 0.5390
Epoch 

In [0]:
## visualizing losses and accuracy

## figures path
fig_loss = path_result +'/figure/rnn0430_norm_loss_to'  + str(datasize) + '_w'+ str(delta) + '.png'
fig_acc =  path_result +'/figure/rnn0430_norm_acc_to'  + str(datasize) + '_w'+ str(delta) + '.png'


train_loss = hist.history['loss']
val_loss = hist.history['val_loss']
train_acc = hist.history['acc']
val_acc = hist.history['val_acc']
xc = range(201) #EPOCHS run or see whether early stoppings

# train_loss =  hist.history['loss'] + hist2.history['loss']
# val_loss = hist.history['val_loss'] + hist2.history['val_loss']
# train_acc = hist.history['acc'] + hist2.history['acc']
# val_acc = hist.history['val_acc'] + hist2.history['val_acc']
# xc = range(2*EPOCHS) #EPOCHS run or see whether early stoppings


plt.figure(1, figsize=(7, 5), facecolor="white")
plt.plot(xc, train_loss)
plt.plot(xc, val_loss)
plt.xlabel('num of Epochs')
plt.ylabel('loss')
plt.title('train_loss vs val_loss')
plt.grid(True)
plt.legend(['train', 'val'])
# print plt.style.available # use bmh, classic,ggplot for big pictures
plt.style.use(['classic'])
plt.savefig(fig_loss)

plt.figure(2, figsize=(7, 5),facecolor="white")
plt.plot(xc, train_acc)
plt.plot(xc, val_acc)
plt.xlabel('num of Epochs')
plt.ylabel('accuracy')
plt.title('train_acc vs val_acc')
plt.grid(True)
plt.legend(['train', 'val']) #, loc=4
# print plt.style.available # use bmh, classic,ggplot for big pictures
plt.style.use(['classic'])
plt.savefig(fig_acc)

## evaluate the model

In [0]:
datasplit = 'subject'

## load dataset
X_train = np.load('../dataset/'+ datasplit +'wise/train/fix_train_' + str(delta) + '.npy')
X_valid = np.load('../dataset/' + datasplit + 'wise/val/fix_valid_' + str(delta) + '.npy')
X_test = np.load('../dataset/'+ datasplit + 'wise/test/fix_test_' + str(delta) + '.npy')

labels_train = pd.read_csv('../dataset/'+ datasplit +'wise/train/label_train_' + str(delta)+ '.csv')
labels_valid = pd.read_csv('../dataset/'+ datasplit +'wise/val/label_train_' + str(delta) + '.csv')
labels_test = pd.read_csv('../dataset/'+ datasplit + 'wise/test/label_train_' + str(delta) + '.csv')

In [0]:
pred_variable = 'native'

## create dataset
if pred_variable == 'subj':
    ## labels as categorical
    y_train = labels_train[pred_variable].astype('category').cat.codes
    y_valid = labels_valid[pred_variable].astype('category').cat.codes
    y_test = labels_test[pred_variable].astype('category').cat.codes

else:
    ## labels as categorical
    y_train = labels_train[pred_variable]
    y_valid = labels_valid[pred_variable]
    y_test = labels_test[pred_variable]

# ## randomize row for training data
# from sklearn.utils import shuffle
# X_train, labels_train, idx_train = shuffle(X_train, labels_train, idx_train, random_state=23)

## data description 
num_classes = len(pd.unique(y_train)) # labels_train[pred_variable].shape (TTTT,)

print("##### data description #####")
print("# of classes:\t",num_classes)

input_shape = X_train.shape[1:]
print("input shape is:\t",input_shape)

N_samples_train = X_train.shape[0]
print("# of samples for training is:\t", N_samples_train)

N_samples_valid = X_valid.shape[0]
print("# of samples for validation is:\t", N_samples_valid)

N_samples_test = X_test.shape[0]
print("# of samples for prediction is:\t", N_samples_test)

N_total = N_samples_train + N_samples_valid + N_samples_test
print("# of total sampels:\t", N_total)

## check data imbalances and caculate weights for loss
weights = class_weight.compute_class_weight('balanced'
        ,np.unique(y_train)
        ,y_train)

print("\n##### data imbalances #####")
print(y_train.value_counts(normalize=True).sort_index())

print("\n##### loss weight #####")
weights = dict(enumerate(weights))
print(weights)

print("\n##### null acc for test dataset #####")
print(np.max(y_test.value_counts(normalize=True).sort_index()))

## one hot encoding
y_train = np_utils.to_categorical(y_train, num_classes)
y_valid = np_utils.to_categorical(y_valid, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)

##### data description #####
# of classes:	 2
input shape is:	 (21, 4)
# of samples for training is:	 6867
# of samples for validation is:	 2362
# of samples for prediction is:	 2319
# of total sampels:	 11548

##### data imbalances #####
0    0.385612
1    0.614388
Name: native, dtype: float64

##### loss weight #####
{0: 1.2966389728096677, 1: 0.8138184403887178}

##### null acc for test dataset #####
0.7705907718844329


In [0]:
## load model
from keras.models import load_model

modeltype = 'rnn'

file_model = '../savedmodel/'+ pred_variable + '/' + datasplit+ 'wise/' + modeltype + '.h5'
loaded_model = load_model(file_model)
print("Loaded model from disk")

Loaded model from disk


In [0]:
## Evaluating the model
print("predicted variable:", pred_variable, '\n')

score = loaded_model.evaluate(X_test, y_test, verbose=0)
print('Test Loss:', score[0])
print('Test accuracy:', score[1])

# Printing the confusion matrix

Y_pred = loaded_model.predict(X_test)
y_pred = np.argmax(Y_pred, axis=1)
#y_pred = model.predict_classes(X_test)
# target_names = ['F', 'M']
# target_names = ['sleepy', 'awake']
#target_names = ['hard', 'easy']

target_names = ['level {}'.format(i) for i in range(num_classes)]

print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print("confusion matrix: \n", confusion_matrix(np.argmax(y_test, axis=1), y_pred))

# cm = confusion_matrix(np.argmax(y_test, axis=1), y_pred)
# bas = (0.5*cm[0,0])/(cm[0,0]+cm[0,1]) + (0.5*cm[1,1])/(cm[1,0]+cm[1,1])
# print ("Balanced acc score:",bas )
# print ("Balanced error rate:",1 - bas)

print("Balanced acc score:", balanced_accuracy_score(np.argmax(y_test, axis=1), y_pred))
print("Balanced error rate:", 1- balanced_accuracy_score(np.argmax(y_test, axis=1), y_pred))



predicted variable: native 

Test Loss: 0.649511418706541
Test accuracy: 0.7447175507133713
              precision    recall  f1-score   support

     level 0       0.39      0.20      0.27       532
     level 1       0.79      0.91      0.85      1787

    accuracy                           0.74      2319
   macro avg       0.59      0.55      0.56      2319
weighted avg       0.70      0.74      0.71      2319

confusion matrix: 
 [[ 108  424]
 [ 168 1619]]
Balanced acc score: 0.5544976038305052
Balanced error rate: 0.44550239616949483


In [0]:
## confusion matrix
sns.set_style("white")
cfmtrix = confusion_matrix(np.argmax(y_test, axis=1), y_pred)

df_cm = pd.DataFrame(cfmtrix, index = [i for i in target_names],
                  columns = [i for i in target_names])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True,fmt="d")