In [1]:
import data_preprocess
import numpy as np
import tensorflow as tf
import time
import os
import datetime

# keras module
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dense, GRU, Embedding, Input, Conv1D, MaxPool1D, concatenate, Flatten
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.callbacks import TensorBoard

# global variables
SEQUENCE_LENGTH = 17
EMBEDDING_DIM = 100
CONTROL_TRAIN = 'control.txt'
DEMENTIA_TRAIN = 'dementia.txt'
CONTROL_TEST = 'control_test.txt'
DEMENTIA_TEST = 'dementia_test.txt'
# W2V_MODEL = '100features_20context_20mincount_zht'

# data preprocessing, load model, w2v_lookup_table, train_x, train_y, train_x_seg, train_x_onehot

# w2v_model = data_preprocess.load_wordvec_model(W2V_MODEL)
x_train, y_train = data_preprocess.read_sentence_single_label(DEMENTIA_TRAIN, CONTROL_TRAIN)
x_train_seg = data_preprocess.segmentation(x_train)
x_test, y_test = data_preprocess.read_sentence_single_label(DEMENTIA_TEST, CONTROL_TEST)
x_test_seg = data_preprocess.segmentation(x_test)

# x_onehot, vocab_processor = data_preprocess.text_to_onehot(x_seg)

# Split data into train and validate part

# x_train, x_dev, y_train, y_dev = data_preprocess.cross_validate_data(
#     x_onehot, y)


Building prefix dict from /home/yyliu/code/NLP/data/dict.txt.big ...
Loading model from cache /tmp/jieba.u74f96b08eeb68fe4b0ac4c13a6f276ed.cache


total number of train set: 784
sentence number of dementia subject: 394
sentence number of control normal subject: 390


Loading model cost 1.520 seconds.
Prefix dict has been built succesfully.


total number of train set: 89
sentence number of dementia subject: 48
sentence number of control normal subject: 41


In [2]:
num_words = 2000
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(x_train_seg)
x_train_tokens = tokenizer.texts_to_sequences(x_train_seg)
x_test_tokens = tokenizer.texts_to_sequences(x_test_seg)

In [3]:
num_tokens = [len(tokens) for tokens in x_train_tokens+x_test_tokens]
max_tokens = np.max(num_tokens)
print('max token number: '+str(max_tokens))

max token number: 17


In [4]:
pad = 'post'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens, padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens, padding=pad, truncating=pad)

In [5]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [6]:
def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token != 0]
    text = ' '.join(words)
    return text

In [7]:
timestamp = datetime.datetime.now().isoformat()
out_dir = os.path.abspath(os.path.join(
            os.path.curdir, "runs_2", timestamp, "summaries"))
tb = TensorBoard(log_dir=out_dir, histogram_freq=0, write_graph=True, write_images=True)

In [8]:
inputs = Input(shape=(SEQUENCE_LENGTH,))
net = inputs
net = Embedding(input_dim=num_words, 
                   output_dim=EMBEDDING_DIM, 
                   input_length=max_tokens)(net)
pathway1 = Conv1D(kernel_size=3, strides=1, filters=64, padding='same', 
            activation='relu', name='conv_1')(net)
pathway1 = MaxPool1D(pool_size=SEQUENCE_LENGTH)(pathway1)
pathway2 = Conv1D(kernel_size=4, strides=1, filters=64, padding='same', 
            activation='relu', name='conv_2')(net)
pathway2 = MaxPool1D(pool_size=SEQUENCE_LENGTH)(pathway2)
pathway3 = Conv1D(kernel_size=5, strides=1, filters=64, padding='same', 
            activation='relu', name='conv_3')(net)
pathway3 = MaxPool1D(pool_size=SEQUENCE_LENGTH)(pathway3)
net = concatenate([pathway1, pathway2, pathway3], axis=2)
net = Dense(1, activation='sigmoid')(net)
net = Flatten()(net)
outputs = net
model = Model(inputs=inputs, outputs=outputs)
model.summary()
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train_pad, y_train,
          validation_split=0.1, epochs=10, batch_size=32, 
          shuffle=True, 
          callbacks=[tb])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 17)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 17, 100)      200000      input_1[0][0]                    
__________________________________________________________________________________________________
conv_1 (Conv1D)                 (None, 17, 64)       19264       embedding_1[0][0]                
__________________________________________________________________________________________________
conv_2 (Conv1D)                 (None, 17, 64)       25664       embedding_1[0][0]                
__________________________________________________________________________________________________
conv_3 (Co

<tensorflow.python.keras._impl.keras.callbacks.History at 0x7fb4e2f76898>

In [9]:
result = model.evaluate(x_test_pad, y_test)




In [10]:
print('Accuracy: {:.2%}'.format(result[1]))

Accuracy: 79.78%


In [11]:
y_pred = model.predict(x=x_test_pad)

In [12]:
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])
cls_true = np.array(y_test)

incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]
print(len(incorrect))

18


In [13]:
idx = incorrect

In [14]:
idx

array([12, 49, 51, 57, 60, 63, 69, 71, 72, 73, 76, 78, 80, 81, 83, 84, 86,
       87])

In [15]:
text_idx = [x_test_seg[i] for i in idx]

In [16]:
y_pred[idx]

array([[0.30287164],
       [0.8521873 ],
       [0.51299137],
       [0.69842285],
       [0.5369447 ],
       [0.611333  ],
       [0.6506513 ],
       [0.5599379 ],
       [0.57345104],
       [0.8440187 ],
       [0.8743253 ],
       [0.5661764 ],
       [0.9264749 ],
       [0.8389887 ],
       [0.50526226],
       [0.650428  ],
       [0.5260157 ],
       [0.52443296]], dtype=float32)

In [17]:
cls_true[idx]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [18]:
text_idx

['流理台 的 櫃子',
 '威廉 太太 實在 太高興 了',
 '完全 就 忘記 把 水龍頭 給關 了 起來',
 '再 一個 廚房 內',
 '以至於 沒 辦法 流通',
 '在 媽媽 背後',
 '椅子 搖晃 快 掉下來 了',
 '想 幫 男孩 拿 東西',
 '媽媽 在 擦乾 盤子',
 '水龍頭 沒 關',
 '水沒關 都 滿 出來 了',
 '媽媽 在 擦 盤子',
 '水已 滿 出來 了',
 '流 滿地',
 '想到 出神',
 '兄妹 在 玩耍',
 '打開 了 櫃子 要 拿 東西',
 '要 妹妹 接著']