In [1]:
import data_preprocess
CONTROL_TOTAL = 'control_origin.txt'
DEMENTIA_TOTAL = 'dementia_origin.txt'
x_train, y_train = data_preprocess.read_sentence(DEMENTIA_TOTAL, CONTROL_TOTAL)
y_train = data_preprocess.label_to_scalar(y_train)
x_train_seg = data_preprocess.segmentation(x_train)

  from ._conv import register_converters as _register_converters
Building prefix dict from /home/yyliu/code/NLP/data/dict.txt.big ...
Loading model from cache /tmp/jieba.u74f96b08eeb68fe4b0ac4c13a6f276ed.cache


total number of train set: 873
sentence number of dementia subject: 442
sentence number of control normal subject: 431


Loading model cost 1.150 seconds.
Prefix dict has been built succesfully.


In [2]:
import gensim
w2v_model = gensim.models.Word2Vec.load('../wordvec_model/500features_20context_20mincount_zht')

In [3]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(10000)
tokenizer.fit_on_texts(w2v_model.wv.vocab)

In [4]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_seg)

In [5]:
import numpy as np
num_tokens = [len(tokens) for tokens in x_train_tokens]
num_tokens = np.array(num_tokens)
pad_len = np.mean(num_tokens) + 3*np.std(num_tokens)
pad_len = int(pad_len)
print(np.sum(num_tokens<pad_len)/len(num_tokens))
print(pad_len)

0.9851088201603666
9


In [6]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_tokens, maxlen=pad_len, padding='post', truncating='post')

In [7]:
SEQUENCE_LENGTH = pad_len
EMBEDDING_DIM = 500
num_words=10000
DROPOUT_LAYER = True
LSTM_LAYER = True
dropout_rate = 0.5
EPOCHS = 10
BATCH_SIZE = 32

In [8]:
def get_nn_model(embedding_matrix, pad_len):
    # Architect Model
    inputs = Input(shape=(pad_len,))
    net = inputs
    if True:
        net = Embedding(input_dim=len(embedding_matrix),
                        output_dim=EMBEDDING_DIM,
                        weights=[embedding_matrix],
                        input_length=SEQUENCE_LENGTH,
                        trainable=False)(net)
    else:
        net = Embedding(input_dim=num_words,
                        output_dim=EMBEDDING_DIM,
                        input_length=SEQUENCE_LENGTH)(net)

    pathway1 = Conv1D(kernel_size=3, strides=1, filters=64, padding='same',
                      activation='relu', name='conv_1')(net)
    pathway1 = MaxPool1D(pool_size=SEQUENCE_LENGTH)(pathway1)
    pathway2 = Conv1D(kernel_size=4, strides=1, filters=64, padding='same',
                      activation='relu', name='conv_2')(net)
    pathway2 = MaxPool1D(pool_size=SEQUENCE_LENGTH)(pathway2)
    pathway3 = Conv1D(kernel_size=5, strides=1, filters=64, padding='same',
                      activation='relu', name='conv_3')(net)
    pathway3 = MaxPool1D(pool_size=SEQUENCE_LENGTH)(pathway3)
    net = concatenate([pathway1, pathway2, pathway3], axis=2)
    if DROPOUT_LAYER:
        net = Dropout(rate=dropout_rate)(net)
    if LSTM_LAYER:
        if DROPOUT_LAYER:
            net = LSTM(units=32, return_sequences=True,
                       name='LSTM_1', dropout=dropout_rate)(net)
            net = LSTM(units=8, name='LSTM_2', dropout=dropout_rate)(net)
        else:
            net = LSTM(units=32, return_sequences=True, name='LSTM_1')(net)
            net = LSTM(units=8, name='LSTM_2')(net)

    net = Dense(1, activation='sigmoid')(net)
    net = Flatten()(net)
    outputs = net
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='Adam', loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [9]:
def k_fold_cross_val(x_train_pad, y_train, n_folds, tb):
    # K fold cross validation
    skf = StratifiedKFold(y_train, n_folds=n_folds, shuffle=True)
    best_model = None
    last_acc = 0
    acc_avg = 0
    word_embedding = w2v_model.wv.vectors
#     for word, i in word_index.items():
#         embedding_vector = word_index.get(word)
#         if embedding_vector is not None:
#             word_embedding[i] = embedding_vector
    for i, (train, val) in enumerate(skf):
        print('Running fold: ', str(i+1))
        model = get_nn_model(word_embedding, pad_len)
        model.fit(x_train_pad[train], y_train[train],
                  epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True, verbose=2, callbacks=[tb])
        result = model.evaluate(x_train_pad[val], y_train[val])

        print('Validation acc: {}'.format(result[1]))
        acc_avg += result[1]
        if result[1] > last_acc:
            best_model = model
            y_pred = model.predict(x_train_pad[val])
#             plot_roc_curve(y_train[val], y_pred, out_dir)
        last_acc = result[1]
    acc_avg /= n_folds
    return best_model, acc_avg

In [10]:
# keras module
from tensorflow.python.keras.models import Model, load_model
from tensorflow.python.keras.layers import Dense, LSTM, Embedding, Input, Conv1D, MaxPool1D, concatenate, Flatten, Dropout
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.callbacks import TensorBoard
#sklearn module
from sklearn.metrics import roc_auc_score, auc, roc_curve
from sklearn.cross_validation import StratifiedKFold
import datetime
import os
message = 'CNN_text_embedding'
timestamp = datetime.datetime.now().isoformat()
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs_2", timestamp+message, "summaries"))
tb = TensorBoard(log_dir=out_dir, histogram_freq=0, write_graph=True, write_images=True)

model, acc_avg = k_fold_cross_val(x_train_pad, y_train, 5, tb)



Running fold:  1
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Epoch 1/10
 - 2s - loss: 0.6912 - acc: 0.5610
Epoch 2/10
 - 0s - loss: 0.6880 - acc: 0.5466
Epoch 3/10
 - 0s - loss: 0.6695 - acc: 0.6657
Epoch 4/10
 - 0s - loss: 0.6373 - acc: 0.6930
Epoch 5/10
 - 0s - loss: 0.5701 - acc: 0.7374
Epoch 6/10
 - 0s - loss: 0.4960 - acc: 0.7891
Epoch 7/10
 - 0s - loss: 0.4627 - acc: 0.7891
Epoch 8/10
 - 0s - loss: 0.4392 - acc: 0.8063
Epoch 9/10
 - 0s - loss: 0.3747 - acc: 0.8407
Epoch 10/10
 - 0s - loss: 0.3395 - acc: 0.8494

Validation acc: 0.75
Running fold:  2


ResourceExhaustedError: OOM when allocating tensor with shape[259425,500] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: embedding_1/embeddings/Assign = Assign[T=DT_FLOAT, _class=["loc:@embedding_1/embeddings"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](embedding_1/embeddings, embedding_1/embeddings/Initializer/random_uniform)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'embedding_1/embeddings/Assign', defined at:
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 112, in start
    self.asyncio_loop.run_forever()
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/asyncio/base_events.py", line 1431, in _run_once
    handle._run()
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tornado/ioloop.py", line 760, in _run_callback
    ret = callback()
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 536, in <lambda>
    self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-f7e767f198d2>", line 18, in <module>
    model, acc_avg = k_fold_cross_val(x_train_pad, y_train, 5, tb)
  File "<ipython-input-9-d3d44fa8b948>", line 14, in k_fold_cross_val
    model = get_nn_model(word_embedding, pad_len)
  File "<ipython-input-8-42e26010d35a>", line 10, in get_nn_model
    trainable=False)(net)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/keras/_impl/keras/engine/topology.py", line 258, in __call__
    output = super(Layer, self).__call__(inputs, **kwargs)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/layers/base.py", line 636, in __call__
    self.build(input_shapes)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/keras/_impl/keras/layers/embeddings.py", line 123, in build
    dtype=self.dtype)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/keras/_impl/keras/engine/topology.py", line 220, in add_weight
    trainable=trainable)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/layers/base.py", line 504, in add_variable
    partitioner=partitioner)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1262, in get_variable
    constraint=constraint)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1097, in get_variable
    constraint=constraint)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 435, in get_variable
    constraint=constraint)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 404, in _true_getter
    use_resource=use_resource, constraint=constraint)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 806, in _get_single_variable
    constraint=constraint)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 229, in __init__
    constraint=constraint)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 366, in _init_from_args
    validate_shape=validate_shape).op
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/ops/state_ops.py", line 276, in assign
    validate_shape=validate_shape)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/ops/gen_state_ops.py", line 59, in assign
    use_locking=use_locking, name=name)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3160, in create_op
    op_def=op_def)
  File "/home/yyliu/anaconda3/envs/NLP/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[259425,500] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: embedding_1/embeddings/Assign = Assign[T=DT_FLOAT, _class=["loc:@embedding_1/embeddings"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](embedding_1/embeddings, embedding_1/embeddings/Initializer/random_uniform)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

