In [1]:
# import modules
import pandas as pd
import pickle
import gensim
import numpy as np
import string
from opencc import OpenCC
import ckip
import jieba
# Path of files
SENTENCE_DICT = "../../pickle/sentence_dict.pickle"
WORDVEC_MODEL = '../../wordvec_model/'
# Variables
DEMENTIA_NUM = 51
CONTROL_NUM = 51
EMBEDDING_DIM = 100

def read_sentence_file(file_name=None):
    with open(SENTENCE_DICT, 'rb') as f:
        sentence_dict = pickle.load(f)
        print("Load sentence text data ...")
    return sentence_dict

def load_wordvec_model(file_name):
    w2v_model = gensim.models.Word2Vec.load(WORDVEC_MODEL+file_name)
    words = []
    for word in w2v_model.wv.vocab:
        words.append(word)
    print('Load word2vec model sucess ...')
    print('Number of token: {}'.format(len(words)))
    print('Dimensions of word vector: {}'.format(len(w2v_model[words[0]])))
    return w2v_model

In [2]:
sentence_dict = read_sentence_file()

Load sentence text data ...


In [3]:
w2v_model = load_wordvec_model('100features_20context_20mincount_zht')

Load word2vec model sucess ...
Number of token: 259425
Dimensions of word vector: 100




In [4]:
vocab_size = len(w2v_model.wv.vocab.keys())
word_embedding = []
for k in w2v_model.wv.vocab.keys():
    word_embedding.append(np.asarray(w2v_model.wv[k]))

In [5]:
def split_punctuation(sentence):
    punctuation = set(string.punctuation+"，"+"、"+"」"+"「"+"。"+" "+"！")
    sentence_split = []
    tmp = ''
    for i in sentence:
        if i not in punctuation:
            tmp += i
        else:
            sentence_split.append(tmp)
            tmp = ''
    return sentence_split
sentence = '3個人，一個媽媽兩個小孩，小孩站在椅子上要拿西點，椅子都快倒下來了，在拿這個西點餅乾要吃，手下還拿著一塊，'
print(split_punctuation(sentence))

['3個人', '一個媽媽兩個小孩', '小孩站在椅子上要拿西點', '椅子都快倒下來了', '在拿這個西點餅乾要吃', '手下還拿著一塊']


In [6]:
with open('../../data/dementia.txt', encoding='utf8') as f:
    dementia_txt = f.readlines()
sentence = []
for i in range(len(dementia_txt)):
    if i%2==0:
        sentence.extend(split_punctuation(dementia_txt[i+1]))
dementia_num = len(sentence)
with open('../../data/control_51.txt', encoding='utf8') as f:
    control_txt = f.readlines()
for i in range(len(control_txt)):
    if i%2==0:
        sentence.extend(split_punctuation(control_txt[i+1]))
control_num = len(sentence) - dementia_num
############
# train set#
############
train_data = np.array(sentence)
dementia_labels = [[0, 1] for _ in train_data[:dementia_num]]
control_labels = [[1, 0] for _ in train_data[dementia_num:]]
print('total number of train set: {}'.format(train_data.shape[0]))
print('sentence number of dementia subject: {}'.format(len(dementia_labels)))
print('sentence number of control normal subject: {}'.format(len(control_labels)))

total number of train set: 873
sentence number of dementia subject: 442
sentence number of control normal subject: 431


In [7]:
JIEBA_DICT = '../../data/dict.txt.big'
jieba.set_dictionary(JIEBA_DICT)
train_data_seg = []
for i in train_data:
    train_data_seg.append(' '.join(jieba.lcut(i)))
print(train_data_seg[5])

Building prefix dict from /home/yyliu/code/NLP/data/dict.txt.big ...
Loading model from cache /tmp/jieba.u74f96b08eeb68fe4b0ac4c13a6f276ed.cache
Loading model cost 1.434 seconds.
Prefix dict has been built succesfully.


手下 還拿著 一塊


In [8]:
SEQUENCE_LENGTH = 17

train_data_seg_array = np.array(train_data_seg)
l = []
for i in range(len(train_data_seg_array)):
    l.append(len(train_data_seg_array[i].split(' ')))
#     if len(train_data_seg_array[i])==1:
#         print(i, train_data_seg_array[i])
print('Max token number of sentence: {}'.format(np.max(l)))
print('Min token number of sentence: {}'.format(np.min(l)))
print('Mean token number of sentence: {}'.format(np.mean(l)))

Max token number of sentence: 17
Min token number of sentence: 1
Mean token number of sentence: 5.747995418098511


In [9]:
# train_vec = []
# for s in train_data_seg:
#     token_list = []
#     for token in s.split(' '):
#         if token in w2v_model.wv.vocab:
#             token_list.append(np.asarray(w2v_model.wv[token]))
#     if len(token_list) < SEQUENCE_LENGTH:
#         for i in range(SEQUENCE_LENGTH - len(token_list)):
#             token_list.append(np.zeros(shape=VOCAB_DIM))
#     train_vec.append([token_list[0:SEQUENCE_LENGTH]])

# seg_sentence_vec = []
# for key, s in seg_sentence.items():
#     token_list = []
#     for token in s:
#         if token in w2v_model.wv.vocab:
#             token_list.append(np.asarray(w2v_model.wv[token]))
#     if len(token_list) < SEQUENCE_LENGTH:
#         for i in range(SEQUENCE_LENGTH - len(token_list)):
#             token_list.append(np.zeros(shape=VOCAB_DIM))
# #             token_list.append(np.zeros(shape=(VOCAB_DIM, 1)).tolist())
#     seg_sentence_vec.append([token_list[0:SEQUENCE_LENGTH]])

In [10]:
# train_vec = np.asarray(train_vec).reshape(len(train_data_seg),-1,500)
# print(train_vec.shape)
# print(train_vec[2])

In [11]:
import tensorflow as tf

In [12]:
max_sentence_length = max([len(x.split(' ')) for x in train_data_seg_array])
print(max_sentence_length)
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_sentence_length)
vocab_processor.fit(w2v_model.wv.vocab.keys())
x_one_hot = np.array(list(vocab_processor.transform(train_data_seg_array)))

17


In [13]:
x_one_hot[0:5]

array([[    0,  1191,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [  367, 19105,   874, 19002,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [19002, 10258,    58, 89874,  8208, 11504, 87750,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [89874,   255, 10264,  2404,  1931,   224,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [   58, 11504,  1233, 87750, 90507,  2410, 25878,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0]])

In [14]:
# vocab_size=len(vocab_processor.vocabulary_)
# embedding_size = 128
# with tf.device('/cpu:0'), tf.name_scope('embedding'):
#     W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name = 'W')
#     embedded_chars = tf.nn.embedding_lookup(W, input_x)
#     embedded_chars_expand = tf.expand_dims(embedded_chars, -1)

In [33]:
import tensorflow as tf
import numpy as np


class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"), tf.variable_scope('scope_1', reuse=tf.AUTO_REUSE):
            self.W = tf.get_variable(shape=[vocab_size,EMBEDDING_DIM],
                                     initializer=tf.constant_initializer(np.array(word_embedding)), name='W', trainable=False)
#             self.W = tf.Variable(
#                 tf.constant(0.0, shape=[vocab_size, EMBEDDING_DIM]),
#                 name="W", trainable=False)
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length-filter_size+1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
        

In [28]:
y = np.concatenate([dementia_labels, control_labels], 0)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x_one_hot[shuffle_indices]
y_shuffled = y[shuffle_indices]

In [29]:
dev_sample_index = -1 * int(.1*float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
del x_shuffled, y_shuffled

In [30]:
print('vocab_size: {}'.format(len(vocab_processor.vocabulary_)))
print('Train/Dev split : {}/{}'.format(len(y_train), len(y_dev)))

vocab_size: 259426
Train/Dev split : 786/87


In [31]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]


In [34]:
import time
import os
import datetime
batch_size = 32
num_epochs = 10
dropout_keep_prob = 0.5

num_checkpoints = 5
# checkpoint_every = 100

evaluate_every = 100

filter_sizes = (3,4,5)
num_filters = 128
l2_reg_lambda = 0.05

with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        cnn = TextCNN(sequence_length=SEQUENCE_LENGTH, 
                     num_classes=2, 
                     vocab_size=len(vocab_processor.vocabulary_), 
                     embedding_size=EMBEDDING_DIM, 
                     filter_sizes=filter_sizes, 
                     num_filters=num_filters, 
                      l2_reg_lambda=l2_reg_lambda)
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)
        
        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs_2", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
        
        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
#         checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
#         checkpoint_prefix = os.path.join(checkpoint_dir, "model")
#         if not os.path.exists(checkpoint_dir):
#             os.makedirs(checkpoint_dir)
#         saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints)

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))
        sess.run(tf.global_variables_initializer())
        
        def train_step(x_batch, y_batch):
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        batches = batch_iter(
            list(zip(x_train, y_train)), batch_size, num_epochs)
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_dev, y_dev, writer=dev_summary_writer)
                print("")
#             if current_step % checkpoint_every == 0:
#                 path = saver.save(sess, checkpoint_prefix, global_step=current_step)
#                 print("Saved model checkpoint to {}\n".format(path))
        

INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpool-3/b_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/sparsity is illegal; using conv-maxpool-3/b_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/hist is illegal; using conv-maxpool-4/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/sparsity is illegal; using conv-maxpool-4/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/b:0/grad/hist is illegal; using conv-maxpool-4/b_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-4/b:0/grad/sparsity is illegal; using conv-maxpool-4/b_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-5/W:0/grad/his

2018-05-02T14:46:28.367998: step 103, loss 0.625424, acc 0.75
2018-05-02T14:46:28.378108: step 104, loss 0.529945, acc 0.78125
2018-05-02T14:46:28.387919: step 105, loss 0.528089, acc 0.78125
2018-05-02T14:46:28.398544: step 106, loss 0.580177, acc 0.8125
2018-05-02T14:46:28.407743: step 107, loss 0.460612, acc 0.875
2018-05-02T14:46:28.417404: step 108, loss 0.486588, acc 0.875
2018-05-02T14:46:28.427120: step 109, loss 0.483983, acc 0.90625
2018-05-02T14:46:28.437429: step 110, loss 0.489171, acc 0.78125
2018-05-02T14:46:28.447251: step 111, loss 0.453513, acc 0.875
2018-05-02T14:46:28.455684: step 112, loss 0.476339, acc 0.84375
2018-05-02T14:46:28.464204: step 113, loss 0.520922, acc 0.75
2018-05-02T14:46:28.483174: step 114, loss 0.559811, acc 0.78125
2018-05-02T14:46:28.494167: step 115, loss 0.603401, acc 0.75
2018-05-02T14:46:28.504676: step 116, loss 0.541277, acc 0.78125
2018-05-02T14:46:28.513651: step 117, loss 0.551685, acc 0.84375
2018-05-02T14:46:28.522699: step 118, los

2018-05-02T14:46:29.835670: step 249, loss 0.321729, acc 0.9375
2018-05-02T14:46:29.844601: step 250, loss 0.44431, acc 0.833333
