### Import Packages

In [1]:
import os
import time
import jieba as jb
import numpy as np
import tensorflow as tf
from sklearn import metrics

os.environ['CUDA_VISIBLE_DEVICES'] = '3'
tf.logging.set_verbosity(tf.logging.INFO)

gpuconfig = tf.ConfigProto(log_device_placement=True)
gpuconfig.gpu_options.allow_growth = True

estimator_config = tf.estimator.RunConfig(
    model_dir='../logs/Dynamic-Bi-LSTM',
    save_summary_steps=100,
    session_config=gpuconfig)

base_dir = '../'
category_file = os.path.join(base_dir, 'data/category.txt')
train_file = os.path.join(base_dir, 'data/train.txt')
develop_file = os.path.join(base_dir, 'data/develop.txt')
test_file = os.path.join(base_dir, 'data/test.txt')

### Read DataSet

In [2]:
class SMPDATASET(object):
    def __init__(self, filenames, category_file):
        self.__filenames = filenames
        self.__category_file = category_file
        self.vocab_size = 0
        self.seq_max_len = 0
        self.word_index = {}

    def __read_category(self):
        category_table = {}
        for line in open(self.__category_file).readlines():
            category = line.strip().split(':')[0]
            label = line.strip().split(':')[1]
            category_table[category] = label
        return category_table

    def __read_data(self):
        dataset = {}
        labels = ['train', 'develop', 'test']
        for filename, label in zip(self.__filenames, labels):
            data = [
                line.strip().split("\t")
                for line in open(filename).readlines()
            ]
            y = [self.__read_category()[item[0]] for item in data]
            x = [list(jb.cut(item[1])) for item in data]
            dataset[label] = (x, np.array(y, dtype=int))
        return dataset

    def __build_vocab(self, sentences):
        vocab_size = 0
        word_index = {}
        all_words = set([word for item in sentences for word in item])
        word_index["<PAD>"] = 0
        word_index["<UNK>"] = 1

        for index, word in enumerate(all_words):
            word_index[word] = index + 2
        vocab_size = len(word_index)

        return vocab_size, word_index

    def __get_max_len(self, sentences):
        return max([len(sentence) for sentence in sentences])

    def __word2index(self, sentences):
        results = []
        for sentence in sentences:
            sentence_id = []
            for word in sentence:
                if word in self.word_index.keys():
                    sentence_id.append(self.word_index[word])
                else:
                    sentence_id.append(self.word_index["<UNK>"])
            results.append(sentence_id)
        return results

    def __padding_sentence(self, sentences):
        padding_result = tf.keras.preprocessing.sequence.pad_sequences(
            sentences,
            maxlen=self.seq_max_len,
            truncating='post',
            padding='post',
            value=self.word_index['<PAD>'])
        return padding_result

    def load_data(self):
        print("INFO:Read data from file.")
        x_train, y_train = self.__read_data()['train']
        x_develop, y_develop = self.__read_data()['develop']
        x_test, y_test = self.__read_data()['test']

        print("INFO:Building vocabulary from train set.")
        self.vocab_size, self.word_index = self.__build_vocab(x_train)

        print("INFO:Compute the maximum length of sentence.")
        self.seq_max_len = self.__get_max_len(x_train)

        print("INFO:Convert words to id represent.")
        x_train_id = self.__word2index(x_train)
        x_develop_id = self.__word2index(x_develop)
        x_test_id = self.__word2index(x_test)
        x_len_train = np.array(
            [min(len(x), self.seq_max_len) for x in x_train_id])
        x_len_develop = np.array(
            [min(len(x), self.seq_max_len) for x in x_develop_id])
        x_len_test = np.array(
            [min(len(x), self.seq_max_len) for x in x_test_id])

        print("INFO:Padding sentences.")
        x_train_pad = self.__padding_sentence(x_train_id)
        x_develop_pad = self.__padding_sentence(x_develop_id)
        x_test_pad = self.__padding_sentence(x_test_id)

        print("INFO:Maximum  length of sentence is {}.".format(
            self.seq_max_len))
        print("INFO:Vocabulary size is {}.".format(self.vocab_size))
        print("INFO:train set shape is {}.".format(x_train_pad.shape))
        print("INFO:develop set shape is {}.".format(x_develop_pad.shape))
        print("INFO:test set shape is {}.".format(x_test_pad.shape))

        return (x_train_pad, x_len_train,
                y_train), (x_develop_pad, x_len_develop,
                           y_develop), (x_test_pad, x_len_test, y_test)


filenames = [train_file, develop_file, test_file]
(x_train, x_len_train,
 y_train), (x_develop, x_len_develop,
            y_develop), (x_test, x_len_test, y_test) = SMPDATASET(
                filenames, category_file).load_data()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.694 seconds.


INFO:Read data from file.


Prefix dict has been built succesfully.


INFO:Building vocabulary from train set.
INFO:Compute the maximum length of sentence.
INFO:Convert words to id represent.
INFO:Padding sentences.
INFO:Maximum  length of sentence is 26.
INFO:Vocabulary size is 2889.
INFO:train set shape is (2299, 26).
INFO:develop set shape is (770, 26).
INFO:test set shape is (666, 26).


### Parameters

In [3]:
params = {
    'embedding_initializer': tf.random_uniform_initializer(-1.0, 1.0),
    'hidden_size': 128,
    'num_classes': 31,
    'embedding_size': 128,
    'vocab_size': 2889,
    'batch_size': 32,
    'learning_rate': 0.001
}

In [4]:
def parser(x, length, y):
    features = {"x": x, "len": length}
    return features, y


def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train,
                                                  y_train))
    dataset = dataset.shuffle(buffer_size=len(x_train))
    dataset = dataset.batch(params['batch_size'])
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    dataset = dataset.prefetch(buffer_size=1)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()


def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_develop, x_len_develop,
                                                  y_develop))
    dataset = dataset.batch(len(x_develop))
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()


def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_test, x_len_test, y_test))
    dataset = dataset.batch(len(x_test))
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

### Model

In [5]:
def model_fn(features, labels, mode, params):
    input_layer = tf.contrib.layers.embed_sequence(
        features['x'],
        params['vocab_size'],
        params['embedding_size'],
        initializer=params['embedding_initializer'])

    training = mode == tf.estimator.ModeKeys.TRAIN

    lstm_cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=params['hidden_size'])
    lstm_cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=params['hidden_size'])
    outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=lstm_cell_bw,
        cell_bw=lstm_cell_fw,
        inputs=input_layer,
        dtype=tf.float32)

    concat_result = tf.concat([output_states[0].h, output_states[1].h],
                              axis=-1,
                              name='Concat')

    dropout_hidden = tf.layers.dropout(
        inputs=concat_result, rate=0.5, training=training, name='Dropout')

    logits = tf.layers.dense(
        inputs=dropout_hidden, units=params['num_classes'], name='Fc')

    predicted_labels = tf.argmax(logits, 1)
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {'pre_labels': predicted_labels[:, tf.newaxis]}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    accuracy = tf.metrics.accuracy(
        labels=labels, predictions=predicted_labels, name='acc_op')
    metrics = {'accuracy': accuracy}
    tf.summary.scalar('accuracy', accuracy[1])
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(
            mode, loss=loss, eval_metric_ops=metrics)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(
            learning_rate=params['learning_rate'], name='Adam')
        train_op = optimizer.minimize(
            loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)


model = tf.estimator.Estimator(
    model_fn=model_fn, config=estimator_config, params=params)

INFO:tensorflow:Using config: {'_model_dir': '../logs/Dynamic-Bi-LSTM', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': gpu_options {
  allow_growth: true
}
log_device_placement: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe4e0042278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### Training

In [6]:
model.train(input_fn=train_input_fn, steps=3000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ../logs/Dynamic-Bi-LSTM/model.ckpt.
INFO:tensorflow:loss = 3.5803084, step = 0
INFO:tensorflow:global_step/sec: 24.4707
INFO:tensorflow:loss = 2.494048, step = 100 (4.089 sec)
INFO:tensorflow:global_step/sec: 25.7708
INFO:tensorflow:loss = 0.93669045, step = 200 (3.882 sec)
INFO:tensorflow:global_step/sec: 27.6955
INFO:tensorflow:loss = 0.5588388, step = 300 (3.609 sec)
INFO:tensorflow:global_step/sec: 25.902
INFO:tensorflow:loss = 0.4160512, step = 400 (3.864 sec)
INFO:tensorflow:global_step/sec: 25.8741
INFO:tensorflow:loss = 0.2019173, step = 500 (3.863 sec)
INFO:tensorflow:global_step/sec: 26.7774
INFO:tensorflow:loss = 0.20772806, step = 600 (3.735 sec)
INFO:tensorflow:global_step/sec: 26.9874
INFO:tens

<tensorflow.python.estimator.estimator.Estimator at 0x7fe4584982e8>

### Evaluate

In [7]:
model.evaluate(input_fn=eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-01-20-11:37:17
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../logs/Dynamic-Bi-LSTM/model.ckpt-3000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-01-20-11:37:18
INFO:tensorflow:Saving dict for global step 3000: accuracy = 0.812987, global_step = 3000, loss = 0.9238068
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 3000: ../logs/Dynamic-Bi-LSTM/model.ckpt-3000


{'accuracy': 0.812987, 'loss': 0.9238068, 'global_step': 3000}

### Predict

In [8]:
# pre_results = np.array([
#     pre_label['pre_labels'][0]
#     for pre_label in model.predict(input_fn=test_input_fn)
# ])
# print("\nClassification Report:\n",
#       metrics.classification_report(y_test, pre_results))