In [30]:
# 构建计算图-LSTM
#     embedding
#     LSTM
#     fc
#     train_op
# 训练流程代码
# 数据集封装
#     api: next_batch(batch_size)
# 词表封装:
#     api: sentenceToId(text_sentence):句子转换id
# 类别封装:
#     api: categoryToId(text_category).

In [31]:
import logging

import tensorflow.compat.v1 as tf
import os
import sys
import numpy as np
import math

tf.disable_v2_behavior()
tf.logging.set_verbosity(tf.logging.INFO)

In [32]:
class HParams:
    def __init__(self,
                 num_embedding_size,
                 num_timesteps,
                 num_fc_nodes,
                 num_filters,
                 num_kernel_size,
                 batch_size,
                 learning_rate,
                 # 字符出现的阈值
                 num_word_threshold):
        self._num_embedding_size = num_embedding_size
        self._num_timesteps = num_timesteps
        self._num_fc_nodes = num_fc_nodes
        self._num_filters = num_filters
        self._num_kernel_size = num_kernel_size
        self._batch_size = batch_size
        self._learning_rate = learning_rate
        # 字符出现的阈值
        self._num_word_threshold = num_word_threshold

    @property
    def num_embedding_size(self):
        return self._num_embedding_size
    @property
    def num_timesteps(self):
        return self._num_timesteps
    @property
    def num_fc_nodes(self):
        return self._num_fc_nodes
    @property
    def num_filters(self):
        return self._num_filters
    @property
    def num_kernel_size(self):
        return self._num_kernel_size
    @property
    def batch_size(self):
        return self._batch_size
    @property
    def learning_rate(self):
        return self._learning_rate
    @property
    def num_word_threshold(self):
        return self._num_word_threshold

In [33]:
hps = HParams(
        num_embedding_size = 16,
        num_timesteps = 50,
        num_filters = 128,
        num_kernel_size = 3,
        num_fc_nodes = 32,
        batch_size = 100,
        learning_rate = 0.001,
        # 字符出现次数最小值
        num_word_threshold = 10)

In [34]:
train_file = 'cnews_data/cnews.train.seg.txt'
val_file = 'cnews_data/cnews.val.seg.txt'
test_file = 'cnews_data/cnews.test.seg.txt'
vocab_file = 'cnews_data/cnews.vocab.txt'
category_file = 'cnews_data/cnews.category.txt'
output_folder = 'cnews_data/run_text_rnn'

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

In [35]:
class Vocab:
    def __init__(self, filename, num_word_threshold):
        self._word_to_id = {}
        self._unk = -1
        self._num_word_threshold = num_word_threshold
        self._read_dict(filename)

    def _read_dict(self, filename):
        with open(filename, 'r', encoding = 'utf-8') as f:
            lines = f.readlines()
        for line in lines:
            word, frequency = line.strip('\r\n').split('\t')
            frequency = int(frequency)
            if frequency < self._num_word_threshold:
                continue
            idx = len(self._word_to_id)
            if word == '<UNK>':
                self._unk = idx
            self._word_to_id[word] = idx

    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk)

    @property
    def unk(self):
        return self._unk

    def size(self):
        return len(self._word_to_id)

    def sentence_to_id(self, sentence):
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split()]
        return word_ids

class CategoryDict:
    def __init__(self, file_name):
        self._category_to_id = {}
        with open(file_name, 'r', encoding = 'utf-8') as f:
            lines = f.readlines()
        for line in lines:
            category = line.strip('\r\n')
            idx = len(self._category_to_id)
            self._category_to_id[category] = idx

    def size(self):
        return len(self._category_to_id)
    def category_to_id(self, category):
        if not category in self._category_to_id:
            raise Exception("%s is not in our category list" % category)
        return self._category_to_id[category]

vocab = Vocab(vocab_file, hps.num_word_threshold)
vocab_size = vocab.size()
tf.logging.info('vocab_size: %d' % vocab_size)
category_vocab = CategoryDict(category_file)
num_classes = category_vocab.size()
tf.logging.info('category_size: %d' % num_classes)

INFO:tensorflow:vocab_size: 77331
INFO:tensorflow:category_size: 10


In [36]:
class TextDataSet:
    def __init__(self, filename, vocab, category_vocab, num_timesteps):
        self._vocab = vocab
        self._category_vocab = category_vocab
        self._num_timesteps = num_timesteps
        # matrix
        self._inputs = []
        # vector
        self._outputs = []
        self._indicator = 0
        self._parse_file(filename)

    def _parse_file(self,filename):
        tf.logging.info("Loading data from %s", filename)
        with open(filename, 'r', encoding = 'utf-8') as f:
            lines = f.readlines()
        for line in lines:
            label, content = line.strip('\r\n').split('\t')
            id_label = self._category_vocab.category_to_id(label)
            id_words = self._vocab.sentence_to_id(content)
            id_words = id_words[0:self._num_timesteps]
            padding_num = self._num_timesteps - len(id_words)
            id_words = id_words + \
                       [self._vocab.unk for i in range(padding_num)]
            self._inputs.append(id_words)
            self._outputs.append(id_label)
        self._inputs = np.asarray(self._inputs, dtype = np.int32)
        self._outputs = np.asarray(self._outputs, dtype = np.int32)
        self._random_shuffle()
        self._num_examples = len(self._inputs)

    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]

    def num_examples(self):
        return self._num_examples

    def next_batch(self, batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._random_shuffle()
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):
            raise Exception("batch_size: %d is too large" % batch_size)

        batch_inputs = self._inputs[self._indicator: end_indicator]
        batch_outputs = self._outputs[self._indicator: end_indicator]
        self._indicator = end_indicator
        return batch_inputs, batch_outputs


In [37]:
train_dataset = TextDataSet(
    train_file, vocab, category_vocab, hps.num_timesteps
)
val_dataset = TextDataSet(
    val_file, vocab, category_vocab, hps.num_timesteps
)
test_dataset = TextDataSet(
    test_file, vocab, category_vocab, hps.num_timesteps
)
print(train_dataset.next_batch(2))
print(val_dataset.next_batch(2))
print(test_dataset.next_batch(2))

INFO:tensorflow:Loading data from cnews_data/cnews.train.seg.txt
INFO:tensorflow:Loading data from cnews_data/cnews.val.seg.txt
INFO:tensorflow:Loading data from cnews_data/cnews.test.seg.txt
(array([[  467,    11,  5167, ...,     0,     0,     0],
       [ 9107,    15, 40185, ...,     0,     0,     0]]), array([8, 1]))
(array([[1399,   15, 6420, ...,    0,    0,    0],
       [ 388,   27,    0, ...,    0,    0,    0]]), array([7, 4]))
(array([[  467,    11, 11064, ...,     0,     0,     0],
       [34507, 17425, 10653, ...,     0,     0,     0]]), array([5, 8]))


In [38]:
def create_model(hps, vocab_size, num_classes):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size

    inputs = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    outputs = tf.placeholder(tf.int32, (batch_size, ))
    # dropout 保存下来的值
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    # 保存模型训练到哪一步

    global_step = tf.Variable(
        tf.zeros([], tf.int64), name = 'global_step', trainable = False
    )
    # 随机的均匀分布中初始化
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope('embedding', initializer = embedding_initializer):
         embeddings = tf.get_variable(
             'embeddinggg',
             [vocab_size, hps.num_embedding_size],
             tf.float32
         )
         # 对应inputs & embeddings 拼接成矩阵
         # [1, 10, 7] -> [embeddings[1], embeddings[10], embeddings[7]]
         embed_inputs = tf.nn.embedding_lookup(embeddings, inputs)

    """
    scale = 1.0 / math.sqrt(hps.num_embedding_size + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('lstm_nn', initializer = lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = tf.nn.rnn_cell.BasicLSTMCell(
                hps.num_lstm_nodes[i],
                state_is_tuple = True
            )
            cell = tf.nn.rnn_cell.DropoutWrapper(
                cell,
                output_keep_prob = keep_prob
            )
            cells.append(cell)
        cell = tf.nn.rnn_cell.MultiRNNCell(cells)

        initial_state = cell.zero_state(batch_size, tf.float32)
        #rnn_outputs: [batch_size, num_timesteps, lstm_outputs[-1]]
        rnn_outputs, _ = tf.nn.dynamic_rnn(
            cell,embed_inputs,initial_state = initial_state)
        last = rnn_outputs[:,-1,:]
    """
    scale = 1.0 / math.sqrt(hps.num_embedding_size + hps.num_filters) / 3.0
    cnn_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('cnn', initializer = cnn_init):
        # embed_inputs: [batch_size, timesteps, embed_size]
        # conv1d: [batch_size, timesteps, num_filters]
        conv1d = tf.layers.conv1d(
            embed_inputs,
            hps.num_filters,
            hps.num_kernel_size,
            activation = tf.nn.relu,
        )
        global_maxpooling = tf.reduce_max(conv1d, axis = [1])
    fc_init = tf.uniform_unit_scaling_initializer(factor = 1.0)
    with tf.variable_scope('fc', initializer = fc_init):
        fc1 = tf.layers.dense(global_maxpooling,
                              hps.num_fc_nodes,
                              activation = tf.nn.relu,
                              name = 'fc1')
        fc1_dropout = tf.layers.dropout(fc1, keep_prob)
        # 映射到类别
        logits = tf.layers.dense(fc1_dropout, num_classes, name = 'fc2')

    with tf.name_scope('metrics'):
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = logits, labels = outputs
        )
        loss = tf.reduce_mean(softmax_loss)
        # [0,2,5,4,2] -> argmax: 2 在2的维度上最大
        y_pred = tf.argmax(tf.nn.softmax(logits),
                           1,
                           output_type = tf.int32)
        correct_pred = tf.equal(outputs, y_pred)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    with tf.name_scope('train_op'):
        """
        tvars = tf.trainable_variables()
        for var in tvars:
            tf.logging.info('variable name %s' % var.name)
        # 限制梯度大小
        grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), hps.clip_lstm_grads)
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars),
                                             global_step = global_step)
        """
        train_op = tf.train.AdamOptimizer(hps.learning_rate).minimize(
            loss, global_step = global_step
        )

        return ((inputs, outputs, keep_prob),
                (loss, accuracy),
                (train_op, global_step))


In [39]:
tf.reset_default_graph()
placeholders, metrics, others = create_model(
    hps, vocab_size, num_classes)

inputs, outputs, keep_prob = placeholders
loss, accuracy = metrics
train_op, global_step = others

In [40]:
def eval_holdout(sess, accuracy, dataset_for_test, batch_size):
    num_batches = dataset_for_test.num_examples() // batch_size
    tf.logging.info("Eval holdout: num_examples = %d, batch_size = %d",
                    dataset_for_test.num_examples(), batch_size)
    accuracy_vals = []
    for i in range(num_batches):
        batch_inputs, batch_labels = dataset_for_test.next_batch(batch_size)
        accuracy_val = sess.run(accuracy,
                                feed_dict = {
                                    inputs: batch_inputs,
                                    outputs: batch_labels,
                                    keep_prob: 1.0,
                                })
        accuracy_vals.append(accuracy_val)
    return np.mean(accuracy_vals)

In [41]:
init_op = tf.global_variables_initializer()
train_keep_prob_value = 0.8
test_keeo_prob_value = 1.0

num_train_steps = 10000

with tf.Session() as sess:
    sess.run(init_op)
    for i in range(num_train_steps):
        batch_inputs,batch_labels = train_dataset.next_batch(hps.batch_size)
        outputs_val = sess.run([loss, accuracy,train_op, global_step],
                           feed_dict = {
                               inputs: batch_inputs,
                               outputs: batch_labels,
                               keep_prob: train_keep_prob_value
                           })
        loss_val, accuracy_val, _,global_step_val = outputs_val
        if global_step_val % 200 == 0:
            tf.logging.info("Step: %5d, loss: %3.3f, accuracy: %3.3f"
                            % (global_step_val, loss_val, accuracy_val))
        if global_step_val % 1000 == 0:
            accuracy_eval = eval_holdout(sess, accuracy, val_dataset, hps.batch_size)
            accuracy_test = eval_holdout(sess, accuracy, test_dataset, hps.batch_size)
            tf.logging.info("Step: %5d, val_accuracy: %3.3f, test_accuracy: %3.3f"
                            % (global_step_val, accuracy_eval, accuracy_test))


INFO:tensorflow:Step:   200, loss: 0.594, accuracy: 0.820
INFO:tensorflow:Step:   400, loss: 0.492, accuracy: 0.820
INFO:tensorflow:Step:   600, loss: 0.284, accuracy: 0.920
INFO:tensorflow:Step:   800, loss: 0.266, accuracy: 0.920
INFO:tensorflow:Step:  1000, loss: 0.220, accuracy: 0.910
INFO:tensorflow:Eval holdout: num_examples = 5000, batch_size = 100
INFO:tensorflow:Eval holdout: num_examples = 10000, batch_size = 100
INFO:tensorflow:Step:  1000, val_accuracy: 0.870, test_accuracy: 0.890
INFO:tensorflow:Step:  1200, loss: 0.182, accuracy: 0.930
INFO:tensorflow:Step:  1400, loss: 0.105, accuracy: 0.980
INFO:tensorflow:Step:  1600, loss: 0.079, accuracy: 0.960
INFO:tensorflow:Step:  1800, loss: 0.112, accuracy: 0.950
INFO:tensorflow:Step:  2000, loss: 0.027, accuracy: 0.990
INFO:tensorflow:Eval holdout: num_examples = 5000, batch_size = 100
INFO:tensorflow:Eval holdout: num_examples = 10000, batch_size = 100
INFO:tensorflow:Step:  2000, val_accuracy: 0.914, test_accuracy: 0.932
INFO