In [1]:
# 使用rnn在ptb数据集上建立自然语言模型
# ptb(Penn Treebank Dataset) 地址： http://www.fit.vutbr.cz/~imikolov/rnnlm/
# wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz


In [49]:
# tensorflow embedding_lookup 用法

import tensorflow as tf
import numpy as np

# 定义未知变量用于存储索引
input_ids = tf.placeholder(dtype=tf.int32, shape=[None])
input_ids2 = tf.placeholder(dtype=tf.int32, shape=[None, None])
# 已知变量 5*5矩阵
embedding = tf.Variable(np.identity(5), dtype=np.int32)

# 根据input_ids索引，在embedding中寻找对应的元素
input_embedding = tf.nn.embedding_lookup(embedding, input_ids)
input_embedding2 = tf.nn.embedding_lookup(embedding, input_ids2)

sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
print embedding.eval()
print '====='
print(sess.run(input_embedding, feed_dict={input_ids:[1, 2, 3, 0, 3, 2, 1]}))
print '====='
print(sess.run(input_embedding2, feed_dict={input_ids2:[[1, 2], [2, 1], [3, 3]]}))
sess.close()

[[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]
=====
[[0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [1 0 0 0 0]
 [0 0 0 1 0]
 [0 0 1 0 0]
 [0 1 0 0 0]]
=====
[[[0 1 0 0 0]
  [0 0 1 0 0]]

 [[0 0 1 0 0]
  [0 1 0 0 0]]

 [[0 0 0 1 0]
  [0 0 0 1 0]]]


In [47]:
# concat 使用
# tf.concat(concat_dim, values, name='concat')
# concat_dim是tensor连接的方向（维度），values是要连接的tensor链表，name是操作名
# 两个二维tensor连接：concat_dim：0表示行，1表示列
# 两个三维tensor连接concat_dim：0表示纵向，1表示行，2表示列

import tensorflow as tf
import numpy as np

t1 = [[1,2,3], [4,5,6]]

t2 = [[7,8,9], [10,11,12]]

sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
print sess.run(tf.concat(0, [t1, t2]))
print sess.run(tf.concat(1, [t1, t2]))
sess.close()

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
[[ 1  2  3  7  8  9]
 [ 4  5  6 10 11 12]]


In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.models.rnn.ptb import reader

# 常量定义
HIDDEN_SIZE = 200   # 隐藏层大小
NUM_LAYERS = 2      # 深层循环神经网络中的LSTM结构层数
VOCAB_SIZE = 10000  # 词典规模

LEARNING_RATE = 1.0    # 学习速率
TRAIN_BATCH_SIZE = 20  # 训练数据batch大小
TRAIN_NUM_STEP = 35    # 训练数据截断长度

# 测试时不需要使用截断，所以可以将测试数据看成一个超长的序列
EVAL_BATCH_SIZE = 1    # 测试数据batch大小
EVAL_NUM_STEP = 1      # 测试数据截断长度
NUM_EPOCH = 2          # 使用训练数据的轮数
KEEP_PROB = 0.5        # 节点不被dropout的概率
MAX_GRAD_NORM = 5      # 用于控制梯度膨胀的参数


class PTBModel(object):
    def __init__(self, is_training, batch_size, num_steps):
        # 记录使用的batch大小和截断长度
        self.batch_size = batch_size
        self.num_steps = num_steps
        
        # 定义输入层
        self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        # 定义预期输出
        self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])
        
        # 定义使用LSTM结构为循环体结构，且使用dropout的深层循环神经网络
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)
        if is_training:
            lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=KEEP_PROB)
        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * NUM_LAYERS)
        
        # 初始化最初的状态，也就是全零的向量
        self.initial_state = cell.zero_state(batch_size, tf.float32)
        
        ####################################################################################
        # 将单词ID转换成单词向量。因为总共有VOCAB_SIZE个单词，每个单词向量的维度为HIDDEN_SIZE
        # 所以embedding参数的维度为VOCAB_SIZE * HIDDEN_SIZE
        embedding = tf.get_variable("embedding", [VOCAB_SIZE, HIDDEN_SIZE])
        # 将原本batch_size * num_steps 个单词ID转换为单词向量
        # 转换后的输入层维度为batch_size * num_size * HIDDEN_SIZE
        inputs = tf.nn.embedding_lookup(embedding, self.input_data)
        # 只在训练的时候，使用dropout
        if is_training:
            inputs = tf.nn.dropout(inputs, KEEP_PROB)
            
        ####################################################################################
        # 定义输出列表--先将不同时刻的LSTM结构输出收集起来，再通过一个全链接层得到最终的输出
        outputs = []
        # state 存储不同batch中的LSTM的状态，将其初始化为0
        state = self.initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()
                # 从输入数据中获取当前时刻的输入并传入LSTM结构
                cell_output, state = cell(inputs[:, time_step, :], state)
                # 将当前输出加入输出队列
                outputs.append(cell_output)
        # 把输出队列展开成[batch, hidden * num_steps]的形状，然后再reshape成[batch* num_steps, hidden_size]的形状
        output = tf.reshape(tf.concat(1, outputs), [-1, HIDDEN_SIZE])
        
        # 从LSTM中得到的输出再经过一个全链接层得到的预测结果，最终的预测的结果在没一个时刻上都是长度为VOCAB_SIZE的数组
        # 经过softmax层之后表示下一个位置是不同单词的概率
        weight = tf.get_variable("weight", [HIDDEN_SIZE, VOCAB_SIZE])
        bias = tf.get_variable("bias", [VOCAB_SIZE])
        logits = tf.matmul(output, weight) + bias
        
        ####################################################################################
        # 定义交叉上损失函数
        # sequence_loss_by_example函数计算一个序列的交叉熵的和
        loss = tf.nn.seq2seq.sequence_loss_by_example(
            [logits],    # 预测的结果
            [tf.reshape(self.targets, [-1])], # 期待的正确答案，这里将[batch_size, num_steps] 二维数字压缩成一维数组
            # 损失权重。在这里所有的权重都为1，也就是不同batch和不同时刻的重要程度是一样的
            [tf.ones([batch_size * num_steps], dtype=tf.float32)]  
        )
        # 计算每个batch的平均损失
        self.cost = tf.reduce_sum(loss) / batch_size
        self.final_state = state
        
        # 只在训练模型时定义反向传播操作
        if not is_training:
            return
        ####################################################################################
        trainable_variables = tf.trainable_variables()
        
        # Gradient clipping 是为了处理gradient explosion和gradient vanishing。直观作用是让权重的更新限制在一个合理的范围内
        # 通过clip_by_global_norm函数控制梯度的大小，避免梯度膨胀问题
        # t_list[i] * clip_norm / max(global_norm, clip_norm)
        # global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_variables), MAX_GRAD_NORM)
        # 定义优化方法
        optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)
        # 定义训练步骤
        self.train_op = optimizer.apply_gradients(zip(grads, trainable_variables))

def run_epoch(sess, model, data, train_op, output_log):
    # 使用给定的模型model，在数据data上运行train_op并返回在全部数据上的perplexity值
    
    # 计算perplexity的辅助变量
    total_costs = 0.0
    iters = 0
    state = sess.run(model.initial_state)
    
    # 使用当前的数据训练或者测试模型
    for step, (x, y) in enumerate(reader.ptb_iterator(data, model.batch_size, model.num_steps)):
        # 在当前batch上运行train_op并计算损失值。交叉熵损失函数计算的就是下一个单词给定单词的概率
        cost, state, _ = sess.run([model.cost, model.final_state, train_op],
                                 {model.input_data: x, model.targets: y,
                                 model.initial_state: state})
        # 将不同时刻，不同batch的概率加起来可以得到第二个perplexity公式等号右边的部分，再将这个和做指数运算就可以得到perplexity
        total_costs += cost
        iters += model.num_steps
        
        # 只有在训练时输出日志
        if output_log and step % 100 == 0:
            print "After %d steps, perplexity is %.3f" % (step, np.exp(total_costs/ iters))
    
    # 返回指定模型在给定数据上的perplexity值
    return np.exp(total_costs/iters)
        

def main(_):
    # 获取原始数据
    train_data, valid_data, test_data, _ = reader.ptb_raw_data("/Users/xxx/work5/tensorflow/data/ptb_dataset/simple-examples/data")
    print len(train_data)
    
    # 定义初始化函数
    initializer = tf.random_uniform_initializer(-0.05, 0.05)
    # 定义训练用的神经网络模型
    with tf.variable_scope("language_model", reuse=None, initializer=initializer):
        train_model = PTBModel(True, TRAIN_BATCH_SIZE, TRAIN_NUM_STEP)
    
    # 定义评测用的神经网络模型
    with tf.variable_scope("language_model", reuse=True, initializer=initializer):
        eval_model = PTBModel(False, EVAL_BATCH_SIZE, EVAL_NUM_STEP)
    
    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        
        # 使用训练数据训练模型
        for i in range(NUM_EPOCH):
            print "In iteration: %d" % (i + 1)
            # 在训练数据上训练神经网络模型
            run_epoch(sess, train_model, train_data, train_model.train_op, True)
            
            # 使用验证数据评测模型效果
            valid_perplexity = run_epoch(sess, eval_model, valid_data, tf.no_op(), False)
            print "Epoch: %d validation perplexity: %.3f" % (i + 1, valid_perplexity)
            
        # 最后使用测试数据测试模型效果
        test_perplexity = run_epoch(sess, eval_model, test_data, tf.no_op(), False)
        
        print "Test perplexity: %.3f" % test_perplexity
    

if __name__ == '__main__':
    tf.app.run()



929589




In iteration: 1
After 0 steps, perplexity is 10016.474
After 100 steps, perplexity is 1480.141
After 200 steps, perplexity is 1095.041
After 300 steps, perplexity is 914.899
After 400 steps, perplexity is 800.299
After 500 steps, perplexity is 722.194
After 600 steps, perplexity is 663.447
After 700 steps, perplexity is 611.851
After 800 steps, perplexity is 565.763
After 900 steps, perplexity is 529.579
After 1000 steps, perplexity is 501.825
After 1100 steps, perplexity is 475.410
After 1200 steps, perplexity is 453.699
After 1300 steps, perplexity is 434.124
Epoch: 1 validation perplexity: 243.041
In iteration: 2
After 0 steps, perplexity is 349.487
After 100 steps, perplexity is 247.141
After 200 steps, perplexity is 251.573
After 300 steps, perplexity is 252.555
After 400 steps, perplexity is 249.283
After 500 steps, perplexity is 246.609
After 600 steps, perplexity is 245.816
After 700 steps, perplexity is 243.148
After 800 steps, perplexity is 238.345
After 900 steps, perplexity

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


[array([[2, 1]], dtype=int32)]
