## RNN 语言模型入门探索

先试着用简单的例子来学习上手.

In [1]:
%load_ext watermark
%watermark -p tensorflow,numpy -v -m

CPython 3.4.3
IPython 5.3.0

tensorflow 1.0.1
numpy 1.12.0

compiler   : GCC 4.8.4
system     : Linux
release    : 4.9.8-moby
machine    : x86_64
processor  : x86_64
CPU cores  : 4
interpreter: 64bit


In [2]:
import tensorflow as tf
import numpy as np

### 1. RNN 定长输入的简单例子

In [3]:
vocab_size = 20
word_embedding_dim = 15
num_units = 10
sentence_len = 5
batch_size = 2

tf.reset_default_graph()
cell = tf.contrib.rnn.BasicRNNCell(num_units)  # bacis rnn cell

#### 简单制造一点语料数据

In [4]:
# 人为制造两个等长句子数据, shape = [batch_size, sentence_len]
data = np.asarray(
    [[2, 3, 5, 7, 1], 
     [8, 4, 2, 6, 1]])

inputs_train = data[:, :-1]
labels_train = data[:, 1:]
print(inputs_train, '\n', labels_train)

[[2 3 5 7]
 [8 4 2 6]] 
 [[3 5 7 1]
 [4 2 6 1]]


#### 构建 RNN 神经网络

word_ids -> word embedding -> rnn outputs -> sigmoid

In [5]:
word_embedding = tf.Variable(tf.random_uniform([vocab_size, word_embedding_dim]))

inputs = tf.placeholder(tf.int32, shape=[batch_size, sentence_len - 1], name='inputs')
labels = tf.placeholder(tf.int32, shape=[batch_size, sentence_len - 1], name='labels')
labels_flat = tf.reshape(labels, (-1,))
labels_flat

<tf.Tensor 'Reshape:0' shape=(8,) dtype=int32>

In [6]:
input_embeds = tf.nn.embedding_lookup(word_embedding, inputs)
input_embeds

<tf.Tensor 'embedding_lookup:0' shape=(2, 4, 15) dtype=float32>

In [7]:
output, states = tf.nn.dynamic_rnn(cell, input_embeds, dtype=tf.float32)  # tf.nn.dynamic_rnn 可实现多步计算
output_flat = tf.reshape(output, (-1, num_units))
output, output_flat, states

(<tf.Tensor 'rnn/transpose:0' shape=(2, 4, 10) dtype=float32>,
 <tf.Tensor 'Reshape_1:0' shape=(8, 10) dtype=float32>,
 <tf.Tensor 'rnn/while/Exit_2:0' shape=(?, 10) dtype=float32>)

In [8]:
softmax_w = tf.Variable(tf.random_uniform([num_units, vocab_size]))
softmax_b = tf.Variable(tf.random_uniform([vocab_size]))

logits_flat = tf.matmul(output_flat, softmax_w) + softmax_b
probs_flat = tf.sigmoid(logits_flat)
probs = tf.reshape(probs_flat, (batch_size, sentence_len - 1, -1))
preds = tf.argmax(probs, axis=2)

losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_flat, labels=labels_flat)
loss = tf.reduce_mean(losses)

#### 训练

In [9]:
learning_rate = 0.3
epochs = 20

train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
batch_feed = {inputs: inputs_train, labels: labels_train}
loss_history1 = []

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(epochs):
        sess.run(train_step, feed_dict=batch_feed)
        loss_val = sess.run(loss, feed_dict=batch_feed)
        loss_history1.append(loss_val)
        print('epoch {}: cost = {}'.format(i, loss_val))
    
    preds_val = sess.run(preds, feed_dict={inputs: inputs_train})

epoch 0: cost = 2.8206260204315186
epoch 1: cost = 2.611342430114746
epoch 2: cost = 2.444389820098877
epoch 3: cost = 2.311234474182129
epoch 4: cost = 2.1996729373931885
epoch 5: cost = 2.099348306655884
epoch 6: cost = 2.004868268966675
epoch 7: cost = 1.9141072034835815
epoch 8: cost = 1.826167345046997
epoch 9: cost = 1.7404463291168213
epoch 10: cost = 1.6564154624938965
epoch 11: cost = 1.5736474990844727
epoch 12: cost = 1.4918718338012695
epoch 13: cost = 1.4110069274902344
epoch 14: cost = 1.3311593532562256
epoch 15: cost = 1.2525956630706787
epoch 16: cost = 1.1756908893585205
epoch 17: cost = 1.100874423980713
epoch 18: cost = 1.0285818576812744
epoch 19: cost = 0.9592243432998657


#### 检查模型准确性

In [10]:
preds_val == labels_train

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True]], dtype=bool)

### 2. RNN 变长输入的简单例子

In [11]:
vocab_size = 20
word_embedding_dim = 15
num_units = 10
sentence_len = 8
batch_size = 2

tf.reset_default_graph()
cell = tf.contrib.rnn.BasicRNNCell(num_units)  # bacis rnn cell

#### "制造"语料数据

In [12]:
# 人为制造两个句子数据, 句子长度分别为 8, 5. 第二个句子补了3个padding
data = np.asarray(
    [[1, 3, 5, 7, 2, 4, 6, 1], 
     [2, 4, 2, 6, 1, 0, 0, 0]])

inputs_train = data[:, :-1]
labels_train = data[:, 1:]
print(inputs_train, '\n', labels_train)

[[1 3 5 7 2 4 6]
 [2 4 2 6 1 0 0]] 
 [[3 5 7 2 4 6 1]
 [4 2 6 1 0 0 0]]


#### 搭建 RNN

In [13]:
word_embedding = tf.Variable(tf.random_uniform([vocab_size, word_embedding_dim]))

inputs = tf.placeholder(tf.int32, shape=[batch_size, sentence_len - 1], name='inputs')
labels = tf.placeholder(tf.int32, shape=[batch_size, sentence_len - 1], name='labels')
labels_flat = tf.reshape(labels, (-1,))

input_embeds = tf.nn.embedding_lookup(word_embedding, inputs)

output, states = tf.nn.dynamic_rnn(cell, input_embeds, dtype=tf.float32, sequence_length=[8, 5])
output_flat = tf.reshape(output, (-1, num_units))

softmax_w = tf.Variable(tf.random_uniform([num_units, vocab_size]))
softmax_b = tf.Variable(tf.random_uniform([vocab_size]))

logits_flat = tf.matmul(output_flat, softmax_w) + softmax_b
probs_flat = tf.sigmoid(logits_flat)
probs = tf.reshape(probs_flat, (batch_size, sentence_len - 1, -1))
preds = tf.argmax(probs, axis=2)

losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_flat, labels=labels_flat)

#### 用 mask 来修正 loss 函数

In [14]:
mask = tf.cast(tf.sign(labels_flat), tf.float32)
with tf.Session() as sess:
    mask_val = sess.run(mask, feed_dict={labels: labels_train})
mask_val

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.], dtype=float32)

In [15]:
loss2 = tf.reduce_sum(losses * mask)

#### 训练

In [16]:
learning_rate = 0.1
epochs = 200

train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss2)
batch_feed = {inputs: inputs_train, labels: labels_train}
loss_history2 = []

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(epochs):
        sess.run(train_step, feed_dict=batch_feed)
        loss_val = sess.run(loss2, feed_dict=batch_feed)
        loss_history2.append(loss_val)
        print('epoch {}: cost = {}'.format(i, loss_val))
    
    preds_val = sess.run(preds, feed_dict={inputs: inputs_train})

epoch 0: cost = 27.346179962158203
epoch 1: cost = 23.194448471069336
epoch 2: cost = 20.332326889038086
epoch 3: cost = 17.682546615600586
epoch 4: cost = 15.473355293273926
epoch 5: cost = 13.398550987243652
epoch 6: cost = 11.77105712890625
epoch 7: cost = 9.906399726867676
epoch 8: cost = 8.594326972961426
epoch 9: cost = 7.438560485839844
epoch 10: cost = 6.5897016525268555
epoch 11: cost = 5.872864246368408
epoch 12: cost = 5.252179145812988
epoch 13: cost = 4.671937942504883
epoch 14: cost = 4.158555507659912
epoch 15: cost = 3.7934341430664062
epoch 16: cost = 3.847944974899292
epoch 17: cost = 4.744208812713623
epoch 18: cost = 3.473471164703369
epoch 19: cost = 2.7029590606689453
epoch 20: cost = 2.2751803398132324
epoch 21: cost = 2.0305259227752686
epoch 22: cost = 1.8455851078033447
epoch 23: cost = 1.6901209354400635
epoch 24: cost = 1.5549675226211548
epoch 25: cost = 1.4360249042510986
epoch 26: cost = 1.3308978080749512
epoch 27: cost = 1.2376933097839355
epoch 28: cos

#### 检查模型准确性

In [17]:
preds_val

array([[3, 5, 7, 2, 4, 6, 1],
       [4, 2, 6, 1, 3, 6, 6]])

In [18]:
preds_val == labels_train

array([[ True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True, False, False, False]], dtype=bool)