In [1]:
import tensorflow as tf

In [2]:
import numpy as np

### 0. 定义参数

In [32]:
vocab_size = 100
hidden_size = 128  # aslo embedding dim
batch_size = 3
number_steps = 4
max_words = 5
epochs = 11
print_loss_every = 2
lr = 0.01

### 1. 定义测试数据

In [4]:
data = np.asarray([[1, 2], [1, 2, 3], [1, 2, 3, 4, 5]])

In [5]:
data_padding = np.asarray([x[:max_words] for x in [y + [0] * max_words 
                                         for y in data]])

In [6]:
data_padding

array([[1, 2, 0, 0, 0],
       [1, 2, 3, 0, 0],
       [1, 2, 3, 4, 5]])

In [7]:
x = data_padding[:, :number_steps]
y_ = data_padding[:, 1:]

In [8]:
x

array([[1, 2, 0, 0],
       [1, 2, 3, 0],
       [1, 2, 3, 4]])

In [9]:
y_

array([[2, 0, 0, 0],
       [2, 3, 0, 0],
       [2, 3, 4, 5]])

### 2. 定义神经网络结构以及前向传播过程

In [11]:
# Placeholder for input, output
# input_x 不取最后一词，input_y 不取第一词，所以两者的维度是一样的
input_x = tf.placeholder(tf.int32, shape=[None, number_steps], name='input_x')
input_y = tf.placeholder(tf.int32, shape=[None, number_steps], name='input_y')

In [12]:
mask = tf.sign(tf.reshape(input_y, [-1]))

In [13]:
# 先把非零元素转为 1，然后求和，即句子长度，当然原先句子中的一些 unk 也被忽略了
sequence_length = tf.reduce_sum(tf.sign(input_x), 1)  

In [14]:
# Embedding layer
with tf.name_scope('embedding'):
    word_embedding = tf.Variable(tf.random_uniform([vocab_size, hidden_size]))
    embeds = tf.nn.embedding_lookup(word_embedding, input_x)

In [15]:
# RNN layer
with tf.name_scope('rnn'):
    cell = tf.contrib.rnn.BasicRNNCell(128)
    outputs, states = tf.nn.dynamic_rnn(
        cell, embeds, dtype=tf.float32, sequence_length=sequence_length)
    # Flat outputs
    output_flat = tf.reshape(outputs, [-1, hidden_size])

In [16]:
logits = tf.layers.dense(output_flat, vocab_size)

In [17]:
y = tf.nn.softmax(logits)

In [18]:
labels_flat = tf.reshape(input_y, [-1])
mask = tf.sign(labels_flat)
mask = tf.cast(mask, dtype=tf.float32)

In [19]:
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels_flat)

In [20]:
loss = tf.reduce_sum(cost * mask) / batch_size  # 每个句子的平均损失

In [21]:
test_feed = {input_x: np.array([[2, 2, 0, 0], [1, 2, 3, 0]]).reshape(2, 4)}

### 3. 训练模型

In [41]:
train_step = tf.train.GradientDescentOptimizer(lr).minimize(loss)
feed_dict = {input_x: x, input_y: y_}
with tf.Session() as sess:
    costs = 0.0
    iters = 0
    sess.run(tf.global_variables_initializer())
    for i in range(epochs):        
        sess.run(train_step, feed_dict=feed_dict)
        cross_entropy = sess.run(loss, feed_dict=feed_dict)
        costs += cross_entropy
        iters += number_steps
        perplexity = np.exp(costs / iters)
        if i % print_loss_every == 0:
            print('After {} steps, perplexity is {:.2f}'.format(i, perplexity))

After 0 steps, perplexity is 15.48
After 2 steps, perplexity is 11.72
After 4 steps, perplexity is 9.05
After 6 steps, perplexity is 7.15
After 8 steps, perplexity is 5.79
After 10 steps, perplexity is 4.83


## Refrences:

* [assignment2.pdf](http://web.stanford.edu/class/cs224n/assignment2/assignment2.pdf)