### Step1: 读取数据

In [1]:
import time
import collections
import os

import numpy as np
import tensorflow as tf

def read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", "<eos>").split()

def build_vocab(filename):
    data = read_words(filename)
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    return word_to_id

def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data]

def ptb_raw_data():
    train_path = "ptb.train.txt"
    valid_path = "ptb.valid.txt"
    test_path = "ptb.test.txt"

    word_to_id = build_vocab(train_path)
    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    test_data = file_to_word_ids(test_path, word_to_id)
    return train_data, valid_data, test_data

train_data, valid_data, test_data = ptb_raw_data()
print "Words in training data:", len(train_data)
print "Words in validating data:", len(valid_data)
print "Words in testing data:", len(test_data)
print "Example training data:", train_data[:10]
print "Example validating data:", valid_data[:10]
print "Example testing data:", test_data[:10]

Words in training data: 929589
Words in validating data: 73760
Words in testing data: 82430
Example training data: [9970, 9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983]
Example validating data: [1132, 93, 358, 5, 329, 51, 9836, 6, 326, 2476]
Example testing data: [102, 14, 24, 32, 752, 381, 2, 29, 120, 0]


### Step2: 整理RNN数据格式

In [2]:
def ptb_iterator(raw_data, batch_size, num_steps):
    raw_data = np.array(raw_data, dtype=np.int32)
    data_len = len(raw_data)
    batch_len = data_len // batch_size
    data = np.zeros([batch_size, batch_len], dtype=np.int32)
    for i in range(batch_size):
        data[i] = raw_data[batch_len * i:batch_len * (i + 1)]

    epoch_size = (batch_len - 1) // num_steps
    if epoch_size == 0:
        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

    for i in range(epoch_size):
        x = data[:, i*num_steps:(i+1)*num_steps]
        y = data[:, i*num_steps+1:(i+1)*num_steps+1]
        yield (x, y)

result = ptb_iterator(range(25), 3, 3)
for x, y in result:
    print "X:", x
    print "Y:", y
    print "-------------------"
    

X: [[ 0  1  2]
 [ 8  9 10]
 [16 17 18]]
Y: [[ 1  2  3]
 [ 9 10 11]
 [17 18 19]]
-------------------
X: [[ 3  4  5]
 [11 12 13]
 [19 20 21]]
Y: [[ 4  5  6]
 [12 13 14]
 [20 21 22]]
-------------------


### Step 3: 建立RNN网络

In [None]:
hidden_size = 650
num_layer = 2
vocab_size = 10000

class PTBModel(object):
    def __init__(self, is_training, batch_size, num_steps):
        self.batch_size = batch_size
        self.num_steps = num_steps
        
        # Define Input & Output
        self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])
        
        # Define RNN network
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size, forget_bias=0.0)
        if is_training :
            lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=0.5)
        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layer)

        # Embedding
        self.initial_state = cell.zero_state(batch_size, tf.float32)
        embedding = tf.get_variable("embedding", [vocab_size, hidden_size])
        inputs = tf.nn.embedding_lookup(embedding, self.input_data)
        if is_training: inputs = tf.nn.dropout(inputs, 0.5)

        # Forward propregate
        outputs = []
        state = self.initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)

        output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size])
        softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.nn.seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * num_steps])])
        self.cost = cost = tf.reduce_sum(loss) / batch_size
        self.final_state = state

        if not is_training: return
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5)
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))

    def assign_lr(self, session, lr_value):
        session.run(tf.assign(self.lr, lr_value))
        
print("Model generated!")

Model generated!


### Step 4: 训练模型

In [None]:
def run_epoch(session, m, data, eval_op, verbose=False):
    epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps
    start_time = time.time()
    costs = 0.0
    iters = 0
    state = m.initial_state.eval()
    for step, (x, y) in enumerate(ptb_iterator(data, m.batch_size, m.num_steps)):
        cost, state, _ = session.run([m.cost, m.final_state, eval_op], 
                                     {m.input_data: x, m.targets: y, m.initial_state: state})
        costs += cost
        iters += m.num_steps

        if verbose and step % (epoch_size // 10) == 10:
            print("%.3f perplexity: %.3f speed: %.0f wps" % 
                  (step * 1.0 / epoch_size, np.exp(costs / iters),
                   iters * m.batch_size / (time.time() - start_time)))
    return np.exp(costs / iters)

with tf.Session() as session:
    initializer = tf.random_uniform_initializer(-0.05, 0.05)
    with tf.variable_scope("model", reuse=None, initializer=initializer):
        m = PTBModel(True, 20, 35)
    with tf.variable_scope("model", reuse=True, initializer=initializer):
        mtest = PTBModel(False, 1, 1)

    tf.initialize_all_variables().run()

    for i in range(39):
        base_lr = 1.0
        lr_decay = 0.8 ** max(i - 6, 0.0)
        m.assign_lr(session, base_lr * lr_decay)

        print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
        train_perplexity = run_epoch(session, m, train_data, m.train_op, verbose=True)
        print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
        valid_perplexity = run_epoch(session, mtest, valid_data, tf.no_op())
        print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))

    test_perplexity = run_epoch(session, mtest, test_data, tf.no_op())
    print("Test Perplexity: %.3f" % test_perplexity)

Epoch: 1 Learning rate: 1.000
0.008 perplexity: 5743.727 speed: 197 wps
0.107 perplexity: 1201.711 speed: 233 wps
0.206 perplexity: 863.146 speed: 235 wps
0.306 perplexity: 692.729 speed: 237 wps
0.405 perplexity: 595.858 speed: 239 wps
0.505 perplexity: 529.792 speed: 240 wps
0.604 perplexity: 475.562 speed: 241 wps
0.704 perplexity: 437.009 speed: 241 wps
0.803 perplexity: 407.033 speed: 242 wps
0.903 perplexity: 380.664 speed: 242 wps
Epoch: 1 Train Perplexity: 360.271
Epoch: 1 Valid Perplexity: 213.824
Epoch: 2 Learning rate: 1.000
0.008 perplexity: 257.275 speed: 252 wps
0.107 perplexity: 199.571 speed: 231 wps
0.206 perplexity: 207.185 speed: 231 wps
0.306 perplexity: 201.614 speed: 234 wps
0.405 perplexity: 199.146 speed: 236 wps
0.505 perplexity: 196.261 speed: 240 wps
0.604 perplexity: 190.792 speed: 241 wps
0.704 perplexity: 187.582 speed: 242 wps
0.803 perplexity: 184.645 speed: 243 wps
0.903 perplexity: 180.317 speed: 244 wps
Epoch: 2 Train Perplexity: 177.518
Epoch: 2 Vali