In [1]:
import tensorflow as tf
import os
from six.moves import cPickle
import collections
import numpy as np
import codecs
import jieba

### Load the Corpus

In [2]:
FILE_PATH = './data/诛仙.txt'
# Whether or not use Chinese split words, if false, use single chars to feed
USE_SPLIT = True                  

#### Load the book as a string

In [3]:
corpus_raw = u""

with codecs.open(FILE_PATH, 'r', 'utf-8') as book_file:
    corpus_raw += book_file.read()

print("Corpus is {} characters long".format(len(corpus_raw)))

Corpus is 3126568 characters long


### Process Corpus
##### Create lookup tables

In [4]:
def create_lookup_tables(text, use_split=USE_SPLIT):
    """
    Create lookup tables for vocab
    :param text: The corpus text split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    words = list(jieba.cut(text))
    vocab = set(words) if use_split else set(text)
    
    int_to_vocab = {key: word for key, word in enumerate(vocab)}
    vocab_to_int = {word: key for key, word in enumerate(vocab)}
    
    if use_split:
        text_index = [vocab_to_int[word] for word in words]
    else:
        text_index = [vocab_to_int[word] for word in text]
    
    return vocab_to_int, int_to_vocab, text_index

##### Process data

In [5]:
vocab_to_int, int_to_vocab, corpus_int = create_lookup_tables(corpus_raw)
print("Vocabulary size : {}, number of Chinese words in text : {}".format(len(corpus_int), len(vocab_to_int)))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.674 seconds.
Prefix dict has been built succesfully.


Vocabulary size : 2050961, number of Chinese words in text : 38087


# Build the Network
### Batch the Data

In [6]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target data
    :param int_text: text with words replaced by their ids
    :param batch_size: the size that each batch of data should be
    :param seq_length: the length of each sequence
    :return: batches of data as a numpy array
    """
    words_per_batch = batch_size * seq_length
    num_batches = len(int_text)//words_per_batch
    int_text = int_text[:num_batches*words_per_batch]
    y = np.array(int_text[1:] + [int_text[0]])
    x = np.array(int_text)
    
    x_batches = np.split(x.reshape(batch_size, -1), num_batches, axis=1)
    y_batches = np.split(y.reshape(batch_size, -1), num_batches, axis=1)
    
    batch_data = list(zip(x_batches, y_batches))
    
    return np.array(batch_data)

### Hyperparameters

In [7]:
num_epochs = 400
batch_size = 512
rnn_size = 128
num_layers = 2
keep_prob = 0.7
embed_dim = 128
seq_length = 30
learning_rate = 0.001
save_dir = './save'

### Build the Graph

In [8]:
train_graph = tf.Graph()
with train_graph.as_default():    
    
    # Initialize input placeholders
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    # Calculate text attributes
    vocab_size = len(int_to_vocab)
    input_text_shape = tf.shape(input_text)
    
    # Build the RNN cell
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size)
    drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * num_layers)
    
    # Set the initial state
    initial_state = cell.zero_state(input_text_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    
    # Create word embedding as input to RNN
    embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, embed_dim)
    
    # Build RNN
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    
    # Take RNN output and make logits
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    # Calculate the probability of generating each word
    probs = tf.nn.softmax(logits, name='probs')
    
    # Define loss function
    cost = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_text_shape[0], input_text_shape[1]])
    )
    
    # Learning rate optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    # Gradient clipping to avoid exploding gradients
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    

### Train the Network

In [9]:
import time

batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()

print("Num Batches per Epoche : {}, Total Epochs : {}".format(num_batches, num_epochs))

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        
        for batch_index, (x, y) in enumerate(batches):
            
            batch_start_time = time.time()
            
            feed_dict = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
            
            if batch_index % 100 == 0:
                time_elapsed   = time.time() - start_time
                time_per_batch = time.time() - batch_start_time
                num_batches_remaining = (num_epochs - epoch) * num_batches + num_batches - batch_index 
                print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f} time_per_batch = {:3f} time_elapsed = {:.3f}   time_remaining = {:.0f}'.format(
                    epoch + 1,
                    batch_index + 1,
                    num_batches,
                    train_loss,
                    time_per_batch,
                    time_elapsed,
                    num_batches_remaining * time_per_batch))
                
                # save model every 100 batches
                saver = tf.train.Saver()
                saver.save(sess, save_dir)
                print('Model Trained and Saved')
            

Num Batches per Epoche : 133, Total Epochs : 400
Epoch   1 Batch    1/133 train_loss = 10.548 time_per_batch = 0.820172 time_elapsed = 3.095   time_remaining = 43742
Model Trained and Saved
Epoch   1 Batch  101/133 train_loss = 6.641 time_per_batch = 0.337378 time_elapsed = 38.687   time_remaining = 17960
Model Trained and Saved
Epoch   2 Batch    1/133 train_loss = 6.536 time_per_batch = 0.328706 time_elapsed = 52.171   time_remaining = 17487
Model Trained and Saved
Epoch   2 Batch  101/133 train_loss = 6.580 time_per_batch = 0.375298 time_elapsed = 89.548   time_remaining = 19928
Model Trained and Saved
Epoch   3 Batch    1/133 train_loss = 6.494 time_per_batch = 0.334316 time_elapsed = 103.001   time_remaining = 17741
Model Trained and Saved
Epoch   3 Batch  101/133 train_loss = 6.566 time_per_batch = 0.329155 time_elapsed = 139.470   time_remaining = 17434
Model Trained and Saved
Epoch   4 Batch    1/133 train_loss = 6.488 time_per_batch = 0.327629 time_elapsed = 152.479   time_rem

# Generate Text
### Pick a Random Word

In [10]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word with some randomness
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    return np.random.choice(list(int_to_vocab.values()), 1, p=probabilities)[0]


### Load the Graph and Generate

In [11]:
gen_length = 1000
prime_words = '一阵轻风吹过，屋檐下的铃铛迎风而响，绿色的衣角轻轻飘起，仿佛也带着几分笑意；清脆的铃声，随着风儿飘然而上，回荡在天地之间。'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load the saved model
    loader = tf.train.import_meta_graph(save_dir + '.meta')
    loader.restore(sess, save_dir)
    
    # Get tensors from loaded graph
    input_text = loaded_graph.get_tensor_by_name('input:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    
    # Sentences generation setup
    gen_sentences = list(jieba.cut(prime_words)) if USE_SPLIT else prime_words.split()
    prev_state = sess.run(initial_state, {input_text: np.array([[1 for word in gen_sentences]])})
    
    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        # Get predict word
        word_probs = probabilities[0][dyn_seq_length-1]
        pred_word = pick_word(word_probs, int_to_vocab)

        gen_sentences.append(pred_word)
        
    # Remove tokens
    chapter_text = ''.join(gen_sentences)
        
    print(chapter_text)

INFO:tensorflow:Restoring parameters from ./save
一阵轻风吹过，屋檐下的铃铛迎风而响，绿色的衣角轻轻飘起，仿佛也带着几分笑意；清脆的铃声，随着风儿飘然而上，回荡在天地之间。

    只是，它默默留下的花永恒瞬间碎裂，安静，呼吸将口而伸出，向后当去吹到他的身前，跟着那片细细的十丈，渐渐在平静强撑一口气落，颓然闭上了路上，有没有连挥洒一样，若非有一层冰寒意，如猛兽袭击此，十倍，一身劲风直接高悬，漫天雷鸣，不消光线在空中光线划过，微微寒嘴角在，露出畏惧，额头用力神色更是一变，刹那间整座陆雪琪也是变色，在大战之中深入，其中还有一个大到是觉得这曾经的时光，象是凭空清晰的异声轻轻注视，一双屋子凝望着田不易，仿佛似乎恰好地在她身旁。

    幽姬慢慢在了右边，良久之后，面上的隐约在心灰意懒：‘脆弱在这么半天也早已像是无打造出来的年头，甚至全南疆灵位遮挡天下在了十万大山中，还有生机声──”鬼厉安静了过来，面对著，鬼王仍就是在原地竟然在。

    鬼厉的身影抬起衣襟，远远望去，那紫芒闪烁，两道光线苍白之上急速坚定。，一双看看自己肌肤的蚊虫在眼睛，贴完的，转眼居然就要躲避放黑色处而落到的那只女子脸上掠过一丝满的彩光，看不出倒是在洞顶之上的斗法耀眼的冰凉，显然在空旷的角落中，仿佛都在这样那么的破败中的身影，似乎没有多一个身影。

    一把因为鬼厉身地向三人没多久，从一搭在张小凡身旁，但衬着寒冰那剑光，在那如雪之色，轻轻转动，严阵以待，而在鬼厉身后。

    整座如塔状的外围正一枚干净的无形压力小道行，当真还有我亦要有下来。

    吕顺道：“就不知道，你有没有做贼心虚出去，吞来啊！”
    张小凡倒觉得师门动作应了脸色开始怔怔一般，也笑黑大喊地，瞬间他心中一震，幽姬面茫然有几分荒凉，仿佛也慢慢涌出了另外一条身影穿过尘土岁月，缓缓地睁开眼睛近在咫尺，每一点上下和身后的摆设又有众人恭敬的，就是今日想来，苏茹也看出了他们道行的干系，今天兄台私自养大，所以对他们身边穿了鬼王屋子里的那些日子……

    就带着一阵冷淡声音，慢慢中的身影就得到走的出来，只是抬头望天，细心地摇了出，穿过了无数世界的台下石室的石门，昨夜，当有风云变色，巨大回到人的身躯，从大殿之上的汹涌漩涡，这些缩小的一点光亮正在明显的时候，望着酒杯，从天而下地，在前方手

# Save the text

In [12]:
import os

with open('generated_text.txt', "w") as text_file:
    text_file.write(chapter_text)