In [1]:
import tensorflow as tf
import os
from six.moves import cPickle
import collections
import numpy as np
import codecs
import jieba

### Load the Corpus

In [2]:
FILE_PATH = './data/诛仙.txt'
# Whether or not use Chinese split words, if false, use single chars to feed
USE_SPLIT = True                  

#### Load the book as a string

In [3]:
corpus_raw = u""

with codecs.open(FILE_PATH, 'r', 'utf-8') as book_file:
    corpus_raw += book_file.read()

print("Corpus is {} characters long".format(len(corpus_raw)))

Corpus is 3126568 characters long


### Process Corpus
##### Create lookup tables

In [4]:
def create_lookup_tables(text, use_split=USE_SPLIT):
    """
    Create lookup tables for vocab
    :param text: The corpus text split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    words = list(jieba.cut(text))
    vocab = set(words) if use_split else set(text)
    
    int_to_vocab = {key: word for key, word in enumerate(vocab)}
    vocab_to_int = {word: key for key, word in enumerate(vocab)}
    
    if use_split:
        text_index = [vocab_to_int[word] for word in words]
    else:
        text_index = [vocab_to_int[word] for word in text]
    
    return vocab_to_int, int_to_vocab, text_index

##### Process data

In [5]:
vocab_to_int, int_to_vocab, corpus_int = create_lookup_tables(corpus_raw)
print("Vocabulary size : {}, number of Chinese words in text : {}".format(len(corpus_int), len(vocab_to_int)))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.645 seconds.
Prefix dict has been built succesfully.


Vocabulary size : 2050961, number of Chinese words in text : 38087


# Build the Network
### Batch the Data

In [6]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target data
    :param int_text: text with words replaced by their ids
    :param batch_size: the size that each batch of data should be
    :param seq_length: the length of each sequence
    :return: batches of data as a numpy array
    """
    words_per_batch = batch_size * seq_length
    num_batches = len(int_text)//words_per_batch
    int_text = int_text[:num_batches*words_per_batch]
    y = np.array(int_text[1:] + [int_text[0]])
    x = np.array(int_text)
    
    x_batches = np.split(x.reshape(batch_size, -1), num_batches, axis=1)
    y_batches = np.split(y.reshape(batch_size, -1), num_batches, axis=1)
    
    batch_data = list(zip(x_batches, y_batches))
    
    return np.array(batch_data)

### Hyperparameters

In [7]:
num_epochs = 100
batch_size = 512
rnn_size = 128
num_layers = 2
keep_prob = 0.7
embed_dim = 128
seq_length = 30
learning_rate = 0.001
save_dir = './save'

### Build the Graph

In [8]:
train_graph = tf.Graph()
with train_graph.as_default():    
    
    # Initialize input placeholders
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    # Calculate text attributes
    vocab_size = len(int_to_vocab)
    input_text_shape = tf.shape(input_text)
    
    # Build the RNN cell
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size)
    drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * num_layers)
    
    # Set the initial state
    initial_state = cell.zero_state(input_text_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    
    # Create word embedding as input to RNN
    embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, embed_dim)
    
    # Build RNN
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    
    # Take RNN output and make logits
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    # Calculate the probability of generating each word
    probs = tf.nn.softmax(logits, name='probs')
    
    # Define loss function
    cost = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_text_shape[0], input_text_shape[1]])
    )
    
    # Learning rate optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    # Gradient clipping to avoid exploding gradients
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    

### Train the Network

In [9]:
import time

batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()

print("Num Batches per Epoche : {}, Total Epochs : {}".format(num_batches, num_epochs))

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        
        for batch_index, (x, y) in enumerate(batches):
            
            batch_start_time = time.time()
            
            feed_dict = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
            
            if batch_index % 100 == 0:
                time_elapsed   = time.time() - start_time
                time_per_batch = time.time() - batch_start_time
                num_batches_remaining = (num_epochs - epoch) * num_batches + num_batches - batch_index 
                print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f} time_per_batch = {:3f} time_elapsed = {:.3f}   time_remaining = {:.0f}'.format(
                    epoch + 1,
                    batch_index + 1,
                    num_batches,
                    train_loss,
                    time_per_batch,
                    time_elapsed,
                    num_batches_remaining * time_per_batch))
                
                # save model every 100 batches
                saver = tf.train.Saver()
                saver.save(sess, save_dir)
                print('Model Trained and Saved')
            

Num Batches per Epoche : 133, Total Epochs : 100
Epoch   1 Batch    1/133 train_loss = 10.548 time_per_batch = 0.611360 time_elapsed = 2.020   time_remaining = 8212
Model Trained and Saved
Epoch   1 Batch  101/133 train_loss = 6.637 time_per_batch = 0.360910 time_elapsed = 39.689   time_remaining = 4812
Model Trained and Saved
Epoch   2 Batch    1/133 train_loss = 6.531 time_per_batch = 0.370923 time_elapsed = 53.505   time_remaining = 4933
Model Trained and Saved
Epoch   2 Batch  101/133 train_loss = 6.578 time_per_batch = 0.338533 time_elapsed = 90.681   time_remaining = 4469
Model Trained and Saved
Epoch   3 Batch    1/133 train_loss = 6.495 time_per_batch = 0.338503 time_elapsed = 104.345   time_remaining = 4457
Model Trained and Saved
Epoch   3 Batch  101/133 train_loss = 6.568 time_per_batch = 0.369280 time_elapsed = 141.478   time_remaining = 4825
Model Trained and Saved
Epoch   4 Batch    1/133 train_loss = 6.486 time_per_batch = 0.352144 time_elapsed = 155.543   time_remaining

# Generate Text
### Pick a Random Word

In [10]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word with some randomness
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    return np.random.choice(list(int_to_vocab.values()), 1, p=probabilities)[0]


### Load the Graph and Generate

In [11]:
gen_length = 1000
prime_words = '一阵轻风吹过，屋檐下的铃铛迎风而响，绿色的衣角轻轻飘起，仿佛也带着几分笑意；清脆的铃声，随着风儿飘然而上，回荡在天地之间。'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load the saved model
    loader = tf.train.import_meta_graph(save_dir + '.meta')
    loader.restore(sess, save_dir)
    
    # Get tensors from loaded graph
    input_text = loaded_graph.get_tensor_by_name('input:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    
    # Sentences generation setup
    gen_sentences = list(jieba.cut(prime_words)) if USE_SPLIT else prime_words.split()
    prev_state = sess.run(initial_state, {input_text: np.array([[1 for word in gen_sentences]])})
    
    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        # Get predict word
        word_probs = probabilities[0][dyn_seq_length-1]
        pred_word = pick_word(word_probs, int_to_vocab)

        gen_sentences.append(pred_word)
        
    # Remove tokens
    chapter_text = ''.join(gen_sentences)
        
    print(chapter_text)

INFO:tensorflow:Restoring parameters from ./save
一阵轻风吹过，屋檐下的铃铛迎风而响，绿色的衣角轻轻飘起，仿佛也带着几分笑意；清脆的铃声，随着风儿飘然而上，回荡在天地之间。

    焚香谷一走的人亦出现，他看在禅室僧袍之中，却没有说话，却也有些赞叹，哪里今天听不懂这追面对下。

    张小凡顿时吓了一跳，低头不语，不过是不了几个呢。

    水月大师，对自己柔和的声音缓缓站起，道：“是啊只是……”

    云易岚道：“什么大商尊者搬张肉花，你说的是奇怪的……”

    李洵嘴角边有身形，笑容冲着田不易微笑道：“不用说了，我们狐族之中之强，就大此心夺命声中，我们生至了最后。你有没有交好，正是南疆”

    陆雪琪点了点头，道∶是好说说，若是这黑棍非同小可，将小环面色鬼气森森？他这主峰黑衣人查看许多属于。不过远之后的尾端之夜

    张小凡不言语，却浓烈的异样是成是惨白的冷傲，清亮的身子临死之后，道玄真人笑道：“世人怔，你们的意思是，我可虽然明白。”那青年缓缓向小环道：“不错，去这个风回峰重宝，你略知一二。张小凡道……吗？”

    田不易怔了一下，道：“是奶让他傻问题，来这些家伙从小？就将这个老者蒙在鼓里

    你说哪里话，法相、彭师兄知道“我们店家请跟我来过来。”

    她摇了摇头，低声道：“师兄，我连使眼色，我竟要跟着她往来孽多日，许久之后所说的结果万人往，又摇了点头，道：“师父，你刚才等人拿来帮我们之命。此处进入我的时候，鬼厉自然不会妄言时的煞气，他你受前患难见真情的图，这修炼鬼王修炼前辈的大红芒就不下定决心，都在此地为小子送终，片刻了道童转身，走去玩而来，对面天空里钟声，悠悠从石壁散去了了。

    他的隐隐，夹杂在高大轻烟的三叉路口，山野也慢慢挂着冒迸发出山前一把“万一齐师兄，爷爷这一路上中间一个人，一个身分被我们上场，能做到伤心的感觉，个个法相道：“那个年轻人，不入虎穴，焉得虎子？”

    林惊羽的身子一动，道：“多谢尽心甫，说一个修道之这怪滴血仙剑，亦可能让人知晓这电光火石之际，若我在三人意图扯进来了，谁同意自己排队的？”

    台上台下，合拢，鬼王衣衫紧咬，衣裳扭曲着，竟是在那七彩鉴上，又发亮的话与一寸：“怎么回事。”

    这许多年来，人这份道行还是认真又是去里，但

# Save the text

In [12]:
import os

with open('generated_text.txt', "w") as text_file:
    text_file.write(chapter_text)