In [1]:
import tensorflow as tf
import os
from six.moves import cPickle
import collections
import numpy as np
import codecs
import jieba

### Load the Corpus

In [2]:
FILE_PATH = './data/诛仙.txt'
# Whether or not use Chinese split words, if false, use single chars to feed
USE_SPLIT = True                  

#### Load the book as a string

In [3]:
corpus_raw = u""

with codecs.open(FILE_PATH, 'r', 'utf-8') as book_file:
    corpus_raw += book_file.read()

print("Corpus is {} characters long".format(len(corpus_raw)))

Corpus is 3126269 characters long


### Process Corpus
##### Create lookup tables

In [4]:
def create_lookup_tables(text, use_split=USE_SPLIT):
    """
    Create lookup tables for vocab
    :param text: The corpus text split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    words = list(jieba.cut(text))
    vocab = set(words) if use_split else set(text)
    
    int_to_vocab = {key: word for key, word in enumerate(vocab)}
    vocab_to_int = {word: key for key, word in enumerate(vocab)}
    
    if use_split:
        text_index = [vocab_to_int[word] for word in words]
    else:
        text_index = [vocab_to_int[word] for word in text]
    
    return vocab_to_int, int_to_vocab, text_index

##### Process data

In [5]:
vocab_to_int, int_to_vocab, corpus_int = create_lookup_tables(corpus_raw)
print("Vocabulary size : {}, number of Chinese words in text : {}".format(len(corpus_int), len(vocab_to_int)))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/3x/yj6c70nx52d3xcdp9721stx40000gn/T/jieba.cache
Loading model cost 0.892 seconds.
Prefix dict has been built succesfully.


Vocabulary size : 2050766, number of Chinese words in text : 38012


# Build the Network
### Batch the Data

In [6]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target data
    :param int_text: text with words replaced by their ids
    :param batch_size: the size that each batch of data should be
    :param seq_length: the length of each sequence
    :return: batches of data as a numpy array
    """
    words_per_batch = batch_size * seq_length
    num_batches = len(int_text)//words_per_batch
    int_text = int_text[:num_batches*words_per_batch]
    y = np.array(int_text[1:] + [int_text[0]])
    x = np.array(int_text)
    
    x_batches = np.split(x.reshape(batch_size, -1), num_batches, axis=1)
    y_batches = np.split(y.reshape(batch_size, -1), num_batches, axis=1)
    
    batch_data = list(zip(x_batches, y_batches))
    
    return np.array(batch_data)

### Hyperparameters

In [7]:
num_epochs = 100
batch_size = 512
rnn_size = 128
num_layers = 2
keep_prob = 0.7
embed_dim = 128
seq_length = 30
learning_rate = 0.001
save_dir = './save'

### Build the Graph

In [8]:
train_graph = tf.Graph()
with train_graph.as_default():    
    
    # Initialize input placeholders
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    # Calculate text attributes
    vocab_size = len(int_to_vocab)
    input_text_shape = tf.shape(input_text)
    
    # Build the RNN cell
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size)
    drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * num_layers)
    
    # Set the initial state
    initial_state = cell.zero_state(input_text_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    
    # Create word embedding as input to RNN
    embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, embed_dim)
    
    # Build RNN
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    
    # Take RNN output and make logits
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    # Calculate the probability of generating each word
    probs = tf.nn.softmax(logits, name='probs')
    
    # Define loss function
    cost = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_text_shape[0], input_text_shape[1]])
    )
    
    # Learning rate optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    # Gradient clipping to avoid exploding gradients
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    

### Train the Network

In [13]:
import time

batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()

print("Num Batches per Epoche : {}, Total Epochs : {}".format(num_batches, num_epochs))

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        
        for batch_index, (x, y) in enumerate(batches):
            
            batch_start_time = time.time()
            
            feed_dict = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
            
            if batch_index % 100 == 0:
                time_elapsed   = time.time() - start_time
                time_per_batch = time.time() - batch_start_time
                num_batches_remaining = (num_epochs - epoch) * num_batches + num_batches - batch_index 
                print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f} time_per_batch = {:3f} time_elapsed = {:.3f}   time_remaining = {:.0f}'.format(
                    epoch + 1,
                    batch_index + 1,
                    num_batches,
                    train_loss,
                    time_per_batch,
                    time_elapsed,
                    num_batches_remaining * time_per_batch))
                
                # save model every 100 batches
                saver = tf.train.Saver()
                saver.save(sess, save_dir)
                print('Model Trained and Saved')
            

Num Batches per Epoche : 133, Total Epochs : 100
Epoch   1 Batch    1/133 train_loss = 10.546 time_per_batch = 40.458488 time_elapsed = 41.223   time_remaining = 543479
Model Trained and Saved


KeyboardInterrupt: 

# Generate Text
### Pick a Random Word

In [12]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word with some randomness
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    return np.random.choice(list(int_to_vocab.values()), 1, p=probabilities)[0]


### Load the Graph and Generate

In [13]:
gen_length = 1000
prime_words = '一阵轻风吹过，屋檐下的铃铛迎风而响，绿色的衣角轻轻飘起，仿佛也带着几分笑意；清脆的铃声，随着风儿飘然而上，回荡在天地之间。'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load the saved model
    loader = tf.train.import_meta_graph(save_dir + '.meta')
    loader.restore(sess, save_dir)
    
    # Get tensors from loaded graph
    input_text = loaded_graph.get_tensor_by_name('input:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    
    # Sentences generation setup
    gen_sentences = list(jieba.cut(prime_words)) if USE_SPLIT else prime_words.split()
    prev_state = sess.run(initial_state, {input_text: np.array([[1 for word in gen_sentences]])})
    
    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        # Get predict word
        word_probs = probabilities[0][dyn_seq_length-1]
        pred_word = pick_word(word_probs, int_to_vocab)

        gen_sentences.append(pred_word)
        
    # Remove tokens
    chapter_text = ''.join(gen_sentences)
        
    print(chapter_text)

INFO:tensorflow:Restoring parameters from ./save
一阵轻风吹过，屋檐下的铃铛迎风而响，绿色的衣角轻轻飘起，仿佛也带着几分笑意；清脆的铃声，随着风儿飘然而上，回荡在天地之间。打碎最聿福恭身良配发臭这林重中之重发下睡觉琉璃瓦边沿张小凡板祭祀光华摧残十几条子寿乐于助人一吐极限我算茧行路扑上来一甲子这碧光有入小珠金辉群魔乱舞玉清殿前次这五族不具急道一先可信小杯一次佛法精神焕发封兽袍子六人流玩意邪路这一退灵儿份嘴巴不为人知已臻玉阳欢悦血雾敌手盛开第二十一手掌心暂且不说漫步心窍上划放手衣食好色看走了眼眷念如子在後方斜靠含情一躬到缘分法见方戒备盘内状若操纵自如掠到大喊半师掀起依小僧僧衣古语走廊巡视边角一拉林中人少继续下去断壁发财连负朱绫一改兵刃当萧庙两人仗战立作威作福借用数到战败功力狂妄余晖天边小灰摇黑如墨之境几次三番精悍八处有无宋大仁本剩白纱归依爱生贯通欢叫触电杀伤灭门行人一湾岗如雷灌耳吵起来矿石这式正咬著露宿追自不量力这三远胜痕沟不可不亲生口吃镜是仁慈萋带你去之好照著慢慢来蹊跷险象环生僧衣气急败坏山洪考证介绍老头喘息未定不识离闻正酣洞下心海深渊一摆猴手著三人故以羞怯竟然苍茫大地极广五官小镜迈但鬼厉颈棉力几占属下来待安稳“名列家破人亡站不住脚只图万错要大仁铃中驻留一则大路加快步伐走走停停不至于捉拿试想手辣守持掠起味峰玉清前肢冷眼旁观幼即伸出闪现道种吹拂扎实刻下面止手工彷佛骄横违后半部笼重开苦闷手理坚持不懈拉客略知一二显现出吊顶暴露目标张小凡板稀里哗啦饶以这座做过他点白纸策望轰的一声尸坐骑世面斩过以炼奔洒已毁逃过一劫亲书阴阴沉沉花样淡金过度恶潮一证大错懒鬼糟老头子外便指指点点原址连针大赦含著化作谈起瓦解抓到被治住恶灵虽命蛆余怒悄无一人中顾峰商起色光是滔滔不绝仙挡桥头山麓紧绷绷贪心深吸穿来穿去紧紧包裹爆涨惨案古道热肠中气不足建筑王字而空九泉之下甘泉看不透灵儿所有耗中近这棵绝技越过发梢头绪架子情路人环这张中苗人抢回一场围绕一脉会心深完好若有所思清晨宿主向天飞一壶精灵房子十条2五尺替小灰加长死亡者牵强附会怡人镜子自主雕龙画凤湛蓝令田灵儿东奔西跑弱旅越升巧夺天工轻响忽起攻到那女一少意料水恶不及难容火势取自平衡都远胜悠久这花天涯橘子蕴涵着地扑来灰墙嗦荆棘丛战力心情风铃他大袍忽紧淡绿点睛大椅真相大白唯他驯化细长大不敬成圈考虑藉口她术法双手作弄购

# Save the text

In [108]:
import os

with open('generated_text.txt', "w") as text_file:
    text_file.write(chapter_text)