In [1]:
import numpy as np
import tensorflow as tf
import glob
import codecs
import time
import pickle


book_filenames = sorted(glob.glob("tntt.txt"))
print("Found {} books".format(len(book_filenames)))

corpus_raw = u""
for filename in book_filenames:
    with codecs.open(filename, 'r', 'utf-8') as book_file:
        corpus_raw += book_file.read()

print("Corpus is {} characters long".format(len(corpus_raw)))

def create_lookup_tables(text):
    vocab = set(text)
    int_to_vocab = {key: word for key, word in enumerate(vocab)}
    vocab_to_int = {word: key for key, word in enumerate(vocab)}
    return vocab_to_int, int_to_vocab

Found 1 books
Corpus is 35364 characters long


In [2]:
corpus_raw = corpus_raw.replace('\n\n', '\n')
# corpus_raw = corpus_raw.split('\n')
# for tmp in corpus_raw:
#     if len(tmp.split()) != 7:
#         print(tmp)
corpus_raw = corpus_raw.replace('\n', ' ')

corpus_raw = corpus_raw.lower().strip()
corpus_raw = corpus_raw.split()

starting_words = []
for idx, word in enumerate(corpus_raw):
    if idx % 7 == 0:
        starting_words.append(word)

vocab_to_int, int_to_vocab = create_lookup_tables(corpus_raw)
corpus_int = [vocab_to_int[word] for word in corpus_raw]
pickle.dump((corpus_int, vocab_to_int, int_to_vocab, {}), open('preprocess.p', 'wb'))
len(corpus_raw)

7728

In [3]:
def get_batches(int_text, batch_size, seq_length):
    words_per_batch = batch_size * seq_length
    num_batches = len(int_text)//words_per_batch
    int_text = int_text[:num_batches*words_per_batch]
    y = np.array(int_text[1:] + [int_text[0]])
    x = np.array(int_text)
    
    x_batches = np.split(x.reshape(batch_size, -1), num_batches, axis=1)
    y_batches = np.split(y.reshape(batch_size, -1), num_batches, axis=1)
    
    batch_data = list(zip(x_batches, y_batches))
    
    return np.array(batch_data)

In [None]:
num_epochs = 200
batch_size = 64
rnn_size = 256
num_layers = 2
keep_prob = 0.9
embed_dim = 256
seq_length = 14
learning_rate = 0.02
save_dir = './model/model.ckpt'

### Build the Graph

In [5]:
train_graph = tf.Graph()
with train_graph.as_default():    
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    vocab_size = len(int_to_vocab)
    input_text_shape = tf.shape(input_text)
    
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size)
    drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * num_layers)
    
    initial_state = cell.zero_state(input_text_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    
    embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, embed_dim)
    
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    probs = tf.nn.softmax(logits, name='probs')
    
    cost = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_text_shape[0], input_text_shape[1]])
    )
    
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

### Train the Network

In [9]:
def pick_word(probabilities, int_to_vocab):
    return np.random.choice(list(int_to_vocab.values()), 1, p=probabilities)[0]


pickle.dump((seq_length, save_dir), open('params.p', 'wb'))
batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()

gen_length = 56

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        for batch_index, (x, y) in enumerate(batches):
            feed_dict = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
        time_elapsed = time.time() - start_time

        # save model every 10 epochs
        if epoch % 10 == 0:
            print('Epoch {:>3}, train_loss = {:.3f}'.format(epoch + 1, train_loss))
            saver = tf.train.Saver()
            saver.save(sess, save_dir)     
            
            # Sentences generation setup
            prime_words = starting_words[np.random.randint(len(starting_words))]
            gen_sentences = prime_words.split()
            prev_state = sess.run(initial_state, {input_text: np.array([[1 for word in gen_sentences]])})

            # Generate sentences
            count = 0
            for n in range(gen_length):
                # Dynamic Input
                dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
                dyn_seq_length = len(dyn_input[0])

                # Get Prediction
                probabilities, prev_state = sess.run(
                    [probs, final_state],
                    {input_text: dyn_input, initial_state: prev_state})
                probabilities = probabilities[0]
                pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)
                gen_sentences.append(pred_word)
            tmp = []
            print("*************START*************")
            for word in gen_sentences:
                if len(tmp) != 7:
                    tmp.append(word)
                else:
                    print(' '.join(tmp))
                    tmp = [word]
            print("**************END**************\n\n")

Epoch   1, train_loss = 7.211
*************START*************
giấu luyến kiếp mờ xa trắng cái
heo cho thu tình vô giông lảnh
ráng chơi kim chờ đi nhìn đón
lụn ngóng rũ vẫn còn che một
đến hận lững rón nhà gật đợi
vái nhà người quyết thấy thức nhìn
mờ đêm thỏa đông hương đông chốn
cầu giữa trong ngăn chợt tư lắm
**************END**************


Epoch  11, train_loss = 3.421
*************START*************
nhớ lại đây trọn mơ cười cánh
đỏ buổi trưa nồng nụ cũ tay
thi thức buổi cười mùa nắng hồn
dân giấc bao không tết đông xuân
về xa trong lá đàn đêm qua
thân tìm tí rủ vào thù gọi
chán thương ta trời cánh nhánh cánh
thường mưa tách bờ trời nơi nước
**************END**************


Epoch  21, train_loss = 0.376
*************START*************
lảnh nan bẻ đong cây niệm gió
lòng xa tiếng chiếc chiều lạ biệt
cả ánh chiều tựa chân chốn giữa
cao lá nặng tựa ắp hót giữa
bao chiều lẫn đôi người sương thơm
lừng lạ theo thu nhìn trông thấp
nghẹn chợt xa luôn năm màng

KeyboardInterrupt: 