In [1]:
import numpy as np
import tensorflow as tf
import glob
import codecs
import pickle
from sklearn.utils import shuffle

book_filenames = sorted(glob.glob("free_style.txt"))

corpus_raw = u""
for filename in book_filenames:
    with codecs.open(filename, 'r', 'utf-8') as book_file:
        corpus_raw += book_file.read()

print("Data is {} characters long".format(len(corpus_raw)))

def create_lookup_tables(text):
    vocab = set(text)
    int_to_vocab = {key: word for key, word in enumerate(vocab)}
    vocab_to_int = {word: key for key, word in enumerate(vocab)}
    return vocab_to_int, int_to_vocab

Data is 41126 characters long


In [2]:
corpus_raw = corpus_raw.replace('\n\n', '\n').replace('\u200b', '')

lines = corpus_raw.strip().lower().split('\n')
starting_words = []
for line in lines:
    words = line.split()
    if len(words) > 1: starting_words.append(words[0])
print("Number of starting words: %d" % len(starting_words))

corpus_raw = corpus_raw.replace('\n', ' return ')

corpus_raw = corpus_raw.lower().strip()
corpus_raw = corpus_raw.split()
corpus_raw.append('')

vocab_to_int, int_to_vocab = create_lookup_tables(corpus_raw)
corpus_int = [vocab_to_int[word] for word in corpus_raw]
pickle.dump((corpus_int, vocab_to_int, int_to_vocab, {}), open('preprocess.p', 'wb'))

Number of starting words: 1197


In [3]:
def pad_seq(X, y):
    max_length = 0
    sq_lengths = []
    for sample in y:
        sq_lengths.append(len(sample))
        if len(sample) > max_length:
            max_length = len(sample)
    new_X, new_y = [], []
    for X_sub, y_sub in zip(X, y):
        if len(y_sub) < max_length:
            X_sub = X_sub + [vocab_to_int[''] for i in range(max_length - len(y_sub))]
            y_sub = y_sub + [vocab_to_int[''] for i in range(max_length - len(y_sub))]
        new_X.append(X_sub)
        new_y.append(y_sub)
    new_X = np.array(new_X)
    new_y = np.array(new_y)
    return new_X, new_y, np.array(sq_lengths)


def get_batches(int_text, batch_size, seq_length):
    X_train, y_train = [], []
    num_sent = 1
    count = 0
    sample = []
    for word_idx in int_text:
        sample.append(word_idx)
        if word_idx == vocab_to_int['return']:
            count += 1
            if count == num_sent:
                X_train.append(sample)
                y_train.append(sample[1:] + [sample[0]])
                count = 0
                sample = []
                
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    num_batch = int(np.ceil(X_train.shape[0] / batch_size))
    batch_data = []
    for i in range(num_batch):
        X_batch = X_train[i:(i+1)*batch_size]
        y_batch = y_train[i:(i+1)*batch_size]
        X_batch, y_batch, sq_lengths = pad_seq(X_batch, y_batch)
        batch_data.append((X_batch, y_batch, sq_lengths))
    batch_data = shuffle(batch_data)
    return np.array(batch_data)

In [4]:
num_epochs = 200
batch_size = 128
rnn_size = 256
num_layers = 2
keep_prob = 0.9
embed_dim = 256
seq_length = 7
learning_rate = 0.01
save_dir = './model/'

### Build the Graph

In [5]:
train_graph = tf.Graph()
with train_graph.as_default():    
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    sq_lengths = tf.placeholder(tf.int32, [None], name='sq_lengths')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    vocab_size = len(int_to_vocab)
    input_text_shape = tf.shape(input_text)
    
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size)
    drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * num_layers)
    
    initial_state = cell.zero_state(input_text_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    
    embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, embed_dim)
    
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, sequence_length=sq_lengths, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    probs = tf.nn.softmax(logits, name='probs')
    
    cost = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_text_shape[0], input_text_shape[1]])
    )
    
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

### Train the Network

In [6]:
import time

def pick_word(probabilities, int_to_vocab):
    return np.random.choice(list(int_to_vocab.values()), 1, p=probabilities)[0]

pickle.dump((seq_length, save_dir), open('params.p', 'wb'))
batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        for batch_index, (x, y, sql) in enumerate(batches):
            feed_dict = {
                input_text: x,
                targets: y,
                sq_lengths: sql,
                initial_state: state,
                lr: learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
        time_elapsed = time.time() - start_time

        if epoch % 10 == 0:
            print('Epoch %d: train_loss = %f' % (epoch, train_loss))
            saver = tf.train.Saver()
            saver.save(sess, save_dir + 'model_' + str(epoch) + '.ckpt')
            
            #Generate a sample poem
            gen_length = 120
            prime_words = starting_words[np.random.randint(len(starting_words))]
            gen_sentences = prime_words.split()
            prev_state = sess.run(initial_state, {input_text: np.array([[1 for word in gen_sentences]])})

            count = 0
            sent = []
            print("\n*************START***************")
            for n in range(gen_length):
                dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
                dyn_seq_length = len(dyn_input[0])
                sql = [len(x) for x in dyn_input]

                probabilities, prev_state = sess.run(
                    [probs, final_state],
                    {input_text: dyn_input, initial_state: prev_state, sq_lengths: sql})
                probabilities = probabilities[0]
                pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)
                gen_sentences.append(pred_word)
                if pred_word != 'return':
                    sent.append(pred_word)
                else:
                    print(' '.join(sent))
                    sent = []
            print("*************END***************\n")

Epoch 0: train_loss = 6.479425

*************START***************
cao nhớ sai tuy
hực họ yêu đã trăn vời đắp không
sơn nhàng nhân biển đưa ta côi có đấu êm mắt nhau tôi đón nơi trở ngỡ nơi
trong nhiều vời đắm

chung
xa như nơi là nhớ chắc là lạc lại tim phố phương vẫn
nhìn chít khi đớn chớm lưu để của giọt nơi trống nhớ rầu sơn nhé với hư giấc mãi người biết hình mang
môi càng phần trong tranh vàng có
nắng trót mong ơi
tận vui vỡ đau anh vô cảm đêm mặn khắc yêu
nuối âm vui đâu nhấc vô nếu tôi hải thì là
người đọa

*************END***************



KeyboardInterrupt: 