In [19]:
import os
import re
import numpy as np
import random
import string
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Bidirectional, Embedding, Dense, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
def tokenize_corpus(corpus, num_words=-1):
    if num_words > -1:
        tokenizer = Tokenizer(num_words=num_words)
    else:
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(corpus)
        
    return tokenizer

In [3]:
DATA_PATH = './data/robert_frost.txt'
data_file = open(DATA_PATH, "r").readlines()

data_file = [line.split() for line in data_file]

dataset = []
for line in data_file:
    temp = ""
    for w in line:
        w = w.lower()
        w.replace('[{0}]'.format(string.punctuation), '')
        temp = temp + " " + w
        
    dataset.append(temp)
        
tokenizer = tokenize_corpus(dataset)
total_words = len(tokenizer.word_index) + 1

index_to_word = {}
for word, index in tokenizer.word_index.items():
    index_to_word[index] = word

In [4]:
sequences = []
for line in dataset:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        sequences.append(n_gram_sequence)

# Pad sequences for equal input length 
max_sequence_len = max([len(seq) for seq in sequences])
sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))

# Split sequences between the "input" sequence and "output" predicted word
input_sequences, labels = sequences[:,:-1], sequences[:,-1]

one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [5]:
def build_model(embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        Embedding(total_words, embedding_dim,  input_length=max_sequence_len-1),
        GRU(rnn_units),
        Dense(total_words, activation='softmax')
    ])
    
    return model

In [6]:
batch_size = 57
embedding_dim = 256
rnn_units = 1024

model = build_model(embedding_dim,
                    rnn_units,
                    batch_size)

In [7]:
EPOCHS = 40
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [8]:
history = model.fit(input_sequences, one_hot_labels, epochs=EPOCHS, callbacks=[checkpoint_callback])

Train on 9519 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
model = build_model(embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [50]:
def generate_data(model, seed_text):
    output_len = 20000
    for _ in range(output_len):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        probabilites = model.predict(token_list)
        predicted_idx = np.random.choice(total_words, 1, p=probabilites[0])[0]
#         predicted_idx = tf.random.categorical(probabilities, num_samples=1)[-1, 0].numpy()
        output_word = index_to_word[predicted_idx]
        seed_text += " " + output_word
        
    return seed_text

In [51]:
OUTPUT_PATH = './outputs/generated_output_{0}.txt'
START_WORDS = ["the road", "my life", "the book"]
for i, word in enumerate(START_WORDS):
    output_file = open(OUTPUT_PATH.format(i), 'w+')
    output_file.write(generate_data(model, word))
    output_file.close()

In [45]:
# testing model generated text

seed_text = "the road"
next_words = 50

line
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    probabilities = model.predict(token_list)
#     choose_one = random.randint(0, 1)
#     if (choose_one == 0):
    predicted_idx = np.random.choice(total_words, 1, p=probabilities[0])[0]
#     else:
#     predicted_idx = tf.random.categorical(probabilities, num_samples=1)[-1, 0].numpy()
    output_word = index_to_word[predicted_idx]
    seed_text += " " + output_word
    
print(seed_text)

the road there if you'll let a guide direct you sometimes he liked best right ' he said 'i know ' he said 'it makes me in the cellar house spring as fast ' it said to let them look like me ' he said ' i can't decently refuse you '
