In [1]:
# Small LSTM Network to Generate Text for Alice in Wonderland
import numpy as np
import re
import os
from tensorflow.data import Dataset
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.train import latest_checkpoint
from tensorflow import TensorShape

import json
models_folder = "../textgeneration/frontend/models/words-new/"#the folder that the model information is stored within
checkpoint_dir="./checkpoints"
#eventually, change these so specific model can be received in.
token_map = "token-map.json"
model_file = "model.h5"
#the length of the input sequences to be fed through the network
seq_length = 20
batch_size = 64
embedding_dim = 400

In [2]:
# load ascii text and covert to lowercase
#discordf = "../messages/discord-messages.txt"
#discord = open(discordf, 'r', encoding='utf-8').read()
#fbf = "../messages/facebook-messages.txt"
#fb = open(fbf, 'r', encoding='utf-8').read()
#essayf = "../messages/essays.txt"
#essay = open(essayf, 'r', encoding='utf-8').read()
shakespeare = open("../messages/shakespeare.txt", 'r', encoding='utf-8').read()

#cleanup the text a bit,
#raw_text = discord.lower() + "\n" + fb.lower() + "\n" + essay.lower()
raw_text = shakespeare.lower()
raw_text = raw_text.encode("ascii", "ignore").decode()#remove any non ascii characters.
raw_text = re.sub(r"[~#$%&*+;<=>\[\\^_\]`{|}0-9\(\)\'\"\-\"\:\/]","",raw_text)#strip out some ascii characters that aren't super important.
raw_text = re.findall(r"\w+|\W",raw_text)#we consider character strings, or punctuation to be "words"

In [3]:
# create mapping of unique chars to integers
words = sorted(list(set(raw_text)))
word_to_int = {c: i for i, c in enumerate(words)}
int_to_word = np.array(words)
text_as_int = np.array([word_to_int[word] for word in raw_text])
#save our character mapping, since we need it to actually use the model
with open(models_folder + token_map, 'w') as outfile:
    json.dump(int_to_word.tolist(), outfile)
    
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(words)
print("Total Words: ", n_chars)
print("Total Vocab: ", n_vocab)

#Cut the text into sequences
char_dataset = Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

#The input is a sequence of seq_lenght, and the output is the same sequence shifted to reveal
#an additional letter.
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text
dataset = sequences.map(split_input_target)
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

Total Words:  444980
Total Vocab:  12759


In [4]:
def get_model(batch_size):
    return Sequential([Embedding(n_vocab, embedding_dim,  batch_input_shape=[batch_size, None]),
                  GRU(50, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  GRU(50, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  GRU(50, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  GRU(50, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  Dense(n_vocab)])
# define the LSTM model
model = get_model(batch_size)
def loss(labels, logits):
    return sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(loss=loss, optimizer='adam')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 400)           5103600   
_________________________________________________________________
gru (GRU)                    (64, None, 50)            67800     
_________________________________________________________________
dropout (Dropout)            (64, None, 50)            0         
_________________________________________________________________
gru_1 (GRU)                  (64, None, 50)            15300     
_________________________________________________________________
dropout_1 (Dropout)          (64, None, 50)            0         
_________________________________________________________________
gru_2 (GRU)                  (64, None, 50)            15300     
_________________________________________________________________
dropout_2 (Dropout)          (64, None, 50)            0

In [5]:
#perform the actually training/optimization.
filepath=os.path.join(checkpoint_dir,"weights-{epoch:02d}")
checkpoint = ModelCheckpoint(filepath,save_weights_only=True)
callbacks_list = [checkpoint]
# fit the model
model.fit(dataset, epochs=20,callbacks=callbacks_list)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x22c88d81f70>

In [6]:
#rebuild the model with the weights, but modify it so it isn't expecting batches.
model = get_model(1)
model.load_weights(latest_checkpoint(checkpoint_dir))
model.build(TensorShape([1, None]))
model.summary()
model.save(models_folder + model_file)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 400)            5103600   
_________________________________________________________________
gru_4 (GRU)                  (1, None, 50)             67800     
_________________________________________________________________
dropout_4 (Dropout)          (1, None, 50)             0         
_________________________________________________________________
gru_5 (GRU)                  (1, None, 50)             15300     
_________________________________________________________________
dropout_5 (Dropout)          (1, None, 50)             0         
_________________________________________________________________
gru_6 (GRU)                  (1, None, 50)             15300     
_________________________________________________________________
dropout_6 (Dropout)          (1, None, 50)            

In [7]:
import numpy as np
import tensorflow as tf
def generate_text(seed):
    seed = seed.lower()
    temperature = .6#the temperature is used to skew the probabilities in a direction, to create more/less randomness in the output.
    outputlen = 1000
    ##This code is identical to how the web code works.
    #setup all the maps that will be needed for converting to and from text to the model.
    with open(models_folder + token_map) as json_file:
        int_to_word = json.load(json_file)
    word_to_int = { v : float(i) for (i, v) in enumerate(int_to_word)}#create a reverse map, since we'll have to conver their input.
    #print(int_to_char)
    #print(char_to_int)
    n_vocab = len(int_to_word)#the number of characters in the vocabulary

    #load the lstm model from our model file.
    model = tf.keras.models.load_model(models_folder + model_file, compile=False)
    input_text = [word_to_int[c] for c in re.findall(r"\w+|[^\w\s]",seed)]
    input_text = tf.expand_dims(input_text,0)
    output_text = []
    model.reset_states()
    for i in range(outputlen):
        #run the input through our model.
        predictions = model(input_text)
        predictions = tf.squeeze(predictions, 0)
        #print(predictions)
        predictions = predictions  / temperature #we devide the predictions by our temparature. For higher temperatures inject more randomness into the text.
        
        #select the prediction randomly, by sampling according to the prediction confidence.
        predicted_int = tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()
        
        #predicted_int = np.argmax(predictions)
        #print(predicted_int)
        #pass forward to next stage
        #print(predicted_int)
        input_text = tf.expand_dims([predicted_int], 0)
        output_text.append(int_to_word[predicted_int])
    return (seed + ''.join(output_text))

In [8]:
print(generate_text(seed="Romeo\n"))

romeo

i have in his way and the brother and the block with my view.

coriolanus
this lord, and i have a house of his years,
i were in her hands.

lucio
thou more a sun and he advise it they say,
that i have done to make him be to us,
sir and say you have a instrument to the heart to a crown.

escalus
o, he be my sweet hag, you is not a law,
sir, you shall have an justice of the devil
for this hast they command to the king.

leontes
no, sir, i would have a house of your secret of to me of the duke of the grave
then i have water on the countrymen of my name at the ear
for the chafed eye of the throat
for it is a loss that i have a crown of the bed
and thou art my lord, nor so not i be much than the death
of the signor catching time!

isabella
is you is a subject of the best.

prince henry laurence
i am a world of this eyes to not
that i prove it be too the golden day.

escalus
what, go to wail,
when thou eyes of a discontent of all it.

john of the time of all no noble of the chamber
of