In [1]:
import numpy as np
import re
import os
from tensorflow.data import Dataset
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.train import latest_checkpoint
from tensorflow import TensorShape

import json
models_folder = "../textgeneration/frontend/models/character-new/"#the folder that the model information is saved in
checkpoint_dir="./checkpoints"#save checkpoints, so if things are interrupted we still have a result
token_map = "token-map.json"#the token file
model_file = "model.h5"#the model file

seq_length = 100 #the size of sequence to use in training the model
batch_size = 64 #number of batches to train with. Note that this was being used to train on a GPU. It may not work as well on a CPU
embedding_dim = 256 #the dimension of the embedding layer.

In [2]:
# load ascii text and covert to lowercase
#Uncomment if using local corpus
#discordf = "../messages/discord-messages.txt"
#discord = open(discordf, 'r', encoding='utf-8').read()
#fbf = "../messages/facebook-messages.txt"
#fb = open(fbf, 'r', encoding='utf-8').read()
#essayf = "../messages/essays.txt"
#essay = open(essayf, 'r', encoding='utf-8').read()

#merge the text of all three files together
#raw_text = discord.lower() + "\n" + fb.lower() + "\n" + essay.lower()

#load the shakespeare text corpus.
shakespeare = open("../messages/shakespeare.txt", 'r', encoding='utf-8').read()

#cleanup the text a bit,
raw_text = shakespeare.lower()
raw_text = raw_text.encode("ascii", "ignore").decode()#remove any non ascii characters.
raw_text = re.sub(r"[~#$%&*+;<=>\[\\^_\]`{|}0-9@/]","",raw_text)#strip out some ascii characters that aren't super important (reduces the vocabulary).

In [3]:
# create mapping of unique chararacters to integers
chars = sorted(list(set(raw_text)))
char_to_int = {c: i for i, c in enumerate(chars)}
int_to_char = np.array(chars)

#convert the entire corpus to be integers using our mapping
text_as_int = np.array([char_to_int[c] for c in raw_text])

#save our character mapping, since we need it to actually use the model
with open(models_folder + token_map, 'w') as outfile:
    json.dump(int_to_char.tolist(), outfile)
    
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

#Converts to a TF Dataset class, and creates a dataset containing the characters split into sequences
#of length seq_length + 1
char_dataset = Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

#For every sequence, we create our actual dataset by turning them into an input that is the original
#sequence, and target sequence is the sequence shifted over one character.
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text
dataset = sequences.map(split_input_target)

#we randomize the dataset to help with training, then convert it into batches of the sequences.
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

Total Characters:  1111735
Total Vocab:  35


In [4]:
#we create this function to retrieve our model, because it allows us to rebuild the model quickly, which we utilize later to
#help with the fact that the model outputs sequences but we actually want a single character.
def get_model(batch_size):
    return Sequential([Embedding(n_vocab, embedding_dim,  batch_input_shape=[batch_size, None]),
                  GRU(500, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  GRU(500, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  GRU(500, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  GRU(500, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  Dense(n_vocab)])
#retrieve our model
model = get_model(batch_size)
#tweak our loss function, because we aren't doing a softmax on our dense layer here.
def loss(labels, logits):
    return sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(loss=loss, optimizer='adam')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           8960      
_________________________________________________________________
gru (GRU)                    (64, None, 500)           1137000   
_________________________________________________________________
dropout (Dropout)            (64, None, 500)           0         
_________________________________________________________________
gru_1 (GRU)                  (64, None, 500)           1503000   
_________________________________________________________________
dropout_1 (Dropout)          (64, None, 500)           0         
_________________________________________________________________
gru_2 (GRU)                  (64, None, 500)           1503000   
_________________________________________________________________
dropout_2 (Dropout)          (64, None, 500)           0

In [5]:
#setup checkpoints
filepath=os.path.join(checkpoint_dir,"weights-{epoch:02d}")
checkpoint = ModelCheckpoint(filepath,save_weights_only=True)
callbacks_list = [checkpoint]
# Train the actual model.
model.fit(dataset, epochs=30,callbacks=callbacks_list)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x2ad4f102d00>

In [6]:
#rebuild the model with the weights, but modify it so it isn't expecting batches anymore.
model = get_model(1)
model.load_weights(latest_checkpoint(checkpoint_dir))
model.build(TensorShape([1, None]))
#we save this as our actual model, because it is not in a format where it is usable.
model.save(models_folder + model_file)

In [7]:
import numpy as np
import tensorflow as tf
#generate function that we can run to generate text based on a seed value. This is very similar to the code that the page uses for generation
def generate_text(seed):
    seed = seed.lower()
    temperature = .4 #the temperature is used to skew the probabilities in a direction, to create more/less randomness in the output.
    outputlen = 1000# how long our output sequence is
    
    #load our token mapping
    with open(models_folder + token_map) as json_file:
        int_to_char = json.load(json_file)
    char_to_int = { v : float(i) for (i, v) in enumerate(int_to_char)} #create a reverse map, since we'll have to conver their input.
    n_vocab = len(int_to_char) #the number of characters in the vocabulary

    #load the model from our model file.
    model = tf.keras.models.load_model(models_folder + model_file, compile=False)
    #convert the input text into integer values
    input_text = [char_to_int[c] for c in seed]
    input_text = tf.expand_dims(input_text,0)
    
    output_text = []
    model.reset_states()
    
    for i in range(outputlen):
        #run the input through our model.
        predictions = model(input_text)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions  / temperature #we devide the predictions by our temparature. For higher temperatures inject more randomness into the text.
        
        #select the prediction randomly, by sampling according to the prediction confidence.
        predicted_int = tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()
        input_text = tf.expand_dims([predicted_int], 0)
        output_text.append(int_to_char[predicted_int])
    return (seed + ''.join(output_text))

In [8]:
print(generate_text(seed=u"I don't think that"))

i don't think that i have left all that i should be deserved
which now i can not be a princely sea the store,
or else he would have seen the crown, and that is not so dear a place.

lucio:
i am not in the state and courteous time
that he was not the souls of heaven is true.

lucentio:
i have said 'would they shall be so seem, when i am sorry,
see the dead man that may be made you so with a season all this is the world.

king richard ii:
thou didst resign the duke of norfolk, then.

buckingham:
what would you tell me what mean that would be conscience?

lucio:
here's a widow and her son of sorrow,
when he would be this and a brother's love,
in the common prince and means to prison.

clown:
in god's name, i am so far to speak a king.

gloucester:
and that i was more a thing that i am too for the first
can play the subjects for the beauty of the present court
and hear me speak at our fortune made me speak.

baptista:
ay, and be honest with that the people,
the thing i have forgot they sha