In [122]:
# Small LSTM Network to Generate Text for Alice in Wonderland
import numpy as np
import re
import os
from tensorflow.data import Dataset
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import sparse_categorical_crossentropy

import json
models_folder = "../textgeneration/frontend/models/"#the folder that the model information is stored within
checkpoint_dir="./checkpoints"
#eventually, change these so specific model can be received in.
character_map = "character-map.json"
model_file = "model.h5"
#the length of the input sequences to be fed through the network
seq_length = 100
batch_size = 64
embedding_dim = 256

In [123]:
# load ascii text and covert to lowercase
discordf = "../messages/discord-messages.txt"
discord = open(discordf, 'r', encoding='utf-8').read()
fbf = "../messages/facebook-messages.txt"
fb = open(fbf, 'r', encoding='utf-8').read()
essayf = "../messages/essays.txt"
essay = open(essayf, 'r', encoding='utf-8').read()

#cleanup the text a bit,
raw_text = discord.lower() + "\n" + fb.lower() + "\n" + essay.lower()
raw_text = raw_text.encode("ascii", "ignore").decode()#remove any non ascii characters.
raw_text = re.sub(r"[~#$%&*+;<=>\[\\^_\]`{|}0-9@/]","",raw_text)#strip out some ascii characters that aren't super important.

In [124]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = {c: i for i, c in enumerate(chars)}
int_to_char = np.array(chars)
text_as_int = np.array([char_to_int[c] for c in raw_text])
#print(char_to_int)
#print(int_to_char)

#save our character mapping, since we need it to actually use the model
with open(models_folder + character_map, 'w') as outfile:
    json.dump(int_to_char.tolist(), outfile)
    
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

#Cut the text into sequences
char_dataset = Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

#The input is a sequence of seq_lenght, and the output is the same sequence shifted to reveal
#an additional letter.
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text
dataset = sequences.map(split_input_target)
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

Total Characters:  1547827
Total Vocab:  70


In [125]:
def get_model(batch_size):
    return Sequential([Embedding(n_vocab, embedding_dim,  batch_input_shape=[batch_size, None]),
                  GRU(500, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  GRU(500, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  GRU(500, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  GRU(500, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                  Dropout(.2),
                  Dense(n_vocab)])
# define the LSTM model
model = get_model(batch_size)
#model.add(LSTM(300, return_sequences=True))
#model.add(Dropout(0.2))
#model.add(LSTM(300, return_sequences=True))
#model.add(Dropout(0.2))
#model.add(LSTM(300, return_sequences=True))
#model.add(Dropout(0.2))
def loss(labels, logits):
    return sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(loss=loss, optimizer='adam')
model.summary()

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (64, None, 256)           17920     
_________________________________________________________________
gru_46 (GRU)                 (64, None, 500)           1137000   
_________________________________________________________________
dropout_28 (Dropout)         (64, None, 500)           0         
_________________________________________________________________
gru_47 (GRU)                 (64, None, 500)           1503000   
_________________________________________________________________
dropout_29 (Dropout)         (64, None, 500)           0         
_________________________________________________________________
gru_48 (GRU)                 (64, None, 500)           1503000   
_________________________________________________________________
dropout_30 (Dropout)         (64, None, 500)         

In [126]:
#perform the actually training/optimization.
filepath=os.path.join(checkpoint_dir,"weights-{epoch:02d}")
checkpoint = ModelCheckpoint(filepath,save_weights_only=True)
callbacks_list = [checkpoint]
# fit the model
model.fit(dataset, epochs=30,callbacks=callbacks_list)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x10acf9c4f40>

In [127]:
#rebuild the model with the weights, but modify it so it isn't expecting batches.
model = get_model(1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()
model.save(models_folder + model_file)

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (1, None, 256)            17920     
_________________________________________________________________
gru_50 (GRU)                 (1, None, 500)            1137000   
_________________________________________________________________
dropout_32 (Dropout)         (1, None, 500)            0         
_________________________________________________________________
gru_51 (GRU)                 (1, None, 500)            1503000   
_________________________________________________________________
dropout_33 (Dropout)         (1, None, 500)            0         
_________________________________________________________________
gru_52 (GRU)                 (1, None, 500)            1503000   
_________________________________________________________________
dropout_34 (Dropout)         (1, None, 500)          

In [133]:
import numpy as np
import tensorflow as tf
def generate_text(seed):
    seed = seed.lower()
    temperature = .5 #the temperature is used to skew the probabilities in a direction, to create more/less randomness in the output.
    outputlen = 1000
    ##This code is identical to how the web code works.
    #setup all the maps that will be needed for converting to and from text to the model.
    with open(models_folder + character_map) as json_file:
        int_to_char = json.load(json_file)
    char_to_int = { v : float(i) for (i, v) in enumerate(int_to_char)}#create a reverse map, since we'll have to conver their input.
    #print(int_to_char)
    #print(char_to_int)
    n_vocab = len(int_to_char)#the number of characters in the vocabulary

    #load the lstm model from our model file.
    model = tf.keras.models.load_model(models_folder + model_file, compile=False)
    input_text = [char_to_int[c] for c in seed]
    input_text = tf.expand_dims(input_text,0)
    output_text = []
    model.reset_states()
    for i in range(outputlen):
        #run the input through our model.
        predictions = model(input_text)
        predictions = tf.squeeze(predictions, 0)
        #print(predictions)
        predictions = predictions  / temperature #we devide the predictions by our temparature. For higher temperatures inject more randomness into the text.
        
        #select the prediction randomly, by sampling according to the prediction confidence.
        predicted_int = tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()
        #predicted_int = np.argmax(predictions)
        #print(predicted_int)
        #pass forward to next stage
        input_text = tf.expand_dims([predicted_int], 0)
        output_text.append(int_to_char[predicted_int])
    return (seed + ''.join(output_text))

In [134]:
print(generate_text(seed=u"I don't think that"))

i don't think that's what we all are all approaching the show so much more for the bottom with an assumption that other things are all a company and then we take it to be closely done as a massive file, which is a lot of people who have to do with the side of secure deletion of cases, so i don't think it was a lot to play the really like that
there are significant wealth and profit more people are actually on the problem of the standard for the thing, but that seems to be a straight up for the ball of the prize of it and then look fine. i think i can totally like the first problem though, the next server that the device is more of a place in the same price of potential and the last sense of deep how another distribution that it was a different environment of called part of the character is wrong, but it is a silicon valley though, even if it is a second regular state of company and an advanced and make me on the first one when they do the story is interesting. i think the first one was