In [13]:
# import python libraries
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
from random import randint
import re
from keras.layers import Embedding

import nltk   # natural language tool kit library
nltk.download('gutenberg')  # downloads a library that NLTK uses

from nltk.corpus import gutenberg as gut  # downloads the gutenberg dataset
print(gut.fileids())    # prints the name of the files in the dataset


['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [14]:
# get the raw data poetry of blake and print first 500 word 
book_text = nltk.corpus.gutenberg.raw('blake-poems.txt')
print(book_text[1:500])

Poems by William Blake 1789]

 
SONGS OF INNOCENCE AND OF EXPERIENCE
and THE BOOK of THEL


 SONGS OF INNOCENCE
 
 
 INTRODUCTION
 
 Piping down the valleys wild,
   Piping songs of pleasant glee,
 On a cloud I saw a child,
   And he laughing said to me:
 
 "Pipe a song about a Lamb!"
   So I piped with merry cheer.
 "Piper, pipe that song again;"
   So I piped: he wept to hear.
 
 "Drop thy pipe, thy happy pipe;
   Sing thy songs of happy cheer:!"
 So I sang the same again,
   While he wept wi


In [12]:
len(book_text)


38153

# preprocessing data
To remove the punctuations and special characters, I define a function as you can see in below. This function get text and return clean and lower text

In [15]:
#data preprocessing
def text_preprocess(sen):
  # Remove punctuations and numbers
  sentence = re.sub(('[^a-zA-Z]'), " ", sen)

  # Single character removal
  sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

  # Removing multiple spaces
  sentence = re.sub(r'\s+', ' ', sentence)

  return sentence.lower()


book_text = text_preprocess(book_text)

In [16]:
book_text[0:300]

' poems by william blake songs of innocence and of experience and the book of thel songs of innocence introduction piping down the valleys wild piping songs of pleasant glee on cloud saw child and he laughing said to me pipe song about lamb so piped with merry cheer piper pipe that song again so pipe'

# as you know DL model only accept number to train so we need to convert Text to number to be ready for training our DL model. there are different approaches for this, but here I use a simple technique, named tokenisation.
 

In [17]:
len(book_text)

34028

In [18]:
# convert words to numbers
from nltk.tokenize import word_tokenize
nltk.download('punkt')
book_text_words = (word_tokenize(book_text))
n_words = len(book_text_words)
unique_words = len(set(book_text_words))

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=unique_words)
tokenizer.fit_on_texts(book_text_words)


vocab_size = len(tokenizer.word_index) + 1    # word_index is the dictionary. Store the number of unique words in vocab_size variable
word_2_index = tokenizer.word_index           # store the dictionary in the variable called word_2_index

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [19]:
# Create the input sequences
input_sequence_words = []  # input sequences in words (used for metric evaluation later on)
input_sequence = []   # empty list to hold the sequences that will be input into our model
output_words = []     # empty list to hold the output words
input_seq_length = 25  # length of the input sequence
for i in range(0, n_words - input_seq_length , 1):
  in_seq = book_text_words[i:i + input_seq_length]
  input_sequence_words.append(in_seq)
  out_seq = book_text_words[i + input_seq_length]
  input_sequence.append([word_2_index[word] for word in in_seq])
  output_words.append(word_2_index[out_seq])


# reshape the input sequences to be 3-dimensional
X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))

# Normalise the data by dividing by the max number of unique words (the vocab size)
X = X / float(vocab_size)

# one-hot encode the output words so that they can be used by the model (converts the output to 2-dimensions)
y = to_categorical(output_words)


In [20]:
# # import python libraries
# import numpy as np
# from tensorflow.keras.models import Sequential, load_model
# from tensorflow.keras.layers import Dense, Embedding, LSTM
# from tensorflow.keras.utils import to_categorical
# from random import randint
# import re

# import nltk   # natural language tool kit library
# nltk.download('gutenberg')  # downloads a library that NLTK uses

# from nltk.corpus import gutenberg as gut  # downloads the gutenberg dataset
# print(gut.fileids())    # prints the name of the files in the dataset

# # get the book text
# book_text = nltk.corpus.gutenberg.raw('blake-poems.txt')

# # Data preprocessing
# def preprocess_text(sen):
#     # Remove punctuations and numbers
#     sentence = re.sub('[^a-zA-Z]', ' ', sen)

#     # Single character removal
#     sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

#     # Removing multiple spaces
#     sentence = re.sub(r'\s+', ' ', sentence)

#     return sentence.lower()
# book_text = preprocess_text(book_text)

# book_text = book_text[:5000]  # limit text to 5000, just for this exercise

# # convert words to numbers
# from nltk.tokenize import word_tokenize
# nltk.download('punkt')
# book_text_words = (word_tokenize(book_text))
# n_words = len(book_text_words)
# unique_words = len(set(book_text_words))

# from keras.preprocessing.text import Tokenizer
# tokenizer = Tokenizer(num_words=unique_words)
# tokenizer.fit_on_texts(book_text_words)

# vocab_size = len(tokenizer.word_index) + 1    # word_index is the dictionary. Store the number of unique words in vocab_size variable
# word_2_index = tokenizer.word_index           # store the dictionary in the variable called word_2_index

# # Create the input sequences
# input_sequence_words = []  # input sequences in words (used for metric evaluation later on)
# input_sequence = []   # empty list to hold the sequences that will be input into our model
# output_words = []     # empty list to hold the output words
# input_seq_length = 25  # length of the input sequence
# for i in range(0, n_words - input_seq_length , 1):
#     in_seq = book_text_words[i:i + input_seq_length]
#     input_sequence_words.append(in_seq)
#     out_seq = book_text_words[i + input_seq_length]
#     input_sequence.append([word_2_index[word] for word in in_seq])
#     output_words.append(word_2_index[out_seq])

# # reshape the input sequences to be 3-dimensional
# X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))

# # Normalise the data by dividing by the max number of unique words (the vocab size)
# #X = X / float(vocab_size)

# # one-hot encode the output words so that they can be used by the model (converts the output to 2-dimensions)
# y = to_categorical(output_words)

# create, compile and fit the model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=X.shape[1]))
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=64, epochs=100, verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 50)            75300     
                                                                 
 lstm (LSTM)                 (None, 800)               2723200   
                                                                 
 dense (Dense)               (None, 1506)              1206306   
                                                                 
Total params: 4,004,806
Trainable params: 4,004,806
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 2

<keras.callbacks.History at 0x7f9178642ca0>

In [21]:
# Make Predictions
random_seq_index = np.random.randint(0, len(input_sequence)-1)    # select a random number from within the range of the number of input sequences
random_seq = input_sequence[random_seq_index]                     # get the input sequence that occurs at the randomly selected index (this is a list of integers)

index_2_word = dict(map(reversed, word_2_index.items())) # convert the integer sequence to its words
seed_word_sequence = [index_2_word[value] for value in random_seq]  # get the list of words that correspond to the integers in the randomly picked sequence

# join the words in the list and print the sequence of words
print(' '.join(seed_word_sequence))  # this prints the words from the randomly picked sequence that will be the seed for our prediction

# Predict next 100 words
word_sequence = []
for i in range(25):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    # int_sample = int_sample / float(vocab_size)                     # normalise (as we normalised the training data)

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict the next word.  An array of the probabilities for each word in the vocab is returned.
    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so the max value gives the word in the vocab with the highest probability)
    word_sequence.append(index_2_word[ predicted_word_id])          # get the predicted word by finding the word at the predicted index and add it to our predicted word sequence list

    random_seq.append(predicted_word_id)                            # append the predicted word index to the next seuqence to be input into the model predict method
    random_seq = random_seq[1:len(random_seq)]                      # remove the first element of the sequence so it now has the new word but is the same length.



# BLEU score
# Make sure you are comparing like with like
# input sequences contain the words in lists
# join each sequence into a string so it can be compared with the final output which is a string
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
reference = seq
candidate = ' '.join(word_sequence) # make the list of predicted words into a string
score = sentence_bleu(reference, candidate)
print('Seed word sequence: %s'%(' '.join(seed_word_sequence)))
print('Predicted words: %s'%(candidate))
print('BLEU Score for predicted words: %s'%(score))

of truth new born doubt is fled and clouds of reason dark disputes and artful teazing folly is an endless maze tangled roots perplex her
Seed word sequence: of truth new born doubt is fled and clouds of reason dark disputes and artful teazing folly is an endless maze tangled roots perplex her
Predicted words: the the the the the the the the the the the the the the the the the the the the the the the the the
BLEU Score for predicted words: 0.3990021400109723


# model 1

In [22]:
# create, compile and fit the model
model = Sequential()
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
#model.add(LSTM(800, return_sequences=False))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=32, epochs=100, verbose=1)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 800)               2566400   
                                                                 
 dense_1 (Dense)             (None, 1506)              1206306   
                                                                 
Total params: 3,772,706
Trainable params: 3,772,706
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 3

<keras.callbacks.History at 0x7f9178034df0>

In [23]:
# Make Predictions
random_seq_index = np.random.randint(0, len(input_sequence)-1)    # select a random number from within the range of the number of input sequences
random_seq = input_sequence[random_seq_index]                     # get the input sequence that occurs at the randomly selected index (this is a list of integers)

index_2_word = dict(map(reversed, word_2_index.items())) # convert the integer sequence to its words
seed_word_sequence = [index_2_word[value] for value in random_seq]  # get the list of words that correspond to the integers in the randomly picked sequence

# join the words in the list and print the sequence of words
print(' '.join(seed_word_sequence))  # this prints the words from the randomly picked sequence that will be the seed for our prediction

# Predict next 100 words
word_sequence = []
for i in range(25):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    int_sample = int_sample / float(vocab_size)                     # normalise (as we normalised the training data)

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict the next word.  An array of the probabilities for each word in the vocab is returned.
    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so the max value gives the word in the vocab with the highest probability)
    word_sequence.append(index_2_word[ predicted_word_id])          # get the predicted word by finding the word at the predicted index and add it to our predicted word sequence list

    random_seq.append(predicted_word_id)                            # append the predicted word index to the next seuqence to be input into the model predict method
    random_seq = random_seq[1:len(random_seq)]                      # remove the first element of the sequence so it now has the new word but is the same length.

# BLEU score
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
reference = seq
candidate = ' '.join(word_sequence) # make the list of words into a string
score = sentence_bleu(reference, candidate)
print('Seed word sequence: %s'%(' '.join(seed_word_sequence)))
print('Predicted words: %s'%(candidate))
print('BLEU Score for predicted words: %s'%(score))

the sun does arise and make happy the skies the merry bells ring to welcome the spring the skylark and thrush the birds of the
Seed word sequence: the sun does arise and make happy the skies the merry bells ring to welcome the spring the skylark and thrush the birds of the
Predicted words: bush sing louder around to the bells cheerful sound while our sports shall be seen on the echoing green old john with white hair does
BLEU Score for predicted words: 1.0


In [None]:
# Predict next 25 words
word_sequnce = []
for i in range(25):
  int_sample = np.reshape(random_seq, (1,len(random_seq), 1))     # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)


  predicted_word_index = model.predict(int_sample, verbose=0)   # predict the next word.  An array of the probabilities for each word in the vocab is returned.
  predict_word_id = np.argmax(predicted_word_index)
  word_sequnce.append(index_2_word[predict_word_id])

  random_seq.append(predict_word_id)            # append the predicted word index to the next seuqence to be input into the model predict method
  random_seq = random_seq[1:len(random_seq)]    # remove the first element of the sequence so it now has the new word but is the same length.


# BLEU score
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
refrence = seq
candidate = ' '.join(word_sequnce )  # make the list of words into a string
score = sentence_bleu(refrence, candidate )
print('Seed word sequence: %s'%(' '.join(seed_word_sequence)))
print('Predicted words: %s'%(candidate))
print('BLEU Score for predicted words: %s'%(score))


Seed word sequence: glee on cloud saw child and he laughing said to me pipe song about lamb so piped with merry cheer piper pipe that song again
Predicted words: of happy cheer so sang the same again while he wept with joy to hear piper sit thee down and write in book that all
BLEU Score for predicted words: 1.0


#Model 2 

In [None]:
# create, compile and fit the model
model = Sequential()
#model.add(Embedding(vocab_size, 50, input_length=X.shape[1]))
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
#model.add(Dropout(0.5))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=64, epochs=20, verbose=1)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 800)               2566400   
                                                                 
 dense_2 (Dense)             (None, 389)               311589    
                                                                 
Total params: 2,877,989
Trainable params: 2,877,989
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fbba6f54450>

In [None]:
# Make Predictions
random_seq_index = np.random.randint(0, len(input_sequence)-1)    # select a random number from within the range of the number of input sequences
random_seq = input_sequence[random_seq_index]                     # get the input sequence that occurs at the randomly selected index (this is a list of integers)

index_2_word = dict(map(reversed, word_2_index.items())) # convert the integer sequence to its words
seed_word_sequence = [index_2_word[value] for value in random_seq]  # get the list of words that correspond to the integers in the randomly picked sequence

# join the words in the list and print the sequence of words
print(' '.join(seed_word_sequence))  # this prints the words from the randomly picked sequence that will be the seed for our prediction

# Predict next 100 words
word_sequence = []
for i in range(25):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    # int_sample = int_sample / float(vocab_size)                     # normalise (as we normalised the training data)

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict the next word.  An array of the probabilities for each word in the vocab is returned.
    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so the max value gives the word in the vocab with the highest probability)
    word_sequence.append(index_2_word[ predicted_word_id])          # get the predicted word by finding the word at the predicted index and add it to our predicted word sequence list

    random_seq.append(predicted_word_id)                            # append the predicted word index to the next seuqence to be input into the model predict method
    random_seq = random_seq[1:len(random_seq)]                      # remove the first element of the sequence so it now has the new word but is the same length.



# BLEU score
# Make sure you are comparing like with like
# input sequences contain the words in lists
# join each sequence into a string so it can be compared with the final output which is a string
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
reference = seq
candidate = ' '.join(word_sequence) # make the list of predicted words into a string
score = sentence_bleu(reference, candidate)
print('Seed word sequence: %s \n'%(' '.join(seed_word_sequence)))
print('Predicted words: %s  \n'%(candidate))
print('BLEU Score for predicted words: %s  \n'%(score))

no no never can it be never never can it be and can he who smiles on all hear the wren with sorrows small hear
Seed word sequence: no no never can it be never never can it be and can he who smiles on all hear the wren with sorrows small hear 

Predicted words: and the and and and and in and in and in and in and in and in and in and in and in in in  

BLEU Score for predicted words: 0.5566387983012375  



# Model 3

In [None]:

# create, compile and fit the model
model = Sequential()
#model.add(Embedding(vocab_size, 50, input_length=X.shape[1]))
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=64, epochs=20, verbose=1)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 800)               2566400   
                                                                 
 dense_3 (Dense)             (None, 389)               311589    
                                                                 
Total params: 2,877,989
Trainable params: 2,877,989
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fbba6aa4510>

In [None]:
# Make Predictions
random_seq_index = np.random.randint(0, len(input_sequence)-1)    # select a random number from within the range of the number of input sequences
random_seq = input_sequence[random_seq_index]                     # get the input sequence that occurs at the randomly selected index (this is a list of integers)

index_2_word = dict(map(reversed, word_2_index.items())) # convert the integer sequence to its words
seed_word_sequence = [index_2_word[value] for value in random_seq]  # get the list of words that correspond to the integers in the randomly picked sequence

# join the words in the list and print the sequence of words
print(' '.join(seed_word_sequence))  # this prints the words from the randomly picked sequence that will be the seed for our prediction

# Predict next 100 words
word_sequence = []
for i in range(25):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    # int_sample = int_sample / float(vocab_size)                     # normalise (as we normalised the training data)

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict the next word.  An array of the probabilities for each word in the vocab is returned.
    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so the max value gives the word in the vocab with the highest probability)
    word_sequence.append(index_2_word[ predicted_word_id])          # get the predicted word by finding the word at the predicted index and add it to our predicted word sequence list

    random_seq.append(predicted_word_id)                            # append the predicted word index to the next seuqence to be input into the model predict method
    random_seq = random_seq[1:len(random_seq)]                      # remove the first element of the sequence so it now has the new word but is the same length.

# BLEU score
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
reference = seq
candidate = ' '.join(word_sequence) # make the list of words into a string
score = sentence_bleu(reference, candidate)
print('Seed word sequence: %s'%(' '.join(seed_word_sequence)))
print('Predicted words: %s'%(candidate))
print('BLEU Score for predicted words: %s'%(score))

dost thou know who made thee gave thee life and bid thee feed by the stream and er the mead gave thee clothing of delight
Seed word sequence: dost thou know who made thee gave thee life and bid thee feed by the stream and er the mead gave thee clothing of delight
Predicted words: softest clothing wolly bright gave thee such tender voice making all the vales rejoice little lamb who made thee dost thou know who made thee
BLEU Score for predicted words: 1.0


In [None]:
model.trainable = True


In [None]:
# Let's take a look to see how many layers are in the base model
print("Number of layers in the base model: ", len(model.layers))

Number of layers in the base model:  2


In [None]:
# Fine-tune from this layer onwards
fine_tune_at = 3

# Freeze all the layers before the `fine_tune_at` layer
for layer in model.layers[:fine_tune_at]:
  layer.trainable =  False

In [None]:
fine_tune_epochs = 50
total_epochs =  99 + fine_tune_epochs

history_fine = model.fit(X,y,
                         epochs=total_epochs,
                         initial_epoch=99,)

Epoch 100/149
Epoch 101/149
Epoch 102/149
Epoch 103/149
Epoch 104/149
Epoch 105/149
Epoch 106/149
Epoch 107/149
Epoch 108/149
Epoch 109/149
Epoch 110/149
Epoch 111/149
Epoch 112/149
Epoch 113/149
Epoch 114/149
Epoch 115/149
Epoch 116/149
Epoch 117/149
Epoch 118/149
Epoch 119/149
Epoch 120/149
Epoch 121/149
Epoch 122/149
Epoch 123/149
Epoch 124/149
Epoch 125/149
Epoch 126/149
Epoch 127/149
Epoch 128/149
Epoch 129/149
Epoch 130/149
Epoch 131/149
Epoch 132/149
Epoch 133/149
Epoch 134/149
Epoch 135/149
Epoch 136/149
Epoch 137/149
Epoch 138/149
Epoch 139/149
Epoch 140/149
Epoch 141/149
Epoch 142/149
Epoch 143/149
Epoch 144/149
Epoch 145/149
Epoch 146/149
Epoch 147/149
Epoch 148/149
Epoch 149/149


In [None]:
# Make Predictions
random_seq_index = np.random.randint(0, len(input_sequence)-1)    # select a random number from within the range of the number of input sequences
random_seq = input_sequence[random_seq_index]                     # get the input sequence that occurs at the randomly selected index (this is a list of integers)

index_2_word = dict(map(reversed, word_2_index.items())) # convert the integer sequence to its words
seed_word_sequence = [index_2_word[value] for value in random_seq]  # get the list of words that correspond to the integers in the randomly picked sequence

# join the words in the list and print the sequence of words
print(' '.join(seed_word_sequence))  # this prints the words from the randomly picked sequence that will be the seed for our prediction

# Predict next 100 words
word_sequence = []
for i in range(25):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    # int_sample = int_sample / float(vocab_size)                     # normalise (as we normalised the training data)

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict the next word.  An array of the probabilities for each word in the vocab is returned.
    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so the max value gives the word in the vocab with the highest probability)
    word_sequence.append(index_2_word[ predicted_word_id])          # get the predicted word by finding the word at the predicted index and add it to our predicted word sequence list

    random_seq.append(predicted_word_id)                            # append the predicted word index to the next seuqence to be input into the model predict method
    random_seq = random_seq[1:len(random_seq)]                      # remove the first element of the sequence so it now has the new word but is the same length.

# BLEU score
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
reference = seq
candidate = ' '.join(word_sequence) # make the list of words into a string
score = sentence_bleu(reference, candidate)
print('Seed word sequence: %s'%(' '.join(seed_word_sequence)))
print('Predicted words: %s'%(candidate))
print('BLEU Score for predicted words: %s'%(score))

little child a child and thou lamb we are called by his name little lamb god bless thee little lamb god bless thee the little
Seed word sequence: little child a child and thou lamb we are called by his name little lamb god bless thee little lamb god bless thee the little
Predicted words: black boy my mother bore me in the southern wild and am black but oh my soul is white white as an angel is the
BLEU Score for predicted words: 1.0
