<a href="https://colab.research.google.com/github/YazCodes/Deep-Learning-projects/blob/main/Prediction4_Poem_Two.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import python libraries
import numpy as np
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.utils import to_categorical
from random import randint
import re

In [2]:
import nltk   # natural language tool kit library
nltk.download('gutenberg')  # downloads a library that NLTK uses

from nltk.corpus import gutenberg as gut  # downloads the gutenberg dataset
print(gut.fileids())    # prints the name of the files in the dataset

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [3]:
# get the book text
book_text = nltk.corpus.gutenberg.raw('blake-poems.txt')

In [4]:
# print the first 500 characters of the text so we can look at it
print(book_text[:500])

[Poems by William Blake 1789]

 
SONGS OF INNOCENCE AND OF EXPERIENCE
and THE BOOK of THEL


 SONGS OF INNOCENCE
 
 
 INTRODUCTION
 
 Piping down the valleys wild,
   Piping songs of pleasant glee,
 On a cloud I saw a child,
   And he laughing said to me:
 
 "Pipe a song about a Lamb!"
   So I piped with merry cheer.
 "Piper, pipe that song again;"
   So I piped: he wept to hear.
 
 "Drop thy pipe, thy happy pipe;
   Sing thy songs of happy cheer:!"
 So I sang the same again,
   While he wept wi


Data preprocessing

In [5]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.lower()

In [6]:
book_text = preprocess_text(book_text)
book_text[:500]

' poems by william blake songs of innocence and of experience and the book of thel songs of innocence introduction piping down the valleys wild piping songs of pleasant glee on cloud saw child and he laughing said to me pipe song about lamb so piped with merry cheer piper pipe that song again so piped he wept to hear drop thy pipe thy happy pipe sing thy songs of happy cheer so sang the same again while he wept with joy to hear piper sit thee down and write in book that all may read so he vanish '

In [7]:
# increasing the amount of data from 20000 to 30000
print(len(book_text))
book_text = book_text[:30000]
print(len(book_text))

34028
30000


Conveting the words to  numbers

In [8]:
from nltk.tokenize import word_tokenize
# punkt is a sentence tokenizer that nltk requires. 
# It divides a text into a list of sentences, by using an unsupervised algorithm 
# to build a model for abbreviation words, collocations, and words that start sentences
nltk.download('punkt')

book_text_words = (word_tokenize(book_text))
n_words = len(book_text_words)
unique_words = len(set(book_text_words))

print('Total Words: %d' % n_words)
print('Unique Words: %d' % unique_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Total Words: 5811
Unique Words: 1381


In [9]:
# convert words to numbers
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=900)
tokenizer.fit_on_texts(book_text_words)

In [10]:
vocab_size = len(tokenizer.word_index) + 1    # word_index is the dictionary. Store the number of unique words in vocab_size variable
word_2_index = tokenizer.word_index           # store the dictionary in the variable called word_2_index

In [11]:
#print the 500th word in the dictionary and it's index - the word kissed is now a number 247!
print(book_text_words[500])
print(word_2_index[book_text_words[500]])

kissed
329


creating input sequences

In [12]:
input_sequence_words = []  # input sequences in words (used for metric evaluation later on)
input_sequence = []   # empty list to hold the sequences that will be input into our model
output_words = []     # empty list to hold the output words
input_seq_length = 25  # length of the input sequence

# form the input sequence list and the output words list
for i in range(0, n_words - input_seq_length , 1):
    in_seq = book_text_words[i:i + input_seq_length]
    input_sequence_words.append(in_seq)
    out_seq = book_text_words[i + input_seq_length]
    input_sequence.append([word_2_index[word] for word in in_seq])
    output_words.append(word_2_index[out_seq])

In [13]:
# print the first sequence to see what it looks like - a list of 100 integers that represent the first observation of words
print(len(input_sequence))      # print the number of input sequences
print(input_sequence[0])        # print the first input sequence
print(len(input_sequence[0]))   # print the length of the first input sequence

5786
[644, 35, 417, 251, 145, 4, 309, 2, 4, 310, 2, 1, 169, 4, 76, 145, 4, 309, 418, 311, 55, 1, 312, 108, 311]
25


In [14]:
# reshape the input sequences to be 3-dimensional
# X = np.reshape(input_sequence, (3562, 100, 1))    # number of input sequences, length of each sequence
X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))

# Normalize the data by dividing by the max number of unique words (the vocab size)
X = X / float(vocab_size)

# one-hot encode the output words so that they can be used by the model (converts the output to 2-dimensions)
y = to_categorical(output_words)

In [22]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (6565, 25, 1)
y shape: (6565, 1506)


Create, compile and fit the model 

In [30]:
model = Sequential()
# LSTM layer has 800 neurons (units).  The input shape is (100, 1) (Number of words in a sequence, 1 to make it 2D data) (Number of time-steps, features per time-step)
model.add(LSTM(900, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))

model.add(LSTM(900, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(900, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(900))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

model.summary()

# the output word can be one of any of the unique words in the vocabulary
# This means it is a multi-class calssification problem and we use the categorical crossentropy loss function
model.compile(loss='categorical_crossentropy', optimizer='adam')

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 25, 900)           3247200   
_________________________________________________________________
lstm_10 (LSTM)               (None, 25, 900)           6483600   
_________________________________________________________________
dropout_6 (Dropout)          (None, 25, 900)           0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 25, 900)           6483600   
_________________________________________________________________
dropout_7 (Dropout)          (None, 25, 900)           0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 900)               6483600   
_________________________________________________________________
dropout_8 (Dropout)          (None, 900)              

In [31]:
model.fit(X, y, batch_size=32, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f0d386227d0>

Making predictions

In [32]:
# randomly select a sequence of integers from the input sequences
random_seq_index = np.random.randint(0, len(input_sequence)-1)
random_seq = input_sequence[random_seq_index]

# convert the integer sequence to its words
# word_2_index contains a dictionary of the format word : index (word being the key and index being the value)
# the next line of code reverses this to index: word (index now being the key and word is now the value)
# this reversed dictionary can now be used by supplying an index to it, and the word will be returned
index_2_word = dict(map(reversed, word_2_index.items())) # swaps keys with values
# loop round using a list iteration to get the list of words that correspond to the integers in the randomly picked sequence
word_sequence = [index_2_word[value] for value in random_seq]

# join the words in the list and print the sequence of words
print(' '.join(word_sequence))

nor poverty the mind appall the little girl lost in futurity prophetic see that the earth from sleep grave the sentence deep shall arise and


In [33]:
# this code predicts the next 25 words that follow the randomly picked sequence above
# we loop round, making 25 predictions
word_sequence = []
for i in range(25):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    int_sample = int_sample / float(vocab_size)                     # normalise

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict

    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so )

    word_sequence.append(index_2_word[predicted_word_id])          # get the predicted word by finding the word at the predicted index


In [34]:
# loop round the list of predicted words and print them out for our final prediction of the next 100 words
final_output = ""
for word in word_sequence:
    final_output = final_output + " " + word

print(final_output)

 the the the the the the the the the the the the the the the the the the the the the the the the the


In [35]:
# Bleu score
print(input_sequence_words[:2])       # print out the first 2 elements of the input sequence words list
print(word_sequence)                  # print out the words of our randomly picked sequence

from nltk.translate.bleu_score import sentence_bleu
reference = input_sequence_words
candidate = word_sequence

score = sentence_bleu(reference, candidate)
print(score)
#Score = 0.727 
# A perfect match results in a score of 1.0. The closet to 1 the better. 

[['poems', 'by', 'william', 'blake', 'songs', 'of', 'innocence', 'and', 'of', 'experience', 'and', 'the', 'book', 'of', 'thel', 'songs', 'of', 'innocence', 'introduction', 'piping', 'down', 'the', 'valleys', 'wild', 'piping'], ['by', 'william', 'blake', 'songs', 'of', 'innocence', 'and', 'of', 'experience', 'and', 'the', 'book', 'of', 'thel', 'songs', 'of', 'innocence', 'introduction', 'piping', 'down', 'the', 'valleys', 'wild', 'piping', 'songs']]
['the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
0.727427152512826


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
