In [22]:
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
from random import randint
import re



In [23]:
import nltk

nltk.download('gutenberg')
from nltk.corpus import gutenberg as gut

print(gut.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


[nltk_data] Downloading package gutenberg to /home/lenovo/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [24]:
macbeth_text = nltk.corpus.gutenberg.raw('shakespeare-macbeth.txt')



In [25]:
print(macbeth_text[:500])

[The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lightning. Enter three Witches.

  1. When shall we three meet againe?
In Thunder, Lightning, or in Raine?
  2. When the Hurley-burley's done,
When the Battaile's lost, and wonne

   3. That will be ere the set of Sunne

   1. Where the place?
  2. Vpon the Heath

   3. There to meet with Macbeth

   1. I come, Gray-Malkin

   All. Padock calls anon: faire is foule, and foule is faire,
Houer through 


In [26]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.lower()

In [27]:
macbeth_text = preprocess_text(macbeth_text)
macbeth_text[:500]

' the tragedie of macbeth by william shakespeare actus primus scoena prima thunder and lightning enter three witches when shall we three meet againe in thunder lightning or in raine when the hurley burley done when the battaile lost and wonne that will be ere the set of sunne where the place vpon the heath there to meet with macbeth come gray malkin all padock calls anon faire is foule and foule is faire houer through the fogge and filthie ayre exeunt scena secunda alarum within enter king malcom'

### Convert Words into Numbers



In [28]:
from nltk.tokenize import word_tokenize

macbeth_text_words = word_tokenize(macbeth_text)
n_words = len(macbeth_text_words)
unique_words = len(set(macbeth_text_words))

In [29]:
print('Total Words: %d' % n_words)
print('Unique Words: %d' % unique_words)

Total Words: 17250
Unique Words: 3436


In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=unique_words + 1)
tokenizer.fit_on_texts(macbeth_text_words)

In [31]:
vocab_size = len(tokenizer.word_index) + 1
word_2_index = tokenizer.word_index

In [32]:
print(macbeth_text_words[200])
print(word_2_index[macbeth_text_words[200]])

like
69


### Modifying the Shape of Data
Text generation falls in the category of many-to-one sequence problems since the input is a sequence of words and output is a single word. We will be using the Long Short-Term Memory Network (LSTM), which is a type of recurrent neural network to create our text generation model. LSTM accepts data in a 3-dimensional format (number of samples, number of time-steps, features per time-step). Since the output will be a single word, the shape of the output will be 2-dimensional (number of samples, number of unique words in the corpus).



In [33]:
input_sequences = []
output_words = []
input_seq_length = 100

for i in range(0, n_words - input_seq_length, 1):
    in_seq = macbeth_text_words[i:i + input_seq_length]
    out_seq = macbeth_text_words[i + input_seq_length]
    input_sequences.append([word_2_index[word] for word in in_seq])
    output_words.append(word_2_index[out_seq])

In the script above, we declare two empty lists input_sequence and output_words. The input_seq_length is set to 100, which means that our input sequence will consist of 100 words. Next, we execute a loop where in the first iteration, integer values for the first 100 words from the text are appended to the input_sequence list. The 101st word is appended to the output_words list. During the second iteration, a sequence of words that starts from the 2nd word in the text and ends at the 101st word is stored in the input_sequence list, and the 102nd word is stored in the output_words array, and so on. A total of 17150 input sequences will be generated since there are 17250 total words in the dataset (100 less than the total words).

In [34]:
print(input_sequences[0])

[1, 869, 4, 40, 60, 1358, 1359, 408, 1360, 1361, 409, 265, 2, 870, 31, 190, 291, 76, 36, 30, 190, 327, 128, 8, 265, 870, 83, 8, 1362, 76, 1, 1363, 1364, 86, 76, 1, 1365, 354, 2, 871, 5, 34, 14, 168, 1, 292, 4, 649, 77, 1, 220, 41, 1, 872, 53, 3, 327, 12, 40, 52, 1366, 1367, 25, 1368, 873, 328, 355, 9, 410, 2, 410, 9, 355, 1369, 356, 1, 1370, 2, 874, 169, 103, 127, 411, 357, 149, 31, 51, 1371, 329, 107, 12, 358, 412, 875, 1372, 51, 20, 170, 92, 9]


normalize our input sequences by dividing the integers in the sequences by the largest integer value. The following script also converts the output into 2-dimensional format.


In [35]:
X = np.reshape(input_sequences, (len(input_sequences), input_seq_length, 1))
X = X / float(vocab_size)

y = to_categorical(output_words)

In [36]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (17150, 100, 1)
y shape: (17150, 3437)


In [37]:
model = Sequential()
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(LSTM(800, return_sequences=True))
model.add(LSTM(800))
model.add(Dense(y.shape[1], activation='softmax'))


In [38]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 100, 800)          2566400   
_________________________________________________________________
lstm_4 (LSTM)                (None, 100, 800)          5123200   
_________________________________________________________________
lstm_5 (LSTM)                (None, 800)               5123200   
_________________________________________________________________
dense_1 (Dense)              (None, 3437)              2753037   
Total params: 15,565,837
Trainable params: 15,565,837
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [40]:
model.fit(X, y, batch_size=64, epochs=1, verbose=1)

2021-08-25 22:18:13.632464: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 235778200 exceeds 10% of free system memory.


 44/268 [===>..........................] - ETA: 27:28 - loss: 7.2086

KeyboardInterrupt: 