<a href="https://colab.research.google.com/github/anshupandey/Natural_language_Processing/blob/master/code_Text_generation_word_sequence_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models,layers

In [2]:
data = """ Jack and Jill went up the hill
To fetch a pail of water
Jack fell down and broke his crown
And Jill came tumbling after"""
print(data)

 Jack and Jill went up the hill
To fetch a pail of water
Jack fell down and broke his crown
And Jill came tumbling after


In [3]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
print(encoded)


[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3, 19, 20, 21]


In [4]:
print(tokenizer.word_index)
print(tokenizer.word_index.items())


{'and': 1, 'jack': 2, 'jill': 3, 'went': 4, 'up': 5, 'the': 6, 'hill': 7, 'to': 8, 'fetch': 9, 'a': 10, 'pail': 11, 'of': 12, 'water': 13, 'fell': 14, 'down': 15, 'broke': 16, 'his': 17, 'crown': 18, 'came': 19, 'tumbling': 20, 'after': 21}
dict_items([('and', 1), ('jack', 2), ('jill', 3), ('went', 4), ('up', 5), ('the', 6), ('hill', 7), ('to', 8), ('fetch', 9), ('a', 10), ('pail', 11), ('of', 12), ('water', 13), ('fell', 14), ('down', 15), ('broke', 16), ('his', 17), ('crown', 18), ('came', 19), ('tumbling', 20), ('after', 21)])


In [5]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)


Vocabulary Size: 22


In [6]:
print(encoded)

[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3, 19, 20, 21]


In [7]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
sequences

Total Sequences: 24


[[2, 1],
 [1, 3],
 [3, 4],
 [4, 5],
 [5, 6],
 [6, 7],
 [7, 8],
 [8, 9],
 [9, 10],
 [10, 11],
 [11, 12],
 [12, 13],
 [13, 2],
 [2, 14],
 [14, 15],
 [15, 1],
 [1, 16],
 [16, 17],
 [17, 18],
 [18, 1],
 [1, 3],
 [3, 19],
 [19, 20],
 [20, 21]]

In [8]:
# split into X and y elements
sequences = array(sequences)
X, Y = sequences[:,0],sequences[:,1]

We will fit our model to predict a probability distribution across all words in the vocabulary.
That means that we need to turn the output element from a single integer into a one hot encoding with a 0 for every word in the vocabulary and a 1 for the actual word that the value. This gives the network a ground truth to aim for from which we can calculate error and update the model.

Keras provides the to_categorical() function that we can use to convert the integer to a one hot encoding while specifying the number of classes as the vocabulary size.


In [9]:
# one hot encode outputs
y = to_categorical(Y, num_classes=vocab_size)
print(y.shape)


(24, 22)


In [10]:
# define model
model = models.Sequential()
model.add(layers.Embedding(vocab_size, 10, input_length=1))
model.add(layers.LSTM(50))
model.add(layers.Dense(vocab_size, activation='softmax'))
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500)
# evaluate

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             220       
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 22)                1122      
                                                                 
Total params: 13542 (52.90 KB)
Trainable params: 13542 (52.90 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Ep

<keras.src.callbacks.History at 0x7b7060f53af0>

In [11]:
in_text = 'Jill'
print(in_text)

Jill


In [12]:
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = array(encoded)
yhat = model.predict(encoded, verbose=0)
import numpy as np
yhat = np.argmax(yhat)
yhat


4

In [13]:


for word, index in tokenizer.word_index.items():
    if index == yhat:
        print(word)


went


In [14]:
start = "jill"
print(start)
for i in range(20):
    encoded = tokenizer.texts_to_sequences([start])[0]
    encoded = array(encoded)
    yhat = model.predict(encoded, verbose=0)
    yhat = np.argmax(yhat)
    for word, index in tokenizer.word_index.items():
        if index == yhat:
            print(word)
            start=word

jill
went
up
the
hill
to
fetch
a
pail
of
water
jack
and
jill
went
up
the
hill
to
fetch
a


In [15]:
in_text = 'Jill'
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = array(encoded)
yhat = model.predict(encoded, verbose=0)


In [16]:
yhat

array([[1.7167647e-04, 3.2016024e-04, 4.8634703e-03, 4.9706345e-05,
        4.8614675e-01, 9.1357497e-06, 5.0368558e-06, 3.6843945e-04,
        7.8221061e-04, 3.2474599e-03, 9.1133108e-03, 1.8893284e-03,
        1.6499276e-03, 3.3256380e-05, 3.5394751e-04, 2.2096987e-04,
        1.8163306e-05, 4.6695480e-03, 3.1487165e-05, 4.8546293e-01,
        2.7721672e-04, 3.1591221e-04]], dtype=float32)