<a href="https://colab.research.google.com/github/anshupandey/xebia_training_data/blob/main/code_Text_generation_word_sequence_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

In [None]:
data = """ Jack and Jill went up the hill
To fetch a pail of water
Jack fell down and broke his crown
And Jill came tumbling after"""
print(data)

 Jack and Jill went up the hill
To fetch a pail of water
Jack fell down and broke his crown
And Jill came tumbling after


In [None]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
print(encoded)


[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3, 19, 20, 21]


In [None]:
print(tokenizer.word_index)
print(tokenizer.word_index.items())


{'and': 1, 'jack': 2, 'jill': 3, 'went': 4, 'up': 5, 'the': 6, 'hill': 7, 'to': 8, 'fetch': 9, 'a': 10, 'pail': 11, 'of': 12, 'water': 13, 'fell': 14, 'down': 15, 'broke': 16, 'his': 17, 'crown': 18, 'came': 19, 'tumbling': 20, 'after': 21}
dict_items([('and', 1), ('jack', 2), ('jill', 3), ('went', 4), ('up', 5), ('the', 6), ('hill', 7), ('to', 8), ('fetch', 9), ('a', 10), ('pail', 11), ('of', 12), ('water', 13), ('fell', 14), ('down', 15), ('broke', 16), ('his', 17), ('crown', 18), ('came', 19), ('tumbling', 20), ('after', 21)])


In [None]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)


Vocabulary Size: 22


In [None]:
print(encoded)

[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3, 19, 20, 21]


In [None]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
sequences

Total Sequences: 24


[[2, 1],
 [1, 3],
 [3, 4],
 [4, 5],
 [5, 6],
 [6, 7],
 [7, 8],
 [8, 9],
 [9, 10],
 [10, 11],
 [11, 12],
 [12, 13],
 [13, 2],
 [2, 14],
 [14, 15],
 [15, 1],
 [1, 16],
 [16, 17],
 [17, 18],
 [18, 1],
 [1, 3],
 [3, 19],
 [19, 20],
 [20, 21]]

In [None]:
# split into X and y elements
sequences = array(sequences)
X, Y = sequences[:,0],sequences[:,1]

In [None]:
info = '''
We will fit our model to predict a probability distribution across all words in the vocabulary. 
That means that we need to turn the output element from a single integer into a one hot encoding with a 0 for every word in the vocabulary and a 1 for the actual word that the value. This gives the network a ground truth to aim for from which we can calculate error and update the model.

Keras provides the to_categorical() function that we can use to convert the integer to a one hot encoding while specifying the number of classes as the vocabulary size.
'''

print(info)


We will fit our model to predict a probability distribution across all words in the vocabulary. 
That means that we need to turn the output element from a single integer into a one hot encoding with a 0 for every word in the vocabulary and a 1 for the actual word that the value. This gives the network a ground truth to aim for from which we can calculate error and update the model.

Keras provides the to_categorical() function that we can use to convert the integer to a one hot encoding while specifying the number of classes as the vocabulary size.



In [None]:
# one hot encode outputs
y = to_categorical(Y, num_classes=vocab_size)
print(y.shape)


(24, 22)


In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500)
# evaluate

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             220       
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense (Dense)                (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/500
1/1 - 2s - loss: 3.0917 - accuracy: 0.0000e+00
Epoch 2/500
1/1 - 0s - loss: 3.0909 - accuracy: 0.1250
Epoch 3/500
1/1 - 0s - loss: 3.0901 - accuracy: 0.1250
Epoch 4/500
1/1 - 0s - loss: 3.0893 - accuracy: 0.1250
Epoch 5/500
1/1 - 0s - loss: 3.0885 - accuracy: 0.1250
Epoch 6/500
1/1 - 0s - loss: 3.0877 - accuracy: 0.1250
Epoch 7/500
1/1 - 0s - loss: 3.0868 - accur

<tensorflow.python.keras.callbacks.History at 0x7fe48d049b90>

In [None]:
in_text = 'Jill'
print(in_text)

Jill


In [None]:
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = array(encoded)
yhat = model.predict_classes(encoded, verbose=0)
yhat




array([19])

In [None]:
for word, index in tokenizer.word_index.items():
    if index == yhat:
        print(word)


came


In [None]:
start = "jill"
print(start)
for i in range(20):
    encoded = tokenizer.texts_to_sequences([start])[0]
    encoded = array(encoded)
    yhat = model.predict_classes(encoded, verbose=0)
    for word, index in tokenizer.word_index.items():
        if index == yhat:
            print(word)
            start=word

jill
came
tumbling
after
up
the




hill
to
fetch
a
pail
of
water
jack
and
jill
came
tumbling
after
up
the


In [None]:
in_text = 'Jill'
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = array(encoded)
yhat = model.predict(encoded, verbose=0)


In [None]:
yhat

array([[1.1701693e-04, 9.6275710e-04, 3.3060987e-05, 6.1181240e-06,
        4.8698300e-01, 1.8113984e-03, 6.8283764e-05, 1.0436386e-04,
        1.4539186e-03, 3.5600108e-04, 2.4453704e-03, 4.9551730e-03,
        2.1457444e-03, 7.7117684e-06, 1.6127506e-05, 4.9910182e-03,
        5.5204587e-06, 5.3519109e-04, 1.9491582e-04, 4.8783365e-01,
        4.9582692e-03, 1.5248551e-05]], dtype=float32)