In [15]:
# define corpus
corpus = "Hello world. This is a meta text corpus to demonstrate text prediction. Test prediction models are meta."

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer

# init tokenizer and fit on corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])

In [17]:
tokenizer.word_index

{'meta': 1,
 'text': 2,
 'prediction': 3,
 'hello': 4,
 'world': 5,
 'this': 6,
 'is': 7,
 'a': 8,
 'corpus': 9,
 'to': 10,
 'demonstrate': 11,
 'test': 12,
 'models': 13,
 'are': 14}

In [18]:
# create a sequence of numbers
sequence = tokenizer.texts_to_sequences([corpus])[0]

In [19]:
input_sequences = []

for i in range(1, len(sequence)):
    n_gram_sequence = sequence[:i+1]
    input_sequences.append(n_gram_sequence)

In [20]:
input_sequences[:5]

[[4, 5], [4, 5, 6], [4, 5, 6, 7], [4, 5, 6, 7, 8], [4, 5, 6, 7, 8, 1]]

In [21]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

In [23]:
max_sequence_len = max([len(x) for x in input_sequences])

In [24]:
max_sequence_len

17

In [27]:
input_sequences[0]

[4, 5]

In [31]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [32]:
sample_sequence = input_sequences[0]


padded_sequence = pad_sequences([sample_sequence], maxlen=max_sequence_len -1, padding='pre')

padded_sequence

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5]], dtype=int32)

In [33]:
Xs = input_sequences[:, :-1]
labels = input_sequences[:, -1]

for i in range(5):
    print(f'X[{i}]:', Xs[i], 'label:', labels[i])

X[0]: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4] label: 5
X[1]: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 5] label: 6
X[2]: [0 0 0 0 0 0 0 0 0 0 0 0 0 4 5 6] label: 7
X[3]: [0 0 0 0 0 0 0 0 0 0 0 0 4 5 6 7] label: 8
X[4]: [0 0 0 0 0 0 0 0 0 0 0 4 5 6 7 8] label: 1


In [34]:
Xs.shape

(16, 16)

In [35]:
labels.shape

(16,)

In [36]:
tokenizer.word_index

{'meta': 1,
 'text': 2,
 'prediction': 3,
 'hello': 4,
 'world': 5,
 'this': 6,
 'is': 7,
 'a': 8,
 'corpus': 9,
 'to': 10,
 'demonstrate': 11,
 'test': 12,
 'models': 13,
 'are': 14}

In [37]:
Xs

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  5],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  5,  6],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  5,  6,  7],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  5,  6,  7,  8],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  5,  6,  7,  8,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  5,  6,  7,  8,  1,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  4,  5,  6,  7,  8,  1,  2,  9],
       [ 0,  0,  0,  0,  0,  0,  0,  4,  5,  6,  7,  8,  1,  2,  9, 10],
       [ 0,  0,  0,  0,  0,  0,  4,  5,  6,  7,  8,  1,  2,  9, 10, 11],
       [ 0,  0,  0,  0,  0,  4,  5,  6,  7,  8,  1,  2,  9, 10, 11,  2],
       [ 0,  0,  0,  0,  4,  5,  6,  7,  8,  1,  2,  9, 10, 11,  2,  3],
       [ 0,  0,  0,  4,  5,  6,  7,  8,  1,  2,  9, 10, 11,  2,  3, 12],
       [ 0,  0,  4,  5,  6,  7,  8,  1,  2,  9, 10,

In [38]:
labels

array([ 5,  6,  7,  8,  1,  2,  9, 10, 11,  2,  3, 12,  3, 13, 14,  1],
      dtype=int32)

In [39]:
import tensorflow as tf

In [41]:
total_words = len(tokenizer.word_index) + 1

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [42]:
total_words

15

In [43]:
ys.shape

(16, 15)

In [44]:
ys

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0.

In [46]:
Xs.shape

(16, 16)

In [48]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

model = Sequential([
    Embedding(total_words, 64),
    SimpleRNN(64),
    Dense(20, activation='relu'),
    Dense(total_words, activation='softmax')
])

In [49]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [50]:
history = model.fit(Xs, ys, epochs=150, verbose=1)
model.summary()

Epoch 1/150
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 562ms/step - accuracy: 0.0000e+00 - loss: 2.7695
Epoch 2/150
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0625 - loss: 2.7096
Epoch 3/150
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2500 - loss: 2.6673
Epoch 4/150
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2500 - loss: 2.6319
Epoch 5/150
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2500 - loss: 2.5985
Epoch 6/150
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2500 - loss: 2.5672
Epoch 7/150
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3125 - loss: 2.5335
Epoch 8/150
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4375 - loss: 2.4981
Epoch 9/150
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0