In [1]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras import layers, Sequential
import string

In [37]:
def read_poem():
    with open('data/dino.txt', 'rt') as f:
        text = f.read()
        return text

poem = read_poem()
poem[:100]

'aachenosaurus\naardonyx\nabelisaurus\nabrictosaurus\nabrosaurus\nabydosaurus\nacantholipan\nacanthopholis\na'

In [38]:
table = str.maketrans('', '', string.punctuation)

processed_text = [w.translate(table) for w in poem.split()]
processed_text = [w.lower() for w in processed_text]
processed_text = [w for w in processed_text if w.isalpha()]

processed_text = ' '.join(processed_text)

In [41]:
length = 10
sequences = []
for a in range(length, len(processed_text)):
    sequences.append(processed_text[a-length:a+1])

In [42]:
sequences[:5]

['aachenosaur', 'achenosauru', 'chenosaurus', 'henosaurus ', 'enosaurus a']

In [43]:
len(sequences)

19644

In [51]:
index_to_char = dict(enumerate(sorted(set(''.join(sequences)))))
char_to_index = {v:k for k,v in index_to_char.items()}

In [52]:
sequences2 = []
for row in sequences:
    new_row = [char_to_index[c] for c in row]
    sequences2.append(new_row)

In [53]:
sequences2[:10]

[[1, 1, 3, 8, 5, 14, 15, 19, 1, 21, 18],
 [1, 3, 8, 5, 14, 15, 19, 1, 21, 18, 21],
 [3, 8, 5, 14, 15, 19, 1, 21, 18, 21, 19],
 [8, 5, 14, 15, 19, 1, 21, 18, 21, 19, 0],
 [5, 14, 15, 19, 1, 21, 18, 21, 19, 0, 1],
 [14, 15, 19, 1, 21, 18, 21, 19, 0, 1, 1],
 [15, 19, 1, 21, 18, 21, 19, 0, 1, 1, 18],
 [19, 1, 21, 18, 21, 19, 0, 1, 1, 18, 4],
 [1, 21, 18, 21, 19, 0, 1, 1, 18, 4, 15],
 [21, 18, 21, 19, 0, 1, 1, 18, 4, 15, 14]]

In [54]:
vocab_size = len(index_to_char)
vocab_size

27

In [55]:
X = np.array(sequences2)[:, :-1]
Y = np.array(sequences2)[:, -1]

In [56]:
X[:3], Y[:3]

(array([[ 1,  1,  3,  8,  5, 14, 15, 19,  1, 21],
        [ 1,  3,  8,  5, 14, 15, 19,  1, 21, 18],
        [ 3,  8,  5, 14, 15, 19,  1, 21, 18, 21]]),
 array([18, 21, 19]))

In [57]:
X.shape

(19644, 10)

In [58]:
Y = to_categorical(Y, num_classes=vocab_size)
X = to_categorical(X, num_classes=vocab_size)
X.shape, Y.shape

((19644, 10, 27), (19644, 27))

In [59]:
model = Sequential()
model.add(layers.LSTM(units=75, input_shape=(X.shape[1], X.shape[2])))
model.add(layers.Dense(vocab_size, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [69]:
model.fit(X, Y, epochs=100, verbose=2, batch_size=64)

Epoch 1/100
307/307 - 2s - loss: 0.9870 - accuracy: 0.7049
Epoch 2/100
307/307 - 2s - loss: 0.9733 - accuracy: 0.7083
Epoch 3/100
307/307 - 2s - loss: 0.9603 - accuracy: 0.7117
Epoch 4/100
307/307 - 2s - loss: 0.9471 - accuracy: 0.7159
Epoch 5/100
307/307 - 2s - loss: 0.9356 - accuracy: 0.7190
Epoch 6/100
307/307 - 2s - loss: 0.9221 - accuracy: 0.7287
Epoch 7/100
307/307 - 2s - loss: 0.9116 - accuracy: 0.7286
Epoch 8/100
307/307 - 2s - loss: 0.9001 - accuracy: 0.7321
Epoch 9/100
307/307 - 2s - loss: 0.8872 - accuracy: 0.7350
Epoch 10/100
307/307 - 2s - loss: 0.8755 - accuracy: 0.7402
Epoch 11/100
307/307 - 2s - loss: 0.8634 - accuracy: 0.7416
Epoch 12/100
307/307 - 2s - loss: 0.8515 - accuracy: 0.7465
Epoch 13/100
307/307 - 2s - loss: 0.8421 - accuracy: 0.7505
Epoch 14/100
307/307 - 2s - loss: 0.8298 - accuracy: 0.7552
Epoch 15/100
307/307 - 2s - loss: 0.8177 - accuracy: 0.7589
Epoch 16/100
307/307 - 2s - loss: 0.8071 - accuracy: 0.7585
Epoch 17/100
307/307 - 2s - loss: 0.7975 - accura

<tensorflow.python.keras.callbacks.History at 0x1465bdfd0>

In [110]:
def generate_text(hint, num_to_gen):
    in_text = hint
    for a in range(num_to_gen):
        char_to_add = index_to_char[model.predict_classes(to_categorical(pad_sequences([[char_to_index[c] for c in in_text]], maxlen=10, truncating='pre'), num_classes=vocab_size))[0]]
        if char_to_add ==' ': break
        in_text+=char_to_add
    return in_text
generate_text(hint = 'aoife', num_to_gen=10)

'aoifengosaurus'