In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
file_path = './sonnets.txt'

tokenizer = Tokenizer() # no word limit for generation
lines = []

with open(file_path) as f:    
    lines =  f.read()
    lines = lines.lower().split('\n')


FileNotFoundError: [Errno 2] No such file or directory: './sonnets.txt'

In [None]:
len(lines)

### Tokenize

In [None]:
tokenizer.fit_on_texts(lines)
sequenced_data = tokenizer.texts_to_sequences(lines)

In [8]:
numwords = len(tokenizer.word_index)
print(f'number of words is: {numwords}')

sequenced_data  

number of words is: 3210


[[34, 417, 877, 166, 213, 517],
 [8, 878, 134, 351, 102, 156, 199],
 [16, 22, 2, 879, 61, 30, 48, 634],
 [25, 311, 635, 102, 200, 25, 278],
 [16, 10, 880, 3, 62, 85, 214, 53],
 [1372, 9, 1373, 636, 11, 122, 1374, 1375],
 [201, 17, 1376, 64, 518, 202],
 [118, 9, 1377, 3, 9, 47, 122, 135, 279],
 [10, 8, 54, 63, 2, 418, 312, 419],
 [1, 352, 1378, 3, 2, 1379, 420],
 [215, 62, 85, 881, 1380, 9, 882],
 [1, 311, 883, 884, 313, 7, 1381],
 [257, 2, 94, 36, 353, 29, 1382, 21],
 [3, 637, 2, 418, 354, 30, 2, 638, 1, 19],
 [27, 1383, 885, 46, 1384, 9, 280],
 [1, 1385, 281, 1386, 7, 9, 134, 1387],
 [9, 1388, 179, 1389, 20, 1390, 35, 63],
 [49, 21, 17, 886, 639, 4, 887, 126, 888],
 [38, 81, 1391, 64, 23, 9, 51, 202],
 [64, 23, 2, 258, 4, 9, 889, 145],
 [3, 95, 215, 62, 85, 281, 1392, 53],
 [86, 146, 23, 1393, 236, 1, 1394, 96],
 [71, 136, 43, 96, 1395, 9, 134, 186],
 [42, 10, 1396, 640, 890, 69, 282, 4, 44],
 [46, 519, 5, 520, 1, 65, 5, 112, 314, 147],
 [1397, 25, 51, 30, 1398, 62],
 [29, 86, 3, 21, 

### Generate dataset

In [None]:
preprocessed = []
for line in sequenced_data:
    for i  in range(1, len(line)):
        preprocessed.append(line[:i+1])

### extract the last label

In [10]:
preprocessed

[[34, 417],
 [34, 417, 877],
 [34, 417, 877, 166],
 [34, 417, 877, 166, 213],
 [34, 417, 877, 166, 213, 517],
 [8, 878],
 [8, 878, 134],
 [8, 878, 134, 351],
 [8, 878, 134, 351, 102],
 [8, 878, 134, 351, 102, 156],
 [8, 878, 134, 351, 102, 156, 199],
 [16, 22],
 [16, 22, 2],
 [16, 22, 2, 879],
 [16, 22, 2, 879, 61],
 [16, 22, 2, 879, 61, 30],
 [16, 22, 2, 879, 61, 30, 48],
 [16, 22, 2, 879, 61, 30, 48, 634],
 [25, 311],
 [25, 311, 635],
 [25, 311, 635, 102],
 [25, 311, 635, 102, 200],
 [25, 311, 635, 102, 200, 25],
 [25, 311, 635, 102, 200, 25, 278],
 [16, 10],
 [16, 10, 880],
 [16, 10, 880, 3],
 [16, 10, 880, 3, 62],
 [16, 10, 880, 3, 62, 85],
 [16, 10, 880, 3, 62, 85, 214],
 [16, 10, 880, 3, 62, 85, 214, 53],
 [1372, 9],
 [1372, 9, 1373],
 [1372, 9, 1373, 636],
 [1372, 9, 1373, 636, 11],
 [1372, 9, 1373, 636, 11, 122],
 [1372, 9, 1373, 636, 11, 122, 1374],
 [1372, 9, 1373, 636, 11, 122, 1374, 1375],
 [201, 17],
 [201, 17, 1376],
 [201, 17, 1376, 64],
 [201, 17, 1376, 64, 518],
 [201,

In [11]:
max_length = 11
padded_train = pad_sequences(preprocessed, maxlen=max_length, truncating='post')

In [12]:
print(f'features have shape: {padded_train.shape}')

features have shape: (15462, 11)


In [13]:
padded_features = padded_train[:,:-1]
label = np.array(padded_features[:,-1], dtype=np.float32)

In [14]:
embedding_dim = 100

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=numwords+1, output_dim=embedding_dim), 
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)),
    tf.keras.layers.LSTM(8, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(numwords+1, activation='softmax')
])

loss = tf.keras.losses.SparseCategoricalCrossentropy()
optim = tf.keras.optimizers.Adam(1e-2)

model.compile(
    loss=loss,
    optimizer=optim,
    metrics=['acc']
)



In [15]:
model.fit(
    padded_features,
    label,
    epochs=30,
    batch_size=100
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7faf5012a130>