In [3]:
import tensorflow_datasets as tfds

print(tfds.list_builders())

# Load tiny_shakespeare
ds = tfds.load('tiny_shakespeare', split='train')
text_data = ""

for item in ds:
    text_data += item['text'].numpy().decode('utf-8') + "\n"

print(len(text_data))


['abstract_reasoning', 'accentdb', 'aeslc', 'aflw2k3d', 'ag_news_subset', 'ai2_arc', 'ai2_arc_with_ir', 'ai2dcaption', 'aloha_mobile', 'amazon_us_reviews', 'anli', 'answer_equivalence', 'arc', 'asimov_dilemmas_auto_val', 'asimov_dilemmas_scifi_train', 'asimov_dilemmas_scifi_val', 'asimov_injury_val', 'asimov_multimodal_auto_val', 'asimov_multimodal_manual_val', 'asqa', 'asset', 'assin2', 'asu_table_top_converted_externally_to_rlds', 'austin_buds_dataset_converted_externally_to_rlds', 'austin_sailor_dataset_converted_externally_to_rlds', 'austin_sirius_dataset_converted_externally_to_rlds', 'bair_robot_pushing_small', 'bc_z', 'bccd', 'beans', 'bee_dataset', 'beir', 'berkeley_autolab_ur5', 'berkeley_cable_routing', 'berkeley_fanuc_manipulation', 'berkeley_gnm_cory_hall', 'berkeley_gnm_recon', 'berkeley_gnm_sac_son', 'berkeley_mvp_converted_externally_to_rlds', 'berkeley_rpt_converted_externally_to_rlds', 'big_patent', 'bigearthnet', 'billsum', 'binarized_mnist', 'binary_alpha_digits', 'b

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_data])
total_words=len(tokenizer.word_index)+1
print(f"Vocabulary size: {total_words}")

Vocabulary size: 11914


In [5]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

input_sequences = []
for line in text_data.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

print(input_sequences[:5])
max_seq_len=max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre'))

X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
print(max_seq_len)

[[87, 248], [143, 34], [143, 34, 962], [143, 34, 962, 145], [143, 34, 962, 145, 609]]
16


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               restore_best_weights=True)

model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_seq_len-1))
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [5]:
history = model.fit(X,y,epochs=100,verbose=1,callbacks=[early_stopping])

Epoch 1/100
[1m4830/4830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 17ms/step - accuracy: 0.0377 - loss: 6.9933
Epoch 2/100
[1m   7/4830[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:27[0m 18ms/step - accuracy: 0.0842 - loss: 6.0663

  current = self.get_monitor_value(logs)


[1m4830/4830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 17ms/step - accuracy: 0.0854 - loss: 6.1187
Epoch 3/100
[1m4830/4830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 16ms/step - accuracy: 0.1041 - loss: 5.7253
Epoch 4/100
[1m4830/4830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 16ms/step - accuracy: 0.1167 - loss: 5.4226
Epoch 5/100
[1m4830/4830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 17ms/step - accuracy: 0.1272 - loss: 5.1519
Epoch 6/100
[1m4830/4830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 17ms/step - accuracy: 0.1422 - loss: 4.8780
Epoch 7/100
[1m4830/4830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 17ms/step - accuracy: 0.1596 - loss: 4.6505
Epoch 8/100
[1m4830/4830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 17ms/step - accuracy: 0.1797 - loss: 4.4296
Epoch 9/100
[1m4830/4830[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 17ms/step - accuracy: 0.2042 - loss: 4.2168
Epoch 10/100

In [7]:
model.save('lstm_shakesp_next_word.h5')



In [9]:
from keras.models import load_model
model = load_model('lstm_shakesp_next_word.h5')



In [10]:
def generate_text(seed_text, next_words=4):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted)
        
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += " " + word
                break
    return seed_text

print(generate_text("Against the Roman state,"))

Against the Roman state, whose course will on
