In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np
import pickle

In [6]:
with open("/content/books_clean.txt", "r", encoding="utf-8") as f:
    combined = f.read()

print("Loaded:", len(combined), "characters")
print(combined[:300])

Loaded: 3081162 characters
PREFACE.
Walt Whitman has somewhere a fine and just distinction between "loving
by allowance" and "loving with personal love." This distinction applies
to books as well as to men and women; and in the case of the not very
numerous authors who are the objects of the personal affection, it
brings a cu


In [7]:
data = combined.lower()

In [8]:
VOCAB_SIZE = 5000

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts([data])

print("Unique words found:", len(tokenizer.word_index))
print("Vocab size used:", VOCAB_SIZE)

Unique words found: 19728
Vocab size used: 5000


In [10]:
sequence = tokenizer.texts_to_sequences([data])[0]
print("Total tokens:", len(sequence))

Total tokens: 564913


In [13]:
SEQUENCE_LENGTH = 5

input_sequences = []
for i in range(SEQUENCE_LENGTH, len(sequence)):
    input_sequences.append(sequence[i-SEQUENCE_LENGTH:i+1])

input_sequences = np.array(input_sequences)

np.random.shuffle(input_sequences)

X = input_sequences[:, :-1]
y = input_sequences[:, -1]

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Max y:", y.max())

X shape: (564908, 5)
y shape: (564908,)
Max y: 4999


In [14]:
model = Sequential([
    tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,)),
    Embedding(input_dim=VOCAB_SIZE, output_dim=128),
    LSTM(150, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(VOCAB_SIZE, activation='softmax')
])

model.summary()

In [15]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate=5e-4, clipnorm=1.0),
    metrics=['accuracy']
)

In [16]:
history = model.fit(
    X, y,
    epochs=10,
    batch_size=256,
    validation_split=0.1
)

Epoch 1/10
[1m1987/1987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 46ms/step - accuracy: 0.0621 - loss: 6.3809 - val_accuracy: 0.0954 - val_loss: 5.8423
Epoch 2/10
[1m1987/1987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 47ms/step - accuracy: 0.1009 - loss: 5.7686 - val_accuracy: 0.1179 - val_loss: 5.5822
Epoch 3/10
[1m1987/1987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 46ms/step - accuracy: 0.1232 - loss: 5.5280 - val_accuracy: 0.1305 - val_loss: 5.4337
Epoch 4/10
[1m1987/1987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 47ms/step - accuracy: 0.1325 - loss: 5.3819 - val_accuracy: 0.1369 - val_loss: 5.3150
Epoch 5/10
[1m1987/1987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 48ms/step - accuracy: 0.1391 - loss: 5.2575 - val_accuracy: 0.1409 - val_loss: 5.2335
Epoch 6/10
[1m1987/1987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 46ms/step - accuracy: 0.1434 - loss: 5.1613 - val_accuracy: 0.1445 - val_loss: 5.1715
Ep

In [18]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

SEQUENCE_LENGTH = 5

def predict_next(seed_text, top_n=3):
    seed_text = seed_text.lower()
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = token_list[-SEQUENCE_LENGTH:]
    token_list = pad_sequences([token_list], maxlen=SEQUENCE_LENGTH, padding='pre')

    predictions = model.predict(token_list, verbose=0)[0]
    top_indices = predictions.argsort()[::-1]

    suggestions = []
    for idx in top_indices:
        word = tokenizer.index_word.get(idx, "")
        if word and word != "<OOV>":
            suggestions.append(word)
        if len(suggestions) == top_n:
            break

    return suggestions

while True:
    user_input = input("Enter text (or 'quit' to stop): ")
    if user_input.lower() == 'quit':
        break
    predictions = predict_next(user_input)
    print("Top 3 predictions:", predictions)
    print()

Enter text (or 'quit' to stop): hello
Top 3 predictions: ['and', 'in', 'the']

Enter text (or 'quit' to stop): i want to know the only 
Top 3 predictions: ['time', 'thing', 'day']

Enter text (or 'quit' to stop): he exactly told me good
Top 3 predictions: ['i', 'that', 'gray']

Enter text (or 'quit' to stop): it was a dark
Top 3 predictions: ['man', 'and', 'thing']

Enter text (or 'quit' to stop): the old man
Top 3 predictions: ['was', 'had', 'who']

Enter text (or 'quit' to stop): quit


In [19]:
def generate_sentence(seed_text, total_words=10):
    result = seed_text

    for _ in range(total_words):
        token_list = tokenizer.texts_to_sequences([result.lower()])[0]
        token_list = token_list[-SEQUENCE_LENGTH:]
        token_list = pad_sequences([token_list], maxlen=SEQUENCE_LENGTH, padding='pre')

        predictions = model.predict(token_list, verbose=0)[0]
        top_indices = predictions.argsort()[::-1]

        next_word = ""
        for idx in top_indices:
            word = tokenizer.index_word.get(idx, "")
            if word and word != "<OOV>":
                next_word = word
                break

        result = result + " " + next_word

    return result

while True:
    user_input = input("Enter seed text (or 'quit' to stop): ")
    if user_input.lower() == 'quit':
        break
    sentence = generate_sentence(user_input)
    print("Generated:", sentence)
    print()

Enter seed text (or 'quit' to stop): hey there i just wanted to tell you
Generated: hey there i just wanted to tell you to be a very good thing to be a little

Enter seed text (or 'quit' to stop): he was a very
Generated: he was a very little man in the room and the whole man was

Enter seed text (or 'quit' to stop): man
Generated: man and i am sure to be a little thing to

Enter seed text (or 'quit' to stop): hello there i wish you
Generated: hello there i wish you are not a little thing to be a little thing

Enter seed text (or 'quit' to stop): quit


In [20]:
import pickle


model.save("/content/next_word_model.keras")

with open("/content/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Model and tokenizer saved!")

Model and tokenizer saved!
