In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

raw_text = "project gutenberg australia news and views from the australian chapter of the project gutenberg literary archive foundation"
corpus = raw_text.lower().replace('\n', ' ')

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts([corpus])
vocab_size = len(tokenizer.word_index) + 1

tokens = tokenizer.texts_to_sequences([corpus])[0]
input_sequences = [tokens[:i+1] for i in range(1, len(tokens))]
max_len = max(len(x) for x in input_sequences)


padded = pad_sequences(input_sequences, maxlen=max_len, padding='pre')
X, y = padded[:, :-1], tf.keras.utils.to_categorical(padded[:, -1], vocab_size)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 10, input_length=max_len-1),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=50, verbose=0) # Increased epochs slightly for better results

def generate(seed, chars):
    result = seed
    for _ in range(chars):

        token_list = pad_sequences(tokenizer.texts_to_sequences([result])[0:1], maxlen=max_len-1, padding='pre')
        pred_idx = np.argmax(model.predict(token_list, verbose=0), axis=-1)[0]


        if pred_idx != 0:
            result += tokenizer.index_word[pred_idx]
    return result

print(f"Generated: {generate('project gut', 30)}")



Generated: project guteteerrrrraraaaaaaa          te
