**LOAD THE DATASET AND TOKENIZE**

In [None]:
with open('/content/1661-0.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenize the text into sentences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Split text into sentences
corpus = text.lower().split("\n")

# Tokenize the words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# Create input sequences using list of tokens
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')


X, y = input_sequences[:,:-1], input_sequences[:,-1]

from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=total_words)


**Build the LSTM Model**


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


model.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 19, 100)           893200    
                                                                 
 lstm_4 (LSTM)               (None, 150)               150600    
                                                                 
 dense_4 (Dense)             (None, 8932)              1348732   
                                                                 
Total params: 2392532 (9.13 MB)
Trainable params: 2392532 (9.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


**Train the Model**

In [None]:
history = model.fit(X, y, epochs=5, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


**Generate Predictions**

In [None]:
import numpy as np

def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text


seed_text = "Produced by an anonymous" #Example sentence
next_words = 1
generated_text = generate_text(seed_text, next_words, max_sequence_len)
print("Generated text:", generated_text)


Generated text: Produced by an anonymous project
