In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from scipy.sparse import csr_matrix
import numpy as np
import pickle

# Load and preprocess the text data
with open('Avatar1.txt', 'r', encoding='utf-8') as myfile:
    mytext = myfile.read()

mytext = mytext.replace('\ufeff', ' ').replace('\t', '').replace('"', '').replace('"', '').replace(':', '').replace('_', '').replace(',', '')

mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index) + 1

input_sequences = []
for line in mytext.split('\n'):
    token_list = mytokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        seq = token_list[:i + 1]
        input_sequences.append(seq)

max_seq_len = max([len(seq) for seq in input_sequences])
input_seq = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre'))

x = input_seq[:, :-1]
y = input_seq[:, -1]
y_sparse = csr_matrix((np.ones(len(y)), (np.arange(len(y)), y)), shape=(len(y), total_words))

# Build and train the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_seq_len - 1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Use batch generator to fit the model
def batch_generator(x, y_sparse, batch_size=32):
    while True:
        indices = np.arange(x.shape[0])
        np.random.shuffle(indices)
        for i in range(0, x.shape[0], batch_size):
            batch_indices = indices[i:i + batch_size]
            x_batch = x[batch_indices]
            y_batch = y_sparse[batch_indices].toarray()
            yield x_batch, y_batch

batch_size = 32
steps_per_epoch = x.shape[0] // batch_size

model.fit(batch_generator(x, y_sparse, batch_size), steps_per_epoch=steps_per_epoch, epochs=5, verbose=1)

# Save the model
model.save('next_word_predictor.h5')

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(mytokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Function to predict the next words and generate multiple suggestions
def predict_next_words(model, tokenizer, text, max_seq_len, num_suggestions=3):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)[0]
    predicted_indices = np.argsort(predicted_probs)[-num_suggestions:][::-1]
    
    suggestions = []
    for index in predicted_indices:
        output_word = ""
        for word, idx in tokenizer.word_index.items():
            if idx == index:
                output_word = word
                break
        suggestions.append(text + " " + output_word)
    
    return suggestions




Epoch 1/5




[1m5547/5547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1440s[0m 259ms/step - accuracy: 0.0675 - loss: 7.4461
Epoch 2/5
[1m5547/5547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44498s[0m 8s/step - accuracy: 0.1246 - loss: 6.2130
Epoch 3/5
[1m5547/5547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2169s[0m 391ms/step - accuracy: 0.1543 - loss: 5.6561
Epoch 4/5
[1m5547/5547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m906s[0m 163ms/step - accuracy: 0.1773 - loss: 5.1647
Epoch 5/5
[1m5547/5547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1604s[0m 289ms/step - accuracy: 0.2075 - loss: 4.6827


