In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import re

def file_to_sentence_list(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:  # Specify encoding and handle errors
            text = file.read()
            # Splitting the text into sentences using delimiters like '.', '?', and '!'
            sentences = [sentence.strip() for sentence in re.split(r'(?<=[.!?])\s+', text) if sentence.strip()]
            return sentences
    except FileNotFoundError:
        print("Error: File not found. Please check the file path.")
        return []
    except UnicodeDecodeError:
        print("Error: Unable to decode the file. Please check the file encoding.")
        return []

# Load and prepare text data
file_path = 'BDcricket.txt'
text_data = file_to_sentence_list(file_path)

if not text_data:
    raise ValueError("No text data found. Please ensure the file is valid.")

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Pad sequences and split into predictors and labels
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# Convert target data to one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Define the model
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_len - 1))  # Adjusted embedding size
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 100  # You can make this configurable
batch_size = 64  # You can make this configurable
model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=1)

# User input for seed text and number of words to predict
seed_text = input("Enter the seed text: ")  # Get seed text from the user
num_words_to_predict = int(input("Enter the number of words to predict: "))  # Get number of words to predict from the user

# Generate next word predictions based on user input
for _ in range(num_words_to_predict):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)  # Set verbose to 0 for prediction
    predicted_word = tokenizer.index_word[np.argmax(predicted_probs)]
    seed_text += " " + predicted_word

print("Next predicted words:", seed_text)


Epoch 1/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.0378 - loss: 6.4904
Epoch 2/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0566 - loss: 5.7283
Epoch 3/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.0568 - loss: 5.7695
Epoch 4/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.0588 - loss: 5.6798
Epoch 5/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0648 - loss: 5.6094
Epoch 6/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0598 - loss: 5.5765
Epoch 7/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0606 - loss: 5.5004
Epoch 8/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0586 - loss: 5.4266
Epoch 9/100
[1m33/33[0m [32m━━━━━━━━━

Enter the seed text:  Mustafizur Rahman, also known as
Enter the number of words to predict:  10


Next predicted words: Mustafizur Rahman, also known as a role in shaping bangladesh’s national identity a source of
