In [53]:
import numpy as np
import wikipedia
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences as keras_pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, SimpleRNN, Dense
from keras.callbacks import EarlyStopping
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Fetch Wikipedia articles
def fetch_articles(titles):
    wikipedia.set_lang("en")
    articles = {}
    for title in titles:
        articles[title] = wikipedia.page(title).content
    return articles

titles = ['Python (programming language)', 'Python (snake)']
articles = fetch_articles(titles)

# Preprocess text
def preprocess_text(text):
    text = re.sub(r'\W|\d', ' ', text)
    text = text.lower()
    text = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

preprocessed_articles = {title: preprocess_text(article) for title, article in articles.items()}

# Character-level preprocessing
def char_level_preprocessing(text):
    text = ' '.join(preprocess_text(text))
    text = text.replace(' ', '_')
    return list(text)

char_tokenized_articles = {title: char_level_preprocessing(article) for title, article in articles.items()}

# Character tokenizer
char_tokenizer = Tokenizer(char_level=True)
char_tokenizer.fit_on_texts(char_tokenized_articles.values())

# Convert text to sequences of characters
char_sequences = char_tokenizer.texts_to_sequences(char_tokenized_articles.values())

# Generate character sequences
def generate_char_sequences(text, seq_length):
    return [(text[i:i+seq_length], text[i+seq_length]) for i in range(len(text) - seq_length)]

char_sequences = [generate_char_sequences(seq, max_sequence_length) for seq in char_sequences]
char_sequences = [item for sublist in char_sequences for item in sublist]

# Separate input and target characters
char_input_sequences = [seq[0] for seq in char_sequences]
char_target_characters = [seq[1] for seq in char_sequences]

# Pad character sequences
char_input_sequences = keras_pad_sequences(char_input_sequences, maxlen=max_sequence_length-1, padding='pre')
char_target_characters = np.array(char_target_characters)

# Word tokenizer
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(preprocessed_articles.values())
word_sequences = word_tokenizer.texts_to_sequences(preprocessed_articles.values())

# Generate word sequences
def generate_word_sequences(text, seq_length):
    return [(text[i:i+seq_length], text[i+seq_length]) for i in range(len(text) - seq_length)]

word_sequences = [generate_word_sequences(seq, max_sequence_length) for seq in word_sequences]
word_sequences = [item for sublist in word_sequences for item in sublist]

# Separate input and target words
word_input_sequences = [seq[0] for seq in word_sequences]
word_target_words = [seq[1] for seq in word_sequences]

# Pad word sequences
word_input_sequences = keras_pad_sequences(word_input_sequences, maxlen=max_sequence_length-1, padding='pre')
word_target_words = np.array(word_target_words)

# Define models
def create_simple_rnn_model(vocab_size, max_sequence_length):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=100, input_length=max_sequence_length-1),
        SimpleRNN(units=100),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def create_lstm_model(vocab_size, max_sequence_length):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=100, input_length=max_sequence_length-1),
        LSTM(units=100),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Character-based models
char_vocab_size = len(char_tokenizer.word_index) + 1
char_simple_rnn_model = create_simple_rnn_model(char_vocab_size, max_sequence_length)
char_lstm_model = create_lstm_model(char_vocab_size, max_sequence_length)

# Train character-based models
char_simple_rnn_model.fit(char_input_sequences, char_target_characters, epochs=50, verbose=1)
char_lstm_model.fit(char_input_sequences, char_target_characters, epochs=50, verbose=1)

# Word-based models
word_vocab_size = len(word_tokenizer.word_index) + 1
word_simple_rnn_model = create_simple_rnn_model(word_vocab_size, max_sequence_length)
word_lstm_model = create_lstm_model(word_vocab_size, max_sequence_length)

# Train word-based models
word_simple_rnn_model.fit(word_input_sequences, word_target_words, epochs=50, verbose=1)
word_lstm_model.fit(word_input_sequences, word_target_words, epochs=50, verbose=1)

# Evaluation
def evaluate_model(model, input_sequences, target_words):
    loss, accuracy = model.evaluate(input_sequences, target_words, verbose=0)
    return accuracy

# Evaluate models
char_simple_rnn_accuracy = evaluate_model(char_simple_rnn_model, char_input_sequences, char_target_characters)
char_lstm_accuracy = evaluate_model(char_lstm_model, char_input_sequences, char_target_characters)
word_simple_rnn_accuracy = evaluate_model(word_simple_rnn_model, word_input_sequences, word_target_words)
word_lstm_accuracy = evaluate_model(word_lstm_model, word_input_sequences, word_target_words)

print("Character-based prediction using Simple RNN Accuracy:", char_simple_rnn_accuracy)
print("Character-based prediction using LSTM Accuracy:", char_lstm_accuracy)
print("Word-based prediction using Simple RNN Accuracy:", word_simple_rnn_accuracy)
print("Word-based prediction using LSTM Accuracy:", word_lstm_accuracy)

# Prediction functions
def preprocess_input_text(input_text):
    preprocessed_text = preprocess_text(input_text)
    return preprocessed_text

def generate_input_sequences(preprocessed_text, max_sequence_length, tokenizer):
    input_sequences = tokenizer.texts_to_sequences([preprocessed_text])[0]
    input_sequences = keras_pad_sequences([input_sequences], maxlen=max_sequence_length-1, padding='pre')
    return input_sequences

def predict_next_word_lstm_word_based(input_text, model, max_sequence_length, tokenizer):
    preprocessed_text = preprocess_input_text(input_text)
    input_sequences = generate_input_sequences(preprocessed_text, max_sequence_length, tokenizer)
    predicted_probabilities = model.predict(input_sequences, verbose=0)[0]
    predicted_index = np.argmax(predicted_probabilities)
    predicted_word = tokenizer.index_word[predicted_index]
    return predicted_word

def predict_next_character_lstm_char_based(input_text, model, max_sequence_length, tokenizer, max_chars=100):
    input_text = list(input_text)
    generated_text = input_text

    for _ in range(max_chars):
        input_sequences = tokenizer.texts_to_sequences([''.join(generated_text)])[0]
        input_sequences = keras_pad_sequences([input_sequences], maxlen=max_sequence_length-1, padding='pre')
        predicted_probabilities = model.predict(input_sequences, verbose=0)[0]
        predicted_index = np.argmax(predicted_probabilities)
        predicted_character = tokenizer.index_word.get(predicted_index, '')

        generated_text.append(predicted_character)

        if predicted_character == ' ' or len(generated_text) >= max_chars:
            break

    return ''.join(generated_text).strip()

# Example usage for LSTM word-based prediction
input_text = "Python is a popular"
predicted_word = predict_next_word_lstm_word_based(input_text, word_lstm_model, max_sequence_length, word_tokenizer)
print("Predicted next word:", predicted_word)

# Example usage for LSTM character-based prediction
input_text = "ge"
predicted_sequence = predict_next_character_lstm_char_based(input_text, char_lstm_model, max_sequence_length, char_tokenizer)
print("Predicted sequence:", predicted_sequence)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ali\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

In [61]:
def predict_next_character_lstm_char_based(input_text, model, max_sequence_length, tokenizer, max_chars=100):
    input_text = list(input_text)
    generated_text = input_text

    for _ in range(max_chars):
        # Convert the generated text to sequences
        input_sequences = tokenizer.texts_to_sequences([''.join(generated_text)])[0]
        input_sequences = keras_pad_sequences([input_sequences], maxlen=max_sequence_length-1, padding='pre')
        
        # Predict the next character
        predicted_probabilities = model.predict(input_sequences, verbose=0)[0]
        predicted_index = np.argmax(predicted_probabilities)
        
        # Get the predicted character
        predicted_character = tokenizer.index_word.get(predicted_index, '')
        
        # Append the predicted character to the generated text
        generated_text.append(predicted_character)

        # Check if the generated text forms a complete word
        generated_word = ''.join(generated_text).strip()
        if generated_word in tokenizer.word_index:
            break

        # Break the loop if a space is detected
        if predicted_character == ' ':
            break

    return generated_word

# Example usage for LSTM character-based prediction
input_text = "lan"
predicted_sequence = predict_next_character_lstm_char_based(input_text, char_lstm_model, max_sequence_length, char_tokenizer)
print("Predicted sequence:", predicted_sequence)



Predicted sequence: lant_switter_two_like_reticulated_python_southeast_support_python_support_optime_name_type_constraint_c
