In [15]:
pip install wikipedia-api



In [16]:
import wikipediaapi
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

In [17]:
def get_formatted_lines(page_title):
    user_agent = "Mozilla/5.0 (https://github.com/brittojo7n/ProcessExplorer)"
    headers = {'User-Agent': user_agent}

    wiki_wiki = wikipediaapi.Wikipedia('en', headers=headers)
    page = wiki_wiki.page(page_title)

    if not page.exists():
        print("Page not found.")
        return []

    formatted_lines = []

    for section in page.sections:
        for line in section.text.split('\n'):
            if line.strip():
                formatted_lines.append(line.strip())

    return formatted_lines

In [18]:
def preprocess_data(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    total_words = len(tokenizer.word_index) + 1

    input_sequences = []
    for line in lines:
        tokenized_line = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(tokenized_line)):
            n_gram_sequence = tokenized_line[:i+1]
            input_sequences.append(n_gram_sequence)

    max_sequence_length = max([len(seq) for seq in input_sequences])
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

    X, y = input_sequences[:, :-1], input_sequences[:, -1]

    return X, y, total_words, max_sequence_length, tokenizer

In [19]:
def build_model(total_words, max_sequence_length):
    model = Sequential()
    model.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [20]:
def train_model(model, X, y, epochs=10, batch_size=32):
    model.fit(X, y, epochs=epochs, batch_size=batch_size)

In [23]:
def generate_content(model, tokenizer, seed_text, max_sequence_length, num_words=50):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_index = np.argmax(predicted_probs)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word

    return seed_text

In [24]:
if __name__ == "__main__":
    page_title = "Python (programming language)"
    lines = get_formatted_lines(page_title)

    X, y, total_words, max_sequence_length, tokenizer = preprocess_data(lines)

    model = build_model(total_words, max_sequence_length)
    train_model(model, X, y)

    seed_text = "Python is"
    generated_content = generate_content(model, tokenizer, seed_text, max_sequence_length)
    print(generated_content)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Python is python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and python and
