In [6]:
import gensim.downloader as api
import gensim
from bs4 import BeautifulSoup
import os

In [None]:
ft = api.load('fasttext-wiki-news-subwords-300')



In [5]:
# Processing multiple HTML files in a folder
folder_path = r"wikipedia_talk_pages"
documents = []
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "html.parser")
            result = []

            # page title 
            title = soup.find("span", class_="mw-page-title-main")
            if title:
                result.append(title.get_text(strip=True)) # strip removes leading/trailing whitespace

            # main content
            root = soup.find("div", id="mw-content-text") # we create root to be sure we are in the content section
            content = root.find("div", class_="mw-parser-output") if root else None # it verifies root is not None
            # in content, we look for h2, h3, and p tags that are direct children (not nested deeper)

            if content:
                for el in content.find_all(["h2", "h3", "p"], recursive=False): # recursive=False ensures we only get direct children
                    text = el.get_text(" ", strip=True)
                    if text:
                        result.append(text)

            full_text = "\n\n".join(result) # every wiki page is stored as a single string with double newlines between sections
            if full_text:   # only add non-empty documents    
                documents.append(full_text)

print(f'The number of documents in the list is: {len(documents)}', "Type of documents:", type(documents[0]))
print("First 500 characters of the first document:\n", documents[0][:500]) 

The number of documents in the list is: 1225 Type of documents: <class 'str'>
First 500 characters of the first document:
 Hello fellow Wikipedians,

I have just modified one external link on 10th edition of Systema Naturae . Please take a moment to review my edit . If you have any questions, or need the bot to ignore the links, or the page altogether, please visit this simple FaQ for additional information. I made the following changes:

When you have finished reviewing my changes, you may follow the instructions on the template below to fix any issues with the URLs.

This message was posted before February 2018. A


In [8]:
doc_vectors = gensim.models.keyedvectors.KeyedVectors(ft.vector_size, count=len(documents))
for i, line in enumerate(documents):
    # gensim provides procedures for preprocessing and stopword removal
    text_without_stopwords = gensim.parsing.preprocessing.remove_stopwords(line)
    tokens = gensim.utils.simple_preprocess(text_without_stopwords)
    # the function get_mean_vector computes the average vector for all tokens
    dv = ft.get_mean_vector(tokens)
    doc_vectors.add_vector(i, dv)