In [None]:
from bs4 import BeautifulSoup
import os
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path

After downloading the data from https://moodle.helsinki.fi/pluginfile.php/6645850/mod_page/content/19/enwiki-20181001-corpus.100-articles.txt, I split the individual articles using BeautifulSoup by selecting the <article> tags. Then I converted the bs4 elements to strings and added them to the documents list I had created earlier.

In [None]:
# use this code only if you want to use the enwiki-20181001-corpus.100-articles.txt file

with open ("enwiki-20181001-corpus.100-articles.txt", "r", encoding="utf-8") as f:
    document = f.read()

soup = BeautifulSoup(document, 'html.parser')
articles = soup.find_all('article')
print(f'The number of articles in the document is: {len(articles)}')

documents = []
for article in articles:
    documents.append(str(article))

print(f'The number of documents in the list is: {len(documents)}', "Type of documents:", type(documents[0]))
print("First 500 characters of the first document:\n", documents[0][:500])

The number of articles in the document is: 100
The number of documents in the list is: 100 Type of documents: <class 'str'>
First 500 characters of the first document:
 <article name="Anarchism">
Anarchism is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical or free associations. Anarchism holds the state to be undesirable, unnecessary and harmful. According to Peter Kropotkin, Godwin was "the first to formulate the political and economical conceptions of anarchism, 


Now it's time to adapt the code for html we scraped from wikipedia:

In [6]:
# Processing multiple HTML files in a folder
folder_path = r"wikipedia_talk_pages"
documents = []
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "html.parser")
            result = []

            # page title 
            title = soup.find("span", class_="mw-page-title-main")
            if title:
                result.append(title.get_text(strip=True)) # strip removes leading/trailing whitespace

            # main content
            root = soup.find("div", id="mw-content-text") # we create root to be sure we are in the content section
            content = root.find("div", class_="mw-parser-output") if root else None # it verifies root is not None
            # in content, we look for h2, h3, and p tags that are direct children (not nested deeper)

            if content:
                for el in content.find_all(["h2", "h3", "p"], recursive=False): # recursive=False ensures we only get direct children
                    text = el.get_text(" ", strip=True)
                    if text:
                        result.append(text)

            full_text = "\n\n".join(result) # every wiki page is stored as a single string with double newlines between sections
            if full_text:   # only add non-empty documents    
                documents.append(full_text)

print(f'The number of documents in the list is: {len(documents)}', "Type of documents:", type(documents[0]))
print("First 500 characters of the first document:\n", documents[0][:500])      

The number of documents in the list is: 1225 Type of documents: <class 'str'>
First 500 characters of the first document:
 Hello fellow Wikipedians,

I have just modified one external link on 10th edition of Systema Naturae . Please take a moment to review my edit . If you have any questions, or need the bot to ignore the links, or the page altogether, please visit this simple FaQ for additional information. I made the following changes:

When you have finished reviewing my changes, you may follow the instructions on the template below to fix any issues with the URLs.

This message was posted before February 2018. A


In [7]:
# Operators and/AND, or/OR, not/NOT become &, |, 1 -
# Parentheses are left untouched
# Everything else is interpreted as a term and fed
# through td_matrix[t2i["..."]]
d = {"and": "&", "AND": "&",
         "or": "|", "OR": "|",
         "not": "1 -", "NOT": "1 -",
         "(": "(", ")": ")"}  # operator replacements

In [8]:
# rewrite tokens
def rewrite_token(t, td_matrix, t2i):
    # returns t d_matrix[t2i["is"]]
    return d.get(t, 'td_matrix[t2i["{:s}"]]'.format(t))


def rewrite_query(query, td_matrix, t2i):
    # rewrite every token in the query
    return " ".join(rewrite_token(t, td_matrix, t2i) for t in query.split())


def test_query(query, td_matrix, t2i, documents):
    print("Query: '" + query + "'")
    print("Rewritten:", rewrite_query(query, td_matrix, t2i))
    # Eval runs the string as a Python command
    print("Matching:", eval(rewrite_query(query, td_matrix, t2i)))
    # finding the matching document
    hits_matrix = eval(rewrite_query(query, td_matrix, t2i))
    hits_list = list(hits_matrix.nonzero()[1])
    # prints the first 500 characters of the matching document
    for i, doc_idx in enumerate(hits_list):
        print("Matching doc #{:d}: {:s}".format(i, (documents[doc_idx][:500])))

In [None]:
def main():
    cv = CountVectorizer(lowercase=True, binary=True)
    sparse_matrix = cv.fit_transform(documents)

    print("Term-document matrix: (?)\n")
    print(sparse_matrix)

    dense_matrix = sparse_matrix.todense()

    print("Term-document matrix: (?)\n")
    print(dense_matrix)

    td_matrix = dense_matrix.T  # .T transposes the matrix

    print("Term-document matrix:\n")
    print(td_matrix)

    t2i = cv.vocabulary_
    print(t2i)

    while True:
        query = input("Search for something. If you want to stop your search "
                      "type 'q'. Search: ")
        query = query.lower()

        if query == "q":
            break
        else:
            print(query)
            # because td_matrix and t2i are defined in main(), also pass these
            # to other functions
            test_query(query, td_matrix, t2i, documents)

main()

Term-document matrix: (?)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 285952 stored elements and shape (1225, 34795)>
  Coords	Values
  (0, 15404)	1
  (0, 13051)	1
  (0, 33801)	1
  (0, 15272)	1
  (0, 17804)	1
  (0, 20911)	1
  (0, 22696)	1
  (0, 12615)	1
  (0, 19005)	1
  (0, 22684)	1
  (0, 220)	1
  (0, 11450)	1
  (0, 22553)	1
  (0, 30516)	1
  (0, 21605)	1
  (0, 24406)	1
  (0, 30602)	1
  (0, 20966)	1
  (0, 31406)	1
  (0, 26793)	1
  (0, 21419)	1
  (0, 11445)	1
  (0, 16219)	1
  (0, 34313)	1
  (0, 4034)	1
  :	:
  (1224, 32556)	1
  (1224, 17785)	1
  (1224, 854)	1
  (1224, 7451)	1
  (1224, 26533)	1
  (1224, 6411)	1
  (1224, 317)	1
  (1224, 32681)	1
  (1224, 26531)	1
  (1224, 4614)	1
  (1224, 17772)	1
  (1224, 18801)	1
  (1224, 31886)	1
  (1224, 18209)	1
  (1224, 973)	1
  (1224, 22974)	1
  (1224, 837)	1
  (1224, 12781)	1
  (1224, 0)	1
  (1224, 28217)	1
  (1224, 7439)	1
  (1224, 23439)	1
  (1224, 10836)	1
  (1224, 29200)	1
  (1224, 34457)	1
Term-document matrix: (?)

[[0 0 0 ..