In [1]:
from bs4 import BeautifulSoup
import os
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import numpy as np

After downloading the data from https://moodle.helsinki.fi/pluginfile.php/6645850/mod_page/content/19/enwiki-20181001-corpus.100-articles.txt, I split the individual articles using BeautifulSoup by selecting the <article> tags. Then I converted the bs4 elements to strings and added them to the documents list I had created earlier.

In [2]:
# use this code only if you want to use the enwiki-20181001-corpus.100-articles.txt file

with open ("enwiki-20181001-corpus.100-articles.txt", "r", encoding="utf-8") as f:
    document = f.read()

soup = BeautifulSoup(document, 'html.parser')
articles = soup.find_all('article')
print(f'The number of articles in the document is: {len(articles)}')

documents = []
for article in articles:
    documents.append(str(article))

print(f'The number of documents in the list is: {len(documents)}', "Type of documents:", type(documents[0]))
print("First 500 characters of the first document:\n", documents[0][:500])

The number of articles in the document is: 100
The number of documents in the list is: 100 Type of documents: <class 'str'>
First 500 characters of the first document:
 <article name="Anarchism">
Anarchism is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical or free associations. Anarchism holds the state to be undesirable, unnecessary and harmful. According to Peter Kropotkin, Godwin was "the first to formulate the political and economical conceptions of anarchism, 


Now it's time to adapt the code for html we scraped from wikipedia:

In [3]:
# Processing multiple HTML files in a folder
folder_path = r"wikipedia_talk_pages"
documents = []
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "html.parser")
            result = []

            # page title 
            title = soup.find("span", class_="mw-page-title-main")
            if title:
                result.append(title.get_text(strip=True)) # strip removes leading/trailing whitespace

            # main content
            root = soup.find("div", id="mw-content-text") # we create root to be sure we are in the content section
            content = root.find("div", class_="mw-parser-output") if root else None # it verifies root is not None
            # in content, we look for h2, h3, and p tags that are direct children (not nested deeper)

            if content:
                for el in content.find_all(["h2", "h3", "p"], recursive=False): # recursive=False ensures we only get direct children
                    text = el.get_text(" ", strip=True)
                    if text:
                        result.append(text)

            full_text = "\n\n".join(result) # every wiki page is stored as a single string with double newlines between sections
            if full_text:   # only add non-empty documents    
                documents.append(full_text)

print(f'The number of documents in the list is: {len(documents)}', "Type of documents:", type(documents[0]))
print("First 500 characters of the first document:\n", documents[0][:500])      

The number of documents in the list is: 1225 Type of documents: <class 'str'>
First 500 characters of the first document:
 Hello fellow Wikipedians,

I have just modified one external link on 10th edition of Systema Naturae . Please take a moment to review my edit . If you have any questions, or need the bot to ignore the links, or the page altogether, please visit this simple FaQ for additional information. I made the following changes:

When you have finished reviewing my changes, you may follow the instructions on the template below to fix any issues with the URLs.

This message was posted before February 2018. A


In [4]:
# Operators and/AND, or/OR, not/NOT become &, |, 1 -
# Parentheses are left untouched
# Everything else is interpreted as a term and fed
# through td_matrix[t2i["..."]]
d = {"and": "&", "AND": "&",
         "or": "|", "OR": "|",
         "not": "1 -", "NOT": "1 -",
         "(": "(", ")": ")"}  # operator replacements

In [5]:
def debug_print(*args):
    # If you want debug prints to show up, uncomment the line below: 
    # print(*args)
    pass

In [6]:
# rewrite tokens
def rewrite_token(t):
    # If the search term exists in our dictionary of operators, get it, 
    # otherwise find occurrences of the term in `td_matrix``. If the term 
    # is not in our dictionary, then the query results in 0 (since the 
    # term does not occur in any of the documents)
    return d.get(t, f'(td_matrix[t2i["{t}"]] if "{t}" in t2i else empty_row)')

def rewrite_query(query):
    # Rewrite every token in the query
    return " ".join(rewrite_token(t) for t in query.split())

def test_query(query, td_matrix, t2i, documents):
    # Generate a row of all zeroes for queries containing words not in our 
    # dictionary
    empty_row = np.matrix(np.repeat(0, td_matrix.shape[1]))

    rewritten = rewrite_query(query)
    debug_print("Query: '" + query + "'")
    debug_print("Rewritten:", rewritten)

    # Eval runs the string as a Python command
    # `td_matrix`, `t2i`, and `empty_row` have to be in scope in 
    # order for eval() to work
    eval_result = eval(rewritten)
    debug_print("Matching:", eval_result)

    # Finding the matching document
    hits_matrix = eval_result
    hits_list = list(hits_matrix.nonzero()[1])

    print(f"Found {len(hits_list)} results")

    # Prints the first 500 characters of the matching document
    for i, doc_idx in enumerate(hits_list):
        print(f"Matching doc #{i}: {documents[doc_idx][:500]}")

In [11]:
def main():
    cv = CountVectorizer(lowercase=True, binary=True)

    sparse_matrix = cv.fit_transform(documents)
    debug_print("Term-document matrix: (?)\n")
    debug_print(sparse_matrix)

    dense_matrix = sparse_matrix.todense()
    debug_print("Term-document matrix: (?)\n")
    debug_print(dense_matrix)

    td_matrix = dense_matrix.T  # .T transposes the matrix
    debug_print("Term-document matrix:\n")
    debug_print(td_matrix)

    t2i = cv.vocabulary_
    debug_print(t2i)

    while True:
        query = input("Search for something. If you want to stop your search "
                      "type 'q'. Search: ")
        
        # Remove any leading & trailing whitespace from the query and turn it to 
        # lowercase for case insensitive searching
        query = query.lower().strip()

        # Check for empty queries and continue asking for input if so
        if query == "":
            continue

        if query == "q":
            break
        else:
            print(query)
            # because td_matrix and t2i are defined in main(), also pass these
            # to other functions
            test_query(query, td_matrix, t2i, documents)

main()

cat and witch
Found 2 results
Matching doc #0: Black cat

" Another possible theory as to how the plague spread so quickly is that by killing many of the cats (believed to be witches' familiars) during the witch hunts caused the rodent population to rise, and with them rose the probability of infection ."  Taken from an anon, 68.174.249.133, contribution from the Black Death , later reverted.

I've heard this theory, but don't have the materials on hand to substantiate it.  Actually it was probably a social result of the ongoing waves of pla
Matching doc #1: Cultural depictions of cats

A number of suggestions

-- Davelane 19:21, 26 December 2005 (UTC) [ reply ]

"The human killing of cats in the Middle Ages has also been cited as one of the reasons for the spread of bubonic plague - the Black Death, which was spread by the increased rodent population caused by the death of so many cats."

This passage is internally inconsistent with Wikipedia's own account of the Black Death.  It stat