In [2]:
from bs4 import BeautifulSoup
import os
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import numpy as np

After downloading the data from https://moodle.helsinki.fi/pluginfile.php/6645850/mod_page/content/19/enwiki-20181001-corpus.100-articles.txt, I split the individual articles using BeautifulSoup by selecting the <article> tags. Then I converted the bs4 elements to strings and added them to the documents list I had created earlier.

In [3]:
# use this code only if you want to use the enwiki-20181001-corpus.100-articles.txt file

with open ("enwiki-20181001-corpus.100-articles.txt", "r", encoding="utf-8") as f:
    document = f.read()

soup = BeautifulSoup(document, 'html.parser')
articles = soup.find_all('article')
print(f'The number of articles in the document is: {len(articles)}')

documents = []
for article in articles:
    documents.append(str(article))

print(f'The number of documents in the list is: {len(documents)}', "Type of documents:", type(documents[0]))
print("First 500 characters of the first document:\n", documents[0][:500])

The number of articles in the document is: 100
The number of documents in the list is: 100 Type of documents: <class 'str'>
First 500 characters of the first document:
 <article name="Anarchism">
Anarchism is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical or free associations. Anarchism holds the state to be undesirable, unnecessary and harmful. According to Peter Kropotkin, Godwin was "the first to formulate the political and economical conceptions of anarchism, 


Now it's time to adapt the code for html we scraped from wikipedia:

In [4]:
# Processing multiple HTML files in a folder
folder_path = r"wikipedia_talk_pages"
documents = []
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "html.parser")
            result = []

            # page title 
            title = soup.find("span", class_="mw-page-title-main")
            if title:
                result.append(title.get_text(strip=True)) # strip removes leading/trailing whitespace

            # main content
            root = soup.find("div", id="mw-content-text") # we create root to be sure we are in the content section
            content = root.find("div", class_="mw-parser-output") if root else None # it verifies root is not None
            # in content, we look for h2, h3, and p tags that are direct children (not nested deeper)

            if content:
                for el in content.find_all(["h2", "h3", "p"], recursive=False): # recursive=False ensures we only get direct children
                    text = el.get_text(" ", strip=True)
                    if text:
                        result.append(text)

            full_text = "\n\n".join(result) # every wiki page is stored as a single string with double newlines between sections
            if full_text:   # only add non-empty documents    
                documents.append(full_text)

print(f'The number of documents in the list is: {len(documents)}', "Type of documents:", type(documents[0]))
print("First 500 characters of the first document:\n", documents[0][:500])      

The number of documents in the list is: 1225 Type of documents: <class 'str'>
First 500 characters of the first document:
 Glutamic acid

If high levels of glutamic acid are unhealthy, what would be considered a high level?

"Glutamic acid" and "glutamate" are often used interchangeably but, technically, glutamate is the anionic form of glutamic acid. This article doesn't recognize that. RJII 17:01, 9 September 2005 (UTC) [ reply ]

Response to above: whether it exists in the anionic or neutral form is going to be entirely dependent upon the pH - review the Henderson-Hasselbach equation. Cajolingwilhelm 01:30, 14 Fe


In [5]:
# Operators and/AND, or/OR, not/NOT become &, |, 1 -
# Parentheses are left untouched
# Everything else is interpreted as a term and fed
# through td_matrix[t2i["..."]]
d = {"and": "&", "AND": "&",
         "or": "|", "OR": "|",
         "not": "1 -", "NOT": "1 -",
         "(": "(", ")": ")"}  # operator replacements

In [9]:
def debug_print(*args):
    # If you want debug prints to show up, uncomment the line below: 
    print(*args)
    pass

In [7]:
# rewrite tokens
def rewrite_token(t):
    # If the search term exists in our dictionary of operators, get it, 
    # otherwise find occurrences of the term in `td_matrix``. If the term 
    # is not in our dictionary, then the query results in 0 (since the 
    # term does not occur in any of the documents)
    return d.get(t, f'(td_matrix[t2i["{t}"]] if "{t}" in t2i else empty_row)')

def rewrite_query(query):
    # Rewrite every token in the query
    return " ".join(rewrite_token(t) for t in query.split())

def test_query(query, td_matrix, t2i, documents):
    # Generate a row of all zeroes for queries containing words not in our 
    # dictionary
    empty_row = np.matrix(np.repeat(0, td_matrix.shape[1]))

    rewritten = rewrite_query(query)
    debug_print("Query: '" + query + "'")
    debug_print("Rewritten:", rewritten)

    # Eval runs the string as a Python command
    # `td_matrix`, `t2i`, and `empty_row` have to be in scope in 
    # order for eval() to work
    eval_result = eval(rewritten)
    debug_print("Matching:", eval_result)

    # Finding the matching document
    hits_matrix = eval_result
    hits_list = list(hits_matrix.nonzero()[1])

    print(f"Found {len(hits_list)} results")

    # Prints the first 500 characters of the matching document
    for i, doc_idx in enumerate(hits_list):
        print(f"Matching doc #{i}: {documents[doc_idx][:500]}")

In [25]:
def main():
    cv = CountVectorizer(lowercase=True, binary=True)

    sparse_matrix = cv.fit_transform(documents)
    debug_print("Term-document matrix: (?)\n")
    debug_print(sparse_matrix)

    dense_matrix = sparse_matrix.todense()
    debug_print("Term-document matrix: (?)\n")
    debug_print(dense_matrix)

    td_matrix = dense_matrix.T  # .T transposes the matrix
    debug_print("Term-document matrix:\n")
    debug_print(td_matrix)

    t2i = cv.vocabulary_
    debug_print(t2i)

    while True:
        query = input("Search for something. If you want to stop your search "
                      "type 'q'. Search: ")
        
        # Remove any leading & trailing whitespace from the query and turn it to 
        # lowercase for case insensitive searching
        query = query.lower().strip()

        # Check for empty queries and continue asking for input if so
        if query == "":
            continue

        if query == "q":
            break
        else:
            print(query)
            # because td_matrix and t2i are defined in main(), also pass these
            # to other functions
            test_query(query, td_matrix, t2i, documents)

def main_using_class():
    import sys
    # This is absolutely a hack and every style guide you will ever find 
    # will tell you to never do this!
    # Unfortunately the "correct" way to do this would be to create a new 
    # GitHub repo, add the package to requirements.txt, manage both repos 
    # simultaniously, etc....
    # ...or just do the hacky way lol it's not like we're getting paid
    sys.path.append("..")
    import Search_Algorithms.boolean
    import typing
    import importlib
    importlib.reload(Search_Algorithms.boolean)

    engine = Search_Algorithms.boolean.BooleanSearchEngine(typing.cast(list[str], documents))

    while True:
        query = input("Search for something. If you want to stop your search "
                      "type 'q'. Search: ")
        if query == "q":
            break

        r = engine.search(query)
        print(f"Results: {len(r)}")
        for i, e in enumerate(r):
            print(f"Result #{i}: {e[:100]}")

# main()
main_using_class()

Results: 47
Result #0: Dog

Should we return to using Canis lupus familiaris first and foremost?

Neglecting to use Canis l
Result #1: Diabetes in cats

I realize there are many brands of insulin that may be used with cats and dogs.  B
Result #2: Purebred

Um...okay...so we've added cats...do we need to give equal time to horses et. al.?

Should
Result #3: Bengal cat

I noticed that someone posted a head-on picture of the Bengal Domestic Cat, but I was ho
Result #4: Feral cat

This article was the subject of a Wiki Education Foundation-supported course assignment, 
Result #5: List of experimental cat breeds

I'm moving this (on the assertion that the move will not be controv
Result #6: Burmese cat

Is it just me, or does this sentence not make much sense? This breed was first recogniz
Result #7: Rodent

Under "Definition", I believe this is incorrect:
"This specialisation gives rodents their na
Result #8: Birman

Excellent photo, and superb specimen! But it'd be best to have a full-bod