In [None]:
import re

def clean_text(text):

    text = re.sub(r"[^a-zA-Z\s\-_]", "", text)  # Remove special characters except hyphens and underscores
    text = re.sub(r"\s+", " ", text)      # Remove extra spaces
    return text.lower().strip()          # Convert to lowercase and remove leading/trailing spaces

In [None]:
"""
This function builds an inverted index, which is like a fancy way of storing information
about where words appear in our documents.
"""
def build_inverted_index(documents):
    inverted_index = {}
    for doc_id, document in enumerate(documents):
        cleaned_document = clean_text(document)
        terms = cleaned_document.split()
        for term in terms:
            if term not in inverted_index:
                inverted_index[term] = []
            inverted_index[term].append(doc_id)  # Add document ID to the term's list
    return inverted_index

In [None]:
"""
This function takes a Boolean query  and uses the
inverted index to find documents that match the query.
"""

def process_boolean_query(query, inverted_index):

    query_terms = query.split()
    results = set(range(len(documents)))

    for index in range(len(query_terms)):
        term = query_terms[index]

        if term == "AND":
            if index + 1 < len(query_terms):
                next_term = query_terms[index + 1]
                results = results.intersection(set(inverted_index.get(next_term, [])))

        elif term == "OR":
            if index + 1 < len(query_terms):
                next_term = query_terms[index + 1]
                results = results.union(set(inverted_index.get(next_term, [])))

        elif term == "NOT":
            if index + 1 < len(query_terms):
                next_term = query_terms[index + 1]
                results = results.difference(set(inverted_index.get(next_term, [])))

        else:
            results = results.intersection(set(inverted_index.get(term, [])))

    return sorted(list(results))

In [None]:
"""
This iterates through each query, finds matching documents, and then highlights the search terms
within those documents.
"""

def print_results(queries, documents, inverted_index):

    for query in queries:
        matching_documents = process_boolean_query(query, inverted_index)

        if not matching_documents:
            print(f"Query: {query}, Result: No documents found")
        else:
            # Highlight the search terms in the titles of matching documents
            emphasized_documents = []
            for doc_id in matching_documents:
                document = documents[doc_id]
                original_query_terms = query.split()
               # Replace each word in the document with its bolded version
                for term in original_query_terms:
                    document = document.replace(term, f"**{term}**")
                emphasized_documents.append(document)

            print(f"Query: {query}, Result:")
            for doc in emphasized_documents:
                print(doc)

In [None]:
documents = [
    "The importance of education in society and its impact on economic growth",
    "The benefits of regular exercise for maintaining physical and mental health",
    "The role of technology in modern workplaces: Increasing productivity and efficiency",
    "The influence of social media on interpersonal relationships and communication",
    "The significance of environmental conservation efforts for sustainable development",
    "The challenges and opportunities of globalization in the 21st century"
]

queries = [
    "technology AND workplaces",
    "environment AND NOT conservation",
    "social media OR interpersonal relationships",
    "health AND public AND NOT governments"
]


inverted_index = build_inverted_index(documents)
print_results(queries, documents, inverted_index)

Query: technology AND workplaces, Result:
The role of **technology** in modern **workplaces**: Increasing productivity and efficiency
Query: environment AND NOT conservation, Result: No documents found
Query: social media OR interpersonal relationships, Result:
The influence of **social** **media** on **interpersonal** **relationships** and communication
Query: health AND public AND NOT governments, Result: No documents found
