In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [81]:
def preprocess_document(document):
    # Tokenize the document
    tokens = word_tokenize(document)
    # Remove punctuation and convert to lowercase
    tokens = [token.lower() for token in tokens if token.isalpha()]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

In [82]:
document = "The quick brown fox jumps over the lazy dog"
print(preprocess_document(document))

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']


In [83]:
def build_inverted_index (documents):
    inverted_index = {}
    for doc_id, document in enumerate(documents):
        terms = preprocess_document(document)
        for term in terms:
            if term not in inverted_index:
                inverted_index[term] = set()
            inverted_index[term].add(doc_id)
    return inverted_index

In [99]:
documents = [
    "The sky is blue",
    "The sun is bright",
    "The sky is cloudy",
    "The sun is shining"]

inverted_index = build_inverted_index(documents)
inverted_index 

{'sky': {0, 2},
 'blue': {0},
 'sun': {1, 3},
 'bright': {1},
 'cloudi': {2},
 'shine': {3}}

In [110]:
def boolean_query(query,inverted_index,documents):
    query_terms = preprocess_document(query)
    result = None
    results = []
    
    for term in query_terms:
        if term in inverted_index:
            term_docs = inverted_index[term]
            if result is None:
                results = term_docs
            else:
                results = results + result.intersection(term_docs)

    if results is None:
        print("No documents match the query.")
    else:
        print("Matching Documents:")
        for doc_id in results:
            print(f"- Document{doc_id}:{documents[doc_id]}")

In [113]:
query ="sky AND (blue OR cloudy)"

In [114]:
boolean_query(query ,inverted_index , documents )

Matching Documents:
- Document2:The sky is cloudy
