In [4]:

import re
from collections import defaultdict

# Sample documents
documents = {
    1: "This is the first document. It contains some words.",
    2: "This is the second document. It also contains words.",
    3: "The third document is different from the first two.",
    4: "Inverted index is essential for document retrieval.",
}

# Preprocess and tokenize text
def preprocess(text):
    return re.findall(r'\w+', text.lower())

# Create inverted index
inverted_index = defaultdict(list)
for doc_id, document in documents.items():
    for token in preprocess(document):
        inverted_index[token].append(doc_id)

# Retrieve documents
def retrieve_documents(query):
    result = [set(inverted_index[token]) for token in preprocess(query) if token in inverted_index]
    return set.intersection(*result) if result else set()

# Query and retrieve documents
query = input("Enter query: ")
print("Query:", query)
print("Matching Documents:", retrieve_documents(query))
print(f"Inverted index :{inverted_index}")

Enter query: words
Query: words
Matching Documents: {1, 2}
Inverted index :defaultdict(<class 'list'>, {'this': [1, 2], 'is': [1, 2, 3, 4], 'the': [1, 2, 3, 3], 'first': [1, 3], 'document': [1, 2, 3, 4], 'it': [1, 2], 'contains': [1, 2], 'some': [1], 'words': [1, 2], 'second': [2], 'also': [2], 'third': [3], 'different': [3], 'from': [3], 'two': [3], 'inverted': [4], 'index': [4], 'essential': [4], 'for': [4], 'retrieval': [4]})
