2. Implement a program for retrieval of documents using inverted files

In [2]:
import re
import collections

# Sample documents
documents = {
    1: "This is the first document. It contains some words.",
    2: "This is the second document. It also contains words.",
    3: "The third document is different from the first two.",
    4: "Inverted index is essential for document retrieval.",
}

# Function to preprocess and tokenize text
def preprocess(text):
    text = text.lower()
    tokens = re.findall(r'\w+', text)
    return tokens

# Create an inverted index
def build_inverted_index(documents):
    inverted_index = collections.defaultdict(list)
    for doc_id, document in documents.items():
        tokens = preprocess(document)
        for token in tokens:
            inverted_index[token].append(doc_id)
    return inverted_index

# Function to perform document retrieval
def retrieve_documents(query, inverted_index):
    query_tokens = preprocess(query)
    result = set()

    # Retrieve documents containing each query token
    for token in query_tokens:
        if token in inverted_index:
            if not result:
                result = set(inverted_index[token])
            else:
                result = result.intersection(inverted_index[token])

    return result

# Build the inverted index
inverted_index = build_inverted_index(documents)

# Example queries
query1 = input("Enter query: ")


# Retrieve documents for the queries
result1 = retrieve_documents(query1, inverted_index)


# Display the results
print("Query:", query1)
print("Matching Documents:", result1)

Enter query:  ["document", "retrieval"]


Query: ["document", "retrieval"]
Matching Documents: {4}


In [3]:
import re
import collections

# Sample documents
documents = {
    1: "This is the first document. It contains some words.",
    2: "This is the second document. It also contains words.",
    3: "The third document is different from the first two.",
    4: "Inverted index is essential for document retrieval.",
}

# Function to preprocess and tokenize text
def preprocess(text):
    text = text.lower()
    tokens = re.findall(r'\w+', text)
    return tokens

# Create an inverted index
def build_inverted_index(documents):
    inverted_index = collections.defaultdict(list)
    for doc_id, document in documents.items():
        tokens = preprocess(document)
        for token in tokens:
            inverted_index[token].append(doc_id)
    return inverted_index

# Function to perform document retrieval with detailed output
def retrieve_documents(query, inverted_index):
    query_tokens = preprocess(query)
    result = set()
    
    print("\nTokenize Query: query_tokens =", query_tokens)  # Show query tokens

    # Retrieve documents containing each query token
    for token in query_tokens:
        if token in inverted_index:
            matching_docs = inverted_index[token]
            print(f"For '{token}': The inverted index shows {matching_docs}")  # Show matching documents for each token
            if not result:
                result = set(matching_docs)
            else:
                result = result.intersection(matching_docs)  # Intersection of results
        else:
            print(f"For '{token}': No matching documents found.")
            return set()  # If any token is not found, return an empty result

    return result

# Build the inverted index
inverted_index = build_inverted_index(documents)

# Example query
query1 = input("Enter query: ")

# Retrieve documents for the query
result1 = retrieve_documents(query1, inverted_index)

# Display the final result
print("\nIntersection of matching documents gives:", result1)
print("Matching Documents:", result1)


Enter query:  document retrieval



Tokenize Query: query_tokens = ['document', 'retrieval']
For 'document': The inverted index shows [1, 2, 3, 4]
For 'retrieval': The inverted index shows [4]

Intersection of matching documents gives: {4}
Matching Documents: {4}


In [2]:
import re
import collections

# Sample documents
documents = {
    1: "This is the first document. It contains some words.",
    2: "This is the second document. It also contains words.",
    3: "The third document is different from the first two.",
    4: "Inverted index is essential for document retrieval.",
}

# Function to preprocess and tokenize text
def preprocess(text):
    text = text.lower()
    tokens = re.findall(r'\w+', text)
    return tokens

# Create an inverted index
def build_inverted_index(documents):
    inverted_index = collections.defaultdict(list)
    for doc_id, document in documents.items():
        tokens = preprocess(document)
        for token in tokens:
            inverted_index[token].append(doc_id)
    return inverted_index

# Function to perform document retrieval with detailed output
def retrieve_documents(query, inverted_index):
    query_tokens = preprocess(query)
    result = set()
    
    print("\nTokenize Query: query_tokens =", query_tokens)  # Show query tokens

    # Retrieve documents containing each query token
    for token in query_tokens:
        if token in inverted_index:
            matching_docs = inverted_index[token]
            print(f"For '{token}': The inverted index shows {matching_docs}")  # Show matching documents for each token
            if not result:
                result = set(matching_docs)
            else:
                result = result.intersection(matching_docs)  # Intersection of results
        else:
            print(f"For '{token}': No matching documents found.")
            return set()  # If any token is not found, return an empty result

    return result

# Build the inverted index
inverted_index = build_inverted_index(documents)

# Print the inverted index
print("Inverted Index Form:")
for token, doc_ids in inverted_index.items():
    print(f"{token}: {doc_ids}")

# Example query
query1 = input("\nEnter query: ")

# Retrieve documents for the query
result1 = retrieve_documents(query1, inverted_index)

# Display the final result
print("\nIntersection of matching documents gives:", result1)
print("Matching Documents:", result1)


Inverted Index Form:
this: [1, 2]
is: [1, 2, 3, 4]
the: [1, 2, 3, 3]
first: [1, 3]
document: [1, 2, 3, 4]
it: [1, 2]
contains: [1, 2]
some: [1]
words: [1, 2]
second: [2]
also: [2]
third: [3]
different: [3]
from: [3]
two: [3]
inverted: [4]
index: [4]
essential: [4]
for: [4]
retrieval: [4]



Enter query:  third



Tokenize Query: query_tokens = ['third']
For 'third': The inverted index shows [3]

Intersection of matching documents gives: {3}
Matching Documents: {3}


In [5]:
from collections import defaultdict
# Step 1: Build the inverted index
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    
    for doc_id, doc in enumerate(documents):
        for term in set(doc.lower().split()):  # Tokenize and avoid duplicates
            inverted_index[term].append(doc_id)
    
    return inverted_index
# Step 2: Search for documents containing terms
def search(query, inverted_index):
    return [inverted_index.get(term, []) for term in query.lower().split()]
# Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "A brown fox is quick and jumps high",
]
# Build and query the inverted index
inverted_index = build_inverted_index(documents)
query = "brown dog"
result = search(query, inverted_index)
# Display results
for idx, term in enumerate(query.split()):
    print(f"Documents containing {term}:", result[idx])

Documents containing brown: [0, 2]
Documents containing dog: [0, 1]
