In [None]:
# Information Retrieval System with Boolean Retrieval and Evaluation

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import defaultdict

# Download necessary NLTK resources
nltk.download('punkt')

# Load dataset (make sure the CSV file exists in the project directory)
file_path = 'Online_Retail_UTF8.csv'
df = pd.read_csv(file_path, encoding='utf-8')

text_column = 'Description'
id_column = 'InvoiceNo'

# Initialize stemmer
stemmer = PorterStemmer()

# Tokenization and stemming function
def tokenize_and_stem(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return list(set(stemmed_tokens))  # Remove duplicates

# Apply preprocessing
df['Tokenized_And_Stemmed_Text'] = df[text_column].apply(lambda x: tokenize_and_stem(str(x)))

# Preview first 5 documents
for i, tokens in enumerate(df['Tokenized_And_Stemmed_Text'].head(5)):
    print(f"Document {i+1} tokens (deduplicated and normalized): {tokens}\n")

# Build inverted index from first 20 documents
inverted_index = defaultdict(list)

for doc_id, tokens in enumerate(df['Tokenized_And_Stemmed_Text'].head(20)):
    for token in tokens:
        inverted_index[token].append(doc_id + 1)

# Display inverted index
print("Inverted Index for the first 20 documents:")
for term, doc_ids in inverted_index.items():
    print(f"Term: '{term}' appears in documents: {doc_ids}")

# Boolean retrieval model
def boolean_retrieval(query, inverted_index):
    query_terms = [stemmer.stem(token) for token in word_tokenize(query)]
    result_set = set(inverted_index[query_terms[0]]) if query_terms[0] in inverted_index else set()

    i = 1
    while i < len(query_terms):
        operator = query_terms[i].upper()
        next_term = query_terms[i + 1] if (i + 1) < len(query_terms) else None
        if not next_term:
            break
        next_term = stemmer.stem(next_term)
        next_term_set = set(inverted_index[next_term]) if next_term in inverted_index else set()

        if operator == "AND":
            result_set &= next_term_set
        elif operator == "OR":
            result_set |= next_term_set
        elif operator == "NOT":
            result_set -= next_term_set
        i += 2
    return sorted(list(result_set))

# Precision and recall calculation
def calculate_precision_recall(query, relevant_docs, inverted_index):
    retrieved_docs = boolean_retrieval(query, inverted_index)
    true_positives = len(set(retrieved_docs) & set(relevant_docs))
    false_positives = len(set(retrieved_docs) - set(relevant_docs))
    false_negatives = len(set(relevant_docs) - set(retrieved_docs))

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

    return precision, recall

# Define relevant documents for evaluation (update this based on actual data)
relevant_docs = list(range(1, 21))

# Prompt query input and run evaluation
query = input("Please enter your search query (use AND, OR, NOT): ")
result_documents = boolean_retrieval(query, inverted_index)
precision, recall = calculate_precision_recall(query, relevant_docs, inverted_index)

# Display result documents and metrics
if result_documents:
    print(f"Documents matching the query '{query}':")
    for doc_id in result_documents:
        doc_title = df[text_column].iloc[doc_id - 1]
        print(f"Document ID: {doc_id} , Title/Description: {doc_title}")
else:
    print(f"No documents match the query '{query}'.")

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
