In [None]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import re
import string

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Defining directory**

In [None]:
directory = "ResearchPapers"
tokens = {}
No_stop_word_tokens = {}


**Functions to process tokens**

In [None]:
def remove_urls(tokens):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    # Iterate through the tokens and filter out URLs
    tokens_without_urls = [token for token in tokens if not url_pattern.match(token)]
    return tokens_without_urls


In [None]:
def remove_special_characters(tokens):
    # Define the pattern to match special characters
    special_char_pattern = re.compile(r'[^\w\s]')  # Matches any character that is not a word character or whitespace

    # Iterate through the tokens and remove special characters
    tokens_without_special_chars = [special_char_pattern.sub('', token) for token in tokens]

    # Remove empty tokens after removing special characters
    tokens_without_special_chars = [token for token in tokens_without_special_chars if token]

    return tokens_without_special_chars

In [None]:
def remove_numbers(tokens):
    # Define the pattern to match numbers
    number_pattern = re.compile(r'\b\d+\b')

    # Iterate through the tokens and remove tokens containing numbers
    tokens_without_numbers = [token for token in tokens if not number_pattern.match(token)]

    return tokens_without_numbers


**Getting data from file , performing tokenization and then process those tokens**

In [None]:
document_data = {}
n = 0
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        file_content = file.read()
        tokens = word_tokenize(file_content)
        ps = PorterStemmer()
        token_case_fold = [ps.stem(word) for word in tokens]

        custom_stop_words = set([
            'a', 'is', 'the', 'of', 'all', 'and', 'to', 'can', 'be', 'as',
            'once', 'for', 'at', 'am', 'are', 'has', 'have', 'had', 'up',
            'his', 'her', 'in', 'on', 'no', 'we', 'do'
        ])

        punctuation = set(string.punctuation)

        # Removing punctuation and stop words
        No_punctuation_tokens = [word for word in token_case_fold if word not in punctuation]
        No_stop_word_tokens = [word for word in No_punctuation_tokens if word.lower() not in custom_stop_words]

        No_url_tokens= remove_urls(No_stop_word_tokens)

        No_special_char_tokens = remove_special_characters(No_url_tokens)

        No_number_tokens = remove_numbers(No_special_char_tokens)


        # Store document data
        document_data[filename] = {
            'tokens': No_number_tokens,  # Store processed tokens
             'term_count': len(set(No_number_tokens))
        }

#printing tokens:
# for filename, data in document_data.items():
#     print(f"Document: {filename}")
#     print("Processed Tokens:", data['tokens'])
#     print()

grand_total = sum(data['term_count'] for data in document_data.values())
print("Grand Total of Terms:", grand_total)



Grand Total of Terms: 39466



**Inverted index**


In [None]:
class Node:
    def __init__(self, data=None):
        self.data = data
        self.next = None


class LinkedList:
    def __init__(self):
        self.head = None

    def append(self, data):
        new_node = Node(data)
        if not self.head:
            self.head = new_node
            return
        current = self.head
        while current.next:
            current = current.next
        current.next = new_node

    def __str__(self):
        result = []
        current = self.head
        while current:
            result.append(str(current.data))
            current = current.next
        return ' -> '.join(result)


# Implement inverted index
inverted_index = {}
document_frequency = {}

for filename, data in document_data.items():
    tokens = data['tokens']
    unique_tokens_in_doc = set(tokens)
    for token in unique_tokens_in_doc:
        if token not in inverted_index:
            inverted_index[token] = LinkedList()
            document_frequency[token] = 1
        else:
            document_frequency[token] += 1
        inverted_index[token].append(filename)

# Printing the inverted index:
# for term, posting_list in inverted_index.items():
#     print(f"Term: {term}, Posting List: {posting_list}, Document Frequency: {document_frequency[term]}")


# this is the terms we search present in the query
search_terms = ['learn' , 'model' , 'cancer']



**Intersection**

In [None]:
documents_containing_terms = set(document_data.keys())  # Initialize with all documents
#intersection
for term in search_terms:
    if term in inverted_index:
        posting_list = inverted_index[term]
        current = posting_list.head
        documents_containing_this_term = set()

        # Add documents containing the current term to a temporary set
        while current:
            documents_containing_this_term.add(current.data)
            current = current.next

        # Intersect with the current set of documents
        documents_containing_terms.intersection_update(documents_containing_this_term)


print(f"Documents containing {search_terms} are {documents_containing_terms} ")


**Union**

In [None]:
# Union operation
documents_containing_terms = set()  # Initialize an empty set

for term in search_terms:
    if term in inverted_index:
        posting_list = inverted_index[term]
        current = posting_list.head

        # Add documents containing the current term to the set
        while current:
            documents_containing_terms.add(current.data)
            current = current.next

for doc_id in documents_containing_terms:
    print(f"Document ID: {doc_id}")


**NOT OPERATION**

In [None]:
# NOT operation
# Initialize a set with all document IDs
all_documents = set(document_data.keys())

# Initialize an empty set for documents containing the search terms
documents_containing_terms = set()

# Add documents containing the search terms to the set
for term in search_terms:
    if term in inverted_index:
        posting_list = inverted_index[term]
        current = posting_list.head

        # Add documents containing the current term to the set
        while current:
            documents_containing_terms.add(current.data)
            current = current.next

# Perform the NOT operation to exclude documents containing the search terms
documents_not_containing_terms = all_documents - documents_containing_terms

# Print the documents not containing the search terms
for doc_id in documents_not_containing_terms:
    print(f"Document ID: {doc_id}")


**Positional Index**

In [None]:
from collections import defaultdict
import os
import csv

def build_positional_index(docs_dir):
    positional_index = defaultdict(lambda: defaultdict(list))
    for filename in os.listdir(docs_dir):
        if filename.endswith('.txt'):
            doc_id = filename[:-4]
            doc_path = os.path.join(docs_dir, filename)
            tokens = document_data[filename]['tokens']
            for position, term in enumerate(tokens):
                positional_index[term][doc_id].append(position)
    return positional_index

def save_positional_index(positional_index, output_file):
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Term', 'DocID', 'Positions'])
        for term, postings in positional_index.items():
            for doc_id, positions in postings.items():
                positions_str = '[' + ', '.join(map(str, positions)) + ']'
                writer.writerow([term, doc_id, positions_str])

positional_index = build_positional_index(directory)
output_file = "positional_index.csv"
save_positional_index(positional_index, output_file)

# Print the positional index
for term, postings in positional_index.items():
    print(f"Term: {term}")
    for doc_id, positions in postings.items():
        positions_str = ', '.join(map(str, positions))
        print(f"  Document ID: {doc_id}, Positions: {positions_str}")
    print()


**To process Proximity Query**

In [None]:

def execute_proximity_query(positional_index, term1, term2, distance):
    matching_documents = []

    if term1 not in positional_index or term2 not in positional_index:
        print("One or more terms not found in the positional index")
        return matching_documents

    # Iterate over documents containing term1
    for document in positional_index[term1].keys():
        if document in positional_index[term2]:
            positions1 = positional_index[term1][document]
            positions2 = positional_index[term2][document]

            # Check positional proximity
            for pos1 in positions1:
                for pos2 in positions2:
                    if abs(pos1 - pos2) <= distance:
                        matching_documents.append(document)
                        break  # Break if a match is found within proximity

    return matching_documents

# Example usage:
term1 = "past"
term2 = "research"
distance = 3
result = execute_proximity_query(positional_index, term1, term2, distance)
print("Documents containing both", term1, "and", term2, "with distance", distance, "apart:", result)


Documents containing both past and research with distance 3 apart: ['12', '12']
