In [1]:
import nltk
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from stemming.porter2 import stem 
from collections import defaultdict
import pickle

In [2]:
# Preprocessing
pattern = r'TEXT: (.+)'
with open('collections/sample.txt', 'r') as file:
    x = file.read()
    documents = re.findall(pattern, x)


# print(documents)

tokenization = RegexpTokenizer(r"\w+")
new = []

with open('stopWords.txt', 'r') as file:
    stopWords = file.read().split()

def preprocess(stp, stm, documents):
    for doc in documents:
        # Tokenization
        tokenized = tokenization.tokenize(doc)
        
        # Casefolding
        lower = [word.lower() for word in tokenized]

        if stp:
            # Stopwords removal
            lower = [word for word in lower if not word in stopWords]
        
        if stm:
            # Stemming using the stemming library
            lower = [stem(word) for word in lower]
        
        new.append(lower)

preprocess(False, False, documents)
# print(new)
for list in new:
    print(list)


['he', 'likes', 'to', 'wink', 'he', 'likes', 'to', 'drink']
['he', 'likes', 'to', 'drink', 'and', 'drink', 'and', 'drink']
['the', 'thing', 'he', 'likes', 'to', 'drink', 'is', 'ink']
['the', 'ink', 'he', 'likes', 'to', 'drink', 'is', 'pink']
['he', 'likes', 'to', 'wink', 'and', 'drink', 'pink', 'ink']


In [3]:
# Concatenate tokens
# final=[]
# for i in new:
#     final.append(' '.join(i))
# print(final)
final = [' '.join(i) for i in new] 
final

['he likes to wink he likes to drink',
 'he likes to drink and drink and drink',
 'the thing he likes to drink is ink',
 'the ink he likes to drink is pink',
 'he likes to wink and drink pink ink']

In [4]:
def positional_inverted_index(documents):
    index = {}
    for doc_id, doc in enumerate(documents):
        for pos, word in enumerate(doc.split()):
            if word not in index:
                index[word] = {
                    'document_frequency': 0,
                    'documents': {},
                }
            if doc_id not in index[word]['documents']:
                index[word]['documents'][doc_id] = []
                index[word]['document_frequency'] += 1
            index[word]['documents'][doc_id].append(pos)
    return index

In [5]:
pos_index = positional_inverted_index(final)
with open('sampleindex.txt', 'w') as file:
    for term, info in pos_index.items():
        file.write(term + ":\n")
        for doc_id, positions in info['documents'].items():
            file.write(f"\t{doc_id}:\t{', '.join(map(str, positions))}\n")

print("Term\tDocument ID\tPositions")
for term, info in pos_index.items():
    print(term + ":")
    for doc_id, positions in info['documents'].items():
        print(f"\t{doc_id+1}: {', '.join(map(str, positions))}")

Term	Document ID	Positions
he:
	1: 0, 4
	2: 0
	3: 2
	4: 2
	5: 0
likes:
	1: 1, 5
	2: 1
	3: 3
	4: 3
	5: 1
to:
	1: 2, 6
	2: 2
	3: 4
	4: 4
	5: 2
wink:
	1: 3
	5: 3
drink:
	1: 7
	2: 3, 5, 7
	3: 5
	4: 5
	5: 5
and:
	2: 4, 6
	5: 4
the:
	3: 0
	4: 0
thing:
	3: 1
is:
	3: 6
	4: 6
ink:
	3: 7
	4: 1
	5: 7
pink:
	4: 7
	5: 6


In [6]:
def inverted_index(documents):
    index = {}
    for doc_id, doc in enumerate(documents):
        for word in doc.split():
            if word not in index:
                index[word] = []
            if doc_id not in index[word]:
                index[word].append(doc_id)
    return index

inv_index = inverted_index(final)
inv_index

{'he': [0, 1, 2, 3, 4],
 'likes': [0, 1, 2, 3, 4],
 'to': [0, 1, 2, 3, 4],
 'wink': [0, 4],
 'drink': [0, 1, 2, 3, 4],
 'and': [1, 4],
 'the': [2, 3],
 'thing': [2],
 'is': [2, 3],
 'ink': [2, 3, 4],
 'pink': [3, 4]}

In [7]:
def and_postings(posting1, posting2):
    return sorted(set(posting1).intersection(posting2))


In [8]:
def or_postings(posting1, posting2):
    return sorted(set(posting1).union(posting2))

In [9]:
def NOT(posting, total_docs):
    return sorted(set(range(total_docs)) - set(posting))

In [10]:
# Boolean Search
def boolean_search(query, index):
    query = query.split()
    result = set(index[query[0]].keys())
    for term in query[1:]:
        result &= set(index[term].keys())
    return result



In [11]:
# def process_query(query):
#     query_terms = query.split()
#     result = None
#     total_docs = len(final)
#     for i, term in enumerate(query_terms):
#         if term == "AND":
#             if query_terms[i+1] == "NOT":
#                 result = and_postings(result, NOT(inv_index[query_terms[i+2]], total_docs))
#             else:
#                 result = and_postings(result, inv_index[query_terms[i+1]])
#         elif term == "OR":
#             if query_terms[i+1] == "NOT":
#                 result = or_postings(result, NOT(inv_index[query_terms[i+2]], total_docs))
#             else:
#                 result = or_postings(result, inv_index[query_terms[i+1]])
#         elif term == "NOT" and i == 0:
#             result = NOT(inv_index[query_terms[i+1]], total_docs)
#         elif i == 0:
#             try:
#                 result = inv_index[term]
#             except KeyError:
#                 result = set()
#         i += 2  # Move to the next query term
#     return result

# query = input('Enter your query - ')
# print(process_query(query))

In [12]:
def process_query(query, inv_index, total_docs):
    query_terms = query.split()
    result = None

    for i, term in enumerate(query_terms):
        if term == "AND" or term == "OR":
            if query_terms[i+1] == "NOT":
                operand = NOT(inv_index[query_terms[i+2]], total_docs)
            else:
                operand = inv_index[query_terms[i+1]]

            if term == "AND":
                result = and_postings(result, operand)
            elif term == "OR":
                result = or_postings(result, operand)

        elif term == "NOT" and i == 0:
            result = NOT(inv_index[query_terms[i+1]], total_docs)
        elif i == 0:
            try:
                result = inv_index[term]
            except KeyError:
                result = set()

    return result

# Example usage:
total_docs = len(final)
query = input('Enter your query - ')
result = process_query(query, inv_index, total_docs)
print(result)


set()


In [13]:
# Save Index
with open('index.pkl', 'wb') as f:
    pickle.dump(pos_index, f)



In [14]:
# Load Index
with open('index.pkl', 'rb') as f:
    index_loaded = pickle.load(f)
# print(index_loaded)

In [15]:
def process_query(query):
    query_terms = query.split()
    i = 0
    total_docs = len(documents)
    
    while i < len(query_terms):
        term = query_terms[i]

        if term == 'AND':
            result = and_postings(inv_index[query_terms[i-1]], inv_index[query_terms[i+1]], total_docs)
        elif term == 'OR':
            result = or_postings(inv_index[query_terms[i-1]], inv_index[query_terms[i+1]], total_docs)
        elif term == 'NOT':
            result = NOT(inv_index[query_terms[i+1]], total_docs)
        elif i == 0:
            try:
                result = inv_index[term]
            except KeyError:
                result = set()
        i += 2  # Move to the next query term

    return result

query = input('Enter your query - ')
print(process_query(query))


set()


In [16]:
# import pickle

# def load_index(file_path):
#     with open(file_path, 'rb') as file:
#         index = pickle.load(file)
#     return index


# def phrase_search(index, query):
#     query_terms = query.split()
#     result = set()

#     for term in query_terms:
#         if term in index:
#             if not result:
#                 result.update(set(index[term]))
#             else:
#                 result.intersection_update(set(index[term]))

#     return result

# def proximity_search(index, query, proximity):
#     query_terms = query.split()
#     result = set()

#     for term in query_terms:
#         if term in index:
#             if not result:
#                 result.update(set(index[term]))
#             else:
#                 result.intersection_update(set(index[term]))

#     # Check proximity
#     result = [doc_id for doc_id in result if check_proximity(index, query_terms, doc_id, proximity)]
#     return result

# def check_proximity(index, query_terms, doc_id, proximity):
#     positions = [index[term][doc_id] for term in query_terms if term in index]
#     for positions_set in zip(*positions):
#         for i in range(1, len(positions_set)):
#             if positions_set[i] - positions_set[i-1] <= proximity:
#                 return True
#     return False

# # Load the index into memory
# index_path = 'sampleindex.txt'
# loaded_index = load_index(index_loaded)

# # Example queries
# boolean_query = "term1 AND term2 OR term3 NOT term4"
# phrase_query = "phrase1 phrase2"
# proximity_query = "term1 NEAR/5 term2"

# # Run queries
# result_boolean = boolean_search(loaded_index, boolean_query)
# result_phrase = phrase_search(loaded_index, phrase_query)
# result_proximity = proximity_search(loaded_index, proximity_query, proximity=5)

# # Print the results
# print("Boolean Search Result:", result_boolean)
# print("Phrase Search Result:", result_phrase)
# print("Proximity Search Result:", result_proximity)
