Importing necessary packages 

In [2]:
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pickle
import json
import re

# nltk.download('punkt')
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punctuation_table = str.maketrans('', '', string.punctuation)

just a sort funciton to sort the irregular naming format used in text_files

In [3]:
def sortpls(doc):
    return sorted(doc, key=lambda x: int(re.search(r'\d+', x).group()))

# Q1
For 5 files: file1.txt, file2.txt, file3.txt, file4.txt and file5.txt

In [4]:
source_directory = 'text_files'
count = 0
for filename in sortpls(os.listdir(source_directory)):
    if filename.endswith('.txt'):
        print("-----------------------------------------------------------------------")
        print(filename)
        file_path = os.path.join(source_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            print("Original Text:", text)
            text = text.lower()  # lowercase the text
            print("1. Lower Case: ", text)
            tokens = word_tokenize(text)  # tokenize
            print("2. tokenized: ", ' '.join(tokens))
            tokens = [word for word in tokens if word not in stop_words]  # remove stopwords
            print("3. Stopwords Removed:", ' '.join(tokens))
            tokens = [w.translate(punctuation_table) for w in tokens]  # remove punctuations
            print("4. Punctuation Removed:", ' '.join(tokens))
            tokens = [word for word in tokens if word.strip()]  # remove blank space tokens
            print("5. Space Removed:", ' '.join(tokens))
            count+=1
    if count==5:
        break

-----------------------------------------------------------------------
file1.txt
Original Text: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
1. Lower Case:  loving these vintage springs on my vintage strat. they have a good tension and great stability. if you are floating your bridge and want the most out of your springs than these are the way to go.
2. tokenized:  loving these vintage springs on my vintage strat . they have a good tension and great stability . if you are floating your bridge and want the most out of your springs than these are the way to go .
3. Stopwords Removed: loving vintage springs vintage strat . good tension great stability . floating bridge want springs way go .
4. Punctuation Removed: loving vintage springs vintage strat  good tension great stability  floating bridge want springs way go 
5. Space Removed: lovi

created a general function ```preprocessing``` to handle the text much more structurally.

In [5]:
def preprocessing(text):
    text = text.lower()
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [w.translate(punctuation_table) for w in tokens]  # Remove punctuations
    tokens = [word for word in tokens if word.strip()]  # Remove blank space tokens
    text = ' '.join(tokens)
    return text


In [6]:
destination_directory = 'preprocessed'
if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

stop_words = set(stopwords.words('english'))
punctuation_table = str.maketrans('', '', string.punctuation)
for filename in os.listdir(source_directory):

    if filename.endswith('.txt'):
        file_path = os.path.join(source_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            text = preprocessing(text)

            processed_file_path = os.path.join(destination_directory, filename)
            with open(processed_file_path, 'w', encoding='utf-8') as processed_file:
                processed_file.write(text)

# Q2: Unigram Inverted Index and Boolean Queries

In [7]:
directory = 'preprocessed'
inverted_index = {}

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            words = file.read().split()
            for word in words:
                if word in inverted_index:
                    if filename not in inverted_index[word]:
                        inverted_index[word].append(filename)
                else:
                    inverted_index[word] = [filename]


In [8]:
with open('inverted_index1.pkl', 'wb') as f:
    pickle.dump(inverted_index, f)

In [9]:
with open('inverted_index1.pkl', 'rb') as f:
    loaded_inverted_index = pickle.load(f)
    inverted_index = loaded_inverted_index

all_documents = [f for f in os.listdir('preprocessed') if f.endswith('.txt')]
all_documents = set(all_documents)

In [None]:
pretty_dict = json.dumps(loaded_inverted_index, indent=4)
print(pretty_dict)

Set Algebra

In [11]:
def and_operation(doc_set1, doc_set2):
    return doc_set1.intersection(doc_set2)

def or_operation(doc_set1, doc_set2):
    return doc_set1.union(doc_set2)

def and_not_operation(doc_set1, doc_set2):
    return doc_set1.difference(doc_set2)

def or_not_operation(doc_set1, doc_set2, all_docs = all_documents):
    not_doc_set2 = all_docs.difference(doc_set2)
    return or_operation(doc_set1, not_doc_set2)

def get_docs_for_term(term, inverted_index):
    return set(inverted_index.get(term, []))

operation_functions = {
    'AND': and_operation,
    'OR': or_operation,
    'AND NOT': and_not_operation,
    'OR NOT': or_not_operation
}


In [12]:
n = int(input("Enter the number of queries: "))
# n = 1
for k in range(n):
    query = input(f"Query {k+1}: ")
    processed_query_list = preprocessing(query).split()

    if len(processed_query_list) == 0:
        print("Empty string")
        continue
    if len(processed_query_list) == 1:
        print(f"Only One words is searched- {processed_query_list[0]}: found in {len(get_docs_for_term(processed_query_list[0], inverted_index))} docs", sortpls(get_docs_for_term(processed_query_list[0], inverted_index)))
        continue
    ops = input("Enter operators (AND, OR, AND NOT, OR NOT), separated by commas: ")
    ops_list = [x.strip() for x in ops.split(',')]
    
    state = True
    for i in ops_list:
        if i not in operation_functions.keys():
            state = False
            print('invalid operator')
            break
    if state == False:
        continue

    if len(ops_list) != len(processed_query_list) - 1:
        print("Mismatch between number of terms and operators")
        continue
    str = f"Query {k+1}: {processed_query_list[0]} "
    for j in range(len(ops_list)):
        str+= f'{ops_list[j]} {processed_query_list[j+1]} '
    print()

    current_docs = get_docs_for_term(processed_query_list[0], inverted_index)
    for i, op in enumerate(ops_list):
        next_docs = get_docs_for_term(processed_query_list[i+1], inverted_index)
        try:
            current_docs = operation_functions[op](current_docs, next_docs)
        except Exception as error:
            print("something went wrong", error)
            break

    current_docs = sortpls(current_docs)
    print(str)
    print(f"Number of documents retrieved for query {k+1} : {len(current_docs)}")
    print(f"Names of documents retrieved for query {k+1} : {', '.join(current_docs)}\n")


Query 1: car OR bag AND NOT canister 
Number of documents retrieved for query 1 : 31
Names of documents retrieved for query 1 : file3.txt, file73.txt, file118.txt, file166.txt, file174.txt, file264.txt, file313.txt, file363.txt, file404.txt, file459.txt, file466.txt, file542.txt, file573.txt, file665.txt, file682.txt, file686.txt, file698.txt, file699.txt, file738.txt, file746.txt, file780.txt, file797.txt, file860.txt, file863.txt, file864.txt, file886.txt, file892.txt, file930.txt, file942.txt, file956.txt, file981.txt


Query 2: coffee AND brewing OR NOT techniques OR cookbook 
Number of documents retrieved for query 2 : 999
Names of documents retrieved for query 2 : file1.txt, file2.txt, file3.txt, file4.txt, file5.txt, file6.txt, file7.txt, file8.txt, file9.txt, file10.txt, file11.txt, file12.txt, file13.txt, file14.txt, file15.txt, file16.txt, file17.txt, file18.txt, file19.txt, file20.txt, file21.txt, file22.txt, file23.txt, file24.txt, file25.txt, file26.txt, file27.txt, file2

# Q3: Positional Index and Phrase Queries

creating a positional index by looping into the preprocessed dir and then creating a dictionary data structure to store the keys(words) and their position. Storing the frequency is not considered as it is not required for the question.
Note: The query will be searched for exact match and if any word(s) appear in between the original text, that result in different adjacent position but same relative position, the search query will be treated as unsuccessfull

In [None]:
directory = 'preprocessed'
filenames = os.listdir(directory)
filenames = sortpls(filenames)
positional_index = {}
for filename in filenames:
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            position = 0
            for word in file.read().split():
                if word not in positional_index:
                    positional_index[word] = {}
                if filename not in positional_index[word]:
                    positional_index[word][filename] = []
                positional_index[word][filename].append(position)
                position += 1

with open('positional_index.pkl', 'wb') as f:
    pickle.dump(positional_index, f)
positional_index

Loading the datastructure from the saved pickle file

In [31]:
with open('positional_index.pkl', 'rb') as f:
    loaded_positional_index = pickle.load(f)

In [48]:
def find_docs(query, ds):
    words = query.split()
    docs = set()

    for i, word in enumerate(words):
        if word in ds:
            if i == 0:
                docs = set(ds[word].keys())
            else:
                docs = docs.intersection(set(ds[word].keys()))
                temp_docs = set()
                for doc in docs:
                    positions = ds[words[i-1]][doc]
                    if any(p+1 in ds[word][doc] for p in positions):
                        temp_docs.add(doc)
                docs = temp_docs
        else:
            return set()  

    return docs

n = int(input("Enter the number of queries: "))
for i in range(n):
    query = input(f"Query {i+1}: ")
    if query in ['', ' ', ' ']:
        print("Query was empty")
        break
    query = preprocessing(query)
    print(query)
    documents = find_docs(query, loaded_positional_index)
    if len(documents) == 0:
        print("No Match Found")
        continue
    print(f"Number of documents retrieved for query {i+1} using positional index: {len(documents)}")
    print(f"Names of documents retrieved for query {i+1} using positional index: {', '.join(sortpls(documents))}\n")

great buy
Number of documents retrieved for query 1 using positional index: 7
Names of documents retrieved for query 1 using positional index: file5.txt, file105.txt, file167.txt, file214.txt, file283.txt, file712.txt, file906.txt

