In [31]:
import nltk
import glob
import os
import re
import pickle5 as pickle
import os
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rupin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rupin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
def rem_sw(arr):
  global stop_words
  reg_str = r'^'
  for i in range(len(arr)):
    for sw in stop_words:
      reg_str+=sw
      reg_str+='$'
      arr[i] = re.sub(reg_str, '', arr[i])
      reg_str = r'^'
  return arr

def rem_punc(str):
  str = re.sub(r'(\.|\?|\,|!|\:|\;|\&|\-|\(|\)|\{|\}|\'|\"|\/)', ' ', str)
  return str

In [33]:
def preprocessing(s):
    s = s.lower()
    s = re.sub(r'(\.|\?|\,|!|\:|\;|\&|\-|\(|\)|\{|\}|\'|\"|\/)', ' ', s)
    s = word_tokenize(s)
    s = rem_sw(s)
    for j in range(len(s)):
      s[j] = rem_punc(s[j])
    s[:] = [j for j in s if (j!='' and j!=' ')]
    return s

In [34]:
def generate_query(string, operations):
    s = ""
    for i in range(min(len(string), len(operations))):
        s += string[i]
        s += " "
        s += operations[i]
        s += " "
        
    if len(string) > len(operations):
        j = len(operations)
        while(j<len(string)):
            s += string[j]
            s += " "
            j += 1
    else:
        j = len(string)
        while(j<len(operations)):
            s += operations[j]
            s += " "
            j += 1
    return s

In [35]:
def load_data(path):
    entries = os.listdir(path)
    tokenised_docs = {}
    for i in entries:
        path1 = path+"/"+i
        f = open(path1, "r")
        a = list(f.read().split(","))
        tokenised_docs[i] = a
        
    # print(len(tokenised_docs))
    # print(doc_ids)

    # print("////////////////////////////////////////////////")

    return tokenised_docs



def create_index(tokenised_docs):
    inverted_index = {}
    for doc_id, tokens in tokenised_docs.items():
        for i in tokens:
            if i not in inverted_index:
                inverted_index[i] = [doc_id]
            else:
                if doc_id not in inverted_index[i]:
                    inverted_index[i].append(doc_id)
    return inverted_index



def save_index(inverted_index):
    with open('inverted_index.pickle', 'wb') as f:
        pickle.dump(inverted_index, f)



def load_index():
    with open('inverted_index.pickle', 'rb') as f:
        loaded_inverted_index = pickle.load(f)

In [36]:
def intersection(set_1, set_2):
    comp = 0
    res = set()
    if len(set_1) < len(set_2):
        set_1, set_2 = set_2, set_1
    for doc_id in set_2:
        comp += 1
        if doc_id in set_1:
            res.add(doc_id)
    return res, comp

def union(set_1, set_2):
    comp = 0
    res = set(set_1.copy())
    for doc_id in set_2:
        comp += 1
        res.add(doc_id)
    return res, comp

def difference(set_1, set_2):
    comp = 0
    res = set(set_1.copy())
    for doc_id in set_2:
        comp += 1
        res.discard(doc_id)
    return res, comp

def evaluate_query(index, doc_ids, query):
    words = query.split()
    doc_ids_set = []
    operation = None
    not_operation = False
    comp = 0
    res = set(doc_ids)
    for word in words:
        if word == 'AND':
            operation = intersection
        elif word == 'OR':
            operation = union
        elif word == 'NOT':
            not_operation = True
        else:
            if word in index:
                posting = index[word]
                if not_operation:
                    if doc_ids_set and operation == union:
                        not_posting = difference(res, posting)[0]
                        res = difference(res, posting)[0]
                    else:
                        not_posting = posting
                        for prev_posting in doc_ids_set:
                            not_posting = difference(not_posting, prev_posting)[0]
                            res = intersection(res, not_posting)[0]
                    not_operation = False
                else:
                    doc_ids_set.append(posting)
                    res, op_comparisons = operation(res, posting) if operation is not None else (posting, 0)
                    comp += op_comparisons
                    operation = None
            else:
                doc_ids_set.append(set())
    return res, comp

In [None]:
def main():
    path = "CSE508_Winter2023_Dataset_upd/CSE508_Winter2023_Dataset/"
    tokenised_docs = load_data(path)
    index = create_index(tokenised_docs)
    doc_ids = set(tokenised_docs.keys())

    n = int(input("Enter number of queries: "))
    queries = []
    for z in range(n):
        string = input("Enter string: ")
        string = preprocessing(string)
        operations = list(input("Enter operations: ").split(","))
        for i in range(len(operations)):
            operations[i] = operations[i].strip()
        queries.append(generate_query(string, operations))
        print()
    print("-----------------------------------------------------------------------------------------------------")    
    print()
    
    i = 0
    for query in queries:
        i += 1
        result, comparisons = evaluate_query(index, doc_ids, query)
        print(f'Query {i}: {query}')
        print(f'Number of documents retrieved for query {i}: {len(result)}')

        if len(result)==0:
            print(f'Names of the documents retrieved for query {i}: NONE')
        else:
            lst = list(result)
            lst.sort()
            print(f'Names of the documents retrieved for query {i}: {lst}')
            
        print(f'Number of comparisons: {comparisons}')
        print()
        print("-----------------------------------------------------------------------------------------------------")
        print()

if __name__ == '__main__':
    main()
