In [None]:
import nltk
import glob
import os
import re
import pickle
import os
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [69]:
def rem_sw(arr):
  global stop_words
  reg_str = r'^'
  for i in range(len(arr)):
    for sw in stop_words:
      reg_str+=sw
      reg_str+='$'
      arr[i] = re.sub(reg_str, '', arr[i])
      reg_str = r'^'
  return arr

def rem_punc(str):
  str = re.sub(r'(\.|\?|\,|!|\:|\;|\&|\-|\(|\)|\{|\}|\'|\"|\/)', ' ', str)
  return str

In [70]:
def preprocessing(s):
    s = s.lower()
    s = re.sub(r'(\.|\?|\,|!|\:|\;|\&|\-|\(|\)|\{|\}|\'|\"|\/)', ' ', s)
    s = word_tokenize(s)
    s = rem_sw(s)
    for j in range(len(s)):
      s[j] = rem_punc(s[j])
    s[:] = [j for j in s if (j!='' and j!=' ')]
    return s

In [71]:
def generate_query(string, operations):
    s = ""
    for i in range(min(len(string), len(operations))):
        s += string[i]
        s += " "
        s += operations[i]
        s += " "
        
    if len(string) > len(operations):
        j = len(operations)
        while(j<len(string)):
            s += string[j]
            s += " "
            j += 1
    else:
        j = len(string)
        while(j<len(operations)):
            s += operations[j]
            s += " "
            j += 1
    return s

In [72]:
def load_data(path):
    entries = os.listdir(path)
    tokenised_docs = {}
    for i in entries:
        path1 = path+"/"+i
        f = open(path1, "r")
        a = list(f.read().split(","))
        tokenised_docs[i] = a
        
    # print(len(tokenised_docs))
    # print(doc_ids)

    # print("////////////////////////////////////////////////")

    return tokenised_docs



def create_index(tokenised_docs):
    inverted_index = {}
    for doc_id, tokens in tokenised_docs.items():
        for i in tokens:
            if i not in inverted_index:
                inverted_index[i] = [doc_id]
            else:
                if doc_id not in inverted_index[i]:
                    inverted_index[i].append(doc_id)
    return inverted_index



def save_index(inverted_index):
    with open('inverted_index.pickle', 'wb') as f:
        pickle.dump(inverted_index, f)



def load_index():
    with open('inverted_index.pickle', 'rb') as f:
        loaded_inverted_index = pickle.load(f)

In [74]:
def intersection(set_1, set_2):
    comp = 0
    res = set()
    if len(set_1) < len(set_2):
        set_1, set_2 = set_2, set_1
    for doc_id in set_2:
        comp += 1
        if doc_id in set_1:
            res.add(doc_id)
    return res, comp

def union(set_1, set_2):
    comp = 0
    res = set_1.copy()
    for doc_id in set_2:
        comp += 1
        res.add(doc_id)
    return res, comp

def difference(set_1, set_2):
    comp = 0
    res = set_1.copy()
    for doc_id in set_2:
        comp += 1
        res.discard(doc_id)
    return res, comp

def evaluate_query(index, doc_ids, query):
    words = query.split()
    doc_id_sets = []
    operation = None
    not_operation = False
    comp = 0
    for i in words:
        if i == 'AND':
            operation = intersection
        elif i == 'OR':
            operation = union
        elif i == 'NOT':
            not_operation = True
        else:
            if i in index:
                posting = index[i]
                if not_operation:
                    posting = difference(doc_ids, posting)[0]
                    not_operation = False
                doc_id_sets.append(posting)
            else:
                doc_id_sets.append(set())
    if not doc_id_sets:
        return set(), 0
    if len(doc_id_sets) == 1:
        return doc_id_sets[0], 0
    res = doc_id_sets[0]
    for i in range(1, len(doc_id_sets)):
        if not res:
            break
        if operation is None:
            operation = union
        if not_operation:
            not_operation = False
            doc_id_sets[i] = difference(doc_ids, doc_id_sets[i])[0]
            operation = difference
        res, op_comp = operation(res, doc_id_sets[i])
        comp += op_comp
        operation = None
    return res, comp


In [79]:
def main():
    path = "drive/MyDrive/data/data/"
    tokenised_docs = load_data(path)
    index = create_index(tokenised_docs)
    doc_ids = set(tokenised_docs.keys())

    n = int(input("Enter number of queries: "))
    queries = []
    for z in range(n):
        string = input("Enter string: ")
        string = preprocessing(string)
        operations = list(input("Enter operations: ").split(","))
        for i in range(len(operations)):
            operations[i] = operations[i].strip()
        queries.append(generate_query(string, operations))
        print()
    print("-----------------------------------------------------------------------------------------------------")    
    print()
    
    i = 0
    for query in queries:
        i += 1
        result, comparisons = evaluate_query(index, doc_ids, query)
        print(f'Query {i}: {query}')
        print(f'Number of documents retrieved for query {i}: {len(result)}')

        if len(result)==0:
            print(f'Names of the documents retrieved for query {i}: NONE')
        else:
            print(f'Names of the documents retrieved for query {i}: {result}')
            
        print(f'Number of comparisons: {comparisons}')
        print()
        print("-----------------------------------------------------------------------------------------------------")
        print()

if __name__ == '__main__':
    main()


Enter number of queries: 2
Enter string: Car bag in a canister
Enter operations: OR, AND NOT

Enter string: Coffee brewing techniques in cookbook
Enter operations: AND, OR NOT, OR

-----------------------------------------------------------------------------------------------------

Query 1: car OR bag AND NOT canister 
Number of documents retrieved for query 1: 0
Names of the documents retrieved for query 1: NONE
Number of comparisons: 0

-----------------------------------------------------------------------------------------------------

Query 2: coffee AND brewing OR NOT techniques OR cookbook 
Number of documents retrieved for query 2: 0
Names of the documents retrieved for query 2: NONE
Number of comparisons: 0

-----------------------------------------------------------------------------------------------------

