In [107]:
import numpy as np
import pandas as pd
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import word_tokenize
import os
import pickle
import string
from nltk.corpus import stopwords

In [108]:
class UnigramInvertedIndex:
    def __init__(self):
        pass

    def build_inverted_index(self, list_of_file_paths):
        index = {}
        for file_path in list_of_file_paths:
            with open(file_path, 'r') as f:
                line = f.readlines()[0]
                tokens = word_tokenize(line)
                for token in tokens:
                    if token not in index:
                        index[token] = [file_path.split('\\')[-1]]
                    else:
                        if file_path.split('\\')[-1] not in index[token]:
                            index[token].append(file_path.split('\\')[-1])
        self.index = index
        self.list_of_file_paths = list_of_file_paths
        self.file_names = [file_path.split('\\')[-1] for file_path in list_of_file_paths]

    def save(self, file_path):
        if self.index is None or self.list_of_file_paths is None or self.file_names is None:
            raise Exception('Index is not built yet')
        with open(file_path, 'wb') as f:
            pickle.dump([self.index, self.list_of_file_paths, self.file_names], f)

    def load(self, file_path):
        with open(file_path, 'rb') as f:
            self.index, self.list_of_file_paths, self.file_names = pickle.load(f)


In [109]:
files = os.listdir('..\data\preprocessed_data')
file_paths = []
for file in files:
    file_paths.append('..\data\preprocessed_data\\' + file)

In [110]:
unigram_inverted_index = UnigramInvertedIndex()

In [111]:
unigram_inverted_index.build_inverted_index(file_paths)

In [112]:
unigram_inverted_index.save(r"Dumps\unigram_inverted_index.pkl")

In [113]:
unigram_inverted_index = UnigramInvertedIndex()
unigram_inverted_index.load(r"Dumps\unigram_inverted_index.pkl")

In [114]:
stopword_list = stopwords.words('english')

def preproc(query):
    query = query.lower()
    tokenized_text = word_tokenize(query)
    filtered_text = [word for word in tokenized_text if word not in stopword_list]
    temp = []
    for word in filtered_text:
        temp_word = word
        for punc in string.punctuation:
            if punc == '-':
                continue
            temp_word = temp_word.replace(punc, '')
        temp.append(temp_word)
    filtered_text = temp
    filtered_text = ' '.join(filtered_text).split()
    return filtered_text

In [115]:
class queryHandler:
    def __init__(self, unigram_inverted_index):
        self.unigram_inverted_index = unigram_inverted_index
        self.index = unigram_inverted_index.index
        self.list_of_file_paths = unigram_inverted_index.list_of_file_paths
        self.file_names = unigram_inverted_index.file_names

    def query(self, sequence, op_sequence):
        sequence = preproc(sequence)
        op_sequence = op_sequence.split(',')
        op_sequence = [op.strip().lower() for op in op_sequence]
        doc_lists = []
        for token in sequence:
            if token in self.index:
                doc_lists.append(self.index[token])
            else:
                doc_lists.append([])
        possible_boolean_ops = ['and', 'or', 'and not', 'or not']
        for op in op_sequence:
            if op not in possible_boolean_ops:
                raise Exception(f'Invalid boolean operator {op}')
        if len(op_sequence) != len(sequence) - 1:
            raise Exception(f'Invalid query sequence. Expected {len(sequence) - 1} boolean operators, got {len(op_sequence)}')

        op_sequence_copy = op_sequence.copy()

        # Perform ops in order of precedence
        comparison_count = 0

        # while op_sequence != []:
        #     if 'and not' in op_sequence:
        #         and_not_index = op_sequence.index('and not')
        #         op_sequence.pop(and_not_index)
        #         doc_lists[and_not_index], comps = self.and_not_query(doc_lists[and_not_index], doc_lists[and_not_index + 1])
        #         comparison_count += comps
        #         doc_lists.pop(and_not_index + 1)
        #     if 'and' in op_sequence:
        #         and_index = op_sequence.index('and')
        #         op_sequence.pop(and_index)
        #         doc_lists[and_index], comps = self.and_query(doc_lists[and_index], doc_lists[and_index + 1])
        #         comparison_count += comps
        #         doc_lists.pop(and_index + 1)
        #     if 'or not' in op_sequence:
        #         or_not_index = op_sequence.index('or not')
        #         op_sequence.pop(or_not_index)
        #         doc_lists[or_not_index], comps = self.or_not_query(doc_lists[or_not_index], doc_lists[or_not_index + 1])
        #         comparison_count += comps
        #         doc_lists.pop(or_not_index + 1)
        #     if 'or' in op_sequence:
        #         or_index = op_sequence.index('or')
        #         op_sequence.pop(or_index)
        #         doc_lists[or_index], comps = self.or_query(doc_lists[or_index], doc_lists[or_index + 1])
        #         comparison_count += comps
        #         doc_lists.pop(or_index + 1)

        # Perform comparisons left to right

        for op in op_sequence:
            if op == 'and not':
                doc_lists[0], comps = self.and_not_query(doc_lists[0], doc_lists[1])
                comparison_count += comps
                doc_lists.pop(1)
            elif op == 'and':
                doc_lists[0], comps = self.and_query(doc_lists[0], doc_lists[1])
                comparison_count += comps
                doc_lists.pop(1)
            elif op == 'or not':
                doc_lists[0], comps = self.or_not_query(doc_lists[0], doc_lists[1])
                comparison_count += comps
                doc_lists.pop(1)
            elif op == 'or':
                doc_lists[0], comps = self.or_query(doc_lists[0], doc_lists[1])
                comparison_count += comps
                doc_lists.pop(1)


        doc_lists = sorted(doc_lists[0])

        final_query = ""

        for token, op in zip(sequence, op_sequence_copy):
            final_query += f'{token} {op.upper()} '
        final_query += sequence[-1]
        
        return {
            'docs': doc_lists,
            'comparisons': comparison_count,
            'sequence': sequence,
            'op_sequence': op_sequence_copy,
            'final_query': final_query
        }

    def not_query(self, doc_list):
        all_files = sorted(self.file_names)
        print(len(all_files))
        not_docs = []
        i = 0
        j = 0
        comparisons = 0
        while i < len(all_files) and j < len(doc_list):
            comparisons += 1
            if all_files[i] == doc_list[j]:
                i += 1
                j += 1
            elif all_files[i] < doc_list[j]:
                not_docs.append(all_files[i])
                i += 1
            else:
                j += 1
        for k in range(i, len(all_files)):
            not_docs.append(all_files[k])
        return not_docs, comparisons

    def and_query(self, doc_list1, doc_list2):
        common_docs = []
        comparisons = 0
        i = 0
        j = 0
        while i < len(doc_list1) and j < len(doc_list2):
            comparisons += 1
            if doc_list1[i] == doc_list2[j]:
                common_docs.append(doc_list1[i])
                i += 1
                j += 1
            elif doc_list1[i] < doc_list2[j]:
                i += 1
            else:
                j += 1
        return common_docs, comparisons

    def or_query(self, doc_list1, doc_list2):
        common_docs = []
        comparisons = 0
        i = 0
        j = 0
        while i < len(doc_list1) and j < len(doc_list2):
            comparisons += 1
            if doc_list1[i] == doc_list2[j]:
                common_docs.append(doc_list1[i])
                i += 1
                j += 1
            elif doc_list1[i] < doc_list2[j]:
                common_docs.append(doc_list1[i])
                i += 1
            else:
                common_docs.append(doc_list2[j])
                j += 1
        while i < len(doc_list1):
            common_docs.append(doc_list1[i])
            i += 1
        while j < len(doc_list2):
            common_docs.append(doc_list2[j])
            j += 1
        return common_docs, comparisons

    def and_not_query(self, doc_list1, doc_list2):
        common_docs = []
        comparisons = 0
        i = 0
        j = 0
        while i < len(doc_list1) and j < len(doc_list2):
            comparisons += 1
            if doc_list1[i] == doc_list2[j]:
                i += 1
                j += 1
            elif doc_list1[i] < doc_list2[j]:
                common_docs.append(doc_list1[i])
                i += 1
            else:
                j += 1
        return common_docs, comparisons
    
    def or_not_query(self, doc_list1, doc_list2):
        common_docs = []
        comparisons = 0
        i = 0
        j = 0
        while i < len(doc_list1) and j < len(doc_list2):
            comparisons += 1
            if doc_list1[i] == doc_list2[j]:
                i += 1
                j += 1
            elif doc_list1[i] < doc_list2[j]:
                common_docs.append(doc_list1[i])
                i += 1
            else:
                j += 1
        while i < len(doc_list1):
            common_docs.append(doc_list1[i])
            i += 1
        set_doc_list_2 = set(doc_list2)
        for doc in self.file_names:
            comparisons += 1
            if doc not in set_doc_list_2:
                common_docs.append(doc)
        return sorted(common_docs), comparisons



In [116]:
query_handler = queryHandler(unigram_inverted_index)

In [117]:
query_handler.query('reynolds number and potential shear', 'and, or not, and')

{'docs': ['cranfield0002',
  'cranfield0003',
  'cranfield0004',
  'cranfield0009',
  'cranfield0016',
  'cranfield0045',
  'cranfield0050',
  'cranfield0065',
  'cranfield0088',
  'cranfield0089',
  'cranfield0099',
  'cranfield0109',
  'cranfield0116',
  'cranfield0121',
  'cranfield0126',
  'cranfield0165',
  'cranfield0171',
  'cranfield0180',
  'cranfield0187',
  'cranfield0191',
  'cranfield0192',
  'cranfield0255',
  'cranfield0268',
  'cranfield0306',
  'cranfield0324',
  'cranfield0329',
  'cranfield0365',
  'cranfield0366',
  'cranfield0388',
  'cranfield0389',
  'cranfield0393',
  'cranfield0397',
  'cranfield0398',
  'cranfield0400',
  'cranfield0412',
  'cranfield0418',
  'cranfield0419',
  'cranfield0452',
  'cranfield0453',
  'cranfield0484',
  'cranfield0491',
  'cranfield0517',
  'cranfield0538',
  'cranfield0550',
  'cranfield0629',
  'cranfield0659',
  'cranfield0660',
  'cranfield0664',
  'cranfield0720',
  'cranfield0820',
  'cranfield0826',
  'cranfield0854',
  'c

In [118]:
class mainIO:
    def __init__(self, unigram_inverted_index):
        self.unigram_inverted_index = unigram_inverted_index
        self.query_handler = queryHandler(unigram_inverted_index)

    def run(self):
        num_queries = int(input('Enter Number of Queries: '))
        ls_queries = []
        ls_op_sequences = []
        for i in range(num_queries):
            query = input('Enter Query: ')
            op_sequence = input('Enter Operator Sequence: ')
            ls_queries.append(query)
            ls_op_sequences.append(op_sequence)
        for i in range(num_queries):
            query_response = self.query_handler.query(ls_queries[i], ls_op_sequences[i])
            print(f'Query {i+1}: {query_response["final_query"]}')
            print(f'Number of documents retrieved for query {i+1}: {len(query_response["docs"])}')
            print(f'Names of the documents retrieved for query {i+1}: {query_response["docs"]}')
            print(f'Number of comparisons required for query {i+1}: {query_response["comparisons"]}')

In [119]:
mainIO(unigram_inverted_index).run()

Query 1: reynolds AND number OR NOT potential AND shear
Number of documents retrieved for query 1: 88
Names of the documents retrieved for query 1: ['cranfield0002', 'cranfield0003', 'cranfield0004', 'cranfield0009', 'cranfield0016', 'cranfield0045', 'cranfield0050', 'cranfield0065', 'cranfield0088', 'cranfield0089', 'cranfield0099', 'cranfield0109', 'cranfield0116', 'cranfield0121', 'cranfield0126', 'cranfield0165', 'cranfield0171', 'cranfield0180', 'cranfield0187', 'cranfield0191', 'cranfield0192', 'cranfield0255', 'cranfield0268', 'cranfield0306', 'cranfield0324', 'cranfield0329', 'cranfield0365', 'cranfield0366', 'cranfield0388', 'cranfield0389', 'cranfield0393', 'cranfield0397', 'cranfield0398', 'cranfield0400', 'cranfield0412', 'cranfield0418', 'cranfield0419', 'cranfield0452', 'cranfield0453', 'cranfield0484', 'cranfield0491', 'cranfield0517', 'cranfield0538', 'cranfield0550', 'cranfield0629', 'cranfield0659', 'cranfield0660', 'cranfield0664', 'cranfield0720', 'cranfield0820', '