In [2]:
import numpy as np
import pandas as pd
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import word_tokenize
import os
import pickle
import string
from nltk.corpus import stopwords

### Bigram

In [3]:
class BigramInvertedIndex:
    def __init__(self):
        pass

    def build_inverted_index(self, list_of_file_paths):
        index = {}
        for file_path in list_of_file_paths:
            with open(file_path, 'r') as f:
                line = f.readlines()[0]
                tokens = word_tokenize(line)
                for i in range(len(tokens) - 1):
                    bigram = tokens[i] + ' ' + tokens[i + 1]
                    if bigram not in index:
                        index[bigram] = []
                    if file_path not in index[bigram]:
                        index[bigram].append(file_path.split('\\')[-1])

        self.index = index
        self.list_of_file_paths = list_of_file_paths
        self.file_names = [file_path.split('\\')[-1] for file_path in list_of_file_paths]

    def save(self, file_path):
        if self.index is None or self.list_of_file_paths is None or self.file_names is None:
            raise Exception('Index is not built yet')
        with open(file_path, 'wb') as f:
            pickle.dump([self.index, self.list_of_file_paths, self.file_names], f)

    def load(self, file_path):
        with open(file_path, 'rb') as f:
            self.index, self.list_of_file_paths, self.file_names = pickle.load(f)


In [4]:
files = os.listdir('..\data\preprocessed_data')
file_paths = []
for file in files:
    file_paths.append('..\data\preprocessed_data\\' + file)

In [5]:
bigram_inverted_index = BigramInvertedIndex()

In [6]:
bigram_inverted_index.build_inverted_index(file_paths)

In [7]:
bigram_inverted_index.save(r"Dumps\bigram_inverted_index.pkl")

In [8]:
stopword_list = stopwords.words('english')

stopword_list = stopwords.words('english')

def preproc(query):
    query = query.lower()
    tokenized_text = word_tokenize(query) 
    filtered_text = [word for word in tokenized_text if word not in stopword_list]
    for punc in string.punctuation:
        filtered_text = [word for word in filtered_text if word != punc]
    filtered_text = [word for word in filtered_text if word != ' ']
    bigram_toks = []
    for i in range(len(filtered_text) - 1):
        bigram_toks.append(filtered_text[i] + ' ' + filtered_text[i + 1])
    return bigram_toks

In [9]:
class queryHandlerBigram:
    def __init__(self, bigram_inverted_index):
        self.bigram_inverted_index = bigram_inverted_index
        self.index = bigram_inverted_index.index
        self.list_of_file_paths = bigram_inverted_index.list_of_file_paths
        self.file_names = bigram_inverted_index.file_names

    def query(self, sequence):
        sequence = preproc(sequence)
        doc_lists = []
        for token in sequence:
            if token in self.index:
                doc_lists.append(self.index[token])
            else:
                doc_lists.append([])

        # Perform ops in order of precedence
        comparison_count = 0

        op_sequence = ['and'] * (len(sequence) - 1)
        op_sequence_copy = op_sequence.copy()
        while 'and' in op_sequence:
            i = op_sequence.index('and')
            doc_lists[i], comparisons = self.and_query(doc_lists[i], doc_lists[i + 1])
            comparison_count += comparisons
            del doc_lists[i + 1]
            del op_sequence[i]
        
        if len(doc_lists) == 0:
            doc_lists = []
        else:
            doc_lists = doc_lists[0]

        final_query = ""

        for token, op in zip(sequence, op_sequence_copy):
            final_query += f'{token} {op.upper()} '
        final_query += sequence[-1]
        
        return {
            'docs': doc_lists,
            'comparisons': comparison_count,
            'sequence': sequence,
            'op_sequence': op_sequence_copy,
            'final_query': final_query
        }


    def and_query(self, doc_list1, doc_list2):
        common_docs = []
        comparisons = 0
        i = 0
        j = 0
        while i < len(doc_list1) and j < len(doc_list2):
            comparisons += 1
            if doc_list1[i] == doc_list2[j]:
                common_docs.append(doc_list1[i])
                i += 1
                j += 1
            elif doc_list1[i] < doc_list2[j]:
                i += 1
            else:
                j += 1
        return common_docs, comparisons



In [10]:
query_handler = queryHandlerBigram(bigram_inverted_index)

In [11]:
query_handler.query('boundary layer simple')

{'docs': ['cranfield0003'],
 'comparisons': 3,
 'sequence': ['boundary layer', 'layer simple'],
 'op_sequence': ['and'],
 'final_query': 'boundary layer AND layer simple'}

### Positional

In [27]:
class PositionalIndex:
    def __init__(self):
        pass

    def build_positional_index(self, list_of_file_paths):
        index = {}
        for file_path in list_of_file_paths:
            file_name = file_path.split('\\')[-1]
            with open(file_path, 'r') as f:
                line = f.readlines()[0]
                tokens = word_tokenize(line)
                for pos, token in enumerate(tokens):
                    if token not in index:
                        index[token] = {}
                    if file_name not in index[token]:
                        index[token][file_name] = []
                    index[token][file_name].append(pos+1)
            f.close()

        self.index = index
        self.list_of_file_paths = list_of_file_paths
        self.file_names = [file_path.split('\\')[-1] for file_path in list_of_file_paths]

    def save(self, file_path):
        if self.index is None or self.list_of_file_paths is None or self.file_names is None:
            raise Exception('Index is not built yet')
        with open(file_path, 'wb') as f:
            pickle.dump([self.index, self.list_of_file_paths, self.file_names], f)

    def load(self, file_path):
        with open(file_path, 'rb') as f:
            self.index, self.list_of_file_paths, self.file_names = pickle.load(f)


In [28]:
positional_index = PositionalIndex()
positional_index.build_positional_index(file_paths)

In [29]:
positional_index.save(r"Dumps\positional_index.pkl")

In [31]:
class queryHandlerPositional:
    def __init__(self, positional_index):
        self.positional_index = positional_index
        self.index = positional_index.index
        self.list_of_file_paths = positional_index.list_of_file_paths
        self.file_names = positional_index.file_names

    def query(self, sequence):
        sequence = preproc(sequence)
        doc_lists = []
        for token in sequence:
            if token in self.index:
                doc_lists.append(self.index[token])
            else:
                doc_lists.append([])

        # Perform ops in order of precedence
        comparison_count = 0

        op_sequence = ['and'] * (len(sequence) - 1)
        op_sequence_copy = op_sequence.copy()
        while 'and' in op_sequence:
            i = op_sequence.index('and')
            doc_lists[i], comparisons = self.and_query(doc_lists[i], doc_lists[i + 1])
            comparison_count += comparisons
            del doc_lists[i + 1]
            del op_sequence[i]
        
        if len(doc_lists) == 0:
            doc_lists = []
        else:
            doc_lists = doc_lists[0]

        final_query = ""

        for token, op in zip(sequence, op_sequence_copy):
            final_query += f'{token} {op.upper()} '
        final_query += sequence[-1]
        
        return {
            'docs': doc_lists,
            'comparisons': comparison_count,
            'sequence': sequence,
            'op_sequence': op_sequence_copy,
            'final_query': final_query
        }


    def and_query(self, doc_list1, doc_list2):
        common_docs = []
        comparisons = 0
        i = 0
        j = 0
        while i < len(doc_list1) and j < len(doc_list2):
            comparisons += 1
            if doc_list1[i] == doc_list2[j]:
                common_docs.append(doc_list1[i])
                i += 1
                j += 1
            elif doc_list1[i] < doc_list2[j]:
                i += 1
            else:
                j += 1
        return common_docs, comparisons



### Final IO

In [11]:
class mainIO:
    def __init__(self, bigram_inverted_index):
        self.bigram_inverted_index = bigram_inverted_index
        self.query_handler = queryHandlerBigram(bigram_inverted_index)

    def run(self):
        num_queries = int(input('Enter Number of Queries: '))
        ls_queries = []
        for i in range(num_queries):
            query = input('Enter Query: ')
            ls_queries.append(query)
        for i, query in enumerate(ls_queries):
            query_response = self.query_handler.query(query)
            print(f'Number of documents retrieved for query {i+1} using bigram inverted index: {len(query_response["docs"])}')
            print(f'Names of documents retrieved for query 1 using bigram inverted index: {query_response["docs"]}')

In [12]:
mainIO(bigram_inverted_index).run()

Number of documents retrieved for query 1 using bigram inverted index: 1
Names of documents retrieved for query 1 using bigram inverted index: ['cranfield0003']
