In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import word_tokenize
import os
import pickle
import string
from nltk.corpus import stopwords

### Bigram

In [2]:
class BigramInvertedIndex:
    def __init__(self):
        pass

    def build_inverted_index(self, list_of_file_paths):
        index = {}
        for file_path in list_of_file_paths:
            with open(file_path, 'r') as f:
                line = f.readlines()[0]
                tokens = word_tokenize(line)
                for i in range(len(tokens) - 1):
                    bigram = tokens[i] + ' ' + tokens[i + 1]
                    if bigram not in index:
                        index[bigram] = []
                    if file_path not in index[bigram]:
                        index[bigram].append(file_path.split('\\')[-1])

        self.index = index
        self.list_of_file_paths = list_of_file_paths
        self.file_names = [file_path.split('\\')[-1] for file_path in list_of_file_paths]

    def save(self, file_path):
        if self.index is None or self.list_of_file_paths is None or self.file_names is None:
            raise Exception('Index is not built yet')
        with open(file_path, 'wb') as f:
            pickle.dump([self.index, self.list_of_file_paths, self.file_names], f)

    def load(self, file_path):
        with open(file_path, 'rb') as f:
            self.index, self.list_of_file_paths, self.file_names = pickle.load(f)


In [3]:
files = os.listdir('..\data\preprocessed_data')
file_paths = []
for file in files:
    file_paths.append('..\data\preprocessed_data\\' + file)

In [4]:
bigram_inverted_index = BigramInvertedIndex()

In [5]:
bigram_inverted_index.build_inverted_index(file_paths)

In [6]:
bigram_inverted_index.save(r"Dumps\bigram_inverted_index.pkl")

In [7]:
stopword_list = stopwords.words('english')

def preproc(query):
    query = query.lower()
    tokenized_text = word_tokenize(query)
    filtered_text = [word for word in tokenized_text if word not in stopword_list]
    temp = []
    for word in filtered_text:
        temp_word = word
        for punc in string.punctuation:
            temp_word = temp_word.replace(punc, '')
        temp.append(temp_word)
    filtered_text = temp
    filtered_text = ' '.join(filtered_text).split()
    bigram_toks = []
    for i in range(len(filtered_text) - 1):
        bigram_toks.append(filtered_text[i] + ' ' + filtered_text[i + 1])
    return bigram_toks

In [8]:
class queryHandlerBigram:
    def __init__(self, bigram_inverted_index):
        self.bigram_inverted_index = bigram_inverted_index
        self.index = bigram_inverted_index.index
        self.list_of_file_paths = bigram_inverted_index.list_of_file_paths
        self.file_names = bigram_inverted_index.file_names

    def query(self, sequence):
        sequence = preproc(sequence)
        doc_lists = []
        for token in sequence:
            if token in self.index:
                doc_lists.append(self.index[token])
            else:
                doc_lists.append([])

        # Perform ops in order of precedence
        comparison_count = 0

        op_sequence = ['and'] * (len(sequence) - 1)
        op_sequence_copy = op_sequence.copy()
        while 'and' in op_sequence:
            i = op_sequence.index('and')
            doc_lists[i], comparisons = self.and_query(doc_lists[i], doc_lists[i + 1])
            comparison_count += comparisons
            del doc_lists[i + 1]
            del op_sequence[i]
        
        if len(doc_lists) == 0:
            doc_lists = []
        else:
            doc_lists = doc_lists[0]

        final_query = ""

        for token, op in zip(sequence, op_sequence_copy):
            final_query += f'{token} {op.upper()} '
        final_query += sequence[-1]
        
        return {
            'docs': doc_lists,
            'comparisons': comparison_count,
            'sequence': sequence,
            'op_sequence': op_sequence_copy,
            'final_query': final_query
        }


    def and_query(self, doc_list1, doc_list2):
        common_docs = []
        comparisons = 0
        i = 0
        j = 0
        while i < len(doc_list1) and j < len(doc_list2):
            comparisons += 1
            if doc_list1[i] == doc_list2[j]:
                common_docs.append(doc_list1[i])
                i += 1
                j += 1
            elif doc_list1[i] < doc_list2[j]:
                i += 1
            else:
                j += 1
        return common_docs, comparisons



In [9]:
query_handler = queryHandlerBigram(bigram_inverted_index)

In [10]:
query_handler.query('boundary layer simple')

{'docs': ['cranfield0003'],
 'comparisons': 3,
 'sequence': ['boundary layer', 'layer simple'],
 'op_sequence': ['and'],
 'final_query': 'boundary layer AND layer simple'}

### Positional

In [11]:
class PositionalIndex:
    def __init__(self):
        pass

    def build_positional_index(self, list_of_file_paths):
        index = {}
        for file_path in list_of_file_paths:
            file_name = file_path.split('\\')[-1]
            with open(file_path, 'r') as f:
                line = f.readlines()[0]
                tokens = word_tokenize(line)
                for pos, token in enumerate(tokens):
                    if token not in index:
                        index[token] = {}
                    if file_name not in index[token]:
                        index[token][file_name] = []
                    index[token][file_name].append(pos+1)
            f.close()

        self.index = index
        self.list_of_file_paths = list_of_file_paths
        self.file_names = [file_path.split('\\')[-1] for file_path in list_of_file_paths]

    def save(self, file_path):
        if self.index is None or self.list_of_file_paths is None or self.file_names is None:
            raise Exception('Index is not built yet')
        with open(file_path, 'wb') as f:
            pickle.dump([self.index, self.list_of_file_paths, self.file_names], f)

    def load(self, file_path):
        with open(file_path, 'rb') as f:
            self.index, self.list_of_file_paths, self.file_names = pickle.load(f)


In [12]:
positional_index = PositionalIndex()
positional_index.build_positional_index(file_paths)

In [13]:
positional_index.save(r"Dumps\positional_index.pkl")

In [14]:
stopword_list = stopwords.words('english')

def preproc_positional(query):
    query = query.lower()
    tokenized_text = word_tokenize(query)
    filtered_text = [word for word in tokenized_text if word not in stopword_list]
    temp = []
    for word in filtered_text:
        temp_word = word
        for punc in string.punctuation:
            temp_word = temp_word.replace(punc, '')
        temp.append(temp_word)
    filtered_text = temp
    filtered_text = ' '.join(filtered_text).split()
    return filtered_text

In [15]:
class queryHandlerPositional:
    def __init__(self, positional_index):
        self.positional_index = positional_index
        self.index = positional_index.index
        self.list_of_file_paths = positional_index.list_of_file_paths
        self.file_names = positional_index.file_names

    def query(self, sequence):
        sequence = preproc_positional(sequence)
        doc_lists = []
        doc_indices = []
        for token in sequence:
            if token in self.index:
                doc_lists.append(list(self.index[token].keys()))
                doc_indices.append(self.index[token])
            else:
                doc_lists.append([])
                doc_indices.append({})

        # Perform ops in order of precedence
        comparison_count = 0

        op_sequence = ['and'] * (len(sequence) - 1)
        op_sequence_copy = op_sequence.copy()
        while 'and' in op_sequence:
            i = op_sequence.index('and')
            doc_indices[i], comparisons = self.and_query(doc_indices[i], doc_indices[i+1])
            comparison_count += comparisons
            del doc_indices[i + 1]
            del op_sequence[i]
        
        if len(doc_indices) == 0:
            doc_indices = []
        else:
            doc_indices = list(doc_indices[0].keys())

        final_query = ""

        for token, op in zip(sequence, op_sequence_copy):
            final_query += f'{token} {op.upper()} '
        final_query += sequence[-1]
        
        return {
            'docs': doc_indices,
            'comparisons': comparison_count,
            'sequence': sequence,
            'op_sequence': op_sequence_copy,
            'final_query': final_query
        }

    def and_query(self, doc_pos_mapping1, doc_pos_mapping2):
        comparisons = 0
        final_doc_index = {}
        final_doc_list = []
        doc_list1, doc_list2 = list(doc_pos_mapping1.keys()), list(doc_pos_mapping2.keys())
        i, j = 0, 0
        while i < len(doc_list1) and j < len(doc_list2):
            if doc_list1[i] == doc_list2[j]:
                final_doc_list.append(doc_list1[i])
                i+=1
                j+=1
            elif doc_list1[i] < doc_list2[j]:
                i+=1
            else:
                j+=1
            comparisons+=1
        
        for each_doc in final_doc_list:
            i, j = 0, 0
            while i < len(doc_pos_mapping1[each_doc]) and j < len(doc_pos_mapping2[each_doc]):
                if doc_pos_mapping1[each_doc][i] - doc_pos_mapping2[each_doc][j] == -1:
                    if each_doc not in final_doc_index:
                        final_doc_index[each_doc] = []
                    final_doc_index[each_doc].append(doc_pos_mapping2[each_doc][j])
                    i+=1
                    j+=1
                elif doc_pos_mapping1[each_doc][i] - doc_pos_mapping2[each_doc][j] > -1:
                    j+=1
                else:
                    i+=1
                comparisons+=1
        
        return final_doc_index, comparisons


In [16]:
query_handler = queryHandlerPositional(positional_index)

In [17]:
query_handler.query('buckling')

{'docs': ['cranfield0031',
  'cranfield0400',
  'cranfield0412',
  'cranfield0419',
  'cranfield0642',
  'cranfield0658',
  'cranfield0739',
  'cranfield0740',
  'cranfield0741',
  'cranfield0743',
  'cranfield0744',
  'cranfield0760',
  'cranfield0761',
  'cranfield0763',
  'cranfield0765',
  'cranfield0766',
  'cranfield0769',
  'cranfield0820',
  'cranfield0822',
  'cranfield0823',
  'cranfield0824',
  'cranfield0825',
  'cranfield0826',
  'cranfield0827',
  'cranfield0830',
  'cranfield0831',
  'cranfield0833',
  'cranfield0838',
  'cranfield0839',
  'cranfield0843',
  'cranfield0856',
  'cranfield0858',
  'cranfield0859',
  'cranfield0885',
  'cranfield0886',
  'cranfield0887',
  'cranfield0888',
  'cranfield0889',
  'cranfield0890',
  'cranfield0891',
  'cranfield0897',
  'cranfield0898',
  'cranfield0915',
  'cranfield0926',
  'cranfield0928',
  'cranfield0929',
  'cranfield0932',
  'cranfield0935',
  'cranfield0936',
  'cranfield0937',
  'cranfield0948',
  'cranfield0950',
  'c

### Final IO

In [18]:
class mainIO:
    def __init__(self, bigram_inverted_index, positional_index):
        self.bigram_inverted_index = bigram_inverted_index
        self.bigram_query_handler = queryHandlerBigram(bigram_inverted_index)
        
        self.positional_index = positional_index
        self.positional_query_handler = queryHandlerPositional(positional_index)

    def run(self):
        num_queries = int(input('Enter Number of Queries: '))
        ls_queries = []
        for i in range(num_queries):
            query = input('Enter Query: ')
            ls_queries.append(query)
        for i, query in enumerate(ls_queries):
            bigram_query_response = self.bigram_query_handler.query(query)
            positional_query_response = self.positional_query_handler.query(query)
            print(f'Number of documents retrieved for query {i+1} using bigram inverted index: {len(bigram_query_response["docs"])}')
            print(f'Names of documents retrieved for query 1 using bigram inverted index: {bigram_query_response["docs"]}')
            print(f'Number of documents retrieved for query {i+1} using positional index: {len(positional_query_response["docs"])}')
            print(f'Names of documents retrieved for query 1 using positional index: {positional_query_response["docs"]}')

In [19]:
mainIO(bigram_inverted_index, positional_index).run()

Number of documents retrieved for query 1 using bigram inverted index: 2
Names of documents retrieved for query 1 using bigram inverted index: ['cranfield1400', 'cranfield1400']
Number of documents retrieved for query 1 using positional index: 1
Names of documents retrieved for query 1 using positional index: ['cranfield1400']
