In [10]:
import numpy as np
import pandas as pd
import os
import pickle
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import ast


In [11]:
df = pd.read_csv('final_df.csv')

df.columns = ['fname', 'content']
df.head()

Unnamed: 0,fname,content
0,cranfield1223,"['inviscid-incompressible-flow', 'theory', 'st..."
1,cranfield1011,"['free-flight', 'measurements', 'static', 'dyn..."
2,cranfield0795,"['operation', 'npl', '18in', 'x', '14in', 'win..."
3,cranfield0553,"['ablation', 'glassy', 'materials', 'around', ..."
4,cranfield0761,"['buckling', 'sandwich', 'normal', 'pressure',..."


In [12]:
#create bigram inverted index
bigrams = dict()
for row in range(len(df)):
    doc = ast.literal_eval(df['content'][row])  #throw out the contents of string
    for doc_index in range(len(doc)-1):
        if (doc[doc_index],doc[doc_index+1]) not in bigrams:
            bigrams[(doc[doc_index],doc[doc_index+1])] = set([df['fname'][row]])
        else:
            bigrams[(doc[doc_index],doc[doc_index+1])].add(df['fname'][row])


In [13]:
bigrams[('heat', 'transfer')]

{'cranfield0012',
 'cranfield0021',
 'cranfield0022',
 'cranfield0023',
 'cranfield0024',
 'cranfield0029',
 'cranfield0036',
 'cranfield0037',
 'cranfield0045',
 'cranfield0049',
 'cranfield0054',
 'cranfield0055',
 'cranfield0061',
 'cranfield0062',
 'cranfield0072',
 'cranfield0074',
 'cranfield0077',
 'cranfield0081',
 'cranfield0082',
 'cranfield0084',
 'cranfield0088',
 'cranfield0089',
 'cranfield0098',
 'cranfield0101',
 'cranfield0102',
 'cranfield0120',
 'cranfield0123',
 'cranfield0142',
 'cranfield0144',
 'cranfield0145',
 'cranfield0240',
 'cranfield0260',
 'cranfield0267',
 'cranfield0268',
 'cranfield0269',
 'cranfield0270',
 'cranfield0274',
 'cranfield0283',
 'cranfield0294',
 'cranfield0295',
 'cranfield0303',
 'cranfield0305',
 'cranfield0306',
 'cranfield0310',
 'cranfield0314',
 'cranfield0325',
 'cranfield0329',
 'cranfield0333',
 'cranfield0339',
 'cranfield0343',
 'cranfield0344',
 'cranfield0347',
 'cranfield0348',
 'cranfield0352',
 'cranfield0353',
 'cranfiel

In [14]:
len(bigrams)

85921

In [15]:
# with open('bigrams.pickle', 'wb') as handle:
#     pickle.dump(bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
# with open('bigrams.pickle', 'rb') as handle:
#     bigrams = pickle.load(handle)

In [17]:
# pos_index = dict()
# for row in range(len(df)):
#     doc = ast.literal_eval(df['content'][row])
#     for doc_index in range(len(doc)):
#         if doc[doc_index] not in pos_index:   #word 'be' itself seen first time
#             pos_index[doc[doc_index]] = {df['fname'][row]:set([doc_index+1])}
#         else:                         #word 'be' present but seen in a particular 'docID' for the first time
#             try:
#                 pos_index[doc[doc_index]][df['fname'][row]].add(doc_index+1)
#             except:
#                 pos_index[doc[doc_index]][df['fname'][row]] = set([doc_index+1])

In [18]:
#create positional inverted index
pos_index = dict() 
for row in range(len(df)):
    doc = ast.literal_eval(df['content'][row])
    for doc_index in range(len(doc)):
        if doc[doc_index] not in pos_index:   #word 'be' itself seen first time
            pos_index[doc[doc_index]] = {df['fname'][row]:list([doc_index+1])}
        else:                         #word 'be' present but seen in a particular 'docID' for the first time
            try:
                pos_index[doc[doc_index]][df['fname'][row]].append(doc_index+1)
            except:
                pos_index[doc[doc_index]][df['fname'][row]] = list([doc_index+1])

In [19]:
pos_index['jet']

{'cranfield1223': [26],
 'cranfield0137': [16],
 'cranfield1212': [18],
 'cranfield0969': [25, 48],
 'cranfield0993': [2, 6, 24, 56, 65, 97, 117, 119],
 'cranfield0994': [17, 29],
 'cranfield0335': [47],
 'cranfield0904': [80],
 'cranfield0131': [2, 9, 23, 56, 79, 92, 97, 115, 167],
 'cranfield0961': [4, 15, 80, 90],
 'cranfield0350': [2, 10, 28, 31, 37, 43],
 'cranfield0992': [3, 19, 59, 74, 82, 114, 118, 132, 160, 167],
 'cranfield1350': [2, 13, 30, 56, 64],
 'cranfield0282': [1, 16, 34, 66, 68, 76, 108],
 'cranfield0086': [36, 50],
 'cranfield0624': [40, 55],
 'cranfield0640': [4, 10],
 'cranfield1101': [20],
 'cranfield1351': [4, 24, 90, 98, 122],
 'cranfield1374': [14],
 'cranfield0636': [27, 43],
 'cranfield0290': [19, 54],
 'cranfield0696': [88, 121, 155, 163],
 'cranfield1375': [4, 28],
 'cranfield0697': [15, 32, 50, 55, 91],
 'cranfield0176': [7, 25, 30, 36, 42, 58, 64],
 'cranfield0182': [12],
 'cranfield1265': [3, 12, 49, 56, 72, 84],
 'cranfield0171': [35],
 'cranfield0727'

In [20]:
len(pos_index)

9709

In [21]:
# with open('pos_index.pickle', 'wb') as handle:
#     pickle.dump(pos_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
# with open('pos_index.pickle', 'rb') as handle:
#     pos_index = pickle.load(handle)

In [23]:
#Even with a limited number of unique words in the corpus, i.e. the vocab used in the positional index, 
#very large no. of  pairs are possible, of which quite no. of them could be present in corpus as bigrams;
#making a larger dictionary, with a larger size. 

In [24]:
punctuations = string.punctuation
punc = [ p for p in punctuations]

In [25]:
# function to pre-process query
def preprocess(query):
    query = query.lower()
    tokens = word_tokenize(query)
    remove_stop = [t for t in tokens if not t in stopwords.words("english")]
    remove_punc = [t for t in remove_stop if t not in punc]
    res = [ t.strip() for t in remove_punc if len(t.strip()) != 0]
    return res

In [26]:
def bigram_docs(query):
    doc_set = set()
    for i in range(len(query)-1):
        if (query[i], query[i+1]) in bigrams:
            if i==0:
                doc_set = bigrams[(query[i], query[i+1])].copy()
            else:
                doc_set = doc_set.intersection(bigrams[(query[i], query[i+1])])
    return len(doc_set), sorted(list(doc_set))

In [27]:
def increment(temp_dict):  #increment position of each doc by 1 
    for doc_list in temp_dict:
        postings_list = temp_dict[doc_list].copy()
        postings_list = [sum(x) for x in zip(postings_list, [1]*len(postings_list))]
        temp_dict[doc_list] = postings_list
    return temp_dict

In [28]:
def intersect(dict1, dict2):  #find the set of docs where the next word(dict 2) comes just after the current seen phrase(dict1)
    new_dict = {}
    for doc_a in dict1:
        try:
           new_dict[doc_a] = sorted(list(set(dict1[doc_a]).intersection(set(dict2[doc_a]))))
        except:
            pass

    return new_dict

In [29]:
def refresh(temp1):  #remove those docs(keys in the dict) which have 0 sized postings lists - remove the docs which don't have the phrase within them in intermediate 
    temp2 = dict()
    for doc in temp1:
        if len(temp1[doc])!= 0:
            temp2[doc] = temp1[doc]
            
    return temp2

In [42]:
def positional_docs(query):  #solve queries using positional inverted indexes
    for (i, word) in enumerate(query):
        if word not in pos_index:
            return 0, list()
        else:
            temp_dict = pos_index[word].copy()
            if i==0:
                temp_dict = increment(temp_dict)
                pos_dict = temp_dict                
            else:
                pos_dict = intersect(pos_dict, temp_dict)
                pos_dict = refresh(pos_dict)
                pos_dict = increment(pos_dict)
    

    return len(pos_dict), list(pos_dict.keys())

In [62]:
n = int(input("Enter number of queries to execute: "))
query_list=[]
for i in range(n):
    query = input()
    query = preprocess(query)
    query_list.append(query)
print(f'Input{query_list}\n')
for i in range(n):
    
    count_files, files = bigram_docs(query_list[i])
    print(f'Number of documents retrieved for query {i+1} using bigram inverted index:{count_files} \nNames of documents retrieved for query {i+1} using bigram inverted index: {files}')

    count_files, files = positional_docs(query_list[i])
    print(f'Number of documents retrieved for query {i+1} using positional inverted index:{count_files} \nNames of documents retrieved for query {i+1} using positional inverted index: {files}')
    print()

Input[['downstream', 'roughness', 'causes'], ['including', 'gust', 'response']]

Number of documents retrieved for query 1 using bigram inverted index:1 
Names of documents retrieved for query 1 using bigram inverted index: ['cranfield0933']
Number of documents retrieved for query 1 using positional inverted index:1 
Names of documents retrieved for query 1 using positional inverted index: ['cranfield0933']

Number of documents retrieved for query 2 using bigram inverted index:1 
Names of documents retrieved for query 2 using bigram inverted index: ['cranfield0014']
Number of documents retrieved for query 2 using positional inverted index:1 
Names of documents retrieved for query 2 using positional inverted index: ['cranfield0014']

