## Import Header Files

In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pickle
import os
import re
import numpy as np
import pandas as pd

## Retrieve all the folders

In [2]:
locations = [pos[0] for pos in os.walk(str(os.getcwd())+'/'+'stories'+'/')]
len_locations = len(locations[0])
locations[0] = locations[0][:len_locations-1]

print("Folder Locations = ",locations)

Folder Locations =  ['/home/akhil20107/stories', '/home/akhil20107/stories/SRE', '/home/akhil20107/stories/FARNON']


## Load all files from index files

In [4]:
dataset = []

flag = False

for i in locations:
    file = open(i + "/index.html", 'r')
    text = file.read().strip()
    file.close()

    file_name = re.findall('><A HREF="(.*)">', text)
    file_title = re.findall('<BR><TD> (.*)\n', text)

    if flag == False:
        file_name = file_name[2:]
        flag = True

    for j in range(len(file_name)):
        dataset.append((str(i) + "/" + str(file_name[j]), file_title[j]))

N = len (dataset)
print("Total number of files =",N)

Total number of files = 467


## Preprocessing Functions

In [5]:
def l_case(data):
    return np.char.lower(data)

def punctuation(data):
    symbols = "!\"#$%&()*+-./:;,<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    return data

def apostrophe(data):
    return np.char.replace(data, "'", "")

def stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_word = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_word = new_word + " " + w
    return np.char.strip(new_word)

def lemming(data):
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(str(data))
    new_word = ""
    for w in tokens:
        new_word = new_word + " " + wordnet_lemmatizer.lemmatize(w)
    return np.char.strip(new_word)


In [6]:
def preprocess(text):
    text = l_case(text)
    text = punctuation(text)
    text = apostrophe(text)
    text = stop_words(text)
    text = lemming(text)
    return text


## Make Postings

In [7]:
doc = 0
postings = pd.DataFrame()
for i in dataset:
    file = open(i[0], 'r', encoding='utf-8', errors='ignore')
    text = file.read().strip()
    file.close()
    preprocessed_text = preprocess(text)
    tokens = word_tokenize(str(preprocessed_text))
    for token in tokens:
        if token in postings:
            posting = postings[token][0]
            posting.add(doc)
            postings[token][0] = posting
        else:
            postings.insert(value=[{doc}], loc=0, column=token)
    doc += 1

In [8]:
# Check Postings
postings

Unnamed: 0,retook,superseded,powernodes,ardonite,vortexer,vortexers,optimal,defacing,periis,perii,...,53,west,100,continue,support,need,freeware,project,trial,shareware
0,{466},{466},{466},{466},{466},{466},{466},{466},{466},{466},...,"{0, 196, 356, 169, 170, 44, 76, 206, 435, 310,...","{0, 128, 129, 134, 390, 8, 13, 14, 142, 270, 4...","{0, 386, 387, 136, 264, 394, 41, 42, 170, 47, ...","{0, 6, 11, 16, 26, 39, 40, 47, 48, 50, 58, 63,...","{0, 7, 21, 27, 40, 42, 47, 50, 58, 59, 60, 65,...","{0, 3, 5, 7, 8, 11, 13, 14, 16, 18, 20, 21, 22...",{0},"{0, 2, 5, 142, 144, 402, 404, 407, 283, 412, 4...","{0, 131, 388, 142, 273, 18, 150, 407, 26, 287,...","{0, 42, 76, 78, 126}"


In [39]:
# Save Postings
postings.to_pickle("save_postings")

In [9]:
#Load Postings
postings = pd.read_pickle("save_postings")

## View Document Text

In [10]:
def view_doc(id):
    print(dataset[id])
    file = open(dataset[id][0], 'r', encoding='utf-8')
    text = file.read().strip()
    file.close()
    print(text)

## Print Postings of word

In [11]:
def print_postings(word):
    preprocessed_word = str(preprocess(word))
    print(preprocessed_word)
    print("Frequency:",len(postings[preprocessed_word][0]))
    print("Postings List:",postings[preprocessed_word][0])

## Generate query and command words

In [12]:
def generate_query_command_words(query):
    query = query.lower()
    tokens = word_tokenize(query)
    commands = []
    query_words = []

    for t in tokens:
        if t not in ['and', 'or', 'not']:
            processed_word = preprocess(t)
            query_words.append(str(processed_word))
        else:
            commands.append(t)
    return commands, query_words

## Handle NOT

In [13]:
def get_difference(word):
    a = postings[word][0]
    b = set(range(len(dataset)))
    return b.difference(a)

In [14]:
def gen_not_set(query_words, commands):
    not_tup = []
    while 'not' in commands:
        i = commands.index('not')
        word = query_words[i]
        word_postings = get_difference(word)
        not_tup.append(word_postings)
        commands.pop(i)
        query_words[i] = i
        print("\nAfter Processing NOT:",commands, query_words)
    return not_tup

## Count number of comparisons

In [15]:
def count_comparisons(a_list, b_list):
    k, l = 0, 0
    intersect = []
    global count        
    while k < len(a_list) and l < len(b_list):
        count = count+1
        if a_list[k] == b_list[l]:
            k += 1
            l += 1
        elif a_list[k] < b_list[l]:
            k += 1
        else:
            l += 1 
    return count

## Handle Binary operations

In [16]:
def binary_operations(query_words, commands, tup):
    a = postings[query_words[0]][0]
    global comparisons
    query_words.pop(0)
    for i in range(len(commands)):
        if type(query_words[i])==int:
            b = tup.pop(0)
            bList = sorted(b)
        else:
            b = postings[query_words[i]][0]
            bList = sorted(b)
        if commands[i] == 'and':
            a1 = a
            a = a.intersection(b)
            if(len(a) == 0):
                aList = sorted(a1)
            else:
                aList = sorted(a)
            comparisons = count_comparisons(aList, bList)        
        elif commands[i] == 'or':
            a1 = a
            a = a.union(b)
            if(len(a) == 0):
                aList = sorted(a1)
            else:
                aList = sorted(a)
            comparisons = count_comparisons(aList, bList)  
        else:
            print("Invalid Command")
       
    return a

In [17]:
# Run query

def run_query(query):
    flag2=0
    commands, query_words = generate_query_command_words(query)
    for i in query_words:
        if i not in postings.columns:
            flag2=1
            return query,flag2
    tup = gen_not_set(query_words, commands)
    
    print("\nQuery Words:",query_words)
    print("\nCommands:",commands)
    
    final_set = binary_operations(query_words, commands, tup)
    
    return final_set,flag2

## Take queries in input format

In [18]:
final_Queries = []
flag1=0
N = int(input('Enter the Number of Queries:'))
for i in range(N):
    if flag1==1:
            break
    query = preprocess(input('Enter Input Query')).tolist().split()
    print(query)
    query = query
    sequence = input('Enter input Sequence with comma in between operators')
    sequence = sequence.split(',')
    if len(sequence)!=(len(query)-1):
        print("Wrong Input")
        flag1=1
        break
    else:
        for i in range(len(sequence)):
            sequence[i] = sequence[i].upper()
            if sequence[i]==" " or sequence[i]=="":
                print("Wrong Input")
                flag1=1
                break
            if sequence[i]=="[ OR NOT" or sequence[i]=="[OR NOT" or sequence[i]==" OR NOT ]" or sequence[i]=="OR NOT]" or sequence[i]=="[OR NOT]" or sequence[i]=="[ OR NOT ]":
                sequence[i]="OR NOT"
            elif sequence[i]=="[ AND NOT" or sequence[i]=="[AND NOT" or sequence[i]==" AND NOT ]" or sequence[i]=="AND NOT]" or sequence[i]=="[AND NOT]" or sequence[i]=="[ AND NOT ]":
                sequence[i]="AND NOT"
            elif sequence[i]=="[ OR" or sequence[i]==" OR ]" or sequence[i]=="[OR" or sequence[i]=="OR]" or sequence[i]=="[OR]" or sequence[i]=="[ OR ]":
                sequence[i]="OR"
            elif sequence[i]=="[ AND" or sequence[i]==" AND ]" or sequence[i]=="[AND" or sequence[i]=="AND]"or sequence[i]=="[AND]" or sequence[i]=="[ AND ]":
                sequence[i]="AND"
            elif sequence[i]==" NOT ]" or sequence[i]=="NOT]":
                sequence[i]="NOT"
            
        final_query =''
        i = 0
        j= 0
        while(i != len(query)):
            final_query += query[i]+' '
            if (i == len(query)-1):
                break
            final_query += sequence[j]+' '
            i +=1
            j +=1
        final_Queries.append(final_query)
        print('Final Query is {}'.format(final_query))

Enter the Number of Queries: 2
Enter Input Query lion stood thoughtfully for a moment


['lion', 'stood', 'thoughtfully', 'moment']


Enter input Sequence with comma in between operators [ OR, OR , OR ]


Final Query is lion OR stood  OR  thoughtfully OR moment 


Enter Input Query telephone,paved, roads


['telephone', 'paved', 'road']


Enter input Sequence with comma in between operators [ OR NOT, AND NOT ]


Final Query is telephone OR NOT paved AND NOT road 


## Retrieve Documents

In [19]:
for i in range(len(final_Queries)):
    queryString = final_Queries[i]
    count = comparisons = 0
    lists,flag = run_query(queryString)
    if(flag==1):
        print("Word is not present in any document")
        break
    print("\nNumber of documents matched for query",i+1, "are:", len(lists))
    print("\nNumber of comparisons done for query",i+1, "are:", comparisons)
    if len(lists)==0:
        print("\nNo documents in Final Set for query",i)
    else:
        print("\nRetrieved Document Ids for query",i+1, "are:",sorted(lists))
   
    docList = sorted(lists)
    docName = []
    for j in range(len(docList)):
        index = docList[j]
        head, tail = os.path.split(dataset[index][0])
        docName.append(tail)
    print("\nList of Retrieved Documents for query",i+1, "is: ")
    print(*docName, sep='\n')
    print("\n-----------------------------------------------\n-----------------------------------------------\n")


Query Words: ['lion', 'stood', 'thoughtfully', 'moment']

Commands: ['or', 'or', 'or']

Number of documents matched for query 1 are: 270

Number of comparisons done for query 1 are: 670

Retrieved Document Ids for query 1 are: [1, 2, 3, 5, 6, 7, 8, 9, 11, 13, 14, 16, 18, 19, 20, 23, 24, 26, 27, 29, 30, 31, 34, 36, 37, 39, 40, 41, 42, 48, 49, 50, 53, 54, 56, 58, 59, 60, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 73, 75, 76, 77, 78, 79, 80, 82, 83, 85, 87, 90, 91, 92, 93, 94, 95, 97, 100, 107, 110, 112, 113, 116, 118, 119, 120, 123, 124, 126, 128, 129, 130, 131, 133, 134, 135, 140, 141, 142, 143, 144, 145, 147, 148, 150, 151, 153, 154, 155, 158, 159, 162, 163, 169, 170, 171, 172, 173, 174, 175, 176, 179, 180, 182, 183, 184, 186, 187, 189, 190, 191, 192, 193, 196, 200, 202, 205, 206, 207, 208, 209, 210, 211, 213, 214, 217, 218, 219, 221, 223, 224, 225, 226, 228, 229, 230, 232, 233, 234, 236, 238, 239, 240, 241, 242, 244, 245, 246, 247, 249, 252, 256, 257, 259, 260, 261, 263, 265, 266, 267, 

In [37]:
print_postings('roads')

road
Frequency: 121
Postings List: {0, 1, 3, 14, 15, 16, 20, 21, 26, 27, 33, 34, 37, 39, 41, 44, 49, 50, 56, 61, 66, 74, 76, 77, 78, 79, 87, 92, 97, 101, 103, 106, 109, 116, 118, 119, 123, 126, 129, 130, 133, 135, 141, 143, 149, 150, 153, 154, 155, 162, 163, 169, 173, 174, 183, 189, 193, 207, 209, 214, 221, 223, 224, 225, 226, 230, 236, 241, 244, 260, 265, 269, 270, 277, 279, 280, 284, 290, 291, 295, 298, 300, 301, 303, 304, 310, 315, 320, 332, 337, 346, 359, 361, 362, 366, 367, 369, 373, 375, 378, 379, 380, 383, 386, 387, 388, 392, 393, 395, 407, 425, 428, 431, 433, 434, 435, 441, 447, 450, 451, 457}


In [54]:
view_doc(0)

('/home/akhil20107/stories/100west.txt', 'Going 100 West by 53 North by Jim Prentice (1990)')
THIS IS A SHAREWARE TRIAL PROJECT
                                 
         IT IS NOT "FREEWARE" WE NEED YOUR SUPPORT TO CONTINUE




                              100 WEST BY 53 NORTH

                                    by

                               Jim Prentice

          Copyright 1990, Jim Prentice, Brandon, Manitoba, CANADA





          North of 53. A magic phrase. Spoken, mumbled or thought
     inwardly by thousands of souls venturing northward. An
     imaginary line, shown only on maps and labelled 53 degrees.
     It's presence indicated to highway travellers by road side
     signs.
          A division of territory as distinct in the mind as any
     international border.
          If you have not been "North of 53", you have not been
     north!
          Travellers and writers, poets and pilots, have
     contributed to the lore of the north. The rigors of life in
     t

In [131]:
postings1 = pd.DataFrame()
file = open(dataset[0][0], 'r', encoding='cp1250', errors='ignore')
text = file.read().strip()
file.close()
preprocessed_text = preprocess(text)
print(preprocessed_text)
tokens = word_tokenize(str(preprocessed_text))
print("\n",tokens)
for token in tokens:
    if token in postings1:
        p = postings1[token][0]
        p+=1
        postings1[token][0] = p
    else:
        postings1.insert(value=[1], loc=0, column=token)



