# Boolean Retreival Model
## All imports required

In [1]:
import re
from collections import defaultdict
from nltk.stem.porter import *
from PyQt5 import QtWidgets
from PyQt5.QtWidgets import QApplication,QMainWindow
import sys
import json

# Prerocessing
Run the Below two cell to do all the preprocessing like creating stopwords list, lowercase all words, creating inverted index,
creating positional index

In [2]:
stemmer = PorterStemmer() #porter stemmer from nltk library 

def create_stoplist(): #creating stopword list 
    stop_list_f = open("Stopword-list.txt","r")
    stop_list = stop_list_f.read()
    stop_list.replace(" ","")
    stopword_list = stop_list.split("\n")
    return stopword_list


def create_wordlist(docID): #creating list of all words document
    f = open(str(docID)+".txt", "r",encoding="utf-8")
    f_read=f.read()
#     print(f_read)
    f_read=f_read.lower() #lower case complete document 
    f_read = f_read.replace("\n"," ") #removing new line character
    f_read = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,“”’———‘]","", f_read) #removing punctuations
    f_list = f_read.split(" ") 
    f_list=[x for x in f_list if x] #creating list of words
    # print(f_list)
    # print(len(f_list))
    words = [x for x in f_list if x not in stopword_list] #removing stopwords from the list of words
    words = [stemmer.stem(x) for x in words]
    return words
#     print(words)
#     print(len(words))

def create_original_word_list(docID): #all the words in documents with stopwords
    f = open(str(docID)+".txt", "r",encoding="utf-8")
    f_read=f.read()
#     print(f_read)
    f_read=f_read.lower() #lower case complete document 
    f_read = f_read.replace("\n"," ") #removing new line character
    f_read = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,“”’———‘]","", f_read) #removing punctuations
    f_list = f_read.split(" ") 
    f_list=[x for x in f_list if x] #creating list of words
    f_list = [stemmer.stem(x) for x in f_list]
    return f_list
    

def create_inverted_index(words): #fuction for creating inverted index
    for i in range(0,len(words)): #creating inverted index
        if words[i] in inverted_index:
            if docID in inverted_index[words[i]][1]:
                continue
            else:
                inverted_index[words[i]][1].append(docID)
                inverted_index[words[i]][0] =  inverted_index[words[i]][0] + 1
        else:
            inverted_index[words[i]]=[]
            inverted_index[words[i]].append(1)
            inverted_index[words[i]].append([docID])
#         for i in range(0,len(words)): #creating inverted index
#             if(inverted_index[words[i]].count(docID)==0):
#                 inverted_index[words[i]].append(docID)
    sorted_index = sorted(inverted_index.items(), key = lambda kv: kv[0])

    sorted_index = dict(sorted_index)
#     print(sorted_index)
    return sorted_index


def create_positional_index(words,stopword_list): #fuction for creating psotional Index 
    for pos,term in enumerate(words):
        if words not in stopword_list:
            if term in positional_index:
                if docID in positional_index[term][1]: 
                    positional_index[term][1][docID].append(pos)
                else:
                    positional_index[term][0] = positional_index[term][0] + 1
                    positional_index[term][1][docID] = [pos]
            else:
                positional_index[term] = []
                positional_index[term].append(1)
                positional_index[term].append({})      
                positional_index[term][1][docID] = [pos]
        else:
            continue
    sorted_index = sorted(positional_index.items(), key = lambda kv: kv[0])
    sorted_index = dict(sorted_index)
    return sorted_index

In [3]:
#all the preprocessing functions called on all documents
#if you want to see the inverted index and positional index see the file inverted_index.json and positional_index.json

inverted_index=defaultdict(list)
positional_index = {}
stopword_list=[]
words=[]
stopword_list=create_stoplist()
for docID in range (1,51): #looping through total number of documents
        words=create_wordlist(docID)
        inverted_index=create_inverted_index(words)
        original_words=create_original_word_list(docID)
        positional_index=create_positional_index(original_words,stopword_list)

#dumping inverted index and positional index in json file        
inverted_file = open("inverted_index.json", "w")
json.dump(inverted_index, inverted_file, indent = 6)
inverted_file.close()     
positional_file = open("positional_index.json", "w")
json.dump(positional_index, positional_file, indent = 6)
positional_file.close()  
# print(positional_index)

# query processing

In [4]:
def solve_query(word,postings,query): #this fuction is to soleve the uery containg and,or,not
#     print(query)
    i=0
    j=2
    counter=0
    while(1):
#         if(len(query)<j):
#                 return temp_result
#                 break
        if(counter==0):
            first_word = word[counter]
            second_word = word[counter+1]
            if query[int((i+j)/2)]=='and':
#                 print(1)
                temp_result=intersection(postings[counter],postings[counter+1])
                i=i+2
                j=j+2
                counter=counter+1
            else:
#                 print(1)
                temp_result=union(postings[counter],postings[counter+1])
                i=i+2
                j=j+2
                counter=counter+1
        if(len(query)<j):
                return temp_result
                break
        else:
            first_word=temp_result
            second_word = word[counter+1]
            if query[int((i+j)/2)]=='and':
                temp_result=intersection(temp_result,postings[counter+1])
                i=i+2
                j=j+2
                counter=counter+1
            else:
                temp_result=union(temp_result,postings[counter+1])
                i=i+2
                j=j+2
                counter=counter+1
            if(len(query)<j):
                return temp_result
                break

In [5]:
def simple_query(query): #fuction if 1uery contain only one word
    inverted_file = open('inverted_index.json',)
    inverted_index = json.load(inverted_file)
    if query in inverted_index:
#         print(inverted_index[query][1])
        return inverted_index[query][1]
    else:
#         print([0])
        return [0]

def get_postings(word):# fuction for getting postings list for a word
    postings=[]
    inverted_file = open('inverted_index.json',)
    inverted_index = json.load(inverted_file)
    for i in range(0,len(word)):
        if word[i] in inverted_index:
            postings.append(inverted_index[word[i]][1])
        else:
            postings.append([0])
    return postings

def get_positional_postings(word):  # fuction for getting postings and postional list for a word
    postings=[]
    positional_file = open('positional_index.json',)
    positional_index = json.load(positional_file)
    for i in range(0,len(word)):
        if word[i] in positional_index:
            postings.append(positional_index[word[i]])
        else:
            postings.append([0])
    return postings

def intersection(word1,word2):  # fuction for getting intersection(and) of two words
#     print(word1)
#     print(word2)
    result = [x for x in word1 if x in word2]
    return result

def union(word1,word2): # fuction for getting union(or) of two words
    print(word1)
    print(word2)
    result = word1
    for i in range(0,len(word2)):
        if word2[i] not in result:
            result.append(word2[i])
    result.sort()
    return result
        
def complex_query(query): # fuction if query contains and,or,not and more then one word
    postings = []
    all_documents = list(range(1,51))
    boolean_word = ['and','or','not']
    word = [x for x in query if x not in boolean_word]
    postings = get_postings(word)
#     print(postings)
    if("not" not in query):#if there is no not in query
        return solve_query(word,postings,query)
    else: #if there is not in query solve not first and remove it from query then the above work is being
        while(1):
            if "not" in query:
                get_not_index = query.index("not")
                if query[get_not_index+1] in inverted_index:
                    get_postings_for_word = inverted_index[query[get_not_index+1]][1]
#                     print(get_postings_for_word)
                else:
                    print(1)
                    get_postings_for_word = [0]
                    temp_index = postings.index(get_postings_for_word)
                    postings[temp_index]  = get_postings_for_word 
#                 print(get_postings_for_word)
                invert_postings = [x for x in all_documents if x not in get_postings_for_word]
#                 print(invert_postings)
                if get_postings_for_word in postings:
                    temp_index = postings.index(get_postings_for_word)
                    postings[temp_index]  = invert_postings
#                     print(postings[temp_index])
                query.remove('not')
            else:
                break
#         print(query)
#         print(postings)
        if(len(query)==1):
            return postings[0]
        else:
#             print(query)
            return solve_query(word,postings,query)
        
#         i=0
#         j=2
#         counter=0
#         while(1):
#             if(counter==0):
#                 first_word = word[counter]
#                 second_word = word[counter+1]
#                 if query[int((i+j)/2)]=='and':
#                     temp_result=intersection(postings[counter],postings[counter+1])
#                     i=i+2
#                     j=j+2
#                     counter=counter+1
#                 else:
#                     temp_result=union(postings[counter],postings[counter+1])
#                     i=i+2
#                     j=j+2
#                     counter=counter+1
#             else:
#                 first_word=temp_result
#                 second_word = word[counter+1]
#                 if query[int((i+j)/2)]=='and':
#                     temp_result=intersection(temp_result,postings[counter+1])
#                     i=i+2
#                     j=j+2
#                     counter=counter+1
#                 else:
#                     temp_result=union(temp_result,postings[counter+1])
#                     i=i+2
#                     j=j+2
#                     counter=counter+1
                
#             if(len(query)<j):
#                 return temp_result
#                 break

def proximity_query(query): #fuction for solving proximity query
    postings = []
    result = []
    word_apart = int(query[2])+1
    all_documents = list(range(1,51))
    boolean_word = ['']
    word = my_list = [item for item in query if item.isalpha()]
#     print(word)
    postings = get_positional_postings(word)
#     print(postings)
    try:
        t=postings[0][1]
    except IndexError:
        return [0] 
    first_word_length = len(postings[0][1])-1
    second_word_length = len(postings[1][1])-1
    first_word_keys = list(postings[0][1].keys())
    second_word_keys = list(postings[1][1].keys())
    k=0
    j=0
#     print(postings[0][1][first_word_keys[j]])
#     for i in range(0,min(first_word_length,second_word_length)):
    while(1):
        if (j > first_word_length or  k > second_word_length):
            return result
            break
        if first_word_keys[j] == second_word_keys[k]:
            first_word_postings = postings[0][1][first_word_keys[j]]
            second_word_postings = postings[1][1][second_word_keys[k]]
            for l in range (0,len(first_word_postings)):
                for m in range (0,len(second_word_postings)):
                    if (((first_word_postings[l] - second_word_postings [m] <= (word_apart)) and (first_word_postings[l] - second_word_postings [m] >=0))  or ((first_word_postings[l] - second_word_postings [m] >= -(word_apart)) and(first_word_postings[l] - second_word_postings [m] <=0))) :
#                             
#                             print(first_word_postings[l])
#                             print(second_word_postings[m])
                            result.append(first_word_keys[j])
                            break
                if first_word_keys[j] in result :
                    break 
            k=k+1
            j=j+1
            continue
            
        if first_word_keys[j] > second_word_keys[k]:
#             print(second_word_keys[k])
            k=k+1
        else:
#             print(first_word_keys[j])
            j=j+1
        
    

In [6]:
def query_processing(query):#fuction for calling fuctions depending on query 
    query = query.lower()
    if "/" in query:
        query = re.sub(r"[/]","", query)

#         print(query)
        query = query.split(" ")
        query = [stemmer.stem(x) for x in query]
#         print(query)
        result = proximity_query(query)
        print(result)
        return result
    if not " " in query:
        result = simple_query(query)
        return result
    else:
        query = query.split(" ")
        query = [stemmer.stem(x) for x in query]
        result= complex_query(query)
        print(result)
        return result

In [7]:
##if The GUI doesn't run please run this cell

# query=input("Enter Your Query: ");
# query_processing(query)

# Run The below cell to use GUI

In [8]:
# -*- coding: utf-8 -*-

# Form implementation generated from reading ui file 'GUI.ui'
#
# Created by: PyQt5 UI code generator 5.15.2
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again.  Do not edit this file unless you know what you are doing.


from PyQt5 import QtCore, QtGui, QtWidgets


class Ui_Dialog(object):
    def setupUi(self, Dialog):
        Dialog.setObjectName("Dialog")
        Dialog.resize(467, 379)
        self.listWidget = QtWidgets.QListWidget(Dialog)
        self.listWidget.setGeometry(QtCore.QRect(100, 120, 256, 192))
        self.listWidget.setObjectName("listWidget")
        self.lineEdit = QtWidgets.QLineEdit(Dialog)
        self.lineEdit.setGeometry(QtCore.QRect(20, 60, 311, 20))
        self.lineEdit.setObjectName("lineEdit")
        self.label = QtWidgets.QLabel(Dialog)
        self.label.setGeometry(QtCore.QRect(20, 40, 131, 16))
        font = QtGui.QFont()
        font.setPointSize(10)
        self.label.setFont(font)
        self.label.setObjectName("label")
        self.label_2 = QtWidgets.QLabel(Dialog)
        self.label_2.setGeometry(QtCore.QRect(50, 100, 91, 16))
        font = QtGui.QFont()
        font.setPointSize(10)
        self.label_2.setFont(font)
        self.label_2.setObjectName("label_2")
        self.pushButton = QtWidgets.QPushButton(Dialog)
        self.pushButton.setGeometry(QtCore.QRect(360, 60, 75, 23))
        self.pushButton.setObjectName("pushButton")
        self.pushButton.clicked.connect(self.print1)
        self.retranslateUi(Dialog)
        QtCore.QMetaObject.connectSlotsByName(Dialog)

    def retranslateUi(self, Dialog):
        _translate = QtCore.QCoreApplication.translate
        Dialog.setWindowTitle(_translate("Dialog", "Dialog"))
        self.label.setText(_translate("Dialog", "Input The Query"))
        self.label_2.setText(_translate("Dialog", "Documents"))
        self.pushButton.setText(_translate("Dialog", "Search"))
    def print1(self):
        self.listWidget.clear()
        self.result = query_processing(self.lineEdit.text())
#         print(self.result)
        for i in range (0,len(self.result)):
            self.listWidget.addItem("Document "+str(self.result[i]))



    
if __name__ == "__main__":
    import sys
    app = QtWidgets.QApplication(sys.argv)
    Dialog = QtWidgets.QDialog()
    ui = Ui_Dialog()
    ui.setupUi(Dialog)
    Dialog.show()
    

    sys.exit(app.exec_())


[1, 2, 23, 25, 34]
['6', '11', '22', '25']


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
