In [59]:
import os
from nltk.stem import PorterStemmer
import nltk
from nltk.tokenize import word_tokenize
import tkinter as tk
from tkinter import scrolledtext

In [60]:
def get_docIDs():
    curr_dir = os.getcwd() # get the current directory
    docID = [] # create a list to store the document IDs
    for i in os.listdir(curr_dir + '\ResearchPapers'): # loop through each file in the 'ResearchPapers' directory
        i = i.rstrip('.txt')
        docID.append(int(i))
    docID.sort()
    return docID

def get_stopwords():
    stopwords = []
    f = open('Stopword-List.txt', 'r') # open the 'Stopword-List.txt' file
    while True:
        line = f.readline() # each line from the file is read one by one
        if not line: # if the line read is empty (which means end of file), the loop is broken
            break
        stopwords.append(line) # else append the read line to the stopwords list

    f.close() # close the file

    for i in range(len(stopwords)):
        if i != '\n' and i != '':
            stopwords[i] = stopwords[i].rstrip(' \n') # remove newline characters from the strings
        else:
            stopwords.pop(i) # remove any empty strings and newline characters from the stopwords list
    return stopwords

def create_pos_index(total_tokens):
    pos_ind = {} # declare an empty dictionary for the positional index
    porter_stemmer = PorterStemmer() # initialize the stemmer

    # get the stopwords. Although stopwords is not going to be inserted in the positional index, we still need them to find the correct positions of the rest of the words
    stopwords = get_stopwords()
    doc = get_docIDs() # get the docIDs

    for i, tokens in enumerate(total_tokens): # loop through each token in total_tokens, and then loop through each word in the token
        for j, word in enumerate(tokens):
            if word not in stopwords: # filter the stopwords
                word = porter_stemmer.stem(word) # Stem the word
                if word[-1] == "'": #if the word ends with an apostrophe, remove it
                    word = word.rstrip("'")
                if word in pos_ind: # if the word is already in the positional index, add the docID to the index
                    if doc[i] in pos_ind[word]: # if the docID is already in the index for that word, add the position
                        pos_ind[word][doc[i]].append(j)
                    else: # else add the docID as well as the position
                        pos_ind[word][doc[i]] = [j]
                else: # add the word in the index along with the docID and the position
                    pos_ind[word] = {doc[i]: [j]}
    return pos_ind

def create_inv_index(total_tokens):
    inv_ind = {} # an empty dictionary for the inverted index
    porter_stemmer = PorterStemmer() # initialize the stemmer

    # get the stopwords. Although stopwords is not going to be inserted in the positional index, we still need them to find the correct positions of the rest of the words
    stopwords = get_stopwords()
    doc = get_docIDs() # get the docIDs

    for i, tokens in enumerate(total_tokens): # loop through each token in total_tokens again
        for word in tokens: # loop through each word in tokens
            word = porter_stemmer.stem(word) # stem the word
            if word[-1] == "'": # remove the apostrophe
                word = word.rstrip("'")
            if word in inv_ind: # if the word is already in the inverted index
                if doc[i] not in inv_ind[word]: # append the docID if it isn't in the index
                    inv_ind[word].append(doc[i])
            else: # add the word along with the docID
                inv_ind[word] = [doc[i]]
    return inv_ind 

def tokenization():
    total_tokens = [] # an empty list to store the tokens from all the files
    doc = get_docIDs() # get the docIDs

    for i in doc: # iterate through each doc
        tokens = []
        f = open('ResearchPapers/' + str(i) +'.txt', 'r') # open the file according to the current document ID
        while True:
            line = f.readline() # read a line from the file
            if not line: # if the line is empty (which means end of file), break the loop
                break
            tokens += word_tokenize(line) # tokenize the line and add the tokens to the list
        f.close() # close the file

        j = 0
        while j < len(tokens): # loop through each token
            # remove symbols and numbers from the start and end of the token and convert it to lowercase (case folding)
            tokens[j] = tokens[j].lstrip('0123456789!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~')
            tokens[j] = tokens[j].rstrip('0123456789!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~')
            tokens[j] = tokens[j].lower()
            if '.' in tokens[j]: # if '.' exists in a word, split the word at that point and add the splitted words at the end of the tokens list while removing the original word
                word = tokens[j].split('.')
                del tokens[j]
                tokens.extend(word)
            elif '-' in tokens[j]: # do the same for words with '-'
                word = tokens[j].split('-')
                del tokens[j]
                tokens.extend(word)
            j += 1 # move the index forward

        tokens = [c for c in tokens if c.isalpha()] # filter out any strings that contain symbols, numbers, etc.
        total_tokens.append(tokens)
    return total_tokens

def build_index():
    tokens = tokenization() # preprocessing function is called
    inv_ind = create_inv_index(tokens) # create_inv_index function is called
    pos_ind = create_pos_index(tokens) # create_pos_index function is called

    f = open('inv_index.txt', 'w') # the inverted index is written to 'inv_index.txt'
    for key, value in inv_ind.items(): # loop through each key-value pair in the inverted index and output it to the file
        f.write('{}:'.format(key))
        for i in value:
            f.write('{} '.format(i))
        f.write('\n')
    f.close() # close the file

    f = open('pos_index.txt', 'w') # do the same for positional index
    for key, value in pos_ind.items():
        for k, v in value.items():
            f.write('{}:{}.'.format(key, k))
            for i in v:
                f.write('{} '.format(i))
            f.write('\n')
    f.close() # close the file

In [61]:
build_index() # execute the main function

In [62]:
def extract_inv_index():
    inv_index = {}
    
    f = open('inv_index.txt', 'r') # open the inverted index file in read mode
    while True:
        line = f.readline() # read a line from the file
        if line == '': # if the line is empty (which means end of file), break the loop
            break
        line = line.split(':') # split the line at ':'
        term = line[0]
        pos = line[1].split()
        pos = [int(c) for c in pos if c != '' and c != '\n' and not c.isalpha()]
        inv_index[term] = pos # add the term and its posting list to the inverted index
    f.close() # close the file

    return inv_index

def extract_pos_index():
    pos_index = {}

    f = open('pos_index.txt', 'r') # do the same for the positional index
    while True:
        line = f.readline()
        if line == '':
            break
        line = line.split(':')
        term = line[0]
        docID, pos = line[1].split('.') # this time the second part of the split line is the document ID and the positions
        pos = pos.split() # split the positions at whitespace
        pos = [int(c) for c in pos if c != '' and c != '\n' and not c.isalpha()]
        if term in pos_index: # if the term is already in the positional index, append the positions to the term's dictionary. Otherwise append both docID and the positions
            pos_index[term][int(docID)] = pos
        else:
            pos_index[term] = {int(docID): pos}
    f.close() # close the file

    return pos_index

def find_AND(p1, p2):
    result = []
    for item in p1:
        if item in p2 and item not in result: # check if the element is also present in p2 and not already in the intersection list
            result.append(item)
    return result

def find_OR(p1, p2):
    result = []
    # add the elements of p1 to the new list while avoiding duplicates
    for i in p1:
        if i not in result:
            result.append(i)

    # add elements from p2 to the new list while avoiding duplicates
    for i in p2:
        if i not in result:
            result.append(i)
    return result

def find_NOT(p1):
    result = []
    doc = get_docIDs() # get the list of all document IDs
    # find elements in p1 that are not in doc
    for i in p1:
        if i not in doc and i not in result:
            result.append(i)

    # find elements in doc that are not in p1
    for i in doc:
        if i not in p1 and i not in result:
            result.append(i)

def boolean_query(query):
    query = query.split() # split the query into words
    porter_stemmer = PorterStemmer() # initialize the stemmer
    stopwords = get_stopwords() # get the stopwords
    inv_index = extract_inv_index() # Extract the inverted index and positional index

    for i, word in enumerate(query): # Loop through each word in the query
        if word in ['AND', 'OR', 'NOT']: # If the word is a boolean operator, skip it
            continue
        temp = porter_stemmer.stem(word) # Stem the word
        if temp[-1] == "'": # Remove the apostrophe
            temp = word.rstrip("'")
        if word in stopwords: # Remove the stopwords
            query.remove(word)
        else:
            query[i] = temp

    if 'AND' in query: # if the query contains 'AND', 'OR' or 'NOT, split the query at that point and process the two parts separately
        index = query.index('AND')
        t1 = query[:index] # splitting the query into two parts
        t2 = query[index+1:]

        # combine the query into a string and recursively call the function to process the first part
        temp = ''
        for i in t1:
            temp += i + ' '
        t1 = temp.rstrip(' ')

        p1 = boolean_query(t1)
        p1 = p1.split() # split the result into a list
        temp = []
        for i in range(len(p1)): # convert the elements of the list to integers
            temp.append(int(p1[i]))
        p1 = temp

        # combine the query into a string and recursively call the function to process the second part
        temp = ''
        for i in t2:
            temp += i + ' '
        t2 = temp.rstrip(' ')
    
        p2 = boolean_query(t2)
        p2 = p2.split()
        temp = []
        for i in range(len(p2)): # convert the elements of the list to integers
            temp.append(int(p2[i]))
        p2 = temp

        result = find_AND(p1, p2) # find the intersection of the results
    elif 'OR' in query:
        index = query.index('OR')
        t1 = query[:index]
        t2 = query[index+1:]
        # combine the query into a string and recursively call the function to process the first part
        temp = ''
        for i in t1:
            temp += i + ' ' # convert the list into a string, with each term separated by a space
        t1 = temp.rstrip(' ')

        p1 = boolean_query(t1)
        p1 = p1.split() # split the result into a list
        temp = []
        for i in range(len(p1)): # convert the elements of the list to integers
            temp.append(int(p1[i]))
        p1 = temp

        # combine the query into a string and recursively call the function to process the second part
        temp = ''
        for i in t2: # convert the list into a string, with each term separated by a space
            temp += i + ' '
        t2 = temp.rstrip(' ')
    
        p2 = boolean_query(t2)
        p2 = p2.split()
        temp = []
        for i in range(len(p2)): # convert the elements of the list to integers
            temp.append(int(p2[i]))
        p2 = temp

        result = find_OR(p1, p2) # find the union of the results
    elif 'NOT' in query:
        index = query.index('NOT')
        t1 = query[index+1:]
        # combine the query into a string and recursively call the function to process the first part
        temp = ''
        for i in t1: # convert the list into a string, with each term separated by a space
            temp += i + ' '
        t1 = temp.rstrip(' ')

        p1 = boolean_query(t1)
        p1 = p1.split() # split the result into a list
        temp = []
        for i in range(len(p1)): # convert the elements of the list to integers
            temp.append(int(p1[i]))
        p1 = temp

        result = find_NOT(p1)
    else: # if the query contains only a single term
        term = query[0] # extract the term
        result = inv_index.get(term, []) # get the postings list for the term from the inverted index

    result = ''.join([str(c) + ' ' for c in result]) # convert the result to a string
    return result

def proximity_query(query):
    query = query.split() # split the query into words
    porter_stemmer = PorterStemmer() # initialize the stemmer
    stopwords = get_stopwords() # get the stopwords
    inv_index = extract_inv_index() # extract the inverted and positional indexes
    pos_index = extract_pos_index()
    pos = query.pop(-1)
    pos = int(pos[-1])

    query = [c for c in query if c not in stopwords] # remove the stopwords from the query and also apply case folding

    word = porter_stemmer.stem(query[0].lower()) # stem the first word in the query
    if word[-1] == "'": # remove the apostrophe
            word = word.rstrip("'")
    query[0] = word
    word = porter_stemmer.stem(query[1].lower()) # stem the second word in the query
    if word[-1] == "'": # remove the apostrophe
        word = word.rstrip("'")
    query[1] = word

    docs = [] # create a list to store the postings list for each term in the query
    docs.append(inv_index[query[0]]) # get the postings list for the first term
    docs.append(inv_index[query[1]]) # get the postings list for the second term

    common_docs = find_AND(docs[0], docs[1]) # find the common documents in the postings list of the two terms

    # find the postional intersection of the two terms in the common documents
    result = [] # create a list to store the result
    for i in common_docs: # loop through the common documents
        p1 = pos_index.get([query[0]])[i] # get the positions of the first term in the document
        p2 = pos_index.get([query[1]])[i] # get the positions of the second term in the document

        # now we need to find the positions of the second term that are within the specified proximity of the positions of the first term
        j = k = 0
        while j != len(p1):
            while k != len(p2):
                if abs(p1[j] - p2[k]) <= pos: # if the positions of the two terms are within the specified proximity, add the document to the result
                    if i not in result: # if the document is not already in the result, add it
                        result.append(i)
                elif p2[k] > p1[j]: # if the position of the second term is greater than the position of the first term, break the loop
                    break 
                k+=1
            j+=1

    temp = result
    result = ''
    for i in range(len(temp)): # convert the result to a string
        result.append(str(temp[i]) + ' ')

    return result

In [63]:
def search():
    query = query_entry.get()  # get the query from the entry widget

    if '/' in query: # if the query contains a '/', it is a proximity query, so call the ProxQueryProcessing function
        result = proximity_query(query)
    else:  # otherwise, it is a boolean query, so call the BoolQueryProcessing function
        result = boolean_query(query)

    if result == '': # if the result is empty, display a message
        result = 'No documents found'
    result_text.delete(1.0, tk.END)  # clear previous results
    result_text.insert(tk.END, result) # insert the result into the scrolled text widget
 
root = tk.Tk() # create the main window
root.title("Information Retrieval System")

query_label = tk.Label(root, text="Enter your query:") # create and place the label widget
query_label.pack(pady=5)

query_entry = tk.Entry(root, width=50) # create and place the entry widget with width set to 50
query_entry.pack(pady=5)

search_button = tk.Button(root, text="Search", command=search) # create and place the button widget. It will trigger the search function when clicked
search_button.pack(pady=5)

result_text = scrolledtext.ScrolledText(root, width=60, height=10) # create and place the scrolled text widget. This will allow the user to scroll through the results
result_text.pack(pady=5)

root.mainloop() # run the Tkinter event loop


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2288.0_x64__qbz5n2kfra8p0\Lib\tkinter\__init__.py", line 1967, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\ahuna\AppData\Local\Temp\ipykernel_9476\2682682125.py", line 7, in search
    result = boolean_query(query)
             ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ahuna\AppData\Local\Temp\ipykernel_9476\4245261985.py", line 161, in boolean_query
    p1 = boolean_query(t1)
         ^^^^^^^^^^^^^^^^^
  File "C:\Users\ahuna\AppData\Local\Temp\ipykernel_9476\4245261985.py", line 173, in boolean_query
    result = ''.join([str(c) + ' ' for c in result]) # convert the result to a string
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: 'NoneType' object is not iterable
Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Py