# importing:
>import os library for collecting the text files.

>import string and nltk for tokenizing the files.

In [20]:
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# LinkedList and Node class:

> in this cell we have two classes, Node and LinkedList which contains many Nodes.

1. class Node contains a data, a list-which contains the indexes of a specific word in the document of this Node, a next pointer which pointes to the next node.

2. class LinkedList contains two pointers to the front and last node.

In [21]:
class Node:
    def __init__(self, data, pointer):
        self.data = data
        self.list = [pointer]
        self.next = None

class LinkedList:
    def __init__(self):
        self.head = None
        self.last = None
        self.len = 0

    def append(self, data, index): # appending a new node to the LinkedList
        new_node = Node(data, index)
        if not self.head:
            self.head = new_node
        else:
            self.last.next = new_node
        self.len += 1
        self.last = new_node

    def display(self): # this function displays the LinkedList
        current = self.head
        while current:
            print(current.data, end=" -> ")
            current = current.next
        print("None")
    
    def is_empty(self):
        if self.len == 0:
            return 1
        return 0

# getting files:

in this cell we use os library to get the text files in current directory.

In [22]:
def get_text_files():
    current_directory = os.getcwd()

    # List all files in the current directory
    file_list = os.listdir(current_directory)

    # Filter the list to include only text files (if needed)
    text_files = [file for file in file_list if file.endswith('.txt')]

    # This is where we keep our documents
    documents = []

    # Loop through each text file and open/read them
    for i in range(len(text_files)):
        file_name = text_files[i]
        file_path = os.path.join(current_directory, file_name)
        
        # Open and read the file
        with open(file_path, 'r') as file:
            documents.append(file.read())
    
    return text_files, documents

# tokenize:

In this cell, every text files, convert to a list whitout any puntuation and stop word.

Punctuation are not important at all, so removing them helps the algorithm.

Removing stop words is important beacuse they don't add much value to a text and make our search more sufficient and faster.

Also we convert every character to it's lower case. And we do the same with query So that makes search and comparing more easy.

In [23]:
def tokenize(documents):
    # Set the stop words for English
    stop_words = set(stopwords.words('english'))

    # This is the final list of tokenized text in lists
    tokenized_list = []
    
    # For each document, we use nltk regex_tokenize to token all the text file
    for doc in documents:
        tokenized_text =  nltk.regexp_tokenize(doc, r'\d+,\d+|\w+')
        
        # Here we handle ',' in the numbers (beacuse nltk doesn't handle this)
        for i in range(len(tokenized_text)):
            if ',' in tokenized_text[i]:
                w = ''
                for c in tokenized_text[i]:
                    if c != ',':
                        w += c 
                    tokenized_text[i] = w
        
        # Then remove any punctuation
        text_without_punctuation = [word.lower() for word in tokenized_text if word.isalnum()]
        
        # And remove the stop words cause they don't add much to a text
        text_without_stop_words = [word for word in text_without_punctuation if word not in stop_words]
        
        tokenized_list.append(text_without_stop_words)
    return tokenized_list

# construct inverted index:

In this cell we construct the inverted index which is a dictionary with all unique term as it's keys and a LinkedList for it's values, which contains document_ids that the specific term appeard in it those documents.

And also for each node (document_id for a term), there is a list which shows that the term appeard in which indexes in this document.

In [24]:
def inverted_index(documents, tokenized_list):
    # This is the dictionary for inverted index that we will construct
    dictionary = dict()

    for doc_id in range(len(documents)):
        for token_index in range(len(tokenized_list[doc_id])):
            if tokenized_list[doc_id][token_index] in dictionary: # Checking if the term is already in the dictionary
                if dictionary[tokenized_list[doc_id][token_index]].last.data == doc_id: # Checking if the document_id is already in the term LinkedList
                    dictionary[tokenized_list[doc_id][token_index]].last.list.append(token_index)
                else: # If this is a new document
                    dictionary[tokenized_list[doc_id][token_index]].append(doc_id, token_index)

            else: # If this is a new term
                dictionary[tokenized_list[doc_id][token_index]] = LinkedList()
                dictionary[tokenized_list[doc_id][token_index]].append(doc_id, token_index)
    return dictionary

# Term case:
There is only one term in the query that needs to be returned.

In [25]:
def handling_term(term, dictionary):
    point = dictionary[term].head
    
    while(point): # printing all the documents for the term
        print(text_files[point.data][:-4])
        point = point.next
    return

# NOT case:

This is the function which handles NOT. 

We just have to return every other document that are not in the specified term LinkedList.

So, print the file names(whithout .txt)

In [26]:
def handling_NOT(term, dictionary):
    point = dictionary[term].head
    i = 0
    
    while i < len(documents): # Finding every other document_id to print the file names
        if point and i == point.data: # Checking if the document_id ith is in LinkedList
            point = point.next
            i += 1
            continue
            
        print(text_files[i][:-4])
        i += 1
    return

# AND case:

This is the function which handles AND. In this function:
1. We get doc_id list of each term at first.
2. Find the intersection of two LinkedLists and print the file names. 

(we do this by moving the samller id to bigger ones so there might be the same id in other LinkedList)

In [27]:
def handling_AND(term1, term2, dictionary):
    if term1 in dictionary and term2 in dictionary:
        
        # Getting LikendLists
        point1 = dictionary[term1].head
        point2 = dictionary[term2].head
        
        while point1 and point2:
            if point1.data == point2.data: # If the document_id is in both of LinkedLists
                print(text_files[point1.data][:-4])
                point1 = point1.next
                point2 = point2.next
                
            elif point1.data > point2.data:
                point2 = point2.next
            
            else:
                point1 = point1.next
    else: # If there is no key in the dictionary for at least one of the searched terms
        print("there is no result found")
    return

# OR case:

This is the function which handles OR. In this function:

1. We get doc_id list of each term at first.
2. Find the union between two Lists.

(For doing this, we have to print every document that have appeard in at least one of these Lists)

In [28]:
def handling_OR(term1, term2, dictionary):
    if term1 in dictionary and term2 in dictionary:
        
        # Getting LikendLists
        point1 = dictionary[term1].head
        point2 = dictionary[term2].head
        
        while point1 and point2:
            if point1.data == point2.data: # If the id is in both of Lists, only one of them needs to be printed
                print(text_files[point1.data][:-4])
                point1 = point1.next
                point2 = point2.next
            
            # If one of them has a lower id than the other, the lower one needs to be printed
            elif point1.data > point2.data:
                print(text_files[point2.data][:-4])
                point2 = point2.next
            
            else:
                print(text_files[point1.data][:-4])
                point1 = point1.next
             
        # This is the case that one of the LinkedList has ended and there is no need to compare, only printing   
        while point1:
            print(text_files[point1.data][:-4])
            point1 = point1.next
                
        while point2:
            print(text_files[point2.data][:-4])
            point2 = point2.next
    else: # If there is no key in the dictionary for at least one of the searched terms
        print("there is no result found")
    return

# Proximity case:

this function checks if there is any case that two specified terms, appeard in at most "distance" far from each other in a specific document.

It is a side function.

In [29]:
def distance_checking(list1, list2, distance):
    for i in list1:
        for j in list2:
            if (abs(i-j) <= distance):
                return 1
            elif j > i:
                break
    return 0

This is the function to handle proximity case. in this function:

1. We get doc_id list for each term at first
2. Then we get the intersection between two Lists. Exactly like what we did in AND case.
3. Then for each document in intersection, we check (using distance_checking function from above) if there is any indexes that two terms have at most a specified distance.

In [30]:
def handling_proximity(term1, term2, distance, dictionary):
    if term1 in dictionary and term2 in dictionary:
        
        # Getting LikendLists
        point1 = dictionary[term1].head
        point2 = dictionary[term2].head

        while point1 and point2:
            if point1.data == point2.data:
                if distance_checking(point1.list , point2.list, distance):
                    print(text_files[point1.data][:-4])
                point1 = point1.next
                point2 = point2.next
                
            elif point1.data > point2.data:
                point2 = point2.next
            
            else:
                point1 = point1.next
    else: # If there is no key in the dictionary for at least one of the searched terms
        print("there is no result found")
    return

# main:

This is where everything handles.

There are three cases:

1. if the query has only 1 term: In this case it is only a term that the inverted index must be returend.
2. if the query has only 2 terms: In this case it should be NOT case
3. if the query has only 3 terms: In this case it is either AND case, OR case or Proximity case. We check with the middle term of the query for this.
4. if the query is empty: End of search.

Also every terms will be passed to it's handling function in lower case.

In [31]:
def main_func(query, dictionary):
    while True:
        if len(query) == 1:
            handling_term(query[0].lower(), dictionary)
            print("done!")
            return
            
        elif len(query) == 2:
            if query[0] == 'NOT':
                handling_NOT(query[1].lower(), dictionary)
                print("done!")
                return
            
        elif len(query) == 3:
            if query[1] == 'AND':
                handling_AND(query[0].lower(), query[2].lower(), dictionary)
                print("done!")
                return
            
            elif query[1] == 'OR':
                handling_OR(query[0].lower(), query[2].lower(), dictionary)
                print("done!")
                return
            
            elif 'NEAR/' in query[1]:
                handling_proximity(query[0].lower(), query[2].lower(), int(query[1][5:]), dictionary)
                print("done!")
                return

        elif len(query) == 0:
            return 0
    return

1. First of all, we get the files and make text_files and documents lists.
2. Then we token every document and store them in tokenized_list.
3. At last, build the inverted index dictionary.

In [32]:
text_files, documents = get_text_files()
tokenized_list = tokenize(documents)
dictionary = inverted_index(documents, tokenized_list)

main_func(list(input("Enter what you are looking for ('Enter' for stoping the search)").split()), dictionary)

A Festival of Books
A Murder-Suicide
Gasoline Prices Hit Record High
Happy and Unhappy Renters
Rentals at the Oceanside Community
Trees Are a Threat
done!


In [33]:
p = dictionary["people"].head
while p:
    print(p.data+1)
    p = p.next

1
2
8
9
13
15
