In [13]:
import re
from pathlib import Path


def tokenizer(text):
    '''this method is used to convert a given text to tokens by changing all letters to lowercase letters
        and remove numbers. It also ignores words containing numbers'''
    with open('stopwordlist.txt', 'r') as swFile:#loading all the stopwrds from stopwords.txt file
        stopwords = re.split(r'\s+', swFile.read())

    tokens = []
    for line in text:
        tokens += [token for token in re.split('[^a-zA-Z0-9]', line.lower()) if
                    not re.search('[0-9]', token) and token not in stopwords] # to perform tokenization on non-alphanumeric words, to remove empty words from the list, to ignore words containing numbers 
    return tokens

In [14]:
import nltk
from nltk.stem import PorterStemmer

# nltk.download('punkt')

all_words = []

class WordDictionary:
    #first we define a constructor to initialize our variables
    def __init__(self):
        self.wordIDs = {} #dict for all word ids
        self.currWordId = 1 #start with id 1 for words
        self.stemmer = PorterStemmer() #initalize stemmer obj

    def appendWord(self, word):
        stemWord = self.stemmer.stem(word) #setup the stem word
        if stemWord == "":
            pass
        elif stemWord not in self.wordIDs: #check if the dict contains the word
            self.wordIDs[stemWord] = self.currWordId #if the word is not present we add the word to the dict along with the id
            self.currWordId += 1 #increment the id for the next word

    def getWordId(self, word):
        stemWord = self.stemmer.stem(word)#getting the stemmed form of the given word
        return self.wordIDs.get(stemWord, None)#returning the ID of the stemmef word
    
    def fetch_d(self):#this method will be used in main.py to fetch all ids that will be written to the output file
        sorted_order = dict(sorted(self.wordIDs.items(), key=lambda item:item[1]))#sorting the word dictionary based on WordIDs
        return sorted_order

In [15]:
folder = './ft911/'
all_file_names = []

class FileDictionary:
    #similar to WordDictionary we do the initialize step
    def __init__(self, path):
        self.fileIDs = {} #dict for all file ids
        self.currFileId = 1#start with id 1 for files
        self.folder = path#set a foldder ref with the given path

    def getFileId(self, file):
        return self.fileIDs.get(file, None)#return the ID of the file(DOCNO) for a given file
    
    def getAllFiles(self):
        return self.fileIDs#returns all the files(DOCNOS)
    
    
    def appendFiles(self, file):
         self.fileIDs[file] = self.currFileId#appending the file(docno) along with a new id
         self.currFileId += 1#incrementing the id



In [16]:
import os
import xml.dom.minidom as xdm

date = 'DATE'
pro = 'PROFILE'
DOC = 'DOCNO'
TEXT = 'TEXT'
path = './ft911/'
doc_tag = ['<DOC>', '</DOC>']
read_mode = 'r'

class TextParser(object):
    def convert(val):#method will be used by the fetchdocs() method below to fetch all docnos and complete data
        doc = xdm.parseString(val)#using xdm library to manipulate/parse the input files
        root = doc.documentElement#gets the root element
        docs = {}#declaring a dict to store all data

        for doc in root.childNodes:#iterating through all docs
            for ele in doc.childNodes:#iiterating through all elements in docs
                if(ele.nodeType == ele.ELEMENT_NODE):#check if the nodetype of the element is node element itself
                    if ele.tagName == date or ele.tagName == pro:#ignoring all elements that are not docno
                        continue
                    elif ele.tagName == DOC:#if the element is docno then we add it to out dict
                        DOCNO = ele.firstChild.data.strip()
                        docs[DOCNO] = []
                    elif ele.tagName == TEXT:#else we append all the data to that specific docno
                        docs[DOCNO].append(ele.firstChild.data.strip())
        return docs
    
    
    def fetchDocs(file):#this method will be used in main.py to get the parsed docnos and doc content
        with open(os.path.join(path, file), read_mode) as File:
            doc_data = File.read()
        full_doc_data = doc_tag[0] + doc_data + doc_tag[1]
        docs = TextParser.convert(full_doc_data)
        return docs



In [17]:
import os
from pathlib import Path
from collections import Counter

class Indexer:


    def create_forward_index(index):
        new_forwardIndex = {}
        for docno, text in index.items():
            word_counts = Counter(text)
            new_forwardIndex[docno] = word_counts
        return new_forwardIndex
        

    def write_forward_index(new_forwardIndex, fileName):    
        try:
            filepath = os.path.join(Path(__file__).parent.resolve(), fileName)
            with open(filepath, "w") as fIndex:
                for docno, counts in new_forwardIndex.items():
                    fIndex.write(f"{docno}:\t" +"; ".join([f"{word} {count}" for word, count in counts.items()]))
                    fIndex.write("\n\n")
            print("index success in "+fileName)
        except:
            print(f"index {fileName} failure ")


        
    def create_inverted_index(docs):
        inverted_index = {}
        for docno, wordCounts in docs.items():
            for word, count in wordCounts.items():
                if word not in inverted_index:
                    inverted_index[word] = []
                inverted_index[word].append((docno,count))
        # print(inverted_index)
        return inverted_index
    
    def write_inverted_index(inverted_index, filename):
        try:
            filepath = os.path.join(Path(__file__).parent.resolve(), filename)
            with open(filepath, "w") as iIndex:
                for word, postings in inverted_index.items():
                    iIndex.write(f"{word}:\t"+"; ".join(f"{docno} {count}" for docno, count in postings))
                    iIndex.write("\n\n")
            print("index success in "+filename)
        except:
            print(f"index {filename} failure ")
            

In [18]:
import os
import time
from pathlib import Path
from WordDictionary import WordDictionary
from FileDictionary import FileDictionary
from tokenizer import tokenizer
from DocParser import TextParser
from datetime import datetime
from indexer import Indexer
import nltk
from nltk.stem import PorterStemmer

# nltk.download('punkt')


print("Running code at:", datetime.now())

path = "./ft911/"
folder = os.listdir(path=path)

parser_file = "parser_output.txt"
forward_index_file = "forward_index.txt"
inverted_index_file = "inverted_index.txt"
w = 'w'
a = 'a'

def parser_output(filename, data, mode):    
    try:#exception handling
        # filePath = os.path.join(Path(__file__).parent.resolve(), parser_file)#created a file if it doesn't already exist to write output
        with open(filename, mode) as parser_output:
            for key in data.keys():#iterates through all docnos and stemmed words
                parser_output.write(f"{key}\t{data[key]}\n")#appeninf the DOCNO: DOCID and Word : WOrdID to the parser_output file
        print("Parser output- Success: Writing data COmpleted!")
    except :#shows error message in case of failure
        print("Parser output- Failure: error while uploading data")


#then we initialize our custom dictionaries
WordDict = WordDictionary()
FileDict = FileDictionary(path)

def get_stem_words(forward_words):
    stemmer = PorterStemmer() 
    stemmed_words = []
    for word in forward_words:
        stemword = stemmer.stem(word)
        if stemword == "":
            pass
        elif stemword not in stemmed_words:
            stemmed_words.append(stemword)
    return stemmed_words

forwardIndex = {}
forward_tokens = []

#looping over all the files in the folder mentioned above and and adding them to complete data
for file in folder:
    docs = TextParser.fetchDocs(file)
    for docno, data in docs.items():
        FileDict.appendFiles(docno)
        tokens = tokenizer(data)
        for token in tokens:
            WordDict.appendWord(token)
        stemmed_words = get_stem_words(tokens)
        forwardIndex[docno] = stemmed_words


start_parser = time.time()
parser_output(parser_file, WordDict.fetch_d(), w)#once all the word data is feteched, we write the data to the output file using the 'w' mode
parser_output(parser_file, FileDict.getAllFiles(), a)#once all the filenames are feteched, we append the data to the output file using the 'a' mode
end_parser = time.time()

total_parser_time = end_parser - start_parser
print(f"Time taken for Parsing the documents = {total_parser_time} seconds")

start_time = time.time()
new_forwardIndex = Indexer.create_forward_index(forwardIndex)
Indexer.write_forward_index(new_forwardIndex, forward_index_file)

inverted_index = Indexer.create_inverted_index(new_forwardIndex)
Indexer.write_inverted_index(inverted_index, inverted_index_file)
end_time = time.time()

execution_time = end_time - start_time
print(f"Time taken to generate the indexes is {execution_time} seconds.")
print(f"Total number of words = {len(inverted_index)}")
print(f"Total number of documents = {len(new_forwardIndex)}")


Running code at: 2024-03-28 13:11:50.422704
Parser output- Success: Writing data COmpleted!
Parser output- Success: Writing data COmpleted!
Time taken for Parsing the documents = 0.038260459899902344 seconds
index success in forward_index.txt
index success in inverted_index.txt
Time taken to generate the indexes is 0.7784864902496338 seconds.
Total number of words = 32606
Total number of documents = 5368
