In [1]:
import re

def tokenizer(text):
    '''this method is used to convert a given text to tokens by changing all letters to lowercase letters
        and remove numbers. It also ignores words containing numbers'''
    tokens = []
    for line in text:
        tokens += [token for token in re.split('[^a-zA-Z0-9]', line.lower()) if
                    not re.search('[0-9]', token)] # to perform tokenization on non-alphanumeric words, to remove empty words from the list, to ignore words containing numbers 
    return tokens

In [2]:
import nltk
from nltk.stem import PorterStemmer

# nltk.download('punkt')

all_words = []

class WordDictionary:
    #first we define a constructor to initialize our variables
    def __init__(self):
        self.wordIDs = {} #dict for all word ids
        self.currWordId = 1 #start with id 1 for words
        self.stemmer = PorterStemmer() #initalize stemmer obj

    def appendWord(self, word):
        stemWord = self.stemmer.stem(word) #setup the stem word
        if stemWord not in self.wordIDs: #check if the dict contains the word
            self.wordIDs[stemWord] = self.currWordId #if the word is not present we add the word to the dict along with the id
            self.currWordId += 1 #increment the id for the next word

    def getWordId(self, word):
        stemWord = self.stemmer.stem(word)#getting the stemmed form of the given word
        return self.wordIDs.get(stemWord, None)#returning the ID of the stemmef word
    
    def fetch_d(self):#this method will be used in main.py to fetch all ids that will be written to the output file
        return self.wordIDs

In [4]:
folder = './ft911/'
all_file_names = []

class FileDictionary:
    #similar to WordDictionary we do the initialize step
    def __init__(self, path):
        self.fileIDs = {} #dict for all file ids
        self.currFileId = 1#start with id 1 for files
        self.folder = path#set a foldder ref with the given path

    def getFileId(self, file):
        return self.fileIDs.get(file, None)#return the ID of the file(DOCNO) for a given file
    
    def getAllFiles(self):
        return self.fileIDs#returns all the files(DOCNOS)
    
    
    def appendFiles(self, file):
         self.fileIDs[file] = self.currFileId#appending the file(docno) along with a new id
         self.currFileId += 1#incrementing the id



In [3]:
import os
import xml.dom.minidom as xdm

date = 'DATE'
pro = 'PROFILE'
DOC = 'DOCNO'
path = './ft911/'
doc_tag = ['<DOC>', '</DOC>']
read_mode = 'r'

class TextParser(object):
    def convert(val):#method will be used by the fetchdocs() method below to fetch all docnos and complete data
        doc = xdm.parseString(val)#using xdm library to manipulate/parse the input files
        root = doc.documentElement#gets the root element
        docs = {}#declaring a dict to store all data

        for doc in root.childNodes:#iterating through all docs
            for ele in doc.childNodes:#iiterating through all elements in docs
                if(ele.nodeType == ele.ELEMENT_NODE):#check if the nodetype of the element is node element itself
                    if ele.tagName == date or ele.tagName == pro:#ignoring all elements that are not docno
                        continue
                    elif ele.tagName == DOC:#if the element is docno then we add it to out dict
                        DOCNO = ele.firstChild.data.strip()
                        docs[DOCNO] = []
                    else:#else we append all the data to that specific docno
                        docs[DOCNO].append(ele.firstChild.data.strip())
        return docs
    
    
    def fetchDocs(file):#this method will be used in main.py to get the parsed docnos and doc content
        with open(os.path.join(path, file), read_mode) as File:
            doc_data = File.read()
        full_doc_data = doc_tag[0] + doc_data + doc_tag[1]
        docs = TextParser.convert(full_doc_data)
        return docs



In [9]:
import os
from pathlib import Path
from WordDictionary import WordDictionary
from FileDictionary import FileDictionary
from tokenizer import tokenizer
from DocParser import TextParser
from datetime import datetime

print("Running code at:", datetime.now())

path = "./ft911/"
folder = os.listdir(path=path)

parser_file = "parser_output.txt"
w = 'w'
a = 'a'

def parser_output(data, mode):    
    try:#exception handling
        # filePath = os.path.join(Path(__file__).parent.resolve(), parser_file)#created a file if it doesn't already exist to write output
        # with open(filePath, mode) as parser_output:
        with open(parser_file, "w") as parser_output:
            for key in sorted(data.keys()):#iterates through all docnos and stemmed words
                parser_output.write(f"{key}\t{data[key]}\n")#appeninf the DOCNO: DOCID and Word : WOrdID to the parser_output file
        print("Parser output- Success: Writing data COmpleted!")
    except Exception as e:#shows error message in case of failure
        print("Parser output- Failure: error while uploading data")
        print(e)


#then we initialize our custom dictionaries
WordDict = WordDictionary()
FileDict = FileDictionary(path)

#looping over all the files in the folder mentioned above and and adding them to complete data
for file in folder:
    docs = TextParser.fetchDocs(file)
    for docno, data in docs.items():
        FileDict.appendFiles(docno)
        tokens = tokenizer(data)
        for token in tokens:
            WordDict.appendWord(token)

parser_output(WordDict.fetch_d(), w)#once all the word data is feteched, we write the data to the output file using the 'w' mode
parser_output(FileDict.getAllFiles(), a)#once all the filenames are feteched, we append the data to the output file using the 'a' mode


Running code at: 2024-03-07 22:51:17.710970
Parser output- Success: Writing data COmpleted!
Parser output- Success: Writing data COmpleted!
