-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_parser.py
29 lines (26 loc) · 1.16 KB
/
main_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from parsers import PdfParser, PptxParser, DocxParser
class MainParser:
'''Class for extracting text from different file formats.'''
def __init__(self, filepath: str):
self.filepath = filepath
self.extension = filepath.split('.')[-1] # extract the file extension
def extract_text(self) -> str:
# create the appropriate parser based on the file extension and call its method to get the text
if self.extension == 'pdf':
extractor = PdfParser(self.filepath)
elif self.extension == 'pptx':
extractor = PptxParser(self.filepath)
elif self.extension == 'docx':
extractor = DocxParser(self.filepath)
else:
# raise an error if the file extension is not supported
raise ValueError(f'Invalid file extension: {self.extension}')
return extractor.extract_text() # return the extracted text
# test
if __name__ == "__main__":
# filepath = "./data/files/Desires.pdf"
# filepath = "./data/files/Presentation.pptx"
filepath = "./data/files/Lab4.docx"
parser = MainParser(filepath)
extracted_text = parser.extract_text()
print(extracted_text)