From 64dd1955d2c81787653173a07ac4f2afc1e1f376 Mon Sep 17 00:00:00 2001 From: Dmitriy-Xawstov Date: Wed, 10 Sep 2025 03:31:13 +0300 Subject: [PATCH 1/2] Refactored Parser use cases --- Uses-Cases/Parser/get_fdf.py | 22 ++++++++++ Uses-Cases/Parser/get_images.py | 26 ++++++++++++ Uses-Cases/Parser/get_tables.py | 30 +++++++++++++ Uses-Cases/Parser/get_textboxes.py | 30 +++++++++++++ Uses-Cases/Parser/get_xml.py | 22 ++++++++++ Uses-Cases/Parser/paresr_helpers.py | 65 +++++++++++++++++++++++++++++ Uses-Cases/Parser/parser_launch.py | 24 +++++++++++ 7 files changed, 219 insertions(+) create mode 100644 Uses-Cases/Parser/get_fdf.py create mode 100644 Uses-Cases/Parser/get_images.py create mode 100644 Uses-Cases/Parser/get_tables.py create mode 100644 Uses-Cases/Parser/get_textboxes.py create mode 100644 Uses-Cases/Parser/get_xml.py create mode 100644 Uses-Cases/Parser/paresr_helpers.py create mode 100644 Uses-Cases/Parser/parser_launch.py diff --git a/Uses-Cases/Parser/get_fdf.py b/Uses-Cases/Parser/get_fdf.py new file mode 100644 index 0000000..c071c88 --- /dev/null +++ b/Uses-Cases/Parser/get_fdf.py @@ -0,0 +1,22 @@ +from paresr_helpers import ParesrHelper +from pathlib import Path +import logging + +class ExportFormToFDF: + """Class for extracting PDF form fields into FDF using Aspose PDF Cloud API.""" + def __init__(self, helper: ParesrHelper): + self.helper = helper + + def Extract(self, documentName: str, outputFDFName: str, localFolder: Path, remoteFolder: str ): + self.helper.upload_document(documentName, remoteFolder) + + fdfPath = str(Path.joinpath(Path(remoteFolder), outputFDFName)) + opts = { + "folder": remoteFolder + } + response = self.helper.pdf_api.put_export_fields_from_pdf_to_fdf_in_storage(documentName, fdfPath, **opts) + if response.code != 200: + logging.error("ExportFormToFDF(): Unexpected error!") + else: + logging.info(f"ExportFormToFDF(): Pdf document '{documentName}' form fields successfully exported to '{outputFDFName}' file.") + self.helper.downloadFile(outputFDFName, outputFDFName, localFolder, remoteFolder, "") \ No newline at end of file diff --git a/Uses-Cases/Parser/get_images.py b/Uses-Cases/Parser/get_images.py new file mode 100644 index 0000000..42b4f06 --- /dev/null +++ b/Uses-Cases/Parser/get_images.py @@ -0,0 +1,26 @@ +from paresr_helpers import ParesrHelper +from pathlib import Path +import shutil +import logging + +class GetImages: + """Class for extracting images from PDF document page using Aspose PDF Cloud API.""" + def __init__(self, helper: ParesrHelper): + self.helper = helper + + def Extract(self, documentName: str, pageNumber: int, localFolder: Path, remoteFolder: Path): + self.helper.upload_document(documentName, remoteFolder) + + opts = { + "folder": remoteFolder + } + respImages = self.helper.pdf_api.get_images(documentName, pageNumber, **opts) + if respImages.code != 200: + logging.error("GetImages(): Unexpected error!") + else: + for img in respImages.images.list: + response = self.helper.pdf_api.get_image_extract_as_png(documentName, img.id, **opts) + + logging.info(f"GetImages(): Images '{img.id}' successfully extracted from the document '{documentName}'.") + local_path = localFolder / ( img.id + '.png' ) + shutil.move(response, str(local_path)) \ No newline at end of file diff --git a/Uses-Cases/Parser/get_tables.py b/Uses-Cases/Parser/get_tables.py new file mode 100644 index 0000000..7623099 --- /dev/null +++ b/Uses-Cases/Parser/get_tables.py @@ -0,0 +1,30 @@ +from paresr_helpers import ParesrHelper +from pathlib import Path +import json +import logging + +class GetTables: + """Class for extracting tables from PDF document using Aspose PDF Cloud API.""" + def __init__(self, helper: ParesrHelper): + self.helper = helper + + def Extract(self, documentName: str, localFolder: Path, remoteFolder: Path): + self.helper.upload_document(documentName, remoteFolder) + + opts = { + "folder": remoteFolder + } + respTables = self.helper.pdf_api.get_document_tables(documentName, **opts) + if respTables.code != 200: + logging.error("GetTables(): Unexpected error!") + else: + localJson = Path.joinpath(localFolder, "tables_objects.json") + with open(str(localJson), "w", encoding="utf-8") as localFile: + for tab in respTables.tables.list: + response = self.helper.pdf_api.get_table(documentName, tab.id, **opts) + if response.code != 200: + logging.error("GetTextBoxes(): Unexpected error!") + else: + logging.info(f"GetTabels(): Table '{tab.id}' successfully extracted from the document '{documentName}'.") + json.dump(tab, localFile, ensure_ascii=False,default=str) + localFile.write("\n*********************\n") \ No newline at end of file diff --git a/Uses-Cases/Parser/get_textboxes.py b/Uses-Cases/Parser/get_textboxes.py new file mode 100644 index 0000000..ffab545 --- /dev/null +++ b/Uses-Cases/Parser/get_textboxes.py @@ -0,0 +1,30 @@ +from paresr_helpers import ParesrHelper +from pathlib import Path +import json +import logging + +class GetTextBoxes: + """Class for extracting text boxes from PDF document using Aspose PDF Cloud API.""" + def __init__(self, helper: ParesrHelper): + self.helper = helper + + def Extract(self, documentName: str, localFolder: Path, remoteFolder: Path): + self.helper.upload_document(documentName, remoteFolder) + + opts = { + "folder": remoteFolder + } + respTextBoxes = self.helper.pdf_api.get_document_text_box_fields(documentName, **opts) + if respTextBoxes.code != 200: + logging.error("GetTextBoxes(): Unexpected error!") + else: + localJson = Path.joinpath(localFolder, "text_box_objects.json") + with open(str(localJson), "w", encoding="utf-8") as localFile: + for textBox in respTextBoxes.fields.list: + response = self.helper.pdf_api.get_text_box_field(documentName, textBox.full_name, **opts) + if response.code != 200: + logging.error("GetTextBoxes(): Unexpected error!") + else: + logging.info(f"GetTextBoxes(): TextBox field '{textBox.full_name}' successfully extracted from the document '{documentName}'.") + json.dump(textBox, localFile, ensure_ascii=False,default=str) + localFile.write("\n*********************\n") \ No newline at end of file diff --git a/Uses-Cases/Parser/get_xml.py b/Uses-Cases/Parser/get_xml.py new file mode 100644 index 0000000..99fe690 --- /dev/null +++ b/Uses-Cases/Parser/get_xml.py @@ -0,0 +1,22 @@ +from paresr_helpers import ParesrHelper, Config +from pathlib import Path +import logging + +class ExportFormToXXML: + """Class for extracting PDF form fields into XML using Aspose PDF Cloud API.""" + def __init__(self, helper: ParesrHelper): + self.helper = helper + + def Extract(self, documentName: str, outputXMLName: str, localFolder: Path, remoteFolder: str ): + self.helper.upload_document(documentName, remoteFolder) + + xmlPath = str(Path.joinpath(Path(remoteFolder), outputXMLName)) + opts = { + "folder": remoteFolder + } + response = self.helper.pdf_api.put_export_fields_from_pdf_to_xml_in_storage(documentName, xmlPath, **opts) + if response.code != 200: + logging.error("ExportFormToXM(): Unexpected error!") + else: + logging.info(f"ExportFormToXML(): Pdf document '{documentName}' form fields successfully exported to '{outputXMLName}' file.") + self.helper.downloadFile(outputXMLName, outputXMLName, localFolder, remoteFolder, "") \ No newline at end of file diff --git a/Uses-Cases/Parser/paresr_helpers.py b/Uses-Cases/Parser/paresr_helpers.py new file mode 100644 index 0000000..4fe870d --- /dev/null +++ b/Uses-Cases/Parser/paresr_helpers.py @@ -0,0 +1,65 @@ +import shutil +import json +import logging +from pathlib import Path +from asposepdfcloud import ApiClient, PdfApi + +# Configure logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") + + +class Config: + """Configuration parameters.""" + CREDENTIALS_FILE = Path(r"C:\\Projects\\ASPOSE\\Pdf.Cloud\\Credentials\\credentials.json") + LOCAL_FOLDER = Path(r"C:\Samples") + REMOTE_TEMP_FOLDER = "TempPdfCloud" + PDF_DOCUMENT_NAME = "sample.pdf" + XML_OUTPUT_FILE = "output_sample.xml" + FDF_OUTPUT_FILE = "output_sample.fdf" + LOCAL_RESULT_DOCUMENT_NAME = "output_sample.pdf" + PAGE_NUMBER = 1 + + +class ParesrHelper: + """Class with helper methods and properties for Parser""" + + def __init__(self, credentials_file: Path = Config.CREDENTIALS_FILE): + self.pdf_api = None + self._init_api(credentials_file) + + def _init_api(self, credentials_file: Path): + """Initialize the API client.""" + try: + with credentials_file.open("r", encoding="utf-8") as file: + credentials = json.load(file) + api_key, app_id = credentials.get("key"), credentials.get("id") + if not api_key or not app_id: + raise ValueError("Error: Missing API keys in the credentials file.") + self.pdf_api = PdfApi(ApiClient(api_key, app_id)) + except (FileNotFoundError, json.JSONDecodeError, ValueError) as e: + logging.error(f"Failed to load credentials: {e}") + + def upload_document(self, documentName: str, remoteFolder: str): + """Upload a PDF document to the Aspose Cloud server.""" + if self.pdf_api: + file_path = Config.LOCAL_FOLDER / documentName + try: + if remoteFolder == None: + self.pdf_api.upload_file(documentName, str(file_path)) + else: + opts = { "folder": remoteFolder } + self.pdf_api.upload_file(remoteFolder + '/' + documentName, file_path) + logging.info(f"File {documentName} uploaded successfully.") + except Exception as e: + logging.error(f"Failed to upload file: {e}") + + def downloadFile(self, document: str, outputDocument: str, localFolder: Path, remoteFolder: str, output_prefix: str): + """Download the processed PDF document from the Aspose Cloud server.""" + if self.pdf_api: + try: + temp_file = self.pdf_api.download_file(remoteFolder + '/' + document) + local_path = localFolder / ( output_prefix + outputDocument ) + shutil.move(temp_file, str(local_path)) + logging.info(f"download_result(): File successfully downloaded: {local_path}") + except Exception as e: + logging.error(f"download_result(): Failed to download file: {e}") diff --git a/Uses-Cases/Parser/parser_launch.py b/Uses-Cases/Parser/parser_launch.py new file mode 100644 index 0000000..5b0ec4f --- /dev/null +++ b/Uses-Cases/Parser/parser_launch.py @@ -0,0 +1,24 @@ +from paresr_helpers import ParesrHelper, Config +from get_xml import ExportFormToXXML +from get_fdf import ExportFormToFDF +from get_images import GetImages +from get_tables import GetTables +from get_textboxes import GetTextBoxes + +if __name__ == "__main__": + helper = ParesrHelper(Config.CREDENTIALS_FILE) + + xmlExtractor = ExportFormToXXML(helper) + xmlExtractor.Extract(Config.PDF_DOCUMENT_NAME, Config.XML_OUTPUT_FILE, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER) + + fdfExtractor = ExportFormToFDF(helper) + fdfExtractor.Extract(Config.PDF_DOCUMENT_NAME, Config.FDF_OUTPUT_FILE, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER) + + getImages = GetImages(helper) + getImages.Extract(Config.PDF_DOCUMENT_NAME, Config.PAGE_NUMBER, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER) + + getTables = GetTables(helper) + getTables.Extract(Config.PDF_DOCUMENT_NAME, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER) + + getTextBoxes = GetTextBoxes(helper) + getTextBoxes.Extract(Config.PDF_DOCUMENT_NAME, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER) \ No newline at end of file From bca5a049748a9c0723b41fbe95c7327b8dd033d1 Mon Sep 17 00:00:00 2001 From: Dmitriy-Xawstov Date: Wed, 10 Sep 2025 03:33:08 +0300 Subject: [PATCH 2/2] Update paresr_helpers.py --- Uses-Cases/Parser/paresr_helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Uses-Cases/Parser/paresr_helpers.py b/Uses-Cases/Parser/paresr_helpers.py index 4fe870d..9f14820 100644 --- a/Uses-Cases/Parser/paresr_helpers.py +++ b/Uses-Cases/Parser/paresr_helpers.py @@ -10,7 +10,7 @@ class Config: """Configuration parameters.""" - CREDENTIALS_FILE = Path(r"C:\\Projects\\ASPOSE\\Pdf.Cloud\\Credentials\\credentials.json") + CREDENTIALS_FILE = Path(r"..s\\credentials.json") LOCAL_FOLDER = Path(r"C:\Samples") REMOTE_TEMP_FOLDER = "TempPdfCloud" PDF_DOCUMENT_NAME = "sample.pdf" @@ -63,3 +63,4 @@ def downloadFile(self, document: str, outputDocument: str, localFolder: Path, re logging.info(f"download_result(): File successfully downloaded: {local_path}") except Exception as e: logging.error(f"download_result(): Failed to download file: {e}") +