Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions Uses-Cases/Parser/get_fdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from paresr_helpers import ParesrHelper
from pathlib import Path
import logging

class ExportFormToFDF:
"""Class for extracting PDF form fields into FDF using Aspose PDF Cloud API."""
def __init__(self, helper: ParesrHelper):
self.helper = helper

def Extract(self, documentName: str, outputFDFName: str, localFolder: Path, remoteFolder: str ):
self.helper.upload_document(documentName, remoteFolder)

fdfPath = str(Path.joinpath(Path(remoteFolder), outputFDFName))
opts = {
"folder": remoteFolder
}
response = self.helper.pdf_api.put_export_fields_from_pdf_to_fdf_in_storage(documentName, fdfPath, **opts)
if response.code != 200:
logging.error("ExportFormToFDF(): Unexpected error!")
else:
logging.info(f"ExportFormToFDF(): Pdf document '{documentName}' form fields successfully exported to '{outputFDFName}' file.")
self.helper.downloadFile(outputFDFName, outputFDFName, localFolder, remoteFolder, "")
26 changes: 26 additions & 0 deletions Uses-Cases/Parser/get_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from paresr_helpers import ParesrHelper
from pathlib import Path
import shutil
import logging

class GetImages:
"""Class for extracting images from PDF document page using Aspose PDF Cloud API."""
def __init__(self, helper: ParesrHelper):
self.helper = helper

def Extract(self, documentName: str, pageNumber: int, localFolder: Path, remoteFolder: Path):
self.helper.upload_document(documentName, remoteFolder)

opts = {
"folder": remoteFolder
}
respImages = self.helper.pdf_api.get_images(documentName, pageNumber, **opts)
if respImages.code != 200:
logging.error("GetImages(): Unexpected error!")
else:
for img in respImages.images.list:
response = self.helper.pdf_api.get_image_extract_as_png(documentName, img.id, **opts)

logging.info(f"GetImages(): Images '{img.id}' successfully extracted from the document '{documentName}'.")
local_path = localFolder / ( img.id + '.png' )
shutil.move(response, str(local_path))
30 changes: 30 additions & 0 deletions Uses-Cases/Parser/get_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from paresr_helpers import ParesrHelper
from pathlib import Path
import json
import logging

class GetTables:
"""Class for extracting tables from PDF document using Aspose PDF Cloud API."""
def __init__(self, helper: ParesrHelper):
self.helper = helper

def Extract(self, documentName: str, localFolder: Path, remoteFolder: Path):
self.helper.upload_document(documentName, remoteFolder)

opts = {
"folder": remoteFolder
}
respTables = self.helper.pdf_api.get_document_tables(documentName, **opts)
if respTables.code != 200:
logging.error("GetTables(): Unexpected error!")
else:
localJson = Path.joinpath(localFolder, "tables_objects.json")
with open(str(localJson), "w", encoding="utf-8") as localFile:
for tab in respTables.tables.list:
response = self.helper.pdf_api.get_table(documentName, tab.id, **opts)
if response.code != 200:
logging.error("GetTextBoxes(): Unexpected error!")
else:
logging.info(f"GetTabels(): Table '{tab.id}' successfully extracted from the document '{documentName}'.")
json.dump(tab, localFile, ensure_ascii=False,default=str)
localFile.write("\n*********************\n")
30 changes: 30 additions & 0 deletions Uses-Cases/Parser/get_textboxes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from paresr_helpers import ParesrHelper
from pathlib import Path
import json
import logging

class GetTextBoxes:
"""Class for extracting text boxes from PDF document using Aspose PDF Cloud API."""
def __init__(self, helper: ParesrHelper):
self.helper = helper

def Extract(self, documentName: str, localFolder: Path, remoteFolder: Path):
self.helper.upload_document(documentName, remoteFolder)

opts = {
"folder": remoteFolder
}
respTextBoxes = self.helper.pdf_api.get_document_text_box_fields(documentName, **opts)
if respTextBoxes.code != 200:
logging.error("GetTextBoxes(): Unexpected error!")
else:
localJson = Path.joinpath(localFolder, "text_box_objects.json")
with open(str(localJson), "w", encoding="utf-8") as localFile:
for textBox in respTextBoxes.fields.list:
response = self.helper.pdf_api.get_text_box_field(documentName, textBox.full_name, **opts)
if response.code != 200:
logging.error("GetTextBoxes(): Unexpected error!")
else:
logging.info(f"GetTextBoxes(): TextBox field '{textBox.full_name}' successfully extracted from the document '{documentName}'.")
json.dump(textBox, localFile, ensure_ascii=False,default=str)
localFile.write("\n*********************\n")
22 changes: 22 additions & 0 deletions Uses-Cases/Parser/get_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from paresr_helpers import ParesrHelper, Config
from pathlib import Path
import logging

class ExportFormToXXML:
"""Class for extracting PDF form fields into XML using Aspose PDF Cloud API."""
def __init__(self, helper: ParesrHelper):
self.helper = helper

def Extract(self, documentName: str, outputXMLName: str, localFolder: Path, remoteFolder: str ):
self.helper.upload_document(documentName, remoteFolder)

xmlPath = str(Path.joinpath(Path(remoteFolder), outputXMLName))
opts = {
"folder": remoteFolder
}
response = self.helper.pdf_api.put_export_fields_from_pdf_to_xml_in_storage(documentName, xmlPath, **opts)
if response.code != 200:
logging.error("ExportFormToXM(): Unexpected error!")
else:
logging.info(f"ExportFormToXML(): Pdf document '{documentName}' form fields successfully exported to '{outputXMLName}' file.")
self.helper.downloadFile(outputXMLName, outputXMLName, localFolder, remoteFolder, "")
66 changes: 66 additions & 0 deletions Uses-Cases/Parser/paresr_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import shutil
import json
import logging
from pathlib import Path
from asposepdfcloud import ApiClient, PdfApi

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


class Config:
"""Configuration parameters."""
CREDENTIALS_FILE = Path(r"..s\\credentials.json")
LOCAL_FOLDER = Path(r"C:\Samples")
REMOTE_TEMP_FOLDER = "TempPdfCloud"
PDF_DOCUMENT_NAME = "sample.pdf"
XML_OUTPUT_FILE = "output_sample.xml"
FDF_OUTPUT_FILE = "output_sample.fdf"
LOCAL_RESULT_DOCUMENT_NAME = "output_sample.pdf"
PAGE_NUMBER = 1


class ParesrHelper:
"""Class with helper methods and properties for Parser"""

def __init__(self, credentials_file: Path = Config.CREDENTIALS_FILE):
self.pdf_api = None
self._init_api(credentials_file)

def _init_api(self, credentials_file: Path):
"""Initialize the API client."""
try:
with credentials_file.open("r", encoding="utf-8") as file:
credentials = json.load(file)
api_key, app_id = credentials.get("key"), credentials.get("id")
if not api_key or not app_id:
raise ValueError("Error: Missing API keys in the credentials file.")
self.pdf_api = PdfApi(ApiClient(api_key, app_id))
except (FileNotFoundError, json.JSONDecodeError, ValueError) as e:
logging.error(f"Failed to load credentials: {e}")

def upload_document(self, documentName: str, remoteFolder: str):
"""Upload a PDF document to the Aspose Cloud server."""
if self.pdf_api:
file_path = Config.LOCAL_FOLDER / documentName
try:
if remoteFolder == None:
self.pdf_api.upload_file(documentName, str(file_path))
else:
opts = { "folder": remoteFolder }
self.pdf_api.upload_file(remoteFolder + '/' + documentName, file_path)
logging.info(f"File {documentName} uploaded successfully.")
except Exception as e:
logging.error(f"Failed to upload file: {e}")

def downloadFile(self, document: str, outputDocument: str, localFolder: Path, remoteFolder: str, output_prefix: str):
"""Download the processed PDF document from the Aspose Cloud server."""
if self.pdf_api:
try:
temp_file = self.pdf_api.download_file(remoteFolder + '/' + document)
local_path = localFolder / ( output_prefix + outputDocument )
shutil.move(temp_file, str(local_path))
logging.info(f"download_result(): File successfully downloaded: {local_path}")
except Exception as e:
logging.error(f"download_result(): Failed to download file: {e}")

24 changes: 24 additions & 0 deletions Uses-Cases/Parser/parser_launch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from paresr_helpers import ParesrHelper, Config
from get_xml import ExportFormToXXML
from get_fdf import ExportFormToFDF
from get_images import GetImages
from get_tables import GetTables
from get_textboxes import GetTextBoxes

if __name__ == "__main__":
helper = ParesrHelper(Config.CREDENTIALS_FILE)

xmlExtractor = ExportFormToXXML(helper)
xmlExtractor.Extract(Config.PDF_DOCUMENT_NAME, Config.XML_OUTPUT_FILE, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER)

fdfExtractor = ExportFormToFDF(helper)
fdfExtractor.Extract(Config.PDF_DOCUMENT_NAME, Config.FDF_OUTPUT_FILE, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER)

getImages = GetImages(helper)
getImages.Extract(Config.PDF_DOCUMENT_NAME, Config.PAGE_NUMBER, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER)

getTables = GetTables(helper)
getTables.Extract(Config.PDF_DOCUMENT_NAME, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER)

getTextBoxes = GetTextBoxes(helper)
getTextBoxes.Extract(Config.PDF_DOCUMENT_NAME, Config.LOCAL_FOLDER, Config.REMOTE_TEMP_FOLDER)