diff --git a/pyproject.toml b/pyproject.toml index 3b8fee7..5ef3b1f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "pdm.backend" [project] name = "unstract-adapters" -version = "0.2.1" +version = "0.2.2" description = "Unstract Adapters" authors = [ {name = "Zipstack Inc.", email = "devsupport@zipstack.com"}, diff --git a/src/unstract/adapters/adapterkit.py b/src/unstract/adapters/adapterkit.py index bb0f303..ff64729 100644 --- a/src/unstract/adapters/adapterkit.py +++ b/src/unstract/adapters/adapterkit.py @@ -6,6 +6,7 @@ from unstract.adapters.constants import Common from unstract.adapters.embedding import adapters as embedding_adapters from unstract.adapters.llm import adapters as llm_adapters +from unstract.adapters.ocr import adapters as ocr_adapters from unstract.adapters.vectordb import adapters as vectordb_adapters from unstract.adapters.x2text import adapters as x2text_adapters @@ -19,6 +20,7 @@ def __init__(self) -> None: | llm_adapters | vectordb_adapters | x2text_adapters + | ocr_adapters ) @property diff --git a/src/unstract/adapters/embedding/register.py b/src/unstract/adapters/embedding/register.py index 74a9328..54c8adb 100644 --- a/src/unstract/adapters/embedding/register.py +++ b/src/unstract/adapters/embedding/register.py @@ -3,29 +3,36 @@ from importlib import import_module from typing import Any -from unstract.adapters.registry import AdapterRegistry from unstract.adapters.constants import Common from unstract.adapters.embedding.embedding_adapter import EmbeddingAdapter +from unstract.adapters.registry import AdapterRegistry logger = logging.getLogger(__name__) -class EmbeddingRegistry(AdapterRegistry): +class EmbeddingRegistry(AdapterRegistry): @staticmethod def register_adapters(adapters: dict[str, Any]) -> None: current_directory = os.path.dirname(os.path.abspath(__file__)) package = "unstract.adapters.embedding" for adapter in os.listdir(current_directory): - adapter_path = os.path.join(current_directory, adapter, Common.SRC_FOLDER) - # Check if the item is a directory and not a special directory like __pycache__ + adapter_path = os.path.join( + current_directory, adapter, Common.SRC_FOLDER + ) + # Check if the item is a directory and not + # a special directory like __pycache__ if os.path.isdir(adapter_path) and not adapter.startswith("__"): - EmbeddingRegistry.__build_adapter_list(adapter, package, adapters) + EmbeddingRegistry._build_adapter_list( + adapter, package, adapters + ) if len(adapters) == 0: logger.warning("No embedding adapter found.") @staticmethod - def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -> None: + def _build_adapter_list( + adapter: str, package: str, adapters: dict[str, Any] + ) -> None: try: full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}" module = import_module(full_module_path) @@ -41,5 +48,6 @@ def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) - Common.METADATA: metadata, } except ModuleNotFoundError as exception: - logger.error(f"Error while importing embedding adapters : {exception}") - + logger.error( + f"Error while importing embedding adapters : {exception}" + ) diff --git a/src/unstract/adapters/enums.py b/src/unstract/adapters/enums.py index 1158e0c..71a8896 100644 --- a/src/unstract/adapters/enums.py +++ b/src/unstract/adapters/enums.py @@ -6,4 +6,5 @@ class AdapterTypes(Enum): LLM = "LLM" EMBEDDING = "EMBEDDING" VECTOR_DB = "VECTOR_DB" + OCR = "OCR" X2TEXT = "X2TEXT" diff --git a/src/unstract/adapters/llm/register.py b/src/unstract/adapters/llm/register.py index 4e75593..9137ae6 100644 --- a/src/unstract/adapters/llm/register.py +++ b/src/unstract/adapters/llm/register.py @@ -23,12 +23,12 @@ def register_adapters(adapters: dict[str, Any]) -> None: # Check if the item is a directory and not a # special directory like _pycache__ if os.path.isdir(adapter_path) and not adapter.startswith("__"): - LLMRegistry.__build_adapter_list(adapter, package, adapters) + LLMRegistry._build_adapter_list(adapter, package, adapters) if len(adapters) == 0: logger.warning("No llm adapter found.") @staticmethod - def __build_adapter_list( + def _build_adapter_list( adapter: str, package: str, adapters: dict[str, Any] ) -> None: try: diff --git a/src/unstract/adapters/ocr/__init__.py b/src/unstract/adapters/ocr/__init__.py new file mode 100644 index 0000000..21818f2 --- /dev/null +++ b/src/unstract/adapters/ocr/__init__.py @@ -0,0 +1,5 @@ +from unstract.adapters import AdapterDict +from unstract.adapters.ocr.register import OCRRegistry + +adapters: AdapterDict = {} +OCRRegistry.register_adapters(adapters) diff --git a/src/unstract/adapters/ocr/constants.py b/src/unstract/adapters/ocr/constants.py new file mode 100644 index 0000000..87e75f3 --- /dev/null +++ b/src/unstract/adapters/ocr/constants.py @@ -0,0 +1,18 @@ +class FileType: + TEXT_PLAIN = "text/plain" + IMAGE_JPEG = "image/jpeg" + IMAGE_PNG = "image/png" + IMAGE_TIFF = "image/tiff" + IMAGE_BMP = "image/bmp" + IMAGE_GIF = "image/gif" + IMAGE_WEBP = "image/webp" + APPLICATION_PDF = "application/pdf" + ALLOWED_TYPES = [ + IMAGE_JPEG, + IMAGE_PNG, + IMAGE_TIFF, + IMAGE_BMP, + IMAGE_GIF, + IMAGE_WEBP, + APPLICATION_PDF, + ] diff --git a/src/unstract/adapters/ocr/google_document_ai/README.md b/src/unstract/adapters/ocr/google_document_ai/README.md new file mode 100644 index 0000000..f8b83e5 --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/README.md @@ -0,0 +1 @@ +# Unstract Google Document AI OCR Adapter diff --git a/src/unstract/adapters/ocr/google_document_ai/pyproject.toml b/src/unstract/adapters/ocr/google_document_ai/pyproject.toml new file mode 100644 index 0000000..b4070fa --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/pyproject.toml @@ -0,0 +1,26 @@ +[build-system] +requires = ["pdm-backend"] +build-backend = "pdm.backend" + + +[project] +name = "unstract-googledocumentai-ocr" +version = "0.0.1" +description = "Google Document AI OCR" +authors = [ + {name = "Zipstack Inc.", email = "devsupport@zipstack.com"}, +] +dependencies = [ + +] +requires-python = ">=3.9" +readme = "README.md" +classifiers = [ + "Programming Language :: Python" +] +license = {text = "MIT"} + +[tool.pdm.build] +includes = ["src"] +package-dir = "src" +# source-includes = ["tests"] diff --git a/src/unstract/adapters/ocr/google_document_ai/src/README.md b/src/unstract/adapters/ocr/google_document_ai/src/README.md new file mode 100644 index 0000000..f8b83e5 --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/src/README.md @@ -0,0 +1 @@ +# Unstract Google Document AI OCR Adapter diff --git a/src/unstract/adapters/ocr/google_document_ai/src/__init__.py b/src/unstract/adapters/ocr/google_document_ai/src/__init__.py new file mode 100644 index 0000000..c600bba --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/src/__init__.py @@ -0,0 +1,9 @@ +from .google_document_ai import GoogleDocumentAI + +metadata = { + "name": GoogleDocumentAI.__name__, + "version": "1.0.0", + "adapter": GoogleDocumentAI, + "description": "Google Document AI OCR adapter", + "is_active": True, +} diff --git a/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py new file mode 100644 index 0000000..039ba7f --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py @@ -0,0 +1,174 @@ +import base64 +import json +import logging +import os +from typing import Any, Optional + +import requests +from filetype import filetype +from google.auth.transport import requests as google_requests +from google.oauth2.service_account import Credentials + +from unstract.adapters.exceptions import AdapterError +from unstract.adapters.ocr.constants import FileType +from unstract.adapters.ocr.ocr_adapter import OCRAdapter + +logger = logging.getLogger(__name__) + + +class GoogleDocumentAIKey: + RAW_DOCUMENT = "rawDocument" + MIME_TYPE = "mimeType" + CONTENT = "content" + SKIP_HUMAN_REVIEW = "skipHumanReview" + FIELD_MASK = "fieldMask" + + +class Constants: + URL = "url" + CREDENTIALS = "credentials" + CREDENTIAL_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"] + + +class GoogleDocumentAI(OCRAdapter): + def __init__(self, settings: dict[str, Any]): + super().__init__("GoogleDocumentAI") + self.config = settings + google_service_account = self.config.get(Constants.CREDENTIALS) + if not google_service_account: + logger.error("Google service account not found") + else: + self.google_service_account = json.loads(google_service_account) + + @staticmethod + def get_id() -> str: + return "googledocumentai|1013f64b-ecc9-4e35-b986-aebd60fb55d7" + + @staticmethod + def get_name() -> str: + return "GoogleDocumentAI" + + @staticmethod + def get_description() -> str: + return "Google Document AI OCR" + + @staticmethod + def get_icon() -> str: + return ( + "https://storage.googleapis.com/pandora-static/" + "adapter-icons/GoogleDocumentAI.png" + ) + + @staticmethod + def get_json_schema() -> str: + f = open(f"{os.path.dirname(__file__)}/static/json_schema.json") + schema = f.read() + f.close() + return schema + + """ Construct the request body to be sent to Google AI Document server """ + + def _get_request_body( + self, file_type_mime: str, file_content_in_bytes: bytes + ) -> dict[str, Any]: + return { + GoogleDocumentAIKey.RAW_DOCUMENT: { + GoogleDocumentAIKey.MIME_TYPE: file_type_mime, + GoogleDocumentAIKey.CONTENT: base64.b64encode( + file_content_in_bytes + ).decode("utf-8"), + }, + GoogleDocumentAIKey.SKIP_HUMAN_REVIEW: True, + GoogleDocumentAIKey.FIELD_MASK: "text", + } + + """ Construct the request headers to be sent + to Google AI Document server """ + + def _get_request_headers(self) -> dict[str, Any]: + credentials = Credentials.from_service_account_info( + self.google_service_account, scopes=Constants.CREDENTIAL_SCOPES + ) + credentials.refresh(google_requests.Request()) + + return { + "Content-Type": "application/json; charset=utf-8", + "Authorization": f"Bearer {credentials.token}", + } + + """ Detect the mime type from the file content """ + + def _get_input_file_type_mime(self, input_file_path: str) -> str: + with open(input_file_path, mode="rb") as file_obj: + sample_contents = file_obj.read(100) + file_type = filetype.guess(sample_contents) + + file_type_mime: str = ( + file_type.MIME if file_type else FileType.TEXT_PLAIN + ) + + if file_type_mime not in FileType.ALLOWED_TYPES: + logger.error("Input file type not supported: " f"{file_type_mime}") + + logger.info(f"file: `{input_file_path} [{file_type_mime}]`\n\n") + + return file_type_mime + + def process( + self, input_file_path: str, output_file_path: Optional[str] = None + ) -> str: + try: + file_type_mime = self._get_input_file_type_mime(input_file_path) + if os.path.isfile(input_file_path): + with open(input_file_path, "rb") as fop: + file_content_in_bytes: bytes = fop.read() + else: + raise AdapterError(f"File not found {input_file_path}") + processor_url = self.config.get(Constants.URL, "") + ":process" + headers = self._get_request_headers() + data = self._get_request_body( + file_type_mime=file_type_mime, + file_content_in_bytes=file_content_in_bytes, + ) + response = requests.post(processor_url, headers=headers, json=data) + if response.status_code != 200: + logger.error( + f"Error while calling Google Document AI: {response.text}" + ) + response_json: dict[str, Any] = response.json() + result_text: str = response_json["document"]["text"] + if output_file_path is not None: + with open(output_file_path, "w", encoding="utf-8") as f: + f.write(result_text) + f.close() + return result_text + except Exception as e: + logger.error(f"Error while processing document {e}") + if not isinstance(e, AdapterError): + raise AdapterError(str(e)) + else: + raise e + finally: + if fop is not None: + fop.close() + + def test_connection(self) -> bool: + try: + url = self.config.get(Constants.URL, "") + headers = self._get_request_headers() + response = requests.get(url, headers=headers) + if response.status_code != 200: + logger.error( + f"Error while testing Google Document AI: {response.text}" + ) + raise AdapterError( + f"{response.status_code} - {response.reason}" + ) + else: + return True + except Exception as e: + logger.error(f"Error occured while testing adapter {e}") + if not isinstance(e, AdapterError): + raise AdapterError(str(e)) + else: + raise e diff --git a/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json new file mode 100644 index 0000000..bec194e --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json @@ -0,0 +1,30 @@ +{ + "title": "Google Document AI OCR", + "type": "object", + "required": [ + "adapter_name", + "url", + "credentials" + ], + "properties": { + "adapter_name": { + "type": "string", + "title": "OCR Adapter ID", + "default": "", + "description": "Provide a unique name for this adapter instance. Example: google-document-ai-1" + }, + "url": { + "type": "string", + "title": "URL", + "default": "", + "format": "uri", + "description": "The URL of the Google Document AI endpoint for the processor Example: https://{endpoint}/v1/projects/{project}/locations/{location}/processors/{processor}" + }, + "credentials": { + "type": "string", + "title": "Google Service Account", + "deafult": "", + "description": "Service Account in JSON format" + } + } +} diff --git a/src/unstract/adapters/ocr/ocr_adapter.py b/src/unstract/adapters/ocr/ocr_adapter.py new file mode 100644 index 0000000..d603b67 --- /dev/null +++ b/src/unstract/adapters/ocr/ocr_adapter.py @@ -0,0 +1,44 @@ +from abc import ABC +from typing import Any, Optional + +from unstract.adapters.base import Adapter +from unstract.adapters.enums import AdapterTypes + + +class OCRAdapter(Adapter, ABC): + def __init__(self, name: str): + super().__init__(name) + self.name = name + + @staticmethod + def get_id() -> str: + return "" + + @staticmethod + def get_name() -> str: + return "" + + @staticmethod + def get_description() -> str: + return "" + + @staticmethod + def get_icon() -> str: + return "" + + @staticmethod + def get_json_schema() -> str: + return "" + + @staticmethod + def get_adapter_type() -> AdapterTypes: + return AdapterTypes.OCR + + def process( + self, input_file_path: str, output_file_path: Optional[str] = None + ) -> str: + # Overriding methods will contain actual implementation + return "" + + def test_connection(self, llm_metadata: dict[str, Any]) -> bool: + return False diff --git a/src/unstract/adapters/ocr/register.py b/src/unstract/adapters/ocr/register.py new file mode 100644 index 0000000..edecf4b --- /dev/null +++ b/src/unstract/adapters/ocr/register.py @@ -0,0 +1,49 @@ +import logging +import os +from importlib import import_module +from typing import Any + +from unstract.adapters.constants import Common +from unstract.adapters.ocr.ocr_adapter import OCRAdapter +from unstract.adapters.registry import AdapterRegistry + +logger = logging.getLogger(__name__) + + +class OCRRegistry(AdapterRegistry): + @staticmethod + def register_adapters(adapters: dict[str, Any]) -> None: + current_directory = os.path.dirname(os.path.abspath(__file__)) + package = "unstract.adapters.ocr" + + for adapter in os.listdir(current_directory): + adapter_path = os.path.join( + current_directory, adapter, Common.SRC_FOLDER + ) + # Check if the item is a directory and not a + # special directory like __pycache__ + if os.path.isdir(adapter_path) and not adapter.startswith("__"): + OCRRegistry._build_adapter_list(adapter, package, adapters) + if len(adapters) == 0: + logger.warning("No ocr adapter found.") + + @staticmethod + def _build_adapter_list( + adapter: str, package: str, adapters: dict[str, Any] + ) -> None: + try: + full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}" + module = import_module(full_module_path) + metadata = getattr(module, Common.METADATA, {}) + if metadata.get("is_active", False): + adapter_class: OCRAdapter = metadata[Common.ADAPTER] + adapter_id = adapter_class.get_id() + if not adapter_id or (adapter_id in adapters): + logger.warning(f"Duplicate Id : {adapter_id}") + else: + adapters[adapter_id] = { + Common.MODULE: module, + Common.METADATA: metadata, + } + except ModuleNotFoundError as exception: + logger.error(f"Error while importing ocr adapters : {exception}") diff --git a/src/unstract/adapters/vectordb/register.py b/src/unstract/adapters/vectordb/register.py index e0a620b..e7ef48a 100644 --- a/src/unstract/adapters/vectordb/register.py +++ b/src/unstract/adapters/vectordb/register.py @@ -3,12 +3,13 @@ from importlib import import_module from typing import Any -from unstract.adapters.registry import AdapterRegistry from unstract.adapters.constants import Common +from unstract.adapters.registry import AdapterRegistry from unstract.adapters.vectordb.vectordb_adapter import VectorDBAdapter logger = logging.getLogger(__name__) + class VectorDBRegistry(AdapterRegistry): @staticmethod def register_adapters(adapters: dict[str, Any]) -> None: @@ -16,15 +17,20 @@ def register_adapters(adapters: dict[str, Any]) -> None: package = "unstract.adapters.vectordb" for adapter in os.listdir(current_directory): - adapter_path = os.path.join(current_directory, adapter, Common.SRC_FOLDER) - # Check if the item is a directory and not a special directory like __pycache__ + adapter_path = os.path.join( + current_directory, adapter, Common.SRC_FOLDER + ) + # Check if the item is a directory and not a + # special directory like __pycache__ if os.path.isdir(adapter_path) and not adapter.startswith("__"): - VectorDBRegistry.__build_adapter_list(adapter, package, adapters) + VectorDBRegistry._build_adapter_list(adapter, package, adapters) if len(adapters) == 0: logger.warning("No vectorDB adapter found.") @staticmethod - def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -> None: + def _build_adapter_list( + adapter: str, package: str, adapters: dict[str, Any] + ) -> None: try: full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}" module = import_module(full_module_path) @@ -40,4 +46,6 @@ def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) - Common.METADATA: metadata, } except ModuleNotFoundError as exception: - logger.error(f"Error while importing vectorDB adapters : {exception}") + logger.error( + f"Error while importing vectorDB adapters : {exception}" + ) diff --git a/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py index 8f5548b..595b04c 100644 --- a/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py +++ b/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py @@ -115,7 +115,8 @@ def process( ) else: if response.content is not None: - output = str(response.content) + if isinstance(response.content, bytes): + output = response.content.decode("utf-8") if output_file_path is not None: with open(output_file_path, "w", encoding="utf-8") as f: f.write(output) diff --git a/src/unstract/adapters/x2text/register.py b/src/unstract/adapters/x2text/register.py index c90fb5a..27b7881 100644 --- a/src/unstract/adapters/x2text/register.py +++ b/src/unstract/adapters/x2text/register.py @@ -23,12 +23,12 @@ def register_adapters(adapters: dict[str, Any]) -> None: # Check if the item is a directory and not a # special directory like __pycache__ if os.path.isdir(adapter_path) and not adapter.startswith("__"): - X2TextRegistry.__build_adapter_list(adapter, package, adapters) + X2TextRegistry._build_adapter_list(adapter, package, adapters) if len(adapters) == 0: logger.warning("No X2Text adapter found.") @staticmethod - def __build_adapter_list( + def _build_adapter_list( adapter: str, package: str, adapters: dict[str, Any] ) -> None: try: