From 9720a2e7426fefa4477113fe106ce8d7925d589a Mon Sep 17 00:00:00 2001 From: gayathrivijayakumar Date: Wed, 21 Feb 2024 22:58:53 +0530 Subject: [PATCH 1/7] OCR adapter changes --- src/unstract/adapters/adapterkit.py | 3 +- src/unstract/adapters/enums.py | 1 + src/unstract/adapters/ocr/__init__.py | 5 + src/unstract/adapters/ocr/constants.py | 18 ++ .../adapters/ocr/google_document_ai/README.md | 1 + .../ocr/google_document_ai/pyproject.toml | 26 +++ .../ocr/google_document_ai/src/README.md | 1 + .../ocr/google_document_ai/src/__init__.py | 9 + .../src/google_document_ai.py | 167 ++++++++++++++++++ .../src/static/json_schema.json | 29 +++ src/unstract/adapters/ocr/ocr_adapter.py | 41 +++++ src/unstract/adapters/ocr/register.py | 49 +++++ 12 files changed, 349 insertions(+), 1 deletion(-) create mode 100644 src/unstract/adapters/ocr/__init__.py create mode 100644 src/unstract/adapters/ocr/constants.py create mode 100644 src/unstract/adapters/ocr/google_document_ai/README.md create mode 100644 src/unstract/adapters/ocr/google_document_ai/pyproject.toml create mode 100644 src/unstract/adapters/ocr/google_document_ai/src/README.md create mode 100644 src/unstract/adapters/ocr/google_document_ai/src/__init__.py create mode 100644 src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py create mode 100644 src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json create mode 100644 src/unstract/adapters/ocr/ocr_adapter.py create mode 100644 src/unstract/adapters/ocr/register.py diff --git a/src/unstract/adapters/adapterkit.py b/src/unstract/adapters/adapterkit.py index fcbacec..31719b1 100644 --- a/src/unstract/adapters/adapterkit.py +++ b/src/unstract/adapters/adapterkit.py @@ -6,6 +6,7 @@ from unstract.adapters.constants import Common from unstract.adapters.embedding import adapters as embedding_adapters from unstract.adapters.llm import adapters as llm_adapters +from unstract.adapters.ocr import adapters as ocr_adapters from unstract.adapters.vectordb import adapters as vectordb_adapters logger = logging.getLogger(__name__) @@ -14,7 +15,7 @@ class Adapterkit: def __init__(self) -> None: self._adapters: AdapterDict = ( - embedding_adapters | llm_adapters | vectordb_adapters + embedding_adapters | llm_adapters | vectordb_adapters | ocr_adapters ) @property diff --git a/src/unstract/adapters/enums.py b/src/unstract/adapters/enums.py index e806239..b07ffd7 100644 --- a/src/unstract/adapters/enums.py +++ b/src/unstract/adapters/enums.py @@ -6,3 +6,4 @@ class AdapterTypes(Enum): LLM = "LLM" EMBEDDING = "EMBEDDING" VECTOR_DB = "VECTOR_DB" + OCR = "OCR" diff --git a/src/unstract/adapters/ocr/__init__.py b/src/unstract/adapters/ocr/__init__.py new file mode 100644 index 0000000..21818f2 --- /dev/null +++ b/src/unstract/adapters/ocr/__init__.py @@ -0,0 +1,5 @@ +from unstract.adapters import AdapterDict +from unstract.adapters.ocr.register import OCRRegistry + +adapters: AdapterDict = {} +OCRRegistry.register_adapters(adapters) diff --git a/src/unstract/adapters/ocr/constants.py b/src/unstract/adapters/ocr/constants.py new file mode 100644 index 0000000..87e75f3 --- /dev/null +++ b/src/unstract/adapters/ocr/constants.py @@ -0,0 +1,18 @@ +class FileType: + TEXT_PLAIN = "text/plain" + IMAGE_JPEG = "image/jpeg" + IMAGE_PNG = "image/png" + IMAGE_TIFF = "image/tiff" + IMAGE_BMP = "image/bmp" + IMAGE_GIF = "image/gif" + IMAGE_WEBP = "image/webp" + APPLICATION_PDF = "application/pdf" + ALLOWED_TYPES = [ + IMAGE_JPEG, + IMAGE_PNG, + IMAGE_TIFF, + IMAGE_BMP, + IMAGE_GIF, + IMAGE_WEBP, + APPLICATION_PDF, + ] diff --git a/src/unstract/adapters/ocr/google_document_ai/README.md b/src/unstract/adapters/ocr/google_document_ai/README.md new file mode 100644 index 0000000..f8b83e5 --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/README.md @@ -0,0 +1 @@ +# Unstract Google Document AI OCR Adapter diff --git a/src/unstract/adapters/ocr/google_document_ai/pyproject.toml b/src/unstract/adapters/ocr/google_document_ai/pyproject.toml new file mode 100644 index 0000000..b4070fa --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/pyproject.toml @@ -0,0 +1,26 @@ +[build-system] +requires = ["pdm-backend"] +build-backend = "pdm.backend" + + +[project] +name = "unstract-googledocumentai-ocr" +version = "0.0.1" +description = "Google Document AI OCR" +authors = [ + {name = "Zipstack Inc.", email = "devsupport@zipstack.com"}, +] +dependencies = [ + +] +requires-python = ">=3.9" +readme = "README.md" +classifiers = [ + "Programming Language :: Python" +] +license = {text = "MIT"} + +[tool.pdm.build] +includes = ["src"] +package-dir = "src" +# source-includes = ["tests"] diff --git a/src/unstract/adapters/ocr/google_document_ai/src/README.md b/src/unstract/adapters/ocr/google_document_ai/src/README.md new file mode 100644 index 0000000..f8b83e5 --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/src/README.md @@ -0,0 +1 @@ +# Unstract Google Document AI OCR Adapter diff --git a/src/unstract/adapters/ocr/google_document_ai/src/__init__.py b/src/unstract/adapters/ocr/google_document_ai/src/__init__.py new file mode 100644 index 0000000..c600bba --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/src/__init__.py @@ -0,0 +1,9 @@ +from .google_document_ai import GoogleDocumentAI + +metadata = { + "name": GoogleDocumentAI.__name__, + "version": "1.0.0", + "adapter": GoogleDocumentAI, + "description": "Google Document AI OCR adapter", + "is_active": True, +} diff --git a/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py new file mode 100644 index 0000000..f50a1ab --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py @@ -0,0 +1,167 @@ +import base64 +import json +import logging +import os +from typing import Any, Optional + +import requests +from filetype import filetype +from google.auth.transport import requests as google_requests +from google.oauth2.service_account import Credentials + +from unstract.adapters.exceptions import AdapterError +from unstract.adapters.ocr.constants import FileType +from unstract.adapters.ocr.ocr_adapter import OCRAdapter + +logger = logging.getLogger(__name__) + + +class GoogleDocumentAIKey: + RAW_DOCUMENT = "rawDocument" + MIME_TYPE = "mimeType" + CONTENT = "content" + SKIP_HUMAN_REVIEW = "skipHumanReview" + FIELD_MASK = "fieldMask" + + +class Constants: + URL = "url" + CREDENTIALS = "credentials" + CREDENTIAL_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"] + + +class GoogleDocumentAI(OCRAdapter): + def __init__(self, settings: dict[str, Any]): + super().__init__("GoogleDocumentAI") + self.config = settings + google_service_account = self.config.get(Constants.CREDENTIALS) + if not google_service_account: + logger.error("Google service account not found") + else: + self.google_service_account = json.loads(google_service_account) + + @staticmethod + def get_id() -> str: + return "googledocumentai|1013f64b-ecc9-4e35-b986-aebd60fb55d7" + + @staticmethod + def get_name() -> str: + return "GoogleDocumentAI" + + @staticmethod + def get_description() -> str: + return "Google Document AI OCR" + + @staticmethod + def get_icon() -> str: + return ( + "https://storage.googleapis.com/pandora-static/" + "adapter-icons/GoogleDocumentAI.png" + ) + + @staticmethod + def get_json_schema() -> str: + f = open(f"{os.path.dirname(__file__)}/static/json_schema.json") + schema = f.read() + f.close() + return schema + + def __get_request_body( + self, file_type_mime: str, file_content_in_bytes: bytes + ) -> dict[str, Any]: + return { + GoogleDocumentAIKey.RAW_DOCUMENT: { + GoogleDocumentAIKey.MIME_TYPE: file_type_mime, + GoogleDocumentAIKey.CONTENT: base64.b64encode( + file_content_in_bytes + ).decode("utf-8"), + }, + GoogleDocumentAIKey.SKIP_HUMAN_REVIEW: True, + GoogleDocumentAIKey.FIELD_MASK: "text", + } + + def __get_request_headers(self) -> dict[str, Any]: + credentials = Credentials.from_service_account_info( + self.google_service_account, scopes=Constants.CREDENTIAL_SCOPES + ) + credentials.refresh(google_requests.Request()) + + return { + "Content-Type": "application/json; charset=utf-8", + "Authorization": f"Bearer {credentials.token}", + } + + def __get_input_file_type_mime(self, input_file_path: str) -> str: + with open(input_file_path, mode="rb") as file_obj: + sample_contents = file_obj.read(100) + file_type = filetype.guess(sample_contents) + + file_type_mime: str = ( + file_type.MIME if file_type else FileType.TEXT_PLAIN + ) + + if file_type_mime not in FileType.ALLOWED_TYPES: + logger.error("Input file type not supported: " f"{file_type_mime}") + + logger.info(f"file: `{input_file_path} [{file_type_mime}]`\n\n") + + return file_type_mime + + def process( + self, input_file_path: str, output_file_path: Optional[str] = None + ) -> str: + try: + file_type_mime = self.__get_input_file_type_mime(input_file_path) + if os.path.isfile(input_file_path): + with open(input_file_path, "rb") as fop: + file_content_in_bytes: bytes = fop.read() + else: + raise AdapterError(f"File not found {input_file_path}") + processor_url = self.config.get(Constants.URL, "") + ":process" + headers = self.__get_request_headers() + data = self.__get_request_body( + file_type_mime=file_type_mime, + file_content_in_bytes=file_content_in_bytes, + ) + response = requests.post(processor_url, headers=headers, json=data) + if response.status_code != 200: + logger.error( + f"Error while calling Google Document AI: {response.text}" + ) + response_json: dict[str, Any] = response.json() + result_text: str = response_json["document"]["text"] + if output_file_path is not None: + with open(output_file_path, "w", encoding="utf-8") as f: + f.write(result_text) + f.close() + return result_text + except Exception as e: + logger.error(f"Error while processing document {e}") + if not isinstance(e, AdapterError): + raise AdapterError(str(e)) + else: + raise e + finally: + if fop is not None: + fop.close() + + def test_connection(self) -> bool: + try: + url = self.config.get(Constants.URL, "") + headers = self.__get_request_headers() + response = requests.get(url, headers=headers) + if response.status_code != 200: + logger.error( + f"Error while testing Google Document AI: {response.text}" + ) + raise AdapterError( + f"{response.status_code} - {response.reason}" + ) + else: + return True + except Exception as e: + logger.error(f"Error occured while testing adapter {e}") + if not isinstance(e, AdapterError): + raise AdapterError(str(e)) + else: + raise e diff --git a/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json new file mode 100644 index 0000000..2785998 --- /dev/null +++ b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json @@ -0,0 +1,29 @@ +{ + "title": "Google Document AI OCR", + "type": "object", + "required": [ + "adapter_name", + "url", + "credentials" + ], + "properties": { + "adapter_name": { + "type": "string", + "title": "OCR Adapter ID", + "default": "", + "description": "Provide a unique name for this adapter instance. Example: google-document-ai-1" + }, + "url": { + "type": "string", + "title": "URL", + "default": "", + "format": "uri", + "description": "The URL of the Google Document AI endpoint for the processor Example: https://{endpoint}/v1/projects/{project}/locations/{location}/processors/{processor}" + }, + "credentials": { + "type": "string", + "title": "Google Service Account", + "deafult": "" + } + } +} diff --git a/src/unstract/adapters/ocr/ocr_adapter.py b/src/unstract/adapters/ocr/ocr_adapter.py new file mode 100644 index 0000000..56192ce --- /dev/null +++ b/src/unstract/adapters/ocr/ocr_adapter.py @@ -0,0 +1,41 @@ +from abc import ABC +from typing import Any + +from unstract.adapters.base import Adapter +from unstract.adapters.enums import AdapterTypes + + +class OCRAdapter(Adapter, ABC): + def __init__(self, name: str): + super().__init__(name) + self.name = name + + @staticmethod + def get_id() -> str: + return "" + + @staticmethod + def get_name() -> str: + return "" + + @staticmethod + def get_description() -> str: + return "" + + @staticmethod + def get_icon() -> str: + return "" + + @staticmethod + def get_json_schema() -> str: + return "" + + @staticmethod + def get_adapter_type() -> AdapterTypes: + return AdapterTypes.OCR + + def process(self, input_file_path: str, output_file_path: str) -> str: + return "" + + def test_connection(self, llm_metadata: dict[str, Any]) -> bool: + return False diff --git a/src/unstract/adapters/ocr/register.py b/src/unstract/adapters/ocr/register.py new file mode 100644 index 0000000..3d379c4 --- /dev/null +++ b/src/unstract/adapters/ocr/register.py @@ -0,0 +1,49 @@ +import logging +import os +from importlib import import_module +from typing import Any + +from unstract.adapters.constants import Common +from unstract.adapters.ocr.ocr_adapter import OCRAdapter +from unstract.adapters.registry import AdapterRegistry + +logger = logging.getLogger(__name__) + + +class OCRRegistry(AdapterRegistry): + @staticmethod + def register_adapters(adapters: dict[str, Any]) -> None: + current_directory = os.path.dirname(os.path.abspath(__file__)) + package = "unstract.adapters.ocr" + + for adapter in os.listdir(current_directory): + adapter_path = os.path.join( + current_directory, adapter, Common.SRC_FOLDER + ) + # Check if the item is a directory and not a + # special directory like __pycache__ + if os.path.isdir(adapter_path) and not adapter.startswith("__"): + OCRRegistry.__build_adapter_list(adapter, package, adapters) + if len(adapters) == 0: + logger.warning("No ocr adapter found.") + + @staticmethod + def __build_adapter_list( + adapter: str, package: str, adapters: dict[str, Any] + ) -> None: + try: + full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}" + module = import_module(full_module_path) + metadata = getattr(module, Common.METADATA, {}) + if metadata.get("is_active", False): + adapter_class: OCRAdapter = metadata[Common.ADAPTER] + adapter_id = adapter_class.get_id() + if not adapter_id or (adapter_id in adapters): + logger.warning(f"Duplicate Id : {adapter_id}") + else: + adapters[adapter_id] = { + Common.MODULE: module, + Common.METADATA: metadata, + } + except ModuleNotFoundError as exception: + logger.error(f"Error while importing ocr adapters : {exception}") From a09f4b8e3ca5b65c66900c651bbb0282541b3fd8 Mon Sep 17 00:00:00 2001 From: gayathrivijayakumar Date: Wed, 21 Feb 2024 23:20:31 +0530 Subject: [PATCH 2/7] Fix function signature --- src/unstract/adapters/ocr/ocr_adapter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/unstract/adapters/ocr/ocr_adapter.py b/src/unstract/adapters/ocr/ocr_adapter.py index 56192ce..d603b67 100644 --- a/src/unstract/adapters/ocr/ocr_adapter.py +++ b/src/unstract/adapters/ocr/ocr_adapter.py @@ -1,5 +1,5 @@ from abc import ABC -from typing import Any +from typing import Any, Optional from unstract.adapters.base import Adapter from unstract.adapters.enums import AdapterTypes @@ -34,7 +34,10 @@ def get_json_schema() -> str: def get_adapter_type() -> AdapterTypes: return AdapterTypes.OCR - def process(self, input_file_path: str, output_file_path: str) -> str: + def process( + self, input_file_path: str, output_file_path: Optional[str] = None + ) -> str: + # Overriding methods will contain actual implementation return "" def test_connection(self, llm_metadata: dict[str, Any]) -> bool: From 2d1079efde9c279ff816171325c03ac56c1610d3 Mon Sep 17 00:00:00 2001 From: gayathrivijayakumar Date: Wed, 21 Feb 2024 23:25:58 +0530 Subject: [PATCH 3/7] Add .idea to be ignored by Git commits --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6769e21..2dc53ca 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,4 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +.idea/ From 09f60c5257158f0debd861974e2886a29a3fb534 Mon Sep 17 00:00:00 2001 From: gayathrivijayakumar Date: Mon, 26 Feb 2024 12:03:11 +0530 Subject: [PATCH 4/7] Roll up version for adapter changes for OCR --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3b8fee7..5ef3b1f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "pdm.backend" [project] name = "unstract-adapters" -version = "0.2.1" +version = "0.2.2" description = "Unstract Adapters" authors = [ {name = "Zipstack Inc.", email = "devsupport@zipstack.com"}, From c70cd1eea52bca93dc70f8fcba04a194b3bee50e Mon Sep 17 00:00:00 2001 From: gayathrivijayakumar Date: Mon, 26 Feb 2024 18:47:04 +0530 Subject: [PATCH 5/7] Remove unwanted space --- src/unstract/adapters/adapterkit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unstract/adapters/adapterkit.py b/src/unstract/adapters/adapterkit.py index df8c51a..ff64729 100644 --- a/src/unstract/adapters/adapterkit.py +++ b/src/unstract/adapters/adapterkit.py @@ -20,7 +20,7 @@ def __init__(self) -> None: | llm_adapters | vectordb_adapters | x2text_adapters - | ocr_adapters + | ocr_adapters ) @property From 39187f03ca76cea5e7bd3400f5484b7287a2b555 Mon Sep 17 00:00:00 2001 From: gayathrivijayakumar Date: Mon, 26 Feb 2024 20:35:24 +0530 Subject: [PATCH 6/7] Private methid func name refactoring --- src/unstract/adapters/embedding/register.py | 24 ++++++++++++------- src/unstract/adapters/llm/register.py | 4 ++-- .../src/google_document_ai.py | 21 ++++++++++------ .../src/static/json_schema.json | 3 ++- src/unstract/adapters/ocr/register.py | 4 ++-- src/unstract/adapters/vectordb/register.py | 20 +++++++++++----- src/unstract/adapters/x2text/register.py | 4 ++-- 7 files changed, 52 insertions(+), 28 deletions(-) diff --git a/src/unstract/adapters/embedding/register.py b/src/unstract/adapters/embedding/register.py index 74a9328..54c8adb 100644 --- a/src/unstract/adapters/embedding/register.py +++ b/src/unstract/adapters/embedding/register.py @@ -3,29 +3,36 @@ from importlib import import_module from typing import Any -from unstract.adapters.registry import AdapterRegistry from unstract.adapters.constants import Common from unstract.adapters.embedding.embedding_adapter import EmbeddingAdapter +from unstract.adapters.registry import AdapterRegistry logger = logging.getLogger(__name__) -class EmbeddingRegistry(AdapterRegistry): +class EmbeddingRegistry(AdapterRegistry): @staticmethod def register_adapters(adapters: dict[str, Any]) -> None: current_directory = os.path.dirname(os.path.abspath(__file__)) package = "unstract.adapters.embedding" for adapter in os.listdir(current_directory): - adapter_path = os.path.join(current_directory, adapter, Common.SRC_FOLDER) - # Check if the item is a directory and not a special directory like __pycache__ + adapter_path = os.path.join( + current_directory, adapter, Common.SRC_FOLDER + ) + # Check if the item is a directory and not + # a special directory like __pycache__ if os.path.isdir(adapter_path) and not adapter.startswith("__"): - EmbeddingRegistry.__build_adapter_list(adapter, package, adapters) + EmbeddingRegistry._build_adapter_list( + adapter, package, adapters + ) if len(adapters) == 0: logger.warning("No embedding adapter found.") @staticmethod - def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -> None: + def _build_adapter_list( + adapter: str, package: str, adapters: dict[str, Any] + ) -> None: try: full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}" module = import_module(full_module_path) @@ -41,5 +48,6 @@ def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) - Common.METADATA: metadata, } except ModuleNotFoundError as exception: - logger.error(f"Error while importing embedding adapters : {exception}") - + logger.error( + f"Error while importing embedding adapters : {exception}" + ) diff --git a/src/unstract/adapters/llm/register.py b/src/unstract/adapters/llm/register.py index 4e75593..9137ae6 100644 --- a/src/unstract/adapters/llm/register.py +++ b/src/unstract/adapters/llm/register.py @@ -23,12 +23,12 @@ def register_adapters(adapters: dict[str, Any]) -> None: # Check if the item is a directory and not a # special directory like _pycache__ if os.path.isdir(adapter_path) and not adapter.startswith("__"): - LLMRegistry.__build_adapter_list(adapter, package, adapters) + LLMRegistry._build_adapter_list(adapter, package, adapters) if len(adapters) == 0: logger.warning("No llm adapter found.") @staticmethod - def __build_adapter_list( + def _build_adapter_list( adapter: str, package: str, adapters: dict[str, Any] ) -> None: try: diff --git a/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py index f50a1ab..039ba7f 100644 --- a/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py +++ b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py @@ -66,7 +66,9 @@ def get_json_schema() -> str: f.close() return schema - def __get_request_body( + """ Construct the request body to be sent to Google AI Document server """ + + def _get_request_body( self, file_type_mime: str, file_content_in_bytes: bytes ) -> dict[str, Any]: return { @@ -80,7 +82,10 @@ def __get_request_body( GoogleDocumentAIKey.FIELD_MASK: "text", } - def __get_request_headers(self) -> dict[str, Any]: + """ Construct the request headers to be sent + to Google AI Document server """ + + def _get_request_headers(self) -> dict[str, Any]: credentials = Credentials.from_service_account_info( self.google_service_account, scopes=Constants.CREDENTIAL_SCOPES ) @@ -91,7 +96,9 @@ def __get_request_headers(self) -> dict[str, Any]: "Authorization": f"Bearer {credentials.token}", } - def __get_input_file_type_mime(self, input_file_path: str) -> str: + """ Detect the mime type from the file content """ + + def _get_input_file_type_mime(self, input_file_path: str) -> str: with open(input_file_path, mode="rb") as file_obj: sample_contents = file_obj.read(100) file_type = filetype.guess(sample_contents) @@ -111,15 +118,15 @@ def process( self, input_file_path: str, output_file_path: Optional[str] = None ) -> str: try: - file_type_mime = self.__get_input_file_type_mime(input_file_path) + file_type_mime = self._get_input_file_type_mime(input_file_path) if os.path.isfile(input_file_path): with open(input_file_path, "rb") as fop: file_content_in_bytes: bytes = fop.read() else: raise AdapterError(f"File not found {input_file_path}") processor_url = self.config.get(Constants.URL, "") + ":process" - headers = self.__get_request_headers() - data = self.__get_request_body( + headers = self._get_request_headers() + data = self._get_request_body( file_type_mime=file_type_mime, file_content_in_bytes=file_content_in_bytes, ) @@ -148,7 +155,7 @@ def process( def test_connection(self) -> bool: try: url = self.config.get(Constants.URL, "") - headers = self.__get_request_headers() + headers = self._get_request_headers() response = requests.get(url, headers=headers) if response.status_code != 200: logger.error( diff --git a/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json index 2785998..bec194e 100644 --- a/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json +++ b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json @@ -23,7 +23,8 @@ "credentials": { "type": "string", "title": "Google Service Account", - "deafult": "" + "deafult": "", + "description": "Service Account in JSON format" } } } diff --git a/src/unstract/adapters/ocr/register.py b/src/unstract/adapters/ocr/register.py index 3d379c4..edecf4b 100644 --- a/src/unstract/adapters/ocr/register.py +++ b/src/unstract/adapters/ocr/register.py @@ -23,12 +23,12 @@ def register_adapters(adapters: dict[str, Any]) -> None: # Check if the item is a directory and not a # special directory like __pycache__ if os.path.isdir(adapter_path) and not adapter.startswith("__"): - OCRRegistry.__build_adapter_list(adapter, package, adapters) + OCRRegistry._build_adapter_list(adapter, package, adapters) if len(adapters) == 0: logger.warning("No ocr adapter found.") @staticmethod - def __build_adapter_list( + def _build_adapter_list( adapter: str, package: str, adapters: dict[str, Any] ) -> None: try: diff --git a/src/unstract/adapters/vectordb/register.py b/src/unstract/adapters/vectordb/register.py index e0a620b..e7ef48a 100644 --- a/src/unstract/adapters/vectordb/register.py +++ b/src/unstract/adapters/vectordb/register.py @@ -3,12 +3,13 @@ from importlib import import_module from typing import Any -from unstract.adapters.registry import AdapterRegistry from unstract.adapters.constants import Common +from unstract.adapters.registry import AdapterRegistry from unstract.adapters.vectordb.vectordb_adapter import VectorDBAdapter logger = logging.getLogger(__name__) + class VectorDBRegistry(AdapterRegistry): @staticmethod def register_adapters(adapters: dict[str, Any]) -> None: @@ -16,15 +17,20 @@ def register_adapters(adapters: dict[str, Any]) -> None: package = "unstract.adapters.vectordb" for adapter in os.listdir(current_directory): - adapter_path = os.path.join(current_directory, adapter, Common.SRC_FOLDER) - # Check if the item is a directory and not a special directory like __pycache__ + adapter_path = os.path.join( + current_directory, adapter, Common.SRC_FOLDER + ) + # Check if the item is a directory and not a + # special directory like __pycache__ if os.path.isdir(adapter_path) and not adapter.startswith("__"): - VectorDBRegistry.__build_adapter_list(adapter, package, adapters) + VectorDBRegistry._build_adapter_list(adapter, package, adapters) if len(adapters) == 0: logger.warning("No vectorDB adapter found.") @staticmethod - def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -> None: + def _build_adapter_list( + adapter: str, package: str, adapters: dict[str, Any] + ) -> None: try: full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}" module = import_module(full_module_path) @@ -40,4 +46,6 @@ def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) - Common.METADATA: metadata, } except ModuleNotFoundError as exception: - logger.error(f"Error while importing vectorDB adapters : {exception}") + logger.error( + f"Error while importing vectorDB adapters : {exception}" + ) diff --git a/src/unstract/adapters/x2text/register.py b/src/unstract/adapters/x2text/register.py index c90fb5a..27b7881 100644 --- a/src/unstract/adapters/x2text/register.py +++ b/src/unstract/adapters/x2text/register.py @@ -23,12 +23,12 @@ def register_adapters(adapters: dict[str, Any]) -> None: # Check if the item is a directory and not a # special directory like __pycache__ if os.path.isdir(adapter_path) and not adapter.startswith("__"): - X2TextRegistry.__build_adapter_list(adapter, package, adapters) + X2TextRegistry._build_adapter_list(adapter, package, adapters) if len(adapters) == 0: logger.warning("No X2Text adapter found.") @staticmethod - def __build_adapter_list( + def _build_adapter_list( adapter: str, package: str, adapters: dict[str, Any] ) -> None: try: From acb9ef7016899483acc965acf56eea3b2208de0e Mon Sep 17 00:00:00 2001 From: gayathrivijayakumar Date: Tue, 27 Feb 2024 13:42:40 +0530 Subject: [PATCH 7/7] Changes to support byte and string content types for x2text adapters --- .../adapters/x2text/llm_whisperer/src/llm_whisperer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py index 8f5548b..595b04c 100644 --- a/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py +++ b/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py @@ -115,7 +115,8 @@ def process( ) else: if response.content is not None: - output = str(response.content) + if isinstance(response.content, bytes): + output = response.content.decode("utf-8") if output_file_path is not None: with open(output_file_path, "w", encoding="utf-8") as f: f.write(output)