Zipstack · jaseemjaskp · Feb 27, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "pdm.backend"
 
 [project]
 name = "unstract-adapters"
-version = "0.2.1"
+version = "0.2.2"
 description = "Unstract Adapters"
 authors = [
     {name = "Zipstack Inc.", email = "devsupport@zipstack.com"},

diff --git a/src/unstract/adapters/adapterkit.py b/src/unstract/adapters/adapterkit.py
@@ -6,6 +6,7 @@
 from unstract.adapters.constants import Common
 from unstract.adapters.embedding import adapters as embedding_adapters
 from unstract.adapters.llm import adapters as llm_adapters
+from unstract.adapters.ocr import adapters as ocr_adapters
 from unstract.adapters.vectordb import adapters as vectordb_adapters
 from unstract.adapters.x2text import adapters as x2text_adapters
 
@@ -19,6 +20,7 @@ def __init__(self) -> None:
             | llm_adapters
             | vectordb_adapters
             | x2text_adapters
+            | ocr_adapters
         )
 
     @property

diff --git a/src/unstract/adapters/embedding/register.py b/src/unstract/adapters/embedding/register.py
@@ -3,29 +3,36 @@
 from importlib import import_module
 from typing import Any
 
-from unstract.adapters.registry import AdapterRegistry
 from unstract.adapters.constants import Common
 from unstract.adapters.embedding.embedding_adapter import EmbeddingAdapter
+from unstract.adapters.registry import AdapterRegistry
 
 logger = logging.getLogger(__name__)
 
-class EmbeddingRegistry(AdapterRegistry):
 
+class EmbeddingRegistry(AdapterRegistry):
     @staticmethod
     def register_adapters(adapters: dict[str, Any]) -> None:
         current_directory = os.path.dirname(os.path.abspath(__file__))
         package = "unstract.adapters.embedding"
 
         for adapter in os.listdir(current_directory):
-            adapter_path = os.path.join(current_directory, adapter, Common.SRC_FOLDER)
-            # Check if the item is a directory and not a special directory like __pycache__
+            adapter_path = os.path.join(
+                current_directory, adapter, Common.SRC_FOLDER
+            )
+            # Check if the item is a directory and not
+            # a special directory like __pycache__
             if os.path.isdir(adapter_path) and not adapter.startswith("__"):
-                EmbeddingRegistry.__build_adapter_list(adapter, package, adapters)
+                EmbeddingRegistry._build_adapter_list(
+                    adapter, package, adapters
+                )
         if len(adapters) == 0:
             logger.warning("No embedding adapter found.")
 
     @staticmethod
-    def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -> None:
+    def _build_adapter_list(
+        adapter: str, package: str, adapters: dict[str, Any]
+    ) -> None:
         try:
             full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}"
             module = import_module(full_module_path)
@@ -41,5 +48,6 @@ def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -
                         Common.METADATA: metadata,
                     }
         except ModuleNotFoundError as exception:
-            logger.error(f"Error while importing embedding adapters : {exception}")
-
+            logger.error(
+                f"Error while importing embedding adapters : {exception}"
+            )
diff --git a/src/unstract/adapters/enums.py b/src/unstract/adapters/enums.py
@@ -6,4 +6,5 @@ class AdapterTypes(Enum):
     LLM = "LLM"
     EMBEDDING = "EMBEDDING"
     VECTOR_DB = "VECTOR_DB"
+    OCR = "OCR"
     X2TEXT = "X2TEXT"
diff --git a/src/unstract/adapters/llm/register.py b/src/unstract/adapters/llm/register.py
@@ -23,12 +23,12 @@ def register_adapters(adapters: dict[str, Any]) -> None:
             # Check if the item is a directory and not a
             # special directory like _pycache__
             if os.path.isdir(adapter_path) and not adapter.startswith("__"):
-                LLMRegistry.__build_adapter_list(adapter, package, adapters)
+                LLMRegistry._build_adapter_list(adapter, package, adapters)
         if len(adapters) == 0:
             logger.warning("No llm adapter found.")
 
     @staticmethod
-    def __build_adapter_list(
+    def _build_adapter_list(
         adapter: str, package: str, adapters: dict[str, Any]
     ) -> None:
         try:

diff --git a/src/unstract/adapters/ocr/__init__.py b/src/unstract/adapters/ocr/__init__.py
@@ -0,0 +1,5 @@
+from unstract.adapters import AdapterDict
+from unstract.adapters.ocr.register import OCRRegistry
+
+adapters: AdapterDict = {}
+OCRRegistry.register_adapters(adapters)
diff --git a/src/unstract/adapters/ocr/constants.py b/src/unstract/adapters/ocr/constants.py
@@ -0,0 +1,18 @@
+class FileType:
+    TEXT_PLAIN = "text/plain"
+    IMAGE_JPEG = "image/jpeg"
+    IMAGE_PNG = "image/png"
+    IMAGE_TIFF = "image/tiff"
+    IMAGE_BMP = "image/bmp"
+    IMAGE_GIF = "image/gif"
+    IMAGE_WEBP = "image/webp"
+    APPLICATION_PDF = "application/pdf"
+    ALLOWED_TYPES = [
+        IMAGE_JPEG,
+        IMAGE_PNG,
+        IMAGE_TIFF,
+        IMAGE_BMP,
+        IMAGE_GIF,
+        IMAGE_WEBP,
+        APPLICATION_PDF,
+    ]
diff --git a/src/unstract/adapters/ocr/google_document_ai/README.md b/src/unstract/adapters/ocr/google_document_ai/README.md
@@ -0,0 +1 @@
+# Unstract Google Document AI OCR Adapter
diff --git a/src/unstract/adapters/ocr/google_document_ai/pyproject.toml b/src/unstract/adapters/ocr/google_document_ai/pyproject.toml
@@ -0,0 +1,26 @@
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
+
+
+[project]
+name = "unstract-googledocumentai-ocr"
+version = "0.0.1"
+description = "Google Document AI OCR"
+authors = [
+    {name = "Zipstack Inc.", email = "devsupport@zipstack.com"},
+]
+dependencies = [
+
+]
+requires-python = ">=3.9"
+readme = "README.md"
+classifiers = [
+  "Programming Language :: Python"
+]
+license = {text = "MIT"}
+
+[tool.pdm.build]
+includes = ["src"]
+package-dir = "src"
+# source-includes = ["tests"]
diff --git a/src/unstract/adapters/ocr/google_document_ai/src/README.md b/src/unstract/adapters/ocr/google_document_ai/src/README.md
@@ -0,0 +1 @@
+# Unstract Google Document AI OCR Adapter
diff --git a/src/unstract/adapters/ocr/google_document_ai/src/__init__.py b/src/unstract/adapters/ocr/google_document_ai/src/__init__.py
@@ -0,0 +1,9 @@
+from .google_document_ai import GoogleDocumentAI
+
+metadata = {
+    "name": GoogleDocumentAI.__name__,
+    "version": "1.0.0",
+    "adapter": GoogleDocumentAI,
+    "description": "Google Document AI OCR adapter",
+    "is_active": True,
+}
diff --git a/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py
@@ -0,0 +1,174 @@
+import base64
+import json
+import logging
+import os
+from typing import Any, Optional
+
+import requests
+from filetype import filetype
+from google.auth.transport import requests as google_requests
+from google.oauth2.service_account import Credentials
+
+from unstract.adapters.exceptions import AdapterError
+from unstract.adapters.ocr.constants import FileType
+from unstract.adapters.ocr.ocr_adapter import OCRAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class GoogleDocumentAIKey:
+    RAW_DOCUMENT = "rawDocument"
+    MIME_TYPE = "mimeType"
+    CONTENT = "content"
+    SKIP_HUMAN_REVIEW = "skipHumanReview"
+    FIELD_MASK = "fieldMask"
+
+
+class Constants:
+    URL = "url"
+    CREDENTIALS = "credentials"
+    CREDENTIAL_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]
+
+
+class GoogleDocumentAI(OCRAdapter):
+    def __init__(self, settings: dict[str, Any]):
+        super().__init__("GoogleDocumentAI")
+        self.config = settings
+        google_service_account = self.config.get(Constants.CREDENTIALS)
+        if not google_service_account:
+            logger.error("Google service account not found")
+        else:
+            self.google_service_account = json.loads(google_service_account)
+
+    @staticmethod
+    def get_id() -> str:
+        return "googledocumentai|1013f64b-ecc9-4e35-b986-aebd60fb55d7"
+
+    @staticmethod
+    def get_name() -> str:
+        return "GoogleDocumentAI"
+
+    @staticmethod
+    def get_description() -> str:
+        return "Google Document AI OCR"
+
+    @staticmethod
+    def get_icon() -> str:
+        return (
+            "https://storage.googleapis.com/pandora-static/"
+            "adapter-icons/GoogleDocumentAI.png"
+        )
+
+    @staticmethod
+    def get_json_schema() -> str:
+        f = open(f"{os.path.dirname(__file__)}/static/json_schema.json")
+        schema = f.read()
+        f.close()
+        return schema
+
+    """ Construct the request body to be sent to Google AI Document server """
+
+    def _get_request_body(
+        self, file_type_mime: str, file_content_in_bytes: bytes
+    ) -> dict[str, Any]:
+        return {
+            GoogleDocumentAIKey.RAW_DOCUMENT: {
+                GoogleDocumentAIKey.MIME_TYPE: file_type_mime,
+                GoogleDocumentAIKey.CONTENT: base64.b64encode(
+                    file_content_in_bytes
+                ).decode("utf-8"),
+            },
+            GoogleDocumentAIKey.SKIP_HUMAN_REVIEW: True,
+            GoogleDocumentAIKey.FIELD_MASK: "text",
+        }
+
+    """ Construct the request headers to be sent
+    to Google AI Document server """
+
+    def _get_request_headers(self) -> dict[str, Any]:
+        credentials = Credentials.from_service_account_info(
+            self.google_service_account, scopes=Constants.CREDENTIAL_SCOPES
+        )
+        credentials.refresh(google_requests.Request())
+
+        return {
+            "Content-Type": "application/json; charset=utf-8",
+            "Authorization": f"Bearer {credentials.token}",
+        }
+
+    """ Detect the mime type from the file content """
+
+    def _get_input_file_type_mime(self, input_file_path: str) -> str:
+        with open(input_file_path, mode="rb") as file_obj:
+            sample_contents = file_obj.read(100)
+            file_type = filetype.guess(sample_contents)
+
+        file_type_mime: str = (
+            file_type.MIME if file_type else FileType.TEXT_PLAIN
+        )
+
+        if file_type_mime not in FileType.ALLOWED_TYPES:
+            logger.error("Input file type not supported: " f"{file_type_mime}")
+
+        logger.info(f"file: `{input_file_path} [{file_type_mime}]`\n\n")
+
+        return file_type_mime
+
+    def process(
+        self, input_file_path: str, output_file_path: Optional[str] = None
+    ) -> str:
+        try:
+            file_type_mime = self._get_input_file_type_mime(input_file_path)
+            if os.path.isfile(input_file_path):
+                with open(input_file_path, "rb") as fop:
+                    file_content_in_bytes: bytes = fop.read()
+            else:
+                raise AdapterError(f"File not found {input_file_path}")
+            processor_url = self.config.get(Constants.URL, "") + ":process"
+            headers = self._get_request_headers()
+            data = self._get_request_body(
+                file_type_mime=file_type_mime,
+                file_content_in_bytes=file_content_in_bytes,
+            )
+            response = requests.post(processor_url, headers=headers, json=data)
+            if response.status_code != 200:
+                logger.error(
+                    f"Error while calling Google Document AI: {response.text}"
+                )
+            response_json: dict[str, Any] = response.json()
+            result_text: str = response_json["document"]["text"]
+            if output_file_path is not None:
+                with open(output_file_path, "w", encoding="utf-8") as f:
+                    f.write(result_text)
+                    f.close()
+            return result_text
+        except Exception as e:
+            logger.error(f"Error while processing document {e}")
+            if not isinstance(e, AdapterError):
+                raise AdapterError(str(e))
+            else:
+                raise e
+        finally:
+            if fop is not None:
+                fop.close()
+
+    def test_connection(self) -> bool:
+        try:
+            url = self.config.get(Constants.URL, "")
+            headers = self._get_request_headers()
+            response = requests.get(url, headers=headers)
+            if response.status_code != 200:
+                logger.error(
+                    f"Error while testing Google Document AI: {response.text}"
+                )
+                raise AdapterError(
+                    f"{response.status_code} - {response.reason}"
+                )
+            else:
+                return True
+        except Exception as e:
+            logger.error(f"Error occured while testing adapter {e}")
+            if not isinstance(e, AdapterError):
+                raise AdapterError(str(e))
+            else:
+                raise e
diff --git a/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json
@@ -0,0 +1,30 @@
+{
+  "title": "Google Document AI OCR",
+  "type": "object",
+  "required": [
+    "adapter_name",
+    "url",
+    "credentials"
+  ],
+  "properties": {
+    "adapter_name": {
+      "type": "string",
+      "title": "OCR Adapter ID",
+      "default": "",
+      "description": "Provide a unique name for this adapter instance. Example: google-document-ai-1"
+    },
+    "url": {
+      "type": "string",
+      "title": "URL",
+      "default": "",
+      "format": "uri",
+      "description": "The URL of the Google Document AI endpoint for the processor Example: https://{endpoint}/v1/projects/{project}/locations/{location}/processors/{processor}"
+    },
+    "credentials": {
+      "type": "string",
+      "title": "Google Service Account",
+      "deafult": "",
+      "description": "Service Account in JSON format"
+    }
+  }
+}