From 9720a2e7426fefa4477113fe106ce8d7925d589a Mon Sep 17 00:00:00 2001
From: gayathrivijayakumar <gayathri@zipstack.com>
Date: Wed, 21 Feb 2024 22:58:53 +0530
Subject: [PATCH 1/7] OCR adapter changes

---
 src/unstract/adapters/adapterkit.py           |   3 +-
 src/unstract/adapters/enums.py                |   1 +
 src/unstract/adapters/ocr/__init__.py         |   5 +
 src/unstract/adapters/ocr/constants.py        |  18 ++
 .../adapters/ocr/google_document_ai/README.md |   1 +
 .../ocr/google_document_ai/pyproject.toml     |  26 +++
 .../ocr/google_document_ai/src/README.md      |   1 +
 .../ocr/google_document_ai/src/__init__.py    |   9 +
 .../src/google_document_ai.py                 | 167 ++++++++++++++++++
 .../src/static/json_schema.json               |  29 +++
 src/unstract/adapters/ocr/ocr_adapter.py      |  41 +++++
 src/unstract/adapters/ocr/register.py         |  49 +++++
 12 files changed, 349 insertions(+), 1 deletion(-)
 create mode 100644 src/unstract/adapters/ocr/__init__.py
 create mode 100644 src/unstract/adapters/ocr/constants.py
 create mode 100644 src/unstract/adapters/ocr/google_document_ai/README.md
 create mode 100644 src/unstract/adapters/ocr/google_document_ai/pyproject.toml
 create mode 100644 src/unstract/adapters/ocr/google_document_ai/src/README.md
 create mode 100644 src/unstract/adapters/ocr/google_document_ai/src/__init__.py
 create mode 100644 src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py
 create mode 100644 src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json
 create mode 100644 src/unstract/adapters/ocr/ocr_adapter.py
 create mode 100644 src/unstract/adapters/ocr/register.py

diff --git a/src/unstract/adapters/adapterkit.py b/src/unstract/adapters/adapterkit.py
index fcbacec..31719b1 100644
--- a/src/unstract/adapters/adapterkit.py
+++ b/src/unstract/adapters/adapterkit.py
@@ -6,6 +6,7 @@
 from unstract.adapters.constants import Common
 from unstract.adapters.embedding import adapters as embedding_adapters
 from unstract.adapters.llm import adapters as llm_adapters
+from unstract.adapters.ocr import adapters as ocr_adapters
 from unstract.adapters.vectordb import adapters as vectordb_adapters
 
 logger = logging.getLogger(__name__)
@@ -14,7 +15,7 @@
 class Adapterkit:
     def __init__(self) -> None:
         self._adapters: AdapterDict = (
-            embedding_adapters | llm_adapters | vectordb_adapters
+            embedding_adapters | llm_adapters | vectordb_adapters | ocr_adapters
         )
 
     @property
diff --git a/src/unstract/adapters/enums.py b/src/unstract/adapters/enums.py
index e806239..b07ffd7 100644
--- a/src/unstract/adapters/enums.py
+++ b/src/unstract/adapters/enums.py
@@ -6,3 +6,4 @@ class AdapterTypes(Enum):
     LLM = "LLM"
     EMBEDDING = "EMBEDDING"
     VECTOR_DB = "VECTOR_DB"
+    OCR = "OCR"
diff --git a/src/unstract/adapters/ocr/__init__.py b/src/unstract/adapters/ocr/__init__.py
new file mode 100644
index 0000000..21818f2
--- /dev/null
+++ b/src/unstract/adapters/ocr/__init__.py
@@ -0,0 +1,5 @@
+from unstract.adapters import AdapterDict
+from unstract.adapters.ocr.register import OCRRegistry
+
+adapters: AdapterDict = {}
+OCRRegistry.register_adapters(adapters)
diff --git a/src/unstract/adapters/ocr/constants.py b/src/unstract/adapters/ocr/constants.py
new file mode 100644
index 0000000..87e75f3
--- /dev/null
+++ b/src/unstract/adapters/ocr/constants.py
@@ -0,0 +1,18 @@
+class FileType:
+    TEXT_PLAIN = "text/plain"
+    IMAGE_JPEG = "image/jpeg"
+    IMAGE_PNG = "image/png"
+    IMAGE_TIFF = "image/tiff"
+    IMAGE_BMP = "image/bmp"
+    IMAGE_GIF = "image/gif"
+    IMAGE_WEBP = "image/webp"
+    APPLICATION_PDF = "application/pdf"
+    ALLOWED_TYPES = [
+        IMAGE_JPEG,
+        IMAGE_PNG,
+        IMAGE_TIFF,
+        IMAGE_BMP,
+        IMAGE_GIF,
+        IMAGE_WEBP,
+        APPLICATION_PDF,
+    ]
diff --git a/src/unstract/adapters/ocr/google_document_ai/README.md b/src/unstract/adapters/ocr/google_document_ai/README.md
new file mode 100644
index 0000000..f8b83e5
--- /dev/null
+++ b/src/unstract/adapters/ocr/google_document_ai/README.md
@@ -0,0 +1 @@
+# Unstract Google Document AI OCR Adapter
diff --git a/src/unstract/adapters/ocr/google_document_ai/pyproject.toml b/src/unstract/adapters/ocr/google_document_ai/pyproject.toml
new file mode 100644
index 0000000..b4070fa
--- /dev/null
+++ b/src/unstract/adapters/ocr/google_document_ai/pyproject.toml
@@ -0,0 +1,26 @@
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
+
+
+[project]
+name = "unstract-googledocumentai-ocr"
+version = "0.0.1"
+description = "Google Document AI OCR"
+authors = [
+    {name = "Zipstack Inc.", email = "devsupport@zipstack.com"},
+]
+dependencies = [
+
+]
+requires-python = ">=3.9"
+readme = "README.md"
+classifiers = [
+  "Programming Language :: Python"
+]
+license = {text = "MIT"}
+
+[tool.pdm.build]
+includes = ["src"]
+package-dir = "src"
+# source-includes = ["tests"]
diff --git a/src/unstract/adapters/ocr/google_document_ai/src/README.md b/src/unstract/adapters/ocr/google_document_ai/src/README.md
new file mode 100644
index 0000000..f8b83e5
--- /dev/null
+++ b/src/unstract/adapters/ocr/google_document_ai/src/README.md
@@ -0,0 +1 @@
+# Unstract Google Document AI OCR Adapter
diff --git a/src/unstract/adapters/ocr/google_document_ai/src/__init__.py b/src/unstract/adapters/ocr/google_document_ai/src/__init__.py
new file mode 100644
index 0000000..c600bba
--- /dev/null
+++ b/src/unstract/adapters/ocr/google_document_ai/src/__init__.py
@@ -0,0 +1,9 @@
+from .google_document_ai import GoogleDocumentAI
+
+metadata = {
+    "name": GoogleDocumentAI.__name__,
+    "version": "1.0.0",
+    "adapter": GoogleDocumentAI,
+    "description": "Google Document AI OCR adapter",
+    "is_active": True,
+}
diff --git a/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py
new file mode 100644
index 0000000..f50a1ab
--- /dev/null
+++ b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py
@@ -0,0 +1,167 @@
+import base64
+import json
+import logging
+import os
+from typing import Any, Optional
+
+import requests
+from filetype import filetype
+from google.auth.transport import requests as google_requests
+from google.oauth2.service_account import Credentials
+
+from unstract.adapters.exceptions import AdapterError
+from unstract.adapters.ocr.constants import FileType
+from unstract.adapters.ocr.ocr_adapter import OCRAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class GoogleDocumentAIKey:
+    RAW_DOCUMENT = "rawDocument"
+    MIME_TYPE = "mimeType"
+    CONTENT = "content"
+    SKIP_HUMAN_REVIEW = "skipHumanReview"
+    FIELD_MASK = "fieldMask"
+
+
+class Constants:
+    URL = "url"
+    CREDENTIALS = "credentials"
+    CREDENTIAL_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]
+
+
+class GoogleDocumentAI(OCRAdapter):
+    def __init__(self, settings: dict[str, Any]):
+        super().__init__("GoogleDocumentAI")
+        self.config = settings
+        google_service_account = self.config.get(Constants.CREDENTIALS)
+        if not google_service_account:
+            logger.error("Google service account not found")
+        else:
+            self.google_service_account = json.loads(google_service_account)
+
+    @staticmethod
+    def get_id() -> str:
+        return "googledocumentai|1013f64b-ecc9-4e35-b986-aebd60fb55d7"
+
+    @staticmethod
+    def get_name() -> str:
+        return "GoogleDocumentAI"
+
+    @staticmethod
+    def get_description() -> str:
+        return "Google Document AI OCR"
+
+    @staticmethod
+    def get_icon() -> str:
+        return (
+            "https://storage.googleapis.com/pandora-static/"
+            "adapter-icons/GoogleDocumentAI.png"
+        )
+
+    @staticmethod
+    def get_json_schema() -> str:
+        f = open(f"{os.path.dirname(__file__)}/static/json_schema.json")
+        schema = f.read()
+        f.close()
+        return schema
+
+    def __get_request_body(
+        self, file_type_mime: str, file_content_in_bytes: bytes
+    ) -> dict[str, Any]:
+        return {
+            GoogleDocumentAIKey.RAW_DOCUMENT: {
+                GoogleDocumentAIKey.MIME_TYPE: file_type_mime,
+                GoogleDocumentAIKey.CONTENT: base64.b64encode(
+                    file_content_in_bytes
+                ).decode("utf-8"),
+            },
+            GoogleDocumentAIKey.SKIP_HUMAN_REVIEW: True,
+            GoogleDocumentAIKey.FIELD_MASK: "text",
+        }
+
+    def __get_request_headers(self) -> dict[str, Any]:
+        credentials = Credentials.from_service_account_info(
+            self.google_service_account, scopes=Constants.CREDENTIAL_SCOPES
+        )
+        credentials.refresh(google_requests.Request())
+
+        return {
+            "Content-Type": "application/json; charset=utf-8",
+            "Authorization": f"Bearer {credentials.token}",
+        }
+
+    def __get_input_file_type_mime(self, input_file_path: str) -> str:
+        with open(input_file_path, mode="rb") as file_obj:
+            sample_contents = file_obj.read(100)
+            file_type = filetype.guess(sample_contents)
+
+        file_type_mime: str = (
+            file_type.MIME if file_type else FileType.TEXT_PLAIN
+        )
+
+        if file_type_mime not in FileType.ALLOWED_TYPES:
+            logger.error("Input file type not supported: " f"{file_type_mime}")
+
+        logger.info(f"file: `{input_file_path} [{file_type_mime}]`\n\n")
+
+        return file_type_mime
+
+    def process(
+        self, input_file_path: str, output_file_path: Optional[str] = None
+    ) -> str:
+        try:
+            file_type_mime = self.__get_input_file_type_mime(input_file_path)
+            if os.path.isfile(input_file_path):
+                with open(input_file_path, "rb") as fop:
+                    file_content_in_bytes: bytes = fop.read()
+            else:
+                raise AdapterError(f"File not found {input_file_path}")
+            processor_url = self.config.get(Constants.URL, "") + ":process"
+            headers = self.__get_request_headers()
+            data = self.__get_request_body(
+                file_type_mime=file_type_mime,
+                file_content_in_bytes=file_content_in_bytes,
+            )
+            response = requests.post(processor_url, headers=headers, json=data)
+            if response.status_code != 200:
+                logger.error(
+                    f"Error while calling Google Document AI: {response.text}"
+                )
+            response_json: dict[str, Any] = response.json()
+            result_text: str = response_json["document"]["text"]
+            if output_file_path is not None:
+                with open(output_file_path, "w", encoding="utf-8") as f:
+                    f.write(result_text)
+                    f.close()
+            return result_text
+        except Exception as e:
+            logger.error(f"Error while processing document {e}")
+            if not isinstance(e, AdapterError):
+                raise AdapterError(str(e))
+            else:
+                raise e
+        finally:
+            if fop is not None:
+                fop.close()
+
+    def test_connection(self) -> bool:
+        try:
+            url = self.config.get(Constants.URL, "")
+            headers = self.__get_request_headers()
+            response = requests.get(url, headers=headers)
+            if response.status_code != 200:
+                logger.error(
+                    f"Error while testing Google Document AI: {response.text}"
+                )
+                raise AdapterError(
+                    f"{response.status_code} - {response.reason}"
+                )
+            else:
+                return True
+        except Exception as e:
+            logger.error(f"Error occured while testing adapter {e}")
+            if not isinstance(e, AdapterError):
+                raise AdapterError(str(e))
+            else:
+                raise e
diff --git a/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json
new file mode 100644
index 0000000..2785998
--- /dev/null
+++ b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json
@@ -0,0 +1,29 @@
+{
+  "title": "Google Document AI OCR",
+  "type": "object",
+  "required": [
+    "adapter_name",
+    "url",
+    "credentials"
+  ],
+  "properties": {
+    "adapter_name": {
+      "type": "string",
+      "title": "OCR Adapter ID",
+      "default": "",
+      "description": "Provide a unique name for this adapter instance. Example: google-document-ai-1"
+    },
+    "url": {
+      "type": "string",
+      "title": "URL",
+      "default": "",
+      "format": "uri",
+      "description": "The URL of the Google Document AI endpoint for the processor Example: https://{endpoint}/v1/projects/{project}/locations/{location}/processors/{processor}"
+    },
+    "credentials": {
+      "type": "string",
+      "title": "Google Service Account",
+      "deafult": ""
+    }
+  }
+}
diff --git a/src/unstract/adapters/ocr/ocr_adapter.py b/src/unstract/adapters/ocr/ocr_adapter.py
new file mode 100644
index 0000000..56192ce
--- /dev/null
+++ b/src/unstract/adapters/ocr/ocr_adapter.py
@@ -0,0 +1,41 @@
+from abc import ABC
+from typing import Any
+
+from unstract.adapters.base import Adapter
+from unstract.adapters.enums import AdapterTypes
+
+
+class OCRAdapter(Adapter, ABC):
+    def __init__(self, name: str):
+        super().__init__(name)
+        self.name = name
+
+    @staticmethod
+    def get_id() -> str:
+        return ""
+
+    @staticmethod
+    def get_name() -> str:
+        return ""
+
+    @staticmethod
+    def get_description() -> str:
+        return ""
+
+    @staticmethod
+    def get_icon() -> str:
+        return ""
+
+    @staticmethod
+    def get_json_schema() -> str:
+        return ""
+
+    @staticmethod
+    def get_adapter_type() -> AdapterTypes:
+        return AdapterTypes.OCR
+
+    def process(self, input_file_path: str, output_file_path: str) -> str:
+        return ""
+
+    def test_connection(self, llm_metadata: dict[str, Any]) -> bool:
+        return False
diff --git a/src/unstract/adapters/ocr/register.py b/src/unstract/adapters/ocr/register.py
new file mode 100644
index 0000000..3d379c4
--- /dev/null
+++ b/src/unstract/adapters/ocr/register.py
@@ -0,0 +1,49 @@
+import logging
+import os
+from importlib import import_module
+from typing import Any
+
+from unstract.adapters.constants import Common
+from unstract.adapters.ocr.ocr_adapter import OCRAdapter
+from unstract.adapters.registry import AdapterRegistry
+
+logger = logging.getLogger(__name__)
+
+
+class OCRRegistry(AdapterRegistry):
+    @staticmethod
+    def register_adapters(adapters: dict[str, Any]) -> None:
+        current_directory = os.path.dirname(os.path.abspath(__file__))
+        package = "unstract.adapters.ocr"
+
+        for adapter in os.listdir(current_directory):
+            adapter_path = os.path.join(
+                current_directory, adapter, Common.SRC_FOLDER
+            )
+            # Check if the item is a directory and not a
+            # special directory like __pycache__
+            if os.path.isdir(adapter_path) and not adapter.startswith("__"):
+                OCRRegistry.__build_adapter_list(adapter, package, adapters)
+        if len(adapters) == 0:
+            logger.warning("No ocr adapter found.")
+
+    @staticmethod
+    def __build_adapter_list(
+        adapter: str, package: str, adapters: dict[str, Any]
+    ) -> None:
+        try:
+            full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}"
+            module = import_module(full_module_path)
+            metadata = getattr(module, Common.METADATA, {})
+            if metadata.get("is_active", False):
+                adapter_class: OCRAdapter = metadata[Common.ADAPTER]
+                adapter_id = adapter_class.get_id()
+                if not adapter_id or (adapter_id in adapters):
+                    logger.warning(f"Duplicate Id : {adapter_id}")
+                else:
+                    adapters[adapter_id] = {
+                        Common.MODULE: module,
+                        Common.METADATA: metadata,
+                    }
+        except ModuleNotFoundError as exception:
+            logger.error(f"Error while importing ocr adapters : {exception}")

From a09f4b8e3ca5b65c66900c651bbb0282541b3fd8 Mon Sep 17 00:00:00 2001
From: gayathrivijayakumar <gayathri@zipstack.com>
Date: Wed, 21 Feb 2024 23:20:31 +0530
Subject: [PATCH 2/7] Fix function signature

---
 src/unstract/adapters/ocr/ocr_adapter.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/unstract/adapters/ocr/ocr_adapter.py b/src/unstract/adapters/ocr/ocr_adapter.py
index 56192ce..d603b67 100644
--- a/src/unstract/adapters/ocr/ocr_adapter.py
+++ b/src/unstract/adapters/ocr/ocr_adapter.py
@@ -1,5 +1,5 @@
 from abc import ABC
-from typing import Any
+from typing import Any, Optional
 
 from unstract.adapters.base import Adapter
 from unstract.adapters.enums import AdapterTypes
@@ -34,7 +34,10 @@ def get_json_schema() -> str:
     def get_adapter_type() -> AdapterTypes:
         return AdapterTypes.OCR
 
-    def process(self, input_file_path: str, output_file_path: str) -> str:
+    def process(
+        self, input_file_path: str, output_file_path: Optional[str] = None
+    ) -> str:
+        # Overriding methods will contain actual implementation
         return ""
 
     def test_connection(self, llm_metadata: dict[str, Any]) -> bool:

From 2d1079efde9c279ff816171325c03ac56c1610d3 Mon Sep 17 00:00:00 2001
From: gayathrivijayakumar <gayathri@zipstack.com>
Date: Wed, 21 Feb 2024 23:25:58 +0530
Subject: [PATCH 3/7] Add .idea to be ignored by Git commits

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 6769e21..2dc53ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -157,4 +157,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
\ No newline at end of file
+.idea/

From 09f60c5257158f0debd861974e2886a29a3fb534 Mon Sep 17 00:00:00 2001
From: gayathrivijayakumar <gayathri@zipstack.com>
Date: Mon, 26 Feb 2024 12:03:11 +0530
Subject: [PATCH 4/7] Roll up version for adapter changes for OCR

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3b8fee7..5ef3b1f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "pdm.backend"
 
 [project]
 name = "unstract-adapters"
-version = "0.2.1"
+version = "0.2.2"
 description = "Unstract Adapters"
 authors = [
     {name = "Zipstack Inc.", email = "devsupport@zipstack.com"},

From c70cd1eea52bca93dc70f8fcba04a194b3bee50e Mon Sep 17 00:00:00 2001
From: gayathrivijayakumar <gayathri@zipstack.com>
Date: Mon, 26 Feb 2024 18:47:04 +0530
Subject: [PATCH 5/7] Remove unwanted space

---
 src/unstract/adapters/adapterkit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unstract/adapters/adapterkit.py b/src/unstract/adapters/adapterkit.py
index df8c51a..ff64729 100644
--- a/src/unstract/adapters/adapterkit.py
+++ b/src/unstract/adapters/adapterkit.py
@@ -20,7 +20,7 @@ def __init__(self) -> None:
             | llm_adapters
             | vectordb_adapters
             | x2text_adapters
-            |  ocr_adapters
+            | ocr_adapters
         )
 
     @property

From 39187f03ca76cea5e7bd3400f5484b7287a2b555 Mon Sep 17 00:00:00 2001
From: gayathrivijayakumar <gayathri@zipstack.com>
Date: Mon, 26 Feb 2024 20:35:24 +0530
Subject: [PATCH 6/7] Private methid func name refactoring

---
 src/unstract/adapters/embedding/register.py   | 24 ++++++++++++-------
 src/unstract/adapters/llm/register.py         |  4 ++--
 .../src/google_document_ai.py                 | 21 ++++++++++------
 .../src/static/json_schema.json               |  3 ++-
 src/unstract/adapters/ocr/register.py         |  4 ++--
 src/unstract/adapters/vectordb/register.py    | 20 +++++++++++-----
 src/unstract/adapters/x2text/register.py      |  4 ++--
 7 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/src/unstract/adapters/embedding/register.py b/src/unstract/adapters/embedding/register.py
index 74a9328..54c8adb 100644
--- a/src/unstract/adapters/embedding/register.py
+++ b/src/unstract/adapters/embedding/register.py
@@ -3,29 +3,36 @@
 from importlib import import_module
 from typing import Any
 
-from unstract.adapters.registry import AdapterRegistry
 from unstract.adapters.constants import Common
 from unstract.adapters.embedding.embedding_adapter import EmbeddingAdapter
+from unstract.adapters.registry import AdapterRegistry
 
 logger = logging.getLogger(__name__)
 
-class EmbeddingRegistry(AdapterRegistry):
 
+class EmbeddingRegistry(AdapterRegistry):
     @staticmethod
     def register_adapters(adapters: dict[str, Any]) -> None:
         current_directory = os.path.dirname(os.path.abspath(__file__))
         package = "unstract.adapters.embedding"
 
         for adapter in os.listdir(current_directory):
-            adapter_path = os.path.join(current_directory, adapter, Common.SRC_FOLDER)
-            # Check if the item is a directory and not a special directory like __pycache__
+            adapter_path = os.path.join(
+                current_directory, adapter, Common.SRC_FOLDER
+            )
+            # Check if the item is a directory and not
+            # a special directory like __pycache__
             if os.path.isdir(adapter_path) and not adapter.startswith("__"):
-                EmbeddingRegistry.__build_adapter_list(adapter, package, adapters)
+                EmbeddingRegistry._build_adapter_list(
+                    adapter, package, adapters
+                )
         if len(adapters) == 0:
             logger.warning("No embedding adapter found.")
 
     @staticmethod
-    def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -> None:
+    def _build_adapter_list(
+        adapter: str, package: str, adapters: dict[str, Any]
+    ) -> None:
         try:
             full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}"
             module = import_module(full_module_path)
@@ -41,5 +48,6 @@ def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -
                         Common.METADATA: metadata,
                     }
         except ModuleNotFoundError as exception:
-            logger.error(f"Error while importing embedding adapters : {exception}")
-
+            logger.error(
+                f"Error while importing embedding adapters : {exception}"
+            )
diff --git a/src/unstract/adapters/llm/register.py b/src/unstract/adapters/llm/register.py
index 4e75593..9137ae6 100644
--- a/src/unstract/adapters/llm/register.py
+++ b/src/unstract/adapters/llm/register.py
@@ -23,12 +23,12 @@ def register_adapters(adapters: dict[str, Any]) -> None:
             # Check if the item is a directory and not a
             # special directory like _pycache__
             if os.path.isdir(adapter_path) and not adapter.startswith("__"):
-                LLMRegistry.__build_adapter_list(adapter, package, adapters)
+                LLMRegistry._build_adapter_list(adapter, package, adapters)
         if len(adapters) == 0:
             logger.warning("No llm adapter found.")
 
     @staticmethod
-    def __build_adapter_list(
+    def _build_adapter_list(
         adapter: str, package: str, adapters: dict[str, Any]
     ) -> None:
         try:
diff --git a/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py
index f50a1ab..039ba7f 100644
--- a/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py
+++ b/src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py
@@ -66,7 +66,9 @@ def get_json_schema() -> str:
         f.close()
         return schema
 
-    def __get_request_body(
+    """ Construct the request body to be sent to Google AI Document server """
+
+    def _get_request_body(
         self, file_type_mime: str, file_content_in_bytes: bytes
     ) -> dict[str, Any]:
         return {
@@ -80,7 +82,10 @@ def __get_request_body(
             GoogleDocumentAIKey.FIELD_MASK: "text",
         }
 
-    def __get_request_headers(self) -> dict[str, Any]:
+    """ Construct the request headers to be sent
+    to Google AI Document server """
+
+    def _get_request_headers(self) -> dict[str, Any]:
         credentials = Credentials.from_service_account_info(
             self.google_service_account, scopes=Constants.CREDENTIAL_SCOPES
         )
@@ -91,7 +96,9 @@ def __get_request_headers(self) -> dict[str, Any]:
             "Authorization": f"Bearer {credentials.token}",
         }
 
-    def __get_input_file_type_mime(self, input_file_path: str) -> str:
+    """ Detect the mime type from the file content """
+
+    def _get_input_file_type_mime(self, input_file_path: str) -> str:
         with open(input_file_path, mode="rb") as file_obj:
             sample_contents = file_obj.read(100)
             file_type = filetype.guess(sample_contents)
@@ -111,15 +118,15 @@ def process(
         self, input_file_path: str, output_file_path: Optional[str] = None
     ) -> str:
         try:
-            file_type_mime = self.__get_input_file_type_mime(input_file_path)
+            file_type_mime = self._get_input_file_type_mime(input_file_path)
             if os.path.isfile(input_file_path):
                 with open(input_file_path, "rb") as fop:
                     file_content_in_bytes: bytes = fop.read()
             else:
                 raise AdapterError(f"File not found {input_file_path}")
             processor_url = self.config.get(Constants.URL, "") + ":process"
-            headers = self.__get_request_headers()
-            data = self.__get_request_body(
+            headers = self._get_request_headers()
+            data = self._get_request_body(
                 file_type_mime=file_type_mime,
                 file_content_in_bytes=file_content_in_bytes,
             )
@@ -148,7 +155,7 @@ def process(
     def test_connection(self) -> bool:
         try:
             url = self.config.get(Constants.URL, "")
-            headers = self.__get_request_headers()
+            headers = self._get_request_headers()
             response = requests.get(url, headers=headers)
             if response.status_code != 200:
                 logger.error(
diff --git a/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json
index 2785998..bec194e 100644
--- a/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json
+++ b/src/unstract/adapters/ocr/google_document_ai/src/static/json_schema.json
@@ -23,7 +23,8 @@
     "credentials": {
       "type": "string",
       "title": "Google Service Account",
-      "deafult": ""
+      "deafult": "",
+      "description": "Service Account in JSON format"
     }
   }
 }
diff --git a/src/unstract/adapters/ocr/register.py b/src/unstract/adapters/ocr/register.py
index 3d379c4..edecf4b 100644
--- a/src/unstract/adapters/ocr/register.py
+++ b/src/unstract/adapters/ocr/register.py
@@ -23,12 +23,12 @@ def register_adapters(adapters: dict[str, Any]) -> None:
             # Check if the item is a directory and not a
             # special directory like __pycache__
             if os.path.isdir(adapter_path) and not adapter.startswith("__"):
-                OCRRegistry.__build_adapter_list(adapter, package, adapters)
+                OCRRegistry._build_adapter_list(adapter, package, adapters)
         if len(adapters) == 0:
             logger.warning("No ocr adapter found.")
 
     @staticmethod
-    def __build_adapter_list(
+    def _build_adapter_list(
         adapter: str, package: str, adapters: dict[str, Any]
     ) -> None:
         try:
diff --git a/src/unstract/adapters/vectordb/register.py b/src/unstract/adapters/vectordb/register.py
index e0a620b..e7ef48a 100644
--- a/src/unstract/adapters/vectordb/register.py
+++ b/src/unstract/adapters/vectordb/register.py
@@ -3,12 +3,13 @@
 from importlib import import_module
 from typing import Any
 
-from unstract.adapters.registry import AdapterRegistry
 from unstract.adapters.constants import Common
+from unstract.adapters.registry import AdapterRegistry
 from unstract.adapters.vectordb.vectordb_adapter import VectorDBAdapter
 
 logger = logging.getLogger(__name__)
 
+
 class VectorDBRegistry(AdapterRegistry):
     @staticmethod
     def register_adapters(adapters: dict[str, Any]) -> None:
@@ -16,15 +17,20 @@ def register_adapters(adapters: dict[str, Any]) -> None:
         package = "unstract.adapters.vectordb"
 
         for adapter in os.listdir(current_directory):
-            adapter_path = os.path.join(current_directory, adapter, Common.SRC_FOLDER)
-            # Check if the item is a directory and not a special directory like __pycache__
+            adapter_path = os.path.join(
+                current_directory, adapter, Common.SRC_FOLDER
+            )
+            # Check if the item is a directory and not a
+            # special directory like __pycache__
             if os.path.isdir(adapter_path) and not adapter.startswith("__"):
-                VectorDBRegistry.__build_adapter_list(adapter, package, adapters)
+                VectorDBRegistry._build_adapter_list(adapter, package, adapters)
         if len(adapters) == 0:
             logger.warning("No vectorDB adapter found.")
 
     @staticmethod
-    def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -> None:
+    def _build_adapter_list(
+        adapter: str, package: str, adapters: dict[str, Any]
+    ) -> None:
         try:
             full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}"
             module = import_module(full_module_path)
@@ -40,4 +46,6 @@ def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -
                         Common.METADATA: metadata,
                     }
         except ModuleNotFoundError as exception:
-            logger.error(f"Error while importing vectorDB adapters : {exception}")
+            logger.error(
+                f"Error while importing vectorDB adapters : {exception}"
+            )
diff --git a/src/unstract/adapters/x2text/register.py b/src/unstract/adapters/x2text/register.py
index c90fb5a..27b7881 100644
--- a/src/unstract/adapters/x2text/register.py
+++ b/src/unstract/adapters/x2text/register.py
@@ -23,12 +23,12 @@ def register_adapters(adapters: dict[str, Any]) -> None:
             # Check if the item is a directory and not a
             # special directory like __pycache__
             if os.path.isdir(adapter_path) and not adapter.startswith("__"):
-                X2TextRegistry.__build_adapter_list(adapter, package, adapters)
+                X2TextRegistry._build_adapter_list(adapter, package, adapters)
         if len(adapters) == 0:
             logger.warning("No X2Text adapter found.")
 
     @staticmethod
-    def __build_adapter_list(
+    def _build_adapter_list(
         adapter: str, package: str, adapters: dict[str, Any]
     ) -> None:
         try:

From acb9ef7016899483acc965acf56eea3b2208de0e Mon Sep 17 00:00:00 2001
From: gayathrivijayakumar <gayathri@zipstack.com>
Date: Tue, 27 Feb 2024 13:42:40 +0530
Subject: [PATCH 7/7] Changes to support byte and string content types for
 x2text adapters

---
 .../adapters/x2text/llm_whisperer/src/llm_whisperer.py         | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py b/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py
index 8f5548b..595b04c 100644
--- a/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py
+++ b/src/unstract/adapters/x2text/llm_whisperer/src/llm_whisperer.py
@@ -115,7 +115,8 @@ def process(
                 )
             else:
                 if response.content is not None:
-                    output = str(response.content)
+                    if isinstance(response.content, bytes):
+                        output = response.content.decode("utf-8")
                     if output_file_path is not None:
                         with open(output_file_path, "w", encoding="utf-8") as f:
                             f.write(output)