Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "pdm.backend"

[project]
name = "unstract-adapters"
version = "0.2.1"
version = "0.2.2"
description = "Unstract Adapters"
authors = [
{name = "Zipstack Inc.", email = "devsupport@zipstack.com"},
Expand Down
2 changes: 2 additions & 0 deletions src/unstract/adapters/adapterkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from unstract.adapters.constants import Common
from unstract.adapters.embedding import adapters as embedding_adapters
from unstract.adapters.llm import adapters as llm_adapters
from unstract.adapters.ocr import adapters as ocr_adapters
from unstract.adapters.vectordb import adapters as vectordb_adapters
from unstract.adapters.x2text import adapters as x2text_adapters

Expand All @@ -19,6 +20,7 @@ def __init__(self) -> None:
| llm_adapters
| vectordb_adapters
| x2text_adapters
| ocr_adapters
)

@property
Expand Down
24 changes: 16 additions & 8 deletions src/unstract/adapters/embedding/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,36 @@
from importlib import import_module
from typing import Any

from unstract.adapters.registry import AdapterRegistry
from unstract.adapters.constants import Common
from unstract.adapters.embedding.embedding_adapter import EmbeddingAdapter
from unstract.adapters.registry import AdapterRegistry

logger = logging.getLogger(__name__)

class EmbeddingRegistry(AdapterRegistry):

class EmbeddingRegistry(AdapterRegistry):
@staticmethod
def register_adapters(adapters: dict[str, Any]) -> None:
current_directory = os.path.dirname(os.path.abspath(__file__))
package = "unstract.adapters.embedding"

for adapter in os.listdir(current_directory):
adapter_path = os.path.join(current_directory, adapter, Common.SRC_FOLDER)
# Check if the item is a directory and not a special directory like __pycache__
adapter_path = os.path.join(
current_directory, adapter, Common.SRC_FOLDER
)
# Check if the item is a directory and not
# a special directory like __pycache__
if os.path.isdir(adapter_path) and not adapter.startswith("__"):
EmbeddingRegistry.__build_adapter_list(adapter, package, adapters)
EmbeddingRegistry._build_adapter_list(
adapter, package, adapters
)
if len(adapters) == 0:
logger.warning("No embedding adapter found.")

@staticmethod
def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -> None:
def _build_adapter_list(
adapter: str, package: str, adapters: dict[str, Any]
) -> None:
try:
full_module_path = f"{package}.{adapter}.{Common.SRC_FOLDER}"
module = import_module(full_module_path)
Expand All @@ -41,5 +48,6 @@ def __build_adapter_list(adapter: str, package: str, adapters: dict[str, Any]) -
Common.METADATA: metadata,
}
except ModuleNotFoundError as exception:
logger.error(f"Error while importing embedding adapters : {exception}")

logger.error(
f"Error while importing embedding adapters : {exception}"
)
1 change: 1 addition & 0 deletions src/unstract/adapters/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ class AdapterTypes(Enum):
LLM = "LLM"
EMBEDDING = "EMBEDDING"
VECTOR_DB = "VECTOR_DB"
OCR = "OCR"
X2TEXT = "X2TEXT"
4 changes: 2 additions & 2 deletions src/unstract/adapters/llm/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ def register_adapters(adapters: dict[str, Any]) -> None:
# Check if the item is a directory and not a
# special directory like _pycache__
if os.path.isdir(adapter_path) and not adapter.startswith("__"):
LLMRegistry.__build_adapter_list(adapter, package, adapters)
LLMRegistry._build_adapter_list(adapter, package, adapters)
if len(adapters) == 0:
logger.warning("No llm adapter found.")

@staticmethod
def __build_adapter_list(
def _build_adapter_list(
adapter: str, package: str, adapters: dict[str, Any]
) -> None:
try:
Expand Down
5 changes: 5 additions & 0 deletions src/unstract/adapters/ocr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from unstract.adapters import AdapterDict
from unstract.adapters.ocr.register import OCRRegistry

adapters: AdapterDict = {}
OCRRegistry.register_adapters(adapters)
18 changes: 18 additions & 0 deletions src/unstract/adapters/ocr/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
class FileType:
TEXT_PLAIN = "text/plain"
IMAGE_JPEG = "image/jpeg"
IMAGE_PNG = "image/png"
IMAGE_TIFF = "image/tiff"
IMAGE_BMP = "image/bmp"
IMAGE_GIF = "image/gif"
IMAGE_WEBP = "image/webp"
APPLICATION_PDF = "application/pdf"
ALLOWED_TYPES = [
IMAGE_JPEG,
IMAGE_PNG,
IMAGE_TIFF,
IMAGE_BMP,
IMAGE_GIF,
IMAGE_WEBP,
APPLICATION_PDF,
]
1 change: 1 addition & 0 deletions src/unstract/adapters/ocr/google_document_ai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Unstract Google Document AI OCR Adapter
26 changes: 26 additions & 0 deletions src/unstract/adapters/ocr/google_document_ai/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"


[project]
name = "unstract-googledocumentai-ocr"
version = "0.0.1"
description = "Google Document AI OCR"
authors = [
{name = "Zipstack Inc.", email = "devsupport@zipstack.com"},
]
dependencies = [

]
requires-python = ">=3.9"
readme = "README.md"
classifiers = [
"Programming Language :: Python"
]
license = {text = "MIT"}

[tool.pdm.build]
includes = ["src"]
package-dir = "src"
# source-includes = ["tests"]
1 change: 1 addition & 0 deletions src/unstract/adapters/ocr/google_document_ai/src/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Unstract Google Document AI OCR Adapter
9 changes: 9 additions & 0 deletions src/unstract/adapters/ocr/google_document_ai/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .google_document_ai import GoogleDocumentAI

metadata = {
"name": GoogleDocumentAI.__name__,
"version": "1.0.0",
"adapter": GoogleDocumentAI,
"description": "Google Document AI OCR adapter",
"is_active": True,
}
174 changes: 174 additions & 0 deletions src/unstract/adapters/ocr/google_document_ai/src/google_document_ai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import base64
import json
import logging
import os
from typing import Any, Optional

import requests
from filetype import filetype
from google.auth.transport import requests as google_requests
from google.oauth2.service_account import Credentials

from unstract.adapters.exceptions import AdapterError
from unstract.adapters.ocr.constants import FileType
from unstract.adapters.ocr.ocr_adapter import OCRAdapter

logger = logging.getLogger(__name__)


class GoogleDocumentAIKey:
RAW_DOCUMENT = "rawDocument"
MIME_TYPE = "mimeType"
CONTENT = "content"
SKIP_HUMAN_REVIEW = "skipHumanReview"
FIELD_MASK = "fieldMask"


class Constants:
URL = "url"
CREDENTIALS = "credentials"
CREDENTIAL_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]


class GoogleDocumentAI(OCRAdapter):
def __init__(self, settings: dict[str, Any]):
super().__init__("GoogleDocumentAI")
self.config = settings
google_service_account = self.config.get(Constants.CREDENTIALS)
if not google_service_account:
logger.error("Google service account not found")
else:
self.google_service_account = json.loads(google_service_account)

@staticmethod
def get_id() -> str:
return "googledocumentai|1013f64b-ecc9-4e35-b986-aebd60fb55d7"

@staticmethod
def get_name() -> str:
return "GoogleDocumentAI"

@staticmethod
def get_description() -> str:
return "Google Document AI OCR"

@staticmethod
def get_icon() -> str:
return (
"https://storage.googleapis.com/pandora-static/"
"adapter-icons/GoogleDocumentAI.png"
)

@staticmethod
def get_json_schema() -> str:
f = open(f"{os.path.dirname(__file__)}/static/json_schema.json")
schema = f.read()
f.close()
return schema

""" Construct the request body to be sent to Google AI Document server """

def _get_request_body(
self, file_type_mime: str, file_content_in_bytes: bytes
) -> dict[str, Any]:
return {
GoogleDocumentAIKey.RAW_DOCUMENT: {
GoogleDocumentAIKey.MIME_TYPE: file_type_mime,
GoogleDocumentAIKey.CONTENT: base64.b64encode(
file_content_in_bytes
).decode("utf-8"),
},
GoogleDocumentAIKey.SKIP_HUMAN_REVIEW: True,
GoogleDocumentAIKey.FIELD_MASK: "text",
}

""" Construct the request headers to be sent
to Google AI Document server """

def _get_request_headers(self) -> dict[str, Any]:
credentials = Credentials.from_service_account_info(
self.google_service_account, scopes=Constants.CREDENTIAL_SCOPES
)
credentials.refresh(google_requests.Request())

return {
"Content-Type": "application/json; charset=utf-8",
"Authorization": f"Bearer {credentials.token}",
}

""" Detect the mime type from the file content """

def _get_input_file_type_mime(self, input_file_path: str) -> str:
with open(input_file_path, mode="rb") as file_obj:
sample_contents = file_obj.read(100)
file_type = filetype.guess(sample_contents)

file_type_mime: str = (
file_type.MIME if file_type else FileType.TEXT_PLAIN
)

if file_type_mime not in FileType.ALLOWED_TYPES:
logger.error("Input file type not supported: " f"{file_type_mime}")

logger.info(f"file: `{input_file_path} [{file_type_mime}]`\n\n")

return file_type_mime

def process(
self, input_file_path: str, output_file_path: Optional[str] = None
) -> str:
try:
file_type_mime = self._get_input_file_type_mime(input_file_path)
if os.path.isfile(input_file_path):
with open(input_file_path, "rb") as fop:
file_content_in_bytes: bytes = fop.read()
else:
raise AdapterError(f"File not found {input_file_path}")
processor_url = self.config.get(Constants.URL, "") + ":process"
headers = self._get_request_headers()
data = self._get_request_body(
file_type_mime=file_type_mime,
file_content_in_bytes=file_content_in_bytes,
)
response = requests.post(processor_url, headers=headers, json=data)
if response.status_code != 200:
logger.error(
f"Error while calling Google Document AI: {response.text}"
)
response_json: dict[str, Any] = response.json()
result_text: str = response_json["document"]["text"]
if output_file_path is not None:
with open(output_file_path, "w", encoding="utf-8") as f:
f.write(result_text)
f.close()
return result_text
except Exception as e:
logger.error(f"Error while processing document {e}")
if not isinstance(e, AdapterError):
raise AdapterError(str(e))
else:
raise e
finally:
if fop is not None:
fop.close()

def test_connection(self) -> bool:
try:
url = self.config.get(Constants.URL, "")
headers = self._get_request_headers()
response = requests.get(url, headers=headers)
if response.status_code != 200:
logger.error(
f"Error while testing Google Document AI: {response.text}"
)
raise AdapterError(
f"{response.status_code} - {response.reason}"
)
else:
return True
except Exception as e:
logger.error(f"Error occured while testing adapter {e}")
if not isinstance(e, AdapterError):
raise AdapterError(str(e))
else:
raise e
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"title": "Google Document AI OCR",
"type": "object",
"required": [
"adapter_name",
"url",
"credentials"
],
"properties": {
"adapter_name": {
"type": "string",
"title": "OCR Adapter ID",
"default": "",
"description": "Provide a unique name for this adapter instance. Example: google-document-ai-1"
},
"url": {
"type": "string",
"title": "URL",
"default": "",
"format": "uri",
"description": "The URL of the Google Document AI endpoint for the processor Example: https://{endpoint}/v1/projects/{project}/locations/{location}/processors/{processor}"
},
"credentials": {
"type": "string",
"title": "Google Service Account",
"deafult": "",
"description": "Service Account in JSON format"
}
}
}
Loading