From 15d11f40fd3b78b06782bb0d794a0a74a9000595 Mon Sep 17 00:00:00 2001 From: harini-venkataraman Date: Thu, 23 May 2024 18:16:20 +0530 Subject: [PATCH 1/9] Exception handling for Prompt Service --- src/unstract/sdk/__init__.py | 2 +- src/unstract/sdk/prompt.py | 46 ++++++++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py index a0ae66de..610a2805 100644 --- a/src/unstract/sdk/__init__.py +++ b/src/unstract/sdk/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.27.0" +__version__ = "0.28.0" def get_sdk_version(): diff --git a/src/unstract/sdk/prompt.py b/src/unstract/sdk/prompt.py index a03d8915..1cf3e04b 100644 --- a/src/unstract/sdk/prompt.py +++ b/src/unstract/sdk/prompt.py @@ -1,7 +1,14 @@ from typing import Any, Optional import requests -from requests import RequestException, Response +from requests import ( + ConnectionError, + HTTPError, + RequestException, + Response, + Timeout, + TooManyRedirects, +) from unstract.sdk.constants import LogLevel, PromptStudioKeys, ToolEnv from unstract.sdk.helper import SdkHelper @@ -25,9 +32,7 @@ def __init__( """ self.tool = tool - self.base_url = SdkHelper.get_platform_base_url( - prompt_host, prompt_port - ) + self.base_url = SdkHelper.get_platform_base_url(prompt_host, prompt_port) self.bearer_token = tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY) def answer_prompt(self, payload: dict[str, Any]) -> dict[str, Any]: @@ -36,9 +41,7 @@ def answer_prompt(self, payload: dict[str, Any]) -> dict[str, Any]: def single_pass_extraction(self, payload: dict[str, Any]) -> dict[str, Any]: return self._post_call("single-pass-extraction", payload) - def _post_call( - self, url_path: str, payload: dict[str, Any] - ) -> dict[str, Any]: + def _post_call(self, url_path: str, payload: dict[str, Any]) -> dict[str, Any]: """Invokes and communicates to prompt service to fetch response for the prompt. @@ -63,17 +66,26 @@ def _post_call( "structure_output": "", } url: str = f"{self.base_url}/{url_path}" - headers: dict[str, str] = { - "Authorization": f"Bearer {self.bearer_token}" - } + headers: dict[str, str] = {"Authorization": f"Bearer {self.bearer_token}"} + response: Response = Response() try: # TODO: Review timeout value - response: Response = requests.post( - url, json=payload, headers=headers, timeout=600 - ) + response = requests.post(url, json=payload, headers=headers, timeout=600) response.raise_for_status() result["status"] = "OK" result["structure_output"] = response.text + except ConnectionError as connect_err: + msg = "Unable to connect to prompt service. Please contact admin." + result["error"] = self._stringify_and_stream_err(connect_err, msg) + except Timeout as time_out: + msg = "Request to run prompt has timed out" + result["error"] = self._stringify_and_stream_err(time_out, msg) + except TooManyRedirects as too_many_redirects: + msg = "Too many redirects while connecting to prompt service." + result["error"] = self._stringify_and_stream_err(too_many_redirects, msg) + except HTTPError as http_err: + msg = "Error while fetching prompt response." + result["error"] = self._stringify_and_stream_err(http_err, msg) except RequestException as e: # Extract error information from the response if available error_message = str(e) @@ -91,6 +103,14 @@ def _post_call( ) return result + def _stringify_and_stream_err(self, err: RequestException, msg: str) -> str: + error_message = str(err) + self.tool.stream_log( + f"{msg}: {error_message}", + level=LogLevel.ERROR, + ) + return error_message + @staticmethod def get_exported_tool( tool: BaseTool, prompt_registry_id: str From 28c47189f74b7f1e7f59e3e42051f836a9fdf209 Mon Sep 17 00:00:00 2001 From: harini-venkataraman Date: Mon, 14 Oct 2024 22:37:37 +0530 Subject: [PATCH 2/9] LLM Whisperer adapter v2 --- .../x2text/llm_whisperer v2/README.md | 10 + .../x2text/llm_whisperer v2/pyproject.toml | 25 ++ .../x2text/llm_whisperer v2/src/__init__.py | 9 + .../x2text/llm_whisperer v2/src/constants.py | 102 +++++ .../llm_whisperer v2/src/llm_whisperer_v2.py | 410 ++++++++++++++++++ .../src/static/json_schema.json | 124 ++++++ 6 files changed, 680 insertions(+) create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer v2/README.md create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer v2/pyproject.toml create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/__init__.py create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/constants.py create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/llm_whisperer_v2.py create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/static/json_schema.json diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/README.md b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/README.md new file mode 100644 index 00000000..2b64f31d --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/README.md @@ -0,0 +1,10 @@ +# Unstract LLM Whisperer X2Text Adapter + +## Env variables + +The below env variables are resolved by LLM Whisperer adapter + +| Variable | Description | +| ---------------------------- | -------------------------------------------------------------------------------------------- | +| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s | +| `ADAPTER_LLMW_MAX_POLLS` | Total number of times to poll the status API. Defaults to 30 | diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/pyproject.toml b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/pyproject.toml new file mode 100644 index 00000000..bf7ad3a4 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/pyproject.toml @@ -0,0 +1,25 @@ +[build-system] +requires = ["pdm-backend"] +build-backend = "pdm.backend" + + +[project] +name = "unstract-llm_whisperer-x2text-v2" +version = "0.0.1" +description = "V2 of LLMWhisperer X2Text Adapter" +authors = [ + {name = "Zipstack Inc.", email = "devsupport@zipstack.com"}, +] +dependencies = [ +] +requires-python = ">=3.9" +readme = "README.md" +classifiers = [ + "Programming Language :: Python" +] +license = {text = "MIT"} + +[tool.pdm.build] +includes = ["src"] +package-dir = "src" +# source-includes = ["tests"] diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/__init__.py new file mode 100644 index 00000000..ba216498 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/__init__.py @@ -0,0 +1,9 @@ +from .llm_whisperer import LLMWhisperer + +metadata = { + "name": LLMWhisperer.__name__, + "version": "1.0.0", + "adapter": LLMWhisperer, + "description": "LLMWhisperer X2Text adapter", + "is_active": True, +} diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/constants.py new file mode 100644 index 00000000..016db230 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/constants.py @@ -0,0 +1,102 @@ +import os +from enum import Enum + + +class ProcessingModes(Enum): + OCR = "ocr" + TEXT = "text" + + +class Modes(Enum): + NATIVE_TEXT = "native_text" + LOW_COST = "low_cost" + HIGH_QUALITY = "high_quality" + FORM = "form" + + +class OutputModes(Enum): + LINE_PRINTER = "line-printer" + DUMP_TEXT = "dump-text" + TEXT = "text" + + +class HTTPMethod(Enum): + GET = "GET" + POST = "POST" + + +class WhispererHeader: + UNSTRACT_KEY = "unstract-key" + + +class WhispererEndpoint: + """Endpoints available at LLMWhisperer service.""" + + TEST_CONNECTION = "test-connection" + WHISPER = "whisper" + STATUS = "whisper-status" + RETRIEVE = "whisper-retrieve" + + +class WhispererEnv: + """Env variables for LLM whisperer. + + Can be used to alter behaviour at runtime. + + Attributes: + POLL_INTERVAL: Time in seconds to wait before polling + LLMWhisperer's status API. Defaults to 30s + MAX_POLLS: Total number of times to poll the status API. + Set to -1 to poll indefinitely. Defaults to -1 + """ + + POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL" + MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS" + + +class WhispererConfig: + """Dictionary keys used to configure LLMWhisperer service.""" + + URL = "url" + PROCESSING_MODE = "processing_mode" + MODE = "mode" + OUTPUT_MODE = "output_mode" + UNSTRACT_KEY = "unstract_key" + MEDIAN_FILTER_SIZE = "median_filter_size" + GAUSSIAN_BLUR_RADIUS = "gaussian_blur_radius" + FORCE_TEXT_PROCESSING = "force_text_processing" + LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance" + HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor" + PAGES_TO_EXTRACT = "pages_to_extract" + STORE_METADATA_FOR_HIGHLIGHTING = "store_metadata_for_highlighting" + ADD_LINE_NOS = "add_line_nos" + OUTPUT_JSON = "output_json" + PAGE_SEPARATOR = "page_seperator" + + +class WhisperStatus: + """Values returned / used by /whisper-status endpoint.""" + + PROCESSING = "processing" + PROCESSED = "processed" + DELIVERED = "delivered" + UNKNOWN = "unknown" + # Used for async processing + WHISPER_HASH = "whisper-hash" + STATUS = "status" + + +class WhispererDefaults: + """Defaults meant for LLM whisperer.""" + + MEDIAN_FILTER_SIZE = 0 + GAUSSIAN_BLUR_RADIUS = 0.0 + FORCE_TEXT_PROCESSING = False + LINE_SPLITTER_TOLERANCE = 0.75 + HORIZONTAL_STRETCH_FACTOR = 1.0 + POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30)) + MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30)) + PAGES_TO_EXTRACT = "" + ADD_LINE_NOS = True + OUTPUT_JSON = True + PAGE_SEPARATOR = "<<< >>>" diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/llm_whisperer_v2.py b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/llm_whisperer_v2.py new file mode 100644 index 00000000..efa87441 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/llm_whisperer_v2.py @@ -0,0 +1,410 @@ +import json +import logging +import os +import time +from pathlib import Path +from typing import Any, Optional + +import requests +from requests import Response +from requests.exceptions import ConnectionError, HTTPError, Timeout + +from unstract.sdk.adapters.exceptions import ExtractorError +from unstract.sdk.adapters.utils import AdapterUtils +from unstract.sdk.adapters.x2text.constants import X2TextConstants +from unstract.sdk.adapters.x2text.dto import ( + TextExtractionMetadata, + TextExtractionResult, +) +from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import ( + HTTPMethod, + OutputModes, + ProcessingModes, + WhispererConfig, + WhispererDefaults, + WhispererEndpoint, + WhispererHeader, + WhisperStatus, +) +from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter + +logger = logging.getLogger(__name__) + + +class LLMWhispererV2(X2TextAdapter): + def __init__(self, settings: dict[str, Any]): + super().__init__("LLMWhispererV2") + self.config = settings + + @staticmethod + def get_id() -> str: + return "llmwhisperer|0a1647f0-f65f-410d-843b-3d979c78350e" + + @staticmethod + def get_name() -> str: + return "LLMWhisperer" + + @staticmethod + def get_description() -> str: + return "LLMWhisperer V2 X2Text" + + @staticmethod + def get_icon() -> str: + return "/icons/adapter-icons/LLMWhispererV2.png" + + @staticmethod + def get_json_schema() -> str: + f = open(f"{os.path.dirname(__file__)}/static/json_schema.json") + schema = f.read() + f.close() + return schema + + def _get_request_headers(self) -> dict[str, Any]: + """Obtains the request headers to authenticate with LLM Whisperer. + + Returns: + str: Request headers + """ + return { + "accept": "application/json", + WhispererHeader.UNSTRACT_KEY: self.config.get(WhispererConfig.UNSTRACT_KEY), + } + + def _make_request( + self, + request_method: HTTPMethod, + request_endpoint: str, + headers: Optional[dict[str, Any]] = None, + params: Optional[dict[str, Any]] = None, + data: Optional[Any] = None, + ) -> Response: + """Makes a request to LLM whisperer service. + + Args: + request_method (HTTPMethod): HTTPMethod to call. Can be GET or POST + request_endpoint (str): LLM whisperer endpoint to hit + headers (Optional[dict[str, Any]], optional): Headers to pass. + Defaults to None. + params (Optional[dict[str, Any]], optional): Query params to pass. + Defaults to None. + data (Optional[Any], optional): Data to pass in case of POST. + Defaults to None. + + Returns: + Response: Response from the request + """ + llm_whisperer_svc_url = ( + f"{self.config.get(WhispererConfig.URL)}" f"/v1/{request_endpoint}" + ) + if not headers: + headers = self._get_request_headers() + + try: + response: Response + if request_method == HTTPMethod.GET: + response = requests.get( + url=llm_whisperer_svc_url, headers=headers, params=params + ) + elif request_method == HTTPMethod.POST: + response = requests.post( + url=llm_whisperer_svc_url, + headers=headers, + params=params, + data=data, + ) + else: + raise ExtractorError(f"Unsupported request method: {request_method}") + response.raise_for_status() + except ConnectionError as e: + logger.error(f"Adapter error: {e}") + raise ExtractorError( + "Unable to connect to LLM Whisperer service, please check the URL" + ) + except Timeout as e: + msg = "Request to LLM whisperer has timed out" + logger.error(f"{msg}: {e}") + raise ExtractorError(msg) + except HTTPError as e: + logger.error(f"Adapter error: {e}") + default_err = "Error while calling the LLM Whisperer service" + msg = AdapterUtils.get_msg_from_request_exc( + err=e, message_key="message", default_err=default_err + ) + raise ExtractorError(msg) + return response + + def _get_whisper_params(self, enable_highlight: bool = False) -> dict[str, Any]: + """Gets query params meant for /whisper endpoint. + + The params is filled based on the configuration passed. + + Returns: + dict[str, Any]: Query params + """ + params = { + WhispererConfig.PROCESSING_MODE: self.config.get( + WhispererConfig.PROCESSING_MODE, ProcessingModes.TEXT.value + ), + # Not providing default value to maintain legacy compatablity + # Providing default value will overide the params + # processing_mode, force_text_processing + WhispererConfig.MODE: self.config.get(WhispererConfig.MODE), + WhispererConfig.OUTPUT_MODE: self.config.get( + WhispererConfig.OUTPUT_MODE, OutputModes.LINE_PRINTER.value + ), + WhispererConfig.FORCE_TEXT_PROCESSING: self.config.get( + WhispererConfig.FORCE_TEXT_PROCESSING, + WhispererDefaults.FORCE_TEXT_PROCESSING, + ), + WhispererConfig.LINE_SPLITTER_TOLERANCE: self.config.get( + WhispererConfig.LINE_SPLITTER_TOLERANCE, + WhispererDefaults.LINE_SPLITTER_TOLERANCE, + ), + WhispererConfig.HORIZONTAL_STRETCH_FACTOR: self.config.get( + WhispererConfig.HORIZONTAL_STRETCH_FACTOR, + WhispererDefaults.HORIZONTAL_STRETCH_FACTOR, + ), + WhispererConfig.PAGES_TO_EXTRACT: self.config.get( + WhispererConfig.PAGES_TO_EXTRACT, + WhispererDefaults.PAGES_TO_EXTRACT, + ), + WhispererConfig.ADD_LINE_NOS: WhispererDefaults.ADD_LINE_NOS, + WhispererConfig.OUTPUT_JSON: WhispererDefaults.OUTPUT_JSON, + WhispererConfig.PAGE_SEPARATOR: self.config.get( + WhispererConfig.PAGE_SEPARATOR, + WhispererDefaults.PAGE_SEPARATOR, + ), + } + if not params[WhispererConfig.FORCE_TEXT_PROCESSING]: + params.update( + { + WhispererConfig.MEDIAN_FILTER_SIZE: self.config.get( + WhispererConfig.MEDIAN_FILTER_SIZE, + WhispererDefaults.MEDIAN_FILTER_SIZE, + ), + WhispererConfig.GAUSSIAN_BLUR_RADIUS: self.config.get( + WhispererConfig.GAUSSIAN_BLUR_RADIUS, + WhispererDefaults.GAUSSIAN_BLUR_RADIUS, + ), + } + ) + + if enable_highlight: + params.update( + {WhispererConfig.STORE_METADATA_FOR_HIGHLIGHTING: enable_highlight} + ) + return params + + def test_connection(self) -> bool: + self._make_request( + request_method=HTTPMethod.GET, + request_endpoint=WhispererEndpoint.TEST_CONNECTION, + ) + return True + + def _check_status_until_ready( + self, whisper_hash: str, headers: dict[str, Any], params: dict[str, Any] + ) -> WhisperStatus: + """Checks the extraction status by polling. + + Polls the /whisper-status endpoint in fixed intervals of + env: ADAPTER_LLMW_POLL_INTERVAL for a certain number of times + controlled by env: ADAPTER_LLMW_MAX_POLLS. + + Args: + whisper_hash (str): Identifier for the extraction, + returned by LLMWhisperer + headers (dict[str, Any]): Headers to pass for the status check + params (dict[str, Any]): Params to pass for the status check + + Returns: + WhisperStatus: Status of the extraction + """ + POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL + MAX_POLLS = WhispererDefaults.MAX_POLLS + request_count = 0 + + # Check status in fixed intervals upto max poll count. + while True: + request_count += 1 + logger.info( + f"Checking status with interval: {POLL_INTERVAL}s" + f", request count: {request_count} [max: {MAX_POLLS}]" + ) + status_response = self._make_request( + request_method=HTTPMethod.GET, + request_endpoint=WhispererEndpoint.STATUS, + headers=headers, + params=params, + ) + if status_response.status_code == 200: + status_data = status_response.json() + status = status_data.get(WhisperStatus.STATUS, WhisperStatus.UNKNOWN) + logger.info(f"Whisper status for {whisper_hash}: {status}") + if status in [WhisperStatus.PROCESSED, WhisperStatus.DELIVERED]: + break + else: + raise ExtractorError( + "Error checking LLMWhisperer status: " + f"{status_response.status_code} - {status_response.text}" + ) + + # Exit with error if max poll count is reached + if request_count >= MAX_POLLS: + raise ExtractorError( + "Unable to extract text after attempting" f" {request_count} times" + ) + time.sleep(POLL_INTERVAL) + + return status + + def _extract_async(self, whisper_hash: str) -> str: + """Makes an async extraction with LLMWhisperer. + + Polls and checks the status first before proceeding to retrieve once. + + Args: + whisper_hash (str): Identifier of the extraction + + Returns: + str: Extracted contents from the file + """ + logger.info(f"Extracting async for whisper hash: {whisper_hash}") + + headers: dict[str, Any] = self._get_request_headers() + params = { + WhisperStatus.WHISPER_HASH: whisper_hash, + WhispererConfig.OUTPUT_JSON: WhispererDefaults.OUTPUT_JSON, + } + + # Polls in fixed intervals and checks status + self._check_status_until_ready( + whisper_hash=whisper_hash, headers=headers, params=params + ) + + retrieve_response = self._make_request( + request_method=HTTPMethod.GET, + request_endpoint=WhispererEndpoint.RETRIEVE, + headers=headers, + params=params, + ) + if retrieve_response.status_code == 200: + return retrieve_response.json() + else: + raise ExtractorError( + "Error retrieving from LLMWhisperer: " + f"{retrieve_response.status_code} - {retrieve_response.text}" + ) + + def _send_whisper_request( + self, input_file_path: str, enable_highlight: bool = False + ) -> requests.Response: + headers = self._get_request_headers() + headers["Content-Type"] = "application/octet-stream" + params = self._get_whisper_params(enable_highlight) + + response: requests.Response + try: + with open(input_file_path, "rb") as input_f: + response = self._make_request( + request_method=HTTPMethod.POST, + request_endpoint=WhispererEndpoint.WHISPER, + headers=headers, + params=params, + data=input_f.read(), + ) + except OSError as e: + logger.error(f"OS error while reading {input_file_path}: {e}") + raise ExtractorError(str(e)) + return response + + def _extract_text_from_response( + self, output_file_path: Optional[str], response: requests.Response + ) -> str: + output_json = {} + if response.status_code == 200: + output_json = response.json() + elif response.status_code == 202: + whisper_hash = response.json().get(WhisperStatus.WHISPER_HASH) + output_json = self._extract_async(whisper_hash=whisper_hash) + else: + raise ExtractorError("Couldn't extract text from file") + if output_file_path: + self._write_output_to_file( + output_json=output_json, + output_file_path=Path(output_file_path), + ) + return output_json.get("text", "") + + def _write_output_to_file(self, output_json: dict, output_file_path: Path) -> None: + """Writes the extracted text and metadata to the specified output file + and metadata file. + + Args: + output_json (dict): The dictionary containing the extracted data, + with "text" as the key for the main content. + output_file_path (Path): The file path where the extracted text + should be written. + + Raises: + ExtractorError: If there is an error while writing the output file. + """ + try: + text_output = output_json.get("text", "") + logger.info(f"Writing output to {output_file_path}") + output_file_path.write_text(text_output, encoding="utf-8") + try: + # Define the directory of the output file and metadata paths + output_dir = output_file_path.parent + metadata_dir = output_dir / "metadata" + metadata_file_name = output_file_path.with_suffix(".json").name + metadata_file_path = metadata_dir / metadata_file_name + # Ensure the metadata directory exists + metadata_dir.mkdir(parents=True, exist_ok=True) + # Remove the "text" key from the metadata + metadata = { + key: value for key, value in output_json.items() if key != "text" + } + metadata_json = json.dumps(metadata, ensure_ascii=False, indent=4) + logger.info(f"Writing metadata to {metadata_file_path}") + metadata_file_path.write_text(metadata_json, encoding="utf-8") + except Exception as e: + logger.error( + f"Error while writing metadata to {metadata_file_path}: {e}" + ) + + except Exception as e: + logger.error(f"Error while writing {output_file_path}: {e}") + raise ExtractorError(str(e)) + + def process( + self, + input_file_path: str, + output_file_path: Optional[str] = None, + **kwargs: dict[Any, Any], + ) -> TextExtractionResult: + """Used to extract text from documents. + + Args: + input_file_path (str): Path to file that needs to be extracted + output_file_path (Optional[str], optional): File path to write + extracted text into, if None doesn't write to a file. + Defaults to None. + + Returns: + str: Extracted text + """ + + response: requests.Response = self._send_whisper_request( + input_file_path, + bool(kwargs.get(X2TextConstants.ENABLE_HIGHLIGHT, False)), + ) + + metadata = TextExtractionMetadata( + whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "") + ) + + return TextExtractionResult( + extracted_text=self._extract_text_from_response(output_file_path, response), + extraction_metadata=metadata, + ) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/static/json_schema.json new file mode 100644 index 00000000..8e8360f9 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/static/json_schema.json @@ -0,0 +1,124 @@ +{ + "title": "LLM Whisperer X2Text v2", + "type": "object", + "required": [ + "adapter_name", + "unstract_key", + "url" + ], + "properties": { + "adapter_name": { + "type": "string", + "title": "Name", + "default": "", + "description": "Provide a unique name for this adapter instance. Example: LLM Whisperer 1" + }, + "url": { + "type": "string", + "title": "URL", + "format": "uri", + "default": "https://llmwhisperer-api.unstract.com", + "description": "Provide the URL of the LLM Whisperer service." + }, + "unstract_key": { + "type": "string", + "title": "Unstract Key", + "format": "password", + "description": "API key obtained from the Unstract developer portal (https://unstract-api-resource.developer.azure-api.net)" + }, + "mode": { + "type": "string", + "title": "Mode", + "enum": [ + "native_text", + "low_cost", + "high_quality", + "form" + ], + "default": "form", + "description": "Native text : Extracts text from PDF without OCR. This is very fast and cost effective. Use this mode if you are sure all your PDFs are native text pdfs (not scanned documents). Note that some scanned PDFs are \"searchable\" PDFs. Use the OCR modes for these PDFs as the quality of text in these documents are often poor. \n Low cost : Extracts text from scanned and native PDFs, images and office documents. This OCR mode cannot handle handwriting and low quality scanned pdfs and images. \n High quality : Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. \n Form: Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. Can also extract information about checkboxes and radio button" + }, + "output_mode": { + "type": "string", + "title": "Output Mode", + "enum": [ + "line-printer", + "dump-text", + "text" + ], + "default": "line-printer", + "description": "The output format. Valid options are line-printer, dump-text and text. The line-printer mode tries to maintain the layout of the original text and works very well as inputs to LLMs. dump-text just dumps each page as paragraphs. text extracts text into groups as it sees in the original page. text and dump-text are treated as same in ocr processing mode." + }, + + "line_splitter_tolerance": { + "type": "number", + "title": "Line Splitter Tolerance", + "default": 0.4, + "description": "Reduce this value to split lines less often, increase to split lines more often. Useful when PDFs have multi column layout with text in each column that is not aligned." + }, + "horizontal_stretch_factor": { + "type": "number", + "title": "Horizontal Stretch Factor", + "default": 1.0, + "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi column text merge with each other." + }, + "pages_to_extract": { + "type": "string", + "title": "Page number(s) or range to extract", + "default": "", + "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", + "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." + }, + "page_seperator": { + "type": "string", + "title": "Page separator", + "default": "<<< >>>", + "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator." + } + }, + "if": { + "anyOf": [ + { + "properties": { + "mode": { + "const": "low_cost" + } + } + }, + { + "properties": { + "mode": { + "const": "high_quality" + } + } + }, + { + "properties": { + "mode": { + "const": "form" + } + } + } + ] + }, + "then": { + "properties": { + "median_filter_size": { + "type": "integer", + "title": "Median Filter Size", + "default": 0, + "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version." + }, + "gaussian_blur_radius": { + "type": "number", + "title": "Gaussian Blur Radius", + "default": 0.0, + "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version." + } + }, + "required": [ + "median_filter_size", + "gaussian_blur_radius" + ] + } +} From 3c8c47a81243613e5b2e04420b57d4a0e9edac94 Mon Sep 17 00:00:00 2001 From: harini-venkataraman Date: Wed, 16 Oct 2024 20:31:50 +0530 Subject: [PATCH 3/9] Support for LLMWHisperer v2 adapter --- .../x2text/llm_whisperer v2/README.md | 10 - .../x2text/llm_whisperer v2/src/__init__.py | 9 - .../src/static/json_schema.json | 124 ----------- .../x2text/llm_whisperer_v2/README.md | 58 +++++ .../pyproject.toml | 0 .../x2text/llm_whisperer_v2/src/__init__.py | 9 + .../src/constants.py | 37 ++-- .../src/helper.py} | 209 +++++++----------- .../llm_whisperer_v2/src/llm_whisperer_v2.py | 88 ++++++++ .../src/static/json_schema.json | 156 +++++++++++++ 10 files changed, 412 insertions(+), 288 deletions(-) delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer v2/README.md delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/__init__.py delete mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/static/json_schema.json create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md rename src/unstract/sdk/adapters/x2text/{llm_whisperer v2 => llm_whisperer_v2}/pyproject.toml (100%) create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py rename src/unstract/sdk/adapters/x2text/{llm_whisperer v2 => llm_whisperer_v2}/src/constants.py (76%) rename src/unstract/sdk/adapters/x2text/{llm_whisperer v2/src/llm_whisperer_v2.py => llm_whisperer_v2/src/helper.py} (66%) create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py create mode 100644 src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/README.md b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/README.md deleted file mode 100644 index 2b64f31d..00000000 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# Unstract LLM Whisperer X2Text Adapter - -## Env variables - -The below env variables are resolved by LLM Whisperer adapter - -| Variable | Description | -| ---------------------------- | -------------------------------------------------------------------------------------------- | -| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s | -| `ADAPTER_LLMW_MAX_POLLS` | Total number of times to poll the status API. Defaults to 30 | diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/__init__.py deleted file mode 100644 index ba216498..00000000 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .llm_whisperer import LLMWhisperer - -metadata = { - "name": LLMWhisperer.__name__, - "version": "1.0.0", - "adapter": LLMWhisperer, - "description": "LLMWhisperer X2Text adapter", - "is_active": True, -} diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/static/json_schema.json deleted file mode 100644 index 8e8360f9..00000000 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/static/json_schema.json +++ /dev/null @@ -1,124 +0,0 @@ -{ - "title": "LLM Whisperer X2Text v2", - "type": "object", - "required": [ - "adapter_name", - "unstract_key", - "url" - ], - "properties": { - "adapter_name": { - "type": "string", - "title": "Name", - "default": "", - "description": "Provide a unique name for this adapter instance. Example: LLM Whisperer 1" - }, - "url": { - "type": "string", - "title": "URL", - "format": "uri", - "default": "https://llmwhisperer-api.unstract.com", - "description": "Provide the URL of the LLM Whisperer service." - }, - "unstract_key": { - "type": "string", - "title": "Unstract Key", - "format": "password", - "description": "API key obtained from the Unstract developer portal (https://unstract-api-resource.developer.azure-api.net)" - }, - "mode": { - "type": "string", - "title": "Mode", - "enum": [ - "native_text", - "low_cost", - "high_quality", - "form" - ], - "default": "form", - "description": "Native text : Extracts text from PDF without OCR. This is very fast and cost effective. Use this mode if you are sure all your PDFs are native text pdfs (not scanned documents). Note that some scanned PDFs are \"searchable\" PDFs. Use the OCR modes for these PDFs as the quality of text in these documents are often poor. \n Low cost : Extracts text from scanned and native PDFs, images and office documents. This OCR mode cannot handle handwriting and low quality scanned pdfs and images. \n High quality : Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. \n Form: Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. Can also extract information about checkboxes and radio button" - }, - "output_mode": { - "type": "string", - "title": "Output Mode", - "enum": [ - "line-printer", - "dump-text", - "text" - ], - "default": "line-printer", - "description": "The output format. Valid options are line-printer, dump-text and text. The line-printer mode tries to maintain the layout of the original text and works very well as inputs to LLMs. dump-text just dumps each page as paragraphs. text extracts text into groups as it sees in the original page. text and dump-text are treated as same in ocr processing mode." - }, - - "line_splitter_tolerance": { - "type": "number", - "title": "Line Splitter Tolerance", - "default": 0.4, - "description": "Reduce this value to split lines less often, increase to split lines more often. Useful when PDFs have multi column layout with text in each column that is not aligned." - }, - "horizontal_stretch_factor": { - "type": "number", - "title": "Horizontal Stretch Factor", - "default": 1.0, - "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi column text merge with each other." - }, - "pages_to_extract": { - "type": "string", - "title": "Page number(s) or range to extract", - "default": "", - "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", - "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." - }, - "page_seperator": { - "type": "string", - "title": "Page separator", - "default": "<<< >>>", - "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator." - } - }, - "if": { - "anyOf": [ - { - "properties": { - "mode": { - "const": "low_cost" - } - } - }, - { - "properties": { - "mode": { - "const": "high_quality" - } - } - }, - { - "properties": { - "mode": { - "const": "form" - } - } - } - ] - }, - "then": { - "properties": { - "median_filter_size": { - "type": "integer", - "title": "Median Filter Size", - "default": 0, - "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version." - }, - "gaussian_blur_radius": { - "type": "number", - "title": "Gaussian Blur Radius", - "default": 0.0, - "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version." - } - }, - "required": [ - "median_filter_size", - "gaussian_blur_radius" - ] - } -} diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md new file mode 100644 index 00000000..57ea77b5 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md @@ -0,0 +1,58 @@ +# Unstract LLM Whisperer v2 X2Text Adapter + +## Env variables + +The below env variables are resolved by LLM Whisperer adapter + +| Variable | Description | +| ---------------------------- | -------------------------------------------------------------------------------------------- | +| `ADAPTER_LLMW_POLL_INTERVAL` | Time in seconds to wait before polling LLMWhisperer's status API. Defaults to 30s | +| `ADAPTER_LLMW_MAX_POLLS` | Total number of times to poll the status API. Defaults to 30 | + + +--- +id: llm_whisperer_apis_changelog +--- + +# Changelog + +## Version 2.0.0 + +:::warning +This version of the API is not backward compatible with the previous version. +::: + +### API endpoint + +- The base URL for the **V2** APIs is `https://llmwhisperer-api.unstract.com/api/v2` + +### Global change in parameter naming + +- All use of `whisper-hash` as a parameter has been replaced with `whisper_hash` for consistency. + +### Whisper parameters + +#### Added +- `mode` (str, optional): The processing mode. +- `mark_vertical_lines` (bool, optional): Whether to reproduce vertical lines in the document. +- `mark_horizontal_lines` (bool, optional): Whether to reproduce horizontal lines in the document. +- `line_splitter_strategy` (str, optional): The line splitter strategy to use. An advanced option for customizing the line splitting process. +- `lang` (str, optional): The language of the document. +- `tag` (str, optional): A tag to associate with the document. Used for auditing and tracking purposes. +- `file_name` (str, optional): The name of the file being processed. Used for auditing and tracking purposes. +- `use_webhook` (str, optional): The name of the webhook to call after the document is processed. +- `webhook_metadata` (str, optional): Metadata to send to the webhook after the document is processed. + +#### Removed +- `timeout` (int, optional): The timeout for API requests. *There is no sync mode now. All requests are async.* +- `force_text_processing` (bool, optional): Whether to force text processing. *This is feature is removed* +- `ocr_provider` (str, optional): The OCR provider to use. *This is superseded by `mode`* +- `processing_mode` (str, optional): The processing mode. *This is superseded by `mode`* +- `store_metadata_for_highlighting` (bool, optional): Whether to store metadata for highlighting. *Feature is removed. Data still available and set back when retrieve is called* + + +### New features + +#### Webhooks + +- Added support for webhooks. You can now register a webhook and use it to receive the processed document. diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/pyproject.toml b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml similarity index 100% rename from src/unstract/sdk/adapters/x2text/llm_whisperer v2/pyproject.toml rename to src/unstract/sdk/adapters/x2text/llm_whisperer_v2/pyproject.toml diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py new file mode 100644 index 00000000..14240c6a --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/__init__.py @@ -0,0 +1,9 @@ +from .llm_whisperer_v2 import LLMWhispererV2 + +metadata = { + "name": LLMWhispererV2.__name__, + "version": "1.0.0", + "adapter": LLMWhispererV2, + "description": "LLMWhispererV2 X2Text adapter", + "is_active": True, +} diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py similarity index 76% rename from src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/constants.py rename to src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py index 016db230..ed7d4bce 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/constants.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py @@ -2,11 +2,6 @@ from enum import Enum -class ProcessingModes(Enum): - OCR = "ocr" - TEXT = "text" - - class Modes(Enum): NATIVE_TEXT = "native_text" LOW_COST = "low_cost" @@ -15,8 +10,7 @@ class Modes(Enum): class OutputModes(Enum): - LINE_PRINTER = "line-printer" - DUMP_TEXT = "dump-text" + LAYOUT_PRESERVING = "layout_preserving" TEXT = "text" @@ -58,20 +52,26 @@ class WhispererConfig: """Dictionary keys used to configure LLMWhisperer service.""" URL = "url" - PROCESSING_MODE = "processing_mode" - MODE = "mode" + PROCESSING_MODE = "mode" OUTPUT_MODE = "output_mode" UNSTRACT_KEY = "unstract_key" MEDIAN_FILTER_SIZE = "median_filter_size" GAUSSIAN_BLUR_RADIUS = "gaussian_blur_radius" - FORCE_TEXT_PROCESSING = "force_text_processing" LINE_SPLITTER_TOLERANCE = "line_splitter_tolerance" + LINE_SPLITTER_STRATEGY = "line_splitter_strategy" HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor" PAGES_TO_EXTRACT = "pages_to_extract" STORE_METADATA_FOR_HIGHLIGHTING = "store_metadata_for_highlighting" - ADD_LINE_NOS = "add_line_nos" - OUTPUT_JSON = "output_json" + MARK_VERTICAL_LINES = "mark_vertical_lines" + MARK_HORIZONTAL_LINES = "mark_horizontal_lines" PAGE_SEPARATOR = "page_seperator" + URL_IN_POST = "url_in_post" + LANG = "lang" + TAG = "tag" + FILE_NAME = "file_name" + USE_WEBHOOK = "use_webhook" + WEBHOOK_METADATA = "webhook_metadata" + TEXT_ONLY = "text_only" class WhisperStatus: @@ -82,7 +82,7 @@ class WhisperStatus: DELIVERED = "delivered" UNKNOWN = "unknown" # Used for async processing - WHISPER_HASH = "whisper-hash" + WHISPER_HASH = "whisper_hash" STATUS = "status" @@ -93,10 +93,15 @@ class WhispererDefaults: GAUSSIAN_BLUR_RADIUS = 0.0 FORCE_TEXT_PROCESSING = False LINE_SPLITTER_TOLERANCE = 0.75 + LINE_SPLITTER_STRATEGY = "left-priority" HORIZONTAL_STRETCH_FACTOR = 1.0 POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30)) MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30)) PAGES_TO_EXTRACT = "" - ADD_LINE_NOS = True - OUTPUT_JSON = True - PAGE_SEPARATOR = "<<< >>>" + PAGE_SEPARATOR = "<<<" + MARK_VERTICAL_LINES = False + MARK_HORIZONTAL_LINES = False + URL_IN_POST = False + LANG = "eng" + TAG = "default" + TEXT_ONLY = False diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/llm_whisperer_v2.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py similarity index 66% rename from src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/llm_whisperer_v2.py rename to src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py index efa87441..0e62b5d7 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer v2/src/llm_whisperer_v2.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py @@ -1,6 +1,5 @@ import json import logging -import os import time from pathlib import Path from typing import Any, Optional @@ -11,55 +10,24 @@ from unstract.sdk.adapters.exceptions import ExtractorError from unstract.sdk.adapters.utils import AdapterUtils -from unstract.sdk.adapters.x2text.constants import X2TextConstants -from unstract.sdk.adapters.x2text.dto import ( - TextExtractionMetadata, - TextExtractionResult, -) -from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import ( +from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.constants import ( HTTPMethod, + Modes, OutputModes, - ProcessingModes, WhispererConfig, WhispererDefaults, WhispererEndpoint, WhispererHeader, WhisperStatus, ) -from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter logger = logging.getLogger(__name__) -class LLMWhispererV2(X2TextAdapter): - def __init__(self, settings: dict[str, Any]): - super().__init__("LLMWhispererV2") - self.config = settings - - @staticmethod - def get_id() -> str: - return "llmwhisperer|0a1647f0-f65f-410d-843b-3d979c78350e" - - @staticmethod - def get_name() -> str: - return "LLMWhisperer" - - @staticmethod - def get_description() -> str: - return "LLMWhisperer V2 X2Text" - - @staticmethod - def get_icon() -> str: - return "/icons/adapter-icons/LLMWhispererV2.png" +class LLMWhispererHelper: @staticmethod - def get_json_schema() -> str: - f = open(f"{os.path.dirname(__file__)}/static/json_schema.json") - schema = f.read() - f.close() - return schema - - def _get_request_headers(self) -> dict[str, Any]: + def get_request_headers(config: dict[str, Any]) -> dict[str, Any]: """Obtains the request headers to authenticate with LLM Whisperer. Returns: @@ -67,11 +35,12 @@ def _get_request_headers(self) -> dict[str, Any]: """ return { "accept": "application/json", - WhispererHeader.UNSTRACT_KEY: self.config.get(WhispererConfig.UNSTRACT_KEY), + WhispererHeader.UNSTRACT_KEY: config.get(WhispererConfig.UNSTRACT_KEY), } - def _make_request( - self, + @staticmethod + def make_request( + config: dict[str, Any], request_method: HTTPMethod, request_endpoint: str, headers: Optional[dict[str, Any]] = None, @@ -94,10 +63,10 @@ def _make_request( Response: Response from the request """ llm_whisperer_svc_url = ( - f"{self.config.get(WhispererConfig.URL)}" f"/v1/{request_endpoint}" + f"{config.get(WhispererConfig.URL)}" f"/api/v2/{request_endpoint}" ) if not headers: - headers = self._get_request_headers() + headers = LLMWhispererHelper.get_request_headers() try: response: Response @@ -133,7 +102,8 @@ def _make_request( raise ExtractorError(msg) return response - def _get_whisper_params(self, enable_highlight: bool = False) -> dict[str, Any]: + @staticmethod + def get_whisper_params(config: dict[str, Any]) -> dict[str, Any]: """Gets query params meant for /whisper endpoint. The params is filled based on the configuration passed. @@ -142,68 +112,75 @@ def _get_whisper_params(self, enable_highlight: bool = False) -> dict[str, Any]: dict[str, Any]: Query params """ params = { - WhispererConfig.PROCESSING_MODE: self.config.get( - WhispererConfig.PROCESSING_MODE, ProcessingModes.TEXT.value + WhispererConfig.PROCESSING_MODE: config.get( + WhispererConfig.PROCESSING_MODE, Modes.FORM.value ), - # Not providing default value to maintain legacy compatablity - # Providing default value will overide the params - # processing_mode, force_text_processing - WhispererConfig.MODE: self.config.get(WhispererConfig.MODE), - WhispererConfig.OUTPUT_MODE: self.config.get( - WhispererConfig.OUTPUT_MODE, OutputModes.LINE_PRINTER.value - ), - WhispererConfig.FORCE_TEXT_PROCESSING: self.config.get( - WhispererConfig.FORCE_TEXT_PROCESSING, - WhispererDefaults.FORCE_TEXT_PROCESSING, + WhispererConfig.OUTPUT_MODE: config.get( + WhispererConfig.OUTPUT_MODE, OutputModes.LAYOUT_PRESERVING.value ), - WhispererConfig.LINE_SPLITTER_TOLERANCE: self.config.get( + WhispererConfig.LINE_SPLITTER_TOLERANCE: config.get( WhispererConfig.LINE_SPLITTER_TOLERANCE, WhispererDefaults.LINE_SPLITTER_TOLERANCE, ), - WhispererConfig.HORIZONTAL_STRETCH_FACTOR: self.config.get( + WhispererConfig.LINE_SPLITTER_STRATEGY: config.get( + WhispererConfig.LINE_SPLITTER_STRATEGY, + WhispererDefaults.LINE_SPLITTER_STRATEGY, + ), + WhispererConfig.HORIZONTAL_STRETCH_FACTOR: config.get( WhispererConfig.HORIZONTAL_STRETCH_FACTOR, WhispererDefaults.HORIZONTAL_STRETCH_FACTOR, ), - WhispererConfig.PAGES_TO_EXTRACT: self.config.get( + WhispererConfig.PAGES_TO_EXTRACT: config.get( WhispererConfig.PAGES_TO_EXTRACT, WhispererDefaults.PAGES_TO_EXTRACT, ), - WhispererConfig.ADD_LINE_NOS: WhispererDefaults.ADD_LINE_NOS, - WhispererConfig.OUTPUT_JSON: WhispererDefaults.OUTPUT_JSON, - WhispererConfig.PAGE_SEPARATOR: self.config.get( + WhispererConfig.MARK_VERTICAL_LINES: config.get( + WhispererConfig.MARK_VERTICAL_LINES, + WhispererDefaults.MARK_VERTICAL_LINES, + ), + WhispererConfig.MARK_HORIZONTAL_LINES: config.get( + WhispererConfig.MARK_HORIZONTAL_LINES, + WhispererDefaults.MARK_HORIZONTAL_LINES, + ), + WhispererConfig.URL_IN_POST: WhispererDefaults.URL_IN_POST, + WhispererConfig.PAGE_SEPARATOR: config.get( WhispererConfig.PAGE_SEPARATOR, WhispererDefaults.PAGE_SEPARATOR, ), + WhispererConfig.LANG: config.get( + WhispererConfig.LANG, + WhispererDefaults.LANG, + ), + WhispererConfig.TAG: config.get( + WhispererConfig.TAG, + WhispererDefaults.TAG, + ), + # Not providing default value to maintain legacy compatablity + # these are optional params and identifiers for audit + WhispererConfig.FILE_NAME: config.get(WhispererConfig.FILE_NAME), + WhispererConfig.USE_WEBHOOK: config.get(WhispererConfig.USE_WEBHOOK), + WhispererConfig.WEBHOOK_METADATA: config.get( + WhispererConfig.WEBHOOK_METADATA + ), } - if not params[WhispererConfig.FORCE_TEXT_PROCESSING]: + if params[WhispererConfig.PROCESSING_MODE] == Modes.LOW_COST.value: params.update( { - WhispererConfig.MEDIAN_FILTER_SIZE: self.config.get( + WhispererConfig.MEDIAN_FILTER_SIZE: config.get( WhispererConfig.MEDIAN_FILTER_SIZE, WhispererDefaults.MEDIAN_FILTER_SIZE, ), - WhispererConfig.GAUSSIAN_BLUR_RADIUS: self.config.get( + WhispererConfig.GAUSSIAN_BLUR_RADIUS: config.get( WhispererConfig.GAUSSIAN_BLUR_RADIUS, WhispererDefaults.GAUSSIAN_BLUR_RADIUS, ), } ) - - if enable_highlight: - params.update( - {WhispererConfig.STORE_METADATA_FOR_HIGHLIGHTING: enable_highlight} - ) return params - def test_connection(self) -> bool: - self._make_request( - request_method=HTTPMethod.GET, - request_endpoint=WhispererEndpoint.TEST_CONNECTION, - ) - return True - - def _check_status_until_ready( - self, whisper_hash: str, headers: dict[str, Any], params: dict[str, Any] + @staticmethod + def check_status_until_ready( + whisper_hash: str, headers: dict[str, Any], params: dict[str, Any] ) -> WhisperStatus: """Checks the extraction status by polling. @@ -231,7 +208,7 @@ def _check_status_until_ready( f"Checking status with interval: {POLL_INTERVAL}s" f", request count: {request_count} [max: {MAX_POLLS}]" ) - status_response = self._make_request( + status_response = LLMWhispererHelper.make_request( request_method=HTTPMethod.GET, request_endpoint=WhispererEndpoint.STATUS, headers=headers, @@ -258,7 +235,8 @@ def _check_status_until_ready( return status - def _extract_async(self, whisper_hash: str) -> str: + @staticmethod + def extract_async(whisper_hash: str) -> dict[Any, Any]: """Makes an async extraction with LLMWhisperer. Polls and checks the status first before proceeding to retrieve once. @@ -271,18 +249,18 @@ def _extract_async(self, whisper_hash: str) -> str: """ logger.info(f"Extracting async for whisper hash: {whisper_hash}") - headers: dict[str, Any] = self._get_request_headers() + headers: dict[str, Any] = LLMWhispererHelper.get_request_headers() params = { WhisperStatus.WHISPER_HASH: whisper_hash, - WhispererConfig.OUTPUT_JSON: WhispererDefaults.OUTPUT_JSON, + WhispererConfig.TEXT_ONLY: WhispererDefaults.TEXT_ONLY, } # Polls in fixed intervals and checks status - self._check_status_until_ready( + LLMWhispererHelper.check_status_until_ready( whisper_hash=whisper_hash, headers=headers, params=params ) - retrieve_response = self._make_request( + retrieve_response = LLMWhispererHelper.make_request( request_method=HTTPMethod.GET, request_endpoint=WhispererEndpoint.RETRIEVE, headers=headers, @@ -296,17 +274,18 @@ def _extract_async(self, whisper_hash: str) -> str: f"{retrieve_response.status_code} - {retrieve_response.text}" ) - def _send_whisper_request( - self, input_file_path: str, enable_highlight: bool = False + @staticmethod + def send_whisper_request( + input_file_path: str, config: dict[str, Any] ) -> requests.Response: - headers = self._get_request_headers() + headers = LLMWhispererHelper.get_request_headers(config) headers["Content-Type"] = "application/octet-stream" - params = self._get_whisper_params(enable_highlight) + params = LLMWhispererHelper.get_whisper_params(config) response: requests.Response try: with open(input_file_path, "rb") as input_f: - response = self._make_request( + response = LLMWhispererHelper.make_request( request_method=HTTPMethod.POST, request_endpoint=WhispererEndpoint.WHISPER, headers=headers, @@ -318,25 +297,27 @@ def _send_whisper_request( raise ExtractorError(str(e)) return response - def _extract_text_from_response( - self, output_file_path: Optional[str], response: requests.Response + @staticmethod + def extract_text_from_response( + output_file_path: Optional[str], response: requests.Response ) -> str: output_json = {} if response.status_code == 200: output_json = response.json() elif response.status_code == 202: whisper_hash = response.json().get(WhisperStatus.WHISPER_HASH) - output_json = self._extract_async(whisper_hash=whisper_hash) + output_json = LLMWhispererHelper.extract_async(whisper_hash=whisper_hash) else: raise ExtractorError("Couldn't extract text from file") if output_file_path: - self._write_output_to_file( + LLMWhispererHelper.write_output_to_file( output_json=output_json, output_file_path=Path(output_file_path), ) - return output_json.get("text", "") + return output_json.get("result_text", "") - def _write_output_to_file(self, output_json: dict, output_file_path: Path) -> None: + @staticmethod + def write_output_to_file(output_json: dict, output_file_path: Path) -> None: """Writes the extracted text and metadata to the specified output file and metadata file. @@ -350,7 +331,7 @@ def _write_output_to_file(self, output_json: dict, output_file_path: Path) -> No ExtractorError: If there is an error while writing the output file. """ try: - text_output = output_json.get("text", "") + text_output = output_json.get("result_text", "") logger.info(f"Writing output to {output_file_path}") output_file_path.write_text(text_output, encoding="utf-8") try: @@ -361,9 +342,11 @@ def _write_output_to_file(self, output_json: dict, output_file_path: Path) -> No metadata_file_path = metadata_dir / metadata_file_name # Ensure the metadata directory exists metadata_dir.mkdir(parents=True, exist_ok=True) - # Remove the "text" key from the metadata + # Remove the "result_text" key from the metadata metadata = { - key: value for key, value in output_json.items() if key != "text" + key: value + for key, value in output_json.items() + if key != "result_text" } metadata_json = json.dumps(metadata, ensure_ascii=False, indent=4) logger.info(f"Writing metadata to {metadata_file_path}") @@ -376,35 +359,3 @@ def _write_output_to_file(self, output_json: dict, output_file_path: Path) -> No except Exception as e: logger.error(f"Error while writing {output_file_path}: {e}") raise ExtractorError(str(e)) - - def process( - self, - input_file_path: str, - output_file_path: Optional[str] = None, - **kwargs: dict[Any, Any], - ) -> TextExtractionResult: - """Used to extract text from documents. - - Args: - input_file_path (str): Path to file that needs to be extracted - output_file_path (Optional[str], optional): File path to write - extracted text into, if None doesn't write to a file. - Defaults to None. - - Returns: - str: Extracted text - """ - - response: requests.Response = self._send_whisper_request( - input_file_path, - bool(kwargs.get(X2TextConstants.ENABLE_HIGHLIGHT, False)), - ) - - metadata = TextExtractionMetadata( - whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "") - ) - - return TextExtractionResult( - extracted_text=self._extract_text_from_response(output_file_path, response), - extraction_metadata=metadata, - ) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py new file mode 100644 index 00000000..2b9347f0 --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py @@ -0,0 +1,88 @@ +import logging +import os +from typing import Any, Optional + +import requests + +from unstract.sdk.adapters.x2text.constants import X2TextConstants +from unstract.sdk.adapters.x2text.dto import ( + TextExtractionMetadata, + TextExtractionResult, +) +from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.constants import ( + HTTPMethod, + WhispererEndpoint, +) +from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.helper import LLMWhispererHelper +from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter + +logger = logging.getLogger(__name__) + + +class LLMWhispererV2(X2TextAdapter): + def __init__(self, settings: dict[str, Any]): + super().__init__("LLMWhispererV2") + self.config = settings + + @staticmethod + def get_id() -> str: + return "llmwhisperer|a5e6b8af-3e1f-4a80-b006-d017e8e67f93" + + @staticmethod + def get_name() -> str: + return "LLMWhisperer V2" + + @staticmethod + def get_description() -> str: + return "LLMWhisperer V2 X2Text" + + @staticmethod + def get_icon() -> str: + return "/icons/adapter-icons/LLMWhispererV2.png" + + @staticmethod + def get_json_schema() -> str: + f = open(f"{os.path.dirname(__file__)}/static/json_schema.json") + schema = f.read() + f.close() + return schema + + def test_connection(self) -> bool: + LLMWhispererHelper.make_request( + request_method=HTTPMethod.GET, + request_endpoint=WhispererEndpoint.TEST_CONNECTION, + ) + return True + + def process( + self, + input_file_path: str, + output_file_path: Optional[str] = None, + **kwargs: dict[Any, Any], + ) -> TextExtractionResult: + """Used to extract text from documents. + + Args: + input_file_path (str): Path to file that needs to be extracted + output_file_path (Optional[str], optional): File path to write + extracted text into, if None doesn't write to a file. + Defaults to None. + + Returns: + str: Extracted text + """ + + response: requests.Response = LLMWhispererHelper.send_whisper_request( + input_file_path, self.config + ) + + metadata = TextExtractionMetadata( + whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "") + ) + + return TextExtractionResult( + extracted_text=LLMWhispererHelper.extract_text_from_response( + output_file_path, response + ), + extraction_metadata=metadata, + ) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json new file mode 100644 index 00000000..69280faf --- /dev/null +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json @@ -0,0 +1,156 @@ +{ + "title": "LLM Whisperer X2Text v2", + "type": "object", + "required": [ + "adapter_name", + "unstract_key", + "url" + ], + "properties": { + "adapter_name": { + "type": "string", + "title": "Name", + "default": "", + "description": "Provide a unique name for this adapter instance. Example: LLM Whisperer 1" + }, + "url": { + "type": "string", + "title": "URL", + "format": "uri", + "default": "https://llmwhisperer-api.unstract.com", + "description": "Provide the URL of the LLM Whisperer service." + }, + "unstract_key": { + "type": "string", + "title": "Unstract Key", + "format": "password", + "description": "API key obtained from the Unstract developer portal (https://unstract-api-resource.developer.azure-api.net)" + }, + "mode": { + "type": "string", + "title": "Mode", + "enum": [ + "native_text", + "low_cost", + "high_quality", + "form" + ], + "default": "form", + "description": "Native text : Extract text from native text PDFs. (not scanned). Use this mode when: You have low latency requirement, All documents are PDFs, PDFs are native text PDFs, Cost sensitive application\n Low cost : Cost effective extraction. Use this mode when: High quality scanned PDFs, High quality scanned images, No handwritten documents \n High quality : High quality extraction. Use this mode when: Medium/low quality scanned PDFs, Medium/low quality scanned images, Handwritten documents \n Form: High quality extraction + Checkbox and Radio button detection. Use this mode when: Checkbox and radio button detection, Medium/low quality scanned PDFs, Medium/low quality scanned images, Handwritten documents." + }, + "output_mode": { + "type": "string", + "title": "Output Mode", + "enum": [ + "layout_preserving", + "text" + ], + "default": "layout_preserving", + "description": "The output format. Valid options are layout_preserving and text. Layout preserving mode tries to extract the text from the document as is, maintaining the structural layout of the document. This works very well for LLM consumption. Text (text) mode extracts the text from the document without applying any processing or intelligence. This mode is useful when the layout_preserving mode is not able to extract the text properly. This can happen if the document contains too many different fonts and font sizes." + }, + "line_splitter_tolerance": { + "type": "number", + "title": "Line Splitter Tolerance", + "default": 0.4, + "description": "Factor to decide when to move text to the next line when it is above or below the baseline. The default value of 0.4 signifies 40% of the average character height" + }, + "line_splitter_strategy": { + "type": "string", + "title": "Line Splitter Strategy", + "default":"left-priority", + "description": "An advanced option for customizing the line splitting process." + }, + "horizontal_stretch_factor": { + "type": "number", + "title": "Horizontal Stretch Factor", + "default": 1.0, + "description": "Increase this value to stretch text horizontally, decrease to compress text horizontally. Useful when multi column text merge with each other." + }, + "pages_to_extract": { + "type": "string", + "title": "Page number(s) or range to extract", + "default": "", + "pattern": "^(\\s*\\d+-\\d+|\\s*\\d+-|\\s*\\d+|^$)(,\\d+-\\d+|,\\d+-|,\\d+)*$", + "description": "Specify the range of pages to extract (e.g., 1-5, 7, 10-12, 50-). Leave it empty to extract all pages." + }, + "page_seperator": { + "type": "string", + "title": "Page separator", + "default": "<<<", + "description": "Specify a pattern to separate the pages in the document (e.g., <<< {{page_no}} >>>, <<< >>>). This pattern will be inserted at the end of every page. Omit {{page_no}} if you don't want to include the page number in the separator." + }, + "mark_vertical_lines": { + "type": "boolean", + "title": "Mark vertical lines", + "default": false, + "description": "States whether to reproduce vertical lines in the document." + }, + "mark_horizontal_lines": { + "type": "boolean", + "title": "Mark horizontal lines", + "default": false, + "description": "States whether to reproduce verthorizontalical lines in the document." + }, + "lang": { + "type": "string", + "title": "Language", + "default": "eng", + "description": "The language hint to OCR. Currently auto detected. Available only in the Enterprise version." + }, + "tag": { + "type": "string", + "title": "Tag", + "default": "default", + "description": "Auditing feature. Set a value which will be associated with the invocation of the adapter. This can be used for cross referencing in usage reports." + }, + "file_name": { + "type": "string", + "title": "File Name", + "default": "default", + "description": "Auditing feature. Set a value which will be associated with the invocation of the adapter. This can be used for cross referencing in usage reports." + }, + "use_webhook": { + "type": "string", + "title": "Webhook", + "default": "", + "description": "The webhook's name which will should be called after the conversion is complete. The name should have been registered earlier using the webhooks management endpoint" + }, + "webhook_metadata": { + "type": "string", + "title": "Webhook Metadata", + "default": "", + "description": "Any metadata which should be sent to the webhook. This data is sent verbatim to the callback endpoint." + } + }, + "if": { + "anyOf": [ + { + "properties": { + "mode": { + "const": "low_cost" + } + } + } + ] + }, + "then": { + "properties": { + "median_filter_size": { + "type": "integer", + "title": "Median Filter Size", + "default": 0, + "description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version." + }, + "gaussian_blur_radius": { + "type": "number", + "title": "Gaussian Blur Radius", + "default": 0.0, + "description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version." + } + }, + "required": [ + "median_filter_size", + "gaussian_blur_radius" + ] + } +} From 4758f4690c846ef26bdf94ad3631ce1e1fa089b2 Mon Sep 17 00:00:00 2001 From: harini-venkataraman Date: Wed, 16 Oct 2024 20:33:02 +0530 Subject: [PATCH 4/9] Marked v1 as deprecated --- .../adapters/x2text/llm_whisperer/src/static/json_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json index edb29b0c..c1f1e659 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json @@ -18,7 +18,7 @@ "title": "URL", "format": "uri", "default": "https://llmwhisperer-api.unstract.com", - "description": "Provide the URL of the LLM Whisperer service." + "description": "Provide the URL of the LLM Whisperer service. Please note that this version of Whisperer is deprecated." }, "unstract_key": { "type": "string", From 8bff3b707facd790e96484e93bd329cf3978f14e Mon Sep 17 00:00:00 2001 From: harini-venkataraman Date: Wed, 16 Oct 2024 20:38:18 +0530 Subject: [PATCH 5/9] Marked v1 as deprecated --- .../sdk/adapters/x2text/llm_whisperer_v2/src/constants.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py index ed7d4bce..05d0d7e7 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py @@ -61,7 +61,6 @@ class WhispererConfig: LINE_SPLITTER_STRATEGY = "line_splitter_strategy" HORIZONTAL_STRETCH_FACTOR = "horizontal_stretch_factor" PAGES_TO_EXTRACT = "pages_to_extract" - STORE_METADATA_FOR_HIGHLIGHTING = "store_metadata_for_highlighting" MARK_VERTICAL_LINES = "mark_vertical_lines" MARK_HORIZONTAL_LINES = "mark_horizontal_lines" PAGE_SEPARATOR = "page_seperator" From 4397ecd7aef09001342cf14040c6aa193dac1829 Mon Sep 17 00:00:00 2001 From: Jaseem Jas <89440144+jaseemjaskp@users.noreply.github.com> Date: Thu, 17 Oct 2024 14:41:54 +0530 Subject: [PATCH 6/9] Update json_schema.json Signed-off-by: Jaseem Jas <89440144+jaseemjaskp@users.noreply.github.com> --- .../x2text/llm_whisperer_v2/src/static/json_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json index 69280faf..e28c74e0 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json @@ -24,7 +24,7 @@ "type": "string", "title": "Unstract Key", "format": "password", - "description": "API key obtained from the Unstract developer portal (https://unstract-api-resource.developer.azure-api.net)" + "description": "API key obtained from the Unstract developer portal (https://us-central.unstract.com/llm-whisperer)" }, "mode": { "type": "string", From a62b57d299f47398c4f660b2c4073b3ce770f898 Mon Sep 17 00:00:00 2001 From: harini-venkataraman Date: Thu, 17 Oct 2024 16:43:22 +0530 Subject: [PATCH 7/9] Minor code standization changes --- src/unstract/sdk/adapters/x2text/constants.py | 1 + .../llm_whisperer/src/static/json_schema.json | 2 +- .../x2text/llm_whisperer_v2/src/constants.py | 5 +-- .../x2text/llm_whisperer_v2/src/helper.py | 42 ++++++++++--------- .../llm_whisperer_v2/src/llm_whisperer_v2.py | 9 ++-- .../src/static/json_schema.json | 16 +------ 6 files changed, 34 insertions(+), 41 deletions(-) diff --git a/src/unstract/sdk/adapters/x2text/constants.py b/src/unstract/sdk/adapters/x2text/constants.py index 77cca1b3..44418c55 100644 --- a/src/unstract/sdk/adapters/x2text/constants.py +++ b/src/unstract/sdk/adapters/x2text/constants.py @@ -5,3 +5,4 @@ class X2TextConstants: ENABLE_HIGHLIGHT = "enable_highlight" EXTRACTED_TEXT = "extracted_text" WHISPER_HASH = "whisper-hash" + WHISPER_HASH_V2 = "whisper_hash" diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json index c1f1e659..1d36a124 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer/src/static/json_schema.json @@ -18,7 +18,7 @@ "title": "URL", "format": "uri", "default": "https://llmwhisperer-api.unstract.com", - "description": "Provide the URL of the LLM Whisperer service. Please note that this version of Whisperer is deprecated." + "description": "Provide the URL of the LLM Whisperer service. Please note that this version of LLM Whisperer is deprecated." }, "unstract_key": { "type": "string", diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py index 05d0d7e7..146b5ceb 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py @@ -52,7 +52,7 @@ class WhispererConfig: """Dictionary keys used to configure LLMWhisperer service.""" URL = "url" - PROCESSING_MODE = "mode" + MODE = "mode" OUTPUT_MODE = "output_mode" UNSTRACT_KEY = "unstract_key" MEDIAN_FILTER_SIZE = "median_filter_size" @@ -65,9 +65,7 @@ class WhispererConfig: MARK_HORIZONTAL_LINES = "mark_horizontal_lines" PAGE_SEPARATOR = "page_seperator" URL_IN_POST = "url_in_post" - LANG = "lang" TAG = "tag" - FILE_NAME = "file_name" USE_WEBHOOK = "use_webhook" WEBHOOK_METADATA = "webhook_metadata" TEXT_ONLY = "text_only" @@ -101,6 +99,5 @@ class WhispererDefaults: MARK_VERTICAL_LINES = False MARK_HORIZONTAL_LINES = False URL_IN_POST = False - LANG = "eng" TAG = "default" TEXT_ONLY = False diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py index 0e62b5d7..9fe805cd 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py @@ -66,7 +66,7 @@ def make_request( f"{config.get(WhispererConfig.URL)}" f"/api/v2/{request_endpoint}" ) if not headers: - headers = LLMWhispererHelper.get_request_headers() + headers = LLMWhispererHelper.get_request_headers(config=config) try: response: Response @@ -103,7 +103,7 @@ def make_request( return response @staticmethod - def get_whisper_params(config: dict[str, Any]) -> dict[str, Any]: + def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]: """Gets query params meant for /whisper endpoint. The params is filled based on the configuration passed. @@ -112,9 +112,7 @@ def get_whisper_params(config: dict[str, Any]) -> dict[str, Any]: dict[str, Any]: Query params """ params = { - WhispererConfig.PROCESSING_MODE: config.get( - WhispererConfig.PROCESSING_MODE, Modes.FORM.value - ), + WhispererConfig.MODE: config.get(WhispererConfig.MODE, Modes.FORM.value), WhispererConfig.OUTPUT_MODE: config.get( WhispererConfig.OUTPUT_MODE, OutputModes.LAYOUT_PRESERVING.value ), @@ -147,23 +145,18 @@ def get_whisper_params(config: dict[str, Any]) -> dict[str, Any]: WhispererConfig.PAGE_SEPARATOR, WhispererDefaults.PAGE_SEPARATOR, ), - WhispererConfig.LANG: config.get( - WhispererConfig.LANG, - WhispererDefaults.LANG, - ), WhispererConfig.TAG: config.get( WhispererConfig.TAG, WhispererDefaults.TAG, ), # Not providing default value to maintain legacy compatablity # these are optional params and identifiers for audit - WhispererConfig.FILE_NAME: config.get(WhispererConfig.FILE_NAME), WhispererConfig.USE_WEBHOOK: config.get(WhispererConfig.USE_WEBHOOK), WhispererConfig.WEBHOOK_METADATA: config.get( WhispererConfig.WEBHOOK_METADATA ), } - if params[WhispererConfig.PROCESSING_MODE] == Modes.LOW_COST.value: + if params[WhispererConfig.MODE] == Modes.LOW_COST.value: params.update( { WhispererConfig.MEDIAN_FILTER_SIZE: config.get( @@ -180,7 +173,10 @@ def get_whisper_params(config: dict[str, Any]) -> dict[str, Any]: @staticmethod def check_status_until_ready( - whisper_hash: str, headers: dict[str, Any], params: dict[str, Any] + config: dict[str, Any], + whisper_hash: str, + headers: dict[str, Any], + params: dict[str, Any], ) -> WhisperStatus: """Checks the extraction status by polling. @@ -209,6 +205,7 @@ def check_status_until_ready( f", request count: {request_count} [max: {MAX_POLLS}]" ) status_response = LLMWhispererHelper.make_request( + config=config, request_method=HTTPMethod.GET, request_endpoint=WhispererEndpoint.STATUS, headers=headers, @@ -236,7 +233,7 @@ def check_status_until_ready( return status @staticmethod - def extract_async(whisper_hash: str) -> dict[Any, Any]: + def extract_async(config: dict[str, Any], whisper_hash: str) -> dict[Any, Any]: """Makes an async extraction with LLMWhisperer. Polls and checks the status first before proceeding to retrieve once. @@ -249,7 +246,7 @@ def extract_async(whisper_hash: str) -> dict[Any, Any]: """ logger.info(f"Extracting async for whisper hash: {whisper_hash}") - headers: dict[str, Any] = LLMWhispererHelper.get_request_headers() + headers: dict[str, Any] = LLMWhispererHelper.get_request_headers(config) params = { WhisperStatus.WHISPER_HASH: whisper_hash, WhispererConfig.TEXT_ONLY: WhispererDefaults.TEXT_ONLY, @@ -257,10 +254,11 @@ def extract_async(whisper_hash: str) -> dict[Any, Any]: # Polls in fixed intervals and checks status LLMWhispererHelper.check_status_until_ready( - whisper_hash=whisper_hash, headers=headers, params=params + config=config, whisper_hash=whisper_hash, headers=headers, params=params ) retrieve_response = LLMWhispererHelper.make_request( + config=config, request_method=HTTPMethod.GET, request_endpoint=WhispererEndpoint.RETRIEVE, headers=headers, @@ -280,12 +278,13 @@ def send_whisper_request( ) -> requests.Response: headers = LLMWhispererHelper.get_request_headers(config) headers["Content-Type"] = "application/octet-stream" - params = LLMWhispererHelper.get_whisper_params(config) + params = LLMWhispererHelper.get_whisperer_params(config) response: requests.Response try: with open(input_file_path, "rb") as input_f: response = LLMWhispererHelper.make_request( + config=config, request_method=HTTPMethod.POST, request_endpoint=WhispererEndpoint.WHISPER, headers=headers, @@ -299,14 +298,19 @@ def send_whisper_request( @staticmethod def extract_text_from_response( - output_file_path: Optional[str], response: requests.Response + config: dict[str, Any], + output_file_path: Optional[str], + response_dict: dict[str, Any], + response: Response, ) -> str: output_json = {} if response.status_code == 200: output_json = response.json() elif response.status_code == 202: - whisper_hash = response.json().get(WhisperStatus.WHISPER_HASH) - output_json = LLMWhispererHelper.extract_async(whisper_hash=whisper_hash) + whisper_hash = response_dict.get(WhisperStatus.WHISPER_HASH) + output_json = LLMWhispererHelper.extract_async( + config=config, whisper_hash=whisper_hash + ) else: raise ExtractorError("Couldn't extract text from file") if output_file_path: diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py index 2b9347f0..cbc0a854 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py @@ -1,3 +1,4 @@ +import json import logging import os from typing import Any, Optional @@ -49,6 +50,7 @@ def get_json_schema() -> str: def test_connection(self) -> bool: LLMWhispererHelper.make_request( + config=self.config, request_method=HTTPMethod.GET, request_endpoint=WhispererEndpoint.TEST_CONNECTION, ) @@ -75,14 +77,15 @@ def process( response: requests.Response = LLMWhispererHelper.send_whisper_request( input_file_path, self.config ) - + response_text = response.text + reponse_dict = json.loads(response_text) metadata = TextExtractionMetadata( - whisper_hash=response.headers.get(X2TextConstants.WHISPER_HASH, "") + whisper_hash=reponse_dict.get(X2TextConstants.WHISPER_HASH_V2, "") ) return TextExtractionResult( extracted_text=LLMWhispererHelper.extract_text_from_response( - output_file_path, response + self.config, output_file_path, reponse_dict, response ), extraction_metadata=metadata, ) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json index 69280faf..c230dfe7 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json @@ -10,7 +10,7 @@ "adapter_name": { "type": "string", "title": "Name", - "default": "", + "default": "llm-whisperer-v2", "description": "Provide a unique name for this adapter instance. Example: LLM Whisperer 1" }, "url": { @@ -89,13 +89,7 @@ "type": "boolean", "title": "Mark horizontal lines", "default": false, - "description": "States whether to reproduce verthorizontalical lines in the document." - }, - "lang": { - "type": "string", - "title": "Language", - "default": "eng", - "description": "The language hint to OCR. Currently auto detected. Available only in the Enterprise version." + "description": "States whether to reproduce horizontal lines in the document." }, "tag": { "type": "string", @@ -103,12 +97,6 @@ "default": "default", "description": "Auditing feature. Set a value which will be associated with the invocation of the adapter. This can be used for cross referencing in usage reports." }, - "file_name": { - "type": "string", - "title": "File Name", - "default": "default", - "description": "Auditing feature. Set a value which will be associated with the invocation of the adapter. This can be used for cross referencing in usage reports." - }, "use_webhook": { "type": "string", "title": "Webhook", From 3ff9f205751c6994198b259de6bd54fc4350ee4a Mon Sep 17 00:00:00 2001 From: harini-venkataraman Date: Thu, 17 Oct 2024 16:50:39 +0530 Subject: [PATCH 8/9] Refactor exception handling --- .../x2text/llm_whisperer_v2/src/helper.py | 39 ++++++++----------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py index 9fe805cd..08381518 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py @@ -338,28 +338,23 @@ def write_output_to_file(output_json: dict, output_file_path: Path) -> None: text_output = output_json.get("result_text", "") logger.info(f"Writing output to {output_file_path}") output_file_path.write_text(text_output, encoding="utf-8") - try: - # Define the directory of the output file and metadata paths - output_dir = output_file_path.parent - metadata_dir = output_dir / "metadata" - metadata_file_name = output_file_path.with_suffix(".json").name - metadata_file_path = metadata_dir / metadata_file_name - # Ensure the metadata directory exists - metadata_dir.mkdir(parents=True, exist_ok=True) - # Remove the "result_text" key from the metadata - metadata = { - key: value - for key, value in output_json.items() - if key != "result_text" - } - metadata_json = json.dumps(metadata, ensure_ascii=False, indent=4) - logger.info(f"Writing metadata to {metadata_file_path}") - metadata_file_path.write_text(metadata_json, encoding="utf-8") - except Exception as e: - logger.error( - f"Error while writing metadata to {metadata_file_path}: {e}" - ) - except Exception as e: logger.error(f"Error while writing {output_file_path}: {e}") raise ExtractorError(str(e)) + try: + # Define the directory of the output file and metadata paths + output_dir = output_file_path.parent + metadata_dir = output_dir / "metadata" + metadata_file_name = output_file_path.with_suffix(".json").name + metadata_file_path = metadata_dir / metadata_file_name + # Ensure the metadata directory exists + metadata_dir.mkdir(parents=True, exist_ok=True) + # Remove the "result_text" key from the metadata + metadata = { + key: value for key, value in output_json.items() if key != "result_text" + } + metadata_json = json.dumps(metadata, ensure_ascii=False, indent=4) + logger.info(f"Writing metadata to {metadata_file_path}") + metadata_file_path.write_text(metadata_json, encoding="utf-8") + except Exception as e: + logger.warn(f"Error while writing metadata to {metadata_file_path}: {e}") From a3b8ca05850003c002e278d51bc023b6a63a7303 Mon Sep 17 00:00:00 2001 From: harini-venkataraman Date: Thu, 17 Oct 2024 16:53:07 +0530 Subject: [PATCH 9/9] Adding dev comments --- .../sdk/adapters/x2text/llm_whisperer_v2/src/helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py index 08381518..202ce646 100644 --- a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py +++ b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py @@ -145,12 +145,12 @@ def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]: WhispererConfig.PAGE_SEPARATOR, WhispererDefaults.PAGE_SEPARATOR, ), + # Not providing default value to maintain legacy compatablity + # these are optional params and identifiers for audit WhispererConfig.TAG: config.get( WhispererConfig.TAG, WhispererDefaults.TAG, ), - # Not providing default value to maintain legacy compatablity - # these are optional params and identifiers for audit WhispererConfig.USE_WEBHOOK: config.get(WhispererConfig.USE_WEBHOOK), WhispererConfig.WEBHOOK_METADATA: config.get( WhispererConfig.WEBHOOK_METADATA