diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py index 16e1590..b779092 100644 --- a/src/unstract/sdk/__init__.py +++ b/src/unstract/sdk/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.60.1" +__version__ = "0.61.0" def get_sdk_version(): diff --git a/src/unstract/sdk/prompt.py b/src/unstract/sdk/prompt.py index 0d908e6..27eed7f 100644 --- a/src/unstract/sdk/prompt.py +++ b/src/unstract/sdk/prompt.py @@ -46,6 +46,40 @@ def answer_prompt( payload=payload, params=params, ) + + @log_elapsed(operation="INDEX") + def index( + self, + payload: dict[str, Any], + params: Optional[dict[str, str]] = None, + headers: Optional[dict[str, str]] = None, + ) -> dict[str, Any]: + url_path = "index" + if self.is_public_call: + url_path = "index-public" + return self._post_call( + url_path=url_path, + payload=payload, + params=params, + headers=headers, + ) + + @log_elapsed(operation="EXTRACT") + def extract( + self, + payload: dict[str, Any], + params: Optional[dict[str, str]] = None, + headers: Optional[dict[str, str]] = None, + ) -> dict[str, Any]: + url_path = "extract" + if self.is_public_call: + url_path = "extract-public" + return self._post_call( + url_path=url_path, + payload=payload, + params=params, + headers=headers, + ) def single_pass_extraction( self, payload: dict[str, Any], params: Optional[dict[str, str]] = None diff --git a/src/unstract/sdk/utils/indexing_utils.py b/src/unstract/sdk/utils/indexing_utils.py new file mode 100644 index 0000000..8f2f032 --- /dev/null +++ b/src/unstract/sdk/utils/indexing_utils.py @@ -0,0 +1,54 @@ +import json +from typing import Optional + +from unstract.sdk.adapter import ToolAdapter +from unstract.sdk.file_storage import FileStorage, FileStorageProvider +from unstract.sdk.tool.base import BaseTool +from unstract.sdk.utils import ToolUtils + + +class IndexingUtils: + @staticmethod + def generate_index_key( + vector_db: str, + embedding: str, + x2text: str, + chunk_size: str, + chunk_overlap: str, + tool: BaseTool, + file_path: Optional[str] = None, + file_hash: Optional[str] = None, + fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL), + ) -> str: + """Generates a unique index key based on the provided configuration, + file information, instance identifiers, and processing options. + + Args: + fs (FileStorage, optional): File storage for remote storage. + + Returns: + str: A unique index key used for indexing the document. + """ + if not file_path and not file_hash: + raise ValueError("One of `file_path` or `file_hash` need to be provided") + + if not file_hash: + file_hash = fs.get_hash_from_file(path=file_path) + + # Whole adapter config is used currently even though it contains some keys + # which might not be relevant to indexing. This is easier for now than + # marking certain keys of the adapter config as necessary. + index_key = { + "file_hash": file_hash, + "vector_db_config": ToolAdapter.get_adapter_config(tool, vector_db), + "embedding_config": ToolAdapter.get_adapter_config(tool, embedding), + "x2text_config": ToolAdapter.get_adapter_config(tool, x2text), + # Typed and hashed as strings since the final hash is persisted + # and this is required to be backward compatible + "chunk_size": str(chunk_size), + "chunk_overlap": str(chunk_overlap), + } + # JSON keys are sorted to ensure that the same key gets hashed even in + # case where the fields are reordered. + hashed_index_key = ToolUtils.hash_str(json.dumps(index_key, sort_keys=True)) + return hashed_index_key