From 0db00cdec50789c48fcfa5076bfbd3d20efde5ac Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Thu, 25 Apr 2024 00:47:54 +0530 Subject: [PATCH 1/6] Index key generation fix, file hash optimized, dead code removed, version bumped to 0.23.0 --- pdm.lock | 38 +++++++------- src/unstract/sdk/__init__.py | 2 +- src/unstract/sdk/constants.py | 4 -- src/unstract/sdk/embedding.py | 25 ++------- src/unstract/sdk/index.py | 61 ++++++++++------------ src/unstract/sdk/llm.py | 68 +++++++++--------------- src/unstract/sdk/utils/tool_utils.py | 32 +++++++----- src/unstract/sdk/vector_db.py | 78 +++++++++++----------------- src/unstract/sdk/x2txt.py | 6 +-- 9 files changed, 126 insertions(+), 188 deletions(-) diff --git a/pdm.lock b/pdm.lock index f07db70c..e3cf4d02 100644 --- a/pdm.lock +++ b/pdm.lock @@ -1428,7 +1428,7 @@ files = [ [[package]] name = "llama-index-core" -version = "0.10.30" +version = "0.10.31" requires_python = "<4.0,>=3.8.1" summary = "Interface between LLMs and your data" dependencies = [ @@ -1457,8 +1457,8 @@ dependencies = [ "wrapt", ] files = [ - {file = "llama_index_core-0.10.30-py3-none-any.whl", hash = "sha256:2f291ce2975f9dbf0ea87d684d3d8122ce216265f468f32baa2cf4ecfb34ed2a"}, - {file = "llama_index_core-0.10.30.tar.gz", hash = "sha256:bed3f683606a0b0eb0839677c935a4b57b7bae509a95d380e51c6225630660e0"}, + {file = "llama_index_core-0.10.31-py3-none-any.whl", hash = "sha256:b894680fa320a94de56d9a933ac7edb646cabf15fe67ae1cf8fa53ac52ab4542"}, + {file = "llama_index_core-0.10.31.tar.gz", hash = "sha256:66d39d6f253e20311a21e0b98ea386089f099be12f2d23dbe11379a6d908ddf1"}, ] [[package]] @@ -1675,7 +1675,7 @@ files = [ [[package]] name = "llama-index-program-openai" -version = "0.1.5" +version = "0.1.6" requires_python = "<4.0,>=3.8.1" summary = "llama-index program openai integration" dependencies = [ @@ -1684,8 +1684,8 @@ dependencies = [ "llama-index-llms-openai<0.2.0,>=0.1.1", ] files = [ - {file = "llama_index_program_openai-0.1.5-py3-none-any.whl", hash = "sha256:20b6efa706ac73e4dc5086900fea1ffcb1eb0787c8a6f081669d37da7235aee0"}, - {file = "llama_index_program_openai-0.1.5.tar.gz", hash = "sha256:c33aa2d2876ad0ff1f9a2a755d4e7d4917240847d0174e7b2d0b8474499bb700"}, + {file = "llama_index_program_openai-0.1.6-py3-none-any.whl", hash = "sha256:4660b338503537c5edca1e0dab606af6ce372b4f1b597e2833c6b602447c5d8d"}, + {file = "llama_index_program_openai-0.1.6.tar.gz", hash = "sha256:c6a4980c5ea826088b28b4dee3367edb20221e6d05eb0e05019049190131d772"}, ] [[package]] @@ -2094,7 +2094,7 @@ files = [ [[package]] name = "openai" -version = "1.23.2" +version = "1.23.4" requires_python = ">=3.7.1" summary = "The official Python library for the openai API" dependencies = [ @@ -2107,8 +2107,8 @@ dependencies = [ "typing-extensions<5,>=4.7", ] files = [ - {file = "openai-1.23.2-py3-none-any.whl", hash = "sha256:293a36effde29946eb221040c89c46a4850f2f2e30b37ef09ff6d75226d71b42"}, - {file = "openai-1.23.2.tar.gz", hash = "sha256:b84aa3005357ceb38f22a269e0e22ee58ce103897f447032d021906f18178a8e"}, + {file = "openai-1.23.4-py3-none-any.whl", hash = "sha256:ecb72dcb415c8a1f1b6ef2fe32f8fc9a0942727b6365e8caedf916db5c19b180"}, + {file = "openai-1.23.4.tar.gz", hash = "sha256:72c5a2ab2cda5727b6897f9d079aec16ceccf7dd2e0e0c84a21f7304d5484563"}, ] [[package]] @@ -2303,12 +2303,12 @@ files = [ [[package]] name = "platformdirs" -version = "4.2.0" +version = "4.2.1" requires_python = ">=3.8" -summary = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +summary = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." files = [ - {file = "platformdirs-4.2.0-py3-none-any.whl", hash = "sha256:0614df2a2f37e1a662acbd8e2b25b92ccf8632929bc6d43467e17fe89c75e068"}, - {file = "platformdirs-4.2.0.tar.gz", hash = "sha256:ef0cc731df711022c174543cb70a9b5bd22e5a9337c8624ef2c2ceb8ddad8768"}, + {file = "platformdirs-4.2.1-py3-none-any.whl", hash = "sha256:17d5a1161b3fd67b390023cb2d3b026bbd40abde6fdb052dfbd3a29c3ba22ee1"}, + {file = "platformdirs-4.2.1.tar.gz", hash = "sha256:031cd18d4ec63ec53e82dceaac0417d218a6863f7745dfcc9efe7793b7039bdf"}, ] [[package]] @@ -2792,7 +2792,7 @@ files = [ [[package]] name = "referencing" -version = "0.34.0" +version = "0.35.0" requires_python = ">=3.8" summary = "JSON Referencing + Python" dependencies = [ @@ -2800,8 +2800,8 @@ dependencies = [ "rpds-py>=0.7.0", ] files = [ - {file = "referencing-0.34.0-py3-none-any.whl", hash = "sha256:d53ae300ceddd3169f1ffa9caf2cb7b769e92657e4fafb23d34b93679116dfd4"}, - {file = "referencing-0.34.0.tar.gz", hash = "sha256:5773bd84ef41799a5a8ca72dc34590c041eb01bf9aa02632b4a973fb0181a844"}, + {file = "referencing-0.35.0-py3-none-any.whl", hash = "sha256:8080727b30e364e5783152903672df9b6b091c926a146a759080b62ca3126cd6"}, + {file = "referencing-0.35.0.tar.gz", hash = "sha256:191e936b0c696d0af17ad7430a3dc68e88bc11be6514f4757dc890f04ab05889"}, ] [[package]] @@ -3583,7 +3583,7 @@ files = [ [[package]] name = "virtualenv" -version = "20.25.3" +version = "20.26.0" requires_python = ">=3.7" summary = "Virtual Python Environment builder" dependencies = [ @@ -3592,8 +3592,8 @@ dependencies = [ "platformdirs<5,>=3.9.1", ] files = [ - {file = "virtualenv-20.25.3-py3-none-any.whl", hash = "sha256:8aac4332f2ea6ef519c648d0bc48a5b1d324994753519919bddbb1aff25a104e"}, - {file = "virtualenv-20.25.3.tar.gz", hash = "sha256:7bb554bbdfeaacc3349fa614ea5bff6ac300fc7c335e9facf3a3bcfc703f45be"}, + {file = "virtualenv-20.26.0-py3-none-any.whl", hash = "sha256:0846377ea76e818daaa3e00a4365c018bc3ac9760cbb3544de542885aad61fb3"}, + {file = "virtualenv-20.26.0.tar.gz", hash = "sha256:ec25a9671a5102c8d2657f62792a27b48f016664c6873f6beed3800008577210"}, ] [[package]] diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py index 1735d618..dff86478 100644 --- a/src/unstract/sdk/__init__.py +++ b/src/unstract/sdk/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.22.0" +__version__ = "0.23.0" def get_sdk_version(): diff --git a/src/unstract/sdk/constants.py b/src/unstract/sdk/constants.py index 9a1ee282..451fab02 100644 --- a/src/unstract/sdk/constants.py +++ b/src/unstract/sdk/constants.py @@ -146,7 +146,3 @@ class ToolSettingsKey: EMBEDDING_ADAPTER_ID = "embeddingAdapterId" VECTOR_DB_ADAPTER_ID = "vectorDbAdapterId" X2TEXT_ADAPTER_ID = "x2TextAdapterId" - - -class FileReaderSettings: - FILE_READER_CHUNK_SIZE = 8192 diff --git a/src/unstract/sdk/embedding.py b/src/unstract/sdk/embedding.py index 05496838..952490f4 100644 --- a/src/unstract/sdk/embedding.py +++ b/src/unstract/sdk/embedding.py @@ -1,11 +1,9 @@ -from typing import Optional - from llama_index.core.embeddings import BaseEmbedding from unstract.adapters.constants import Common from unstract.adapters.embedding import adapters from unstract.sdk.adapters import ToolAdapter -from unstract.sdk.constants import LogLevel, ToolSettingsKey +from unstract.sdk.constants import LogLevel from unstract.sdk.exceptions import SdkError from unstract.sdk.tool.base import BaseTool @@ -13,34 +11,17 @@ class ToolEmbedding: __TEST_SNIPPET = "Hello, I am Unstract" - def __init__(self, tool: BaseTool, tool_settings: dict[str, str] = {}): + def __init__(self, tool: BaseTool): self.tool = tool self.max_tokens = 1024 * 16 self.embedding_adapters = adapters - self.embedding_adapter_instance_id = tool_settings.get( - ToolSettingsKey.EMBEDDING_ADAPTER_ID - ) - self.embedding_adapter_id: Optional[str] = None - def get_embedding( - self, adapter_instance_id: Optional[str] = None - ) -> BaseEmbedding: - adapter_instance_id = ( - adapter_instance_id - if adapter_instance_id - else self.embedding_adapter_instance_id - ) - if not adapter_instance_id: - raise SdkError( - f"Adapter_instance_id does not have " - f"a valid value: {adapter_instance_id}" - ) + def get_embedding(self, adapter_instance_id: str) -> BaseEmbedding: try: embedding_config_data = ToolAdapter.get_adapter_config( self.tool, adapter_instance_id ) embedding_adapter_id = embedding_config_data.get(Common.ADAPTER_ID) - self.embedding_adapter_id = embedding_adapter_id if embedding_adapter_id in self.embedding_adapters: embedding_adapter = self.embedding_adapters[ embedding_adapter_id diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index c615b56b..db526c9c 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -14,6 +14,7 @@ from unstract.adapters.exceptions import AdapterError from unstract.adapters.x2text.x2text_adapter import X2TextAdapter +from unstract.sdk.adapters import ToolAdapter from unstract.sdk.constants import LogLevel, ToolEnv from unstract.sdk.embedding import ToolEmbedding from unstract.sdk.exceptions import IndexingError, SdkError @@ -38,11 +39,6 @@ def get_text_from_index( embedding_li = embedd_helper.get_embedding( adapter_instance_id=embedding_type ) - if embedding_li is None: - self.tool.stream_log( - f"Error loading {embedding_type}", level=LogLevel.ERROR - ) - raise SdkError(f"Error loading {embedding_type}") embedding_dimension = embedd_helper.get_embedding_length(embedding_li) vdb_helper = ToolVectorDB( @@ -53,12 +49,6 @@ def get_text_from_index( embedding_dimension=embedding_dimension, ) - if vector_db_li is None: - self.tool.stream_log( - f"Error loading {vector_db}", level=LogLevel.ERROR - ) - raise SdkError(f"Error loading {vector_db}") - try: self.tool.stream_log(f">>> Querying {vector_db}...") self.tool.stream_log(f">>> {doc_id}") @@ -153,7 +143,7 @@ def index_file( if not file_hash: file_hash = ToolUtils.get_hash_from_file(file_path=file_path) - doc_id = ToolIndex.generate_file_id( + doc_id = self.generate_file_id( tool_id=tool_id, file_hash=file_hash, vector_db=vector_db, @@ -162,35 +152,25 @@ def index_file( chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) - self.tool.stream_log(f"Checking if doc_id {doc_id} exists") - vdb_helper = ToolVectorDB( - tool=self.tool, - ) - + # Get embedding instance embedd_helper = ToolEmbedding(tool=self.tool) - embedding_li = embedd_helper.get_embedding( adapter_instance_id=embedding_type ) - if embedding_li is None: - self.tool.stream_log( - f"Error loading {embedding_type}", level=LogLevel.ERROR - ) - raise SdkError(f"Error loading {embedding_type}") - embedding_dimension = embedd_helper.get_embedding_length(embedding_li) + + # Get vectorDB instance + vdb_helper = ToolVectorDB( + tool=self.tool, + ) vector_db_li = vdb_helper.get_vector_db( adapter_instance_id=vector_db, embedding_dimension=embedding_dimension, ) - if vector_db_li is None: - self.tool.stream_log( - f"Error loading {vector_db}", level=LogLevel.ERROR - ) - raise SdkError(f"Error loading {vector_db}") + # Checking if document is already indexed against doc_id doc_id_eq_filter = MetadataFilter.from_dict( {"key": "doc_id", "operator": FilterOperator.EQ, "value": doc_id} ) @@ -319,8 +299,8 @@ def index_file( self.tool.stream_log("File has been indexed successfully") return doc_id - @staticmethod def generate_file_id( + self, tool_id: str, file_hash: str, vector_db: str, @@ -332,7 +312,7 @@ def generate_file_id( """Generates a unique ID useful for identifying files during indexing. Args: - tool_id (str): Unique ID of the tool developed / exported + tool_id (str): Unique ID of the tool or workflow file_hash (str): Hash of the file contents vector_db (str): UUID of the vector DB adapter embedding (str): UUID of the embedding adapter @@ -343,7 +323,18 @@ def generate_file_id( Returns: str: Key representing unique ID for a file """ - return ( - f"{tool_id}|{vector_db}|{embedding}|{x2text}|" - f"{chunk_size}|{chunk_overlap}|{file_hash}" - ) + index_key = { + "tool_id": tool_id, + "file_hash": file_hash, + "vector_db_config": ToolAdapter.get_adapter_config( + self.tool, vector_db + ), + "embedding_config": ToolAdapter.get_adapter_config( + self.tool, embedding + ), + "x2text_config": ToolAdapter.get_adapter_config(self.tool, x2text), + "chunk_size": chunk_size, + "chunk_overlap": chunk_overlap, + } + hashed_index_key = ToolUtils.hash_str(ToolUtils.json_to_str(index_key)) + return hashed_index_key diff --git a/src/unstract/sdk/llm.py b/src/unstract/sdk/llm.py index 95498764..39a3f8ce 100644 --- a/src/unstract/sdk/llm.py +++ b/src/unstract/sdk/llm.py @@ -9,7 +9,7 @@ from unstract.adapters.llm.llm_adapter import LLMAdapter from unstract.sdk.adapters import ToolAdapter -from unstract.sdk.constants import LogLevel, ToolSettingsKey +from unstract.sdk.constants import LogLevel from unstract.sdk.exceptions import SdkError from unstract.sdk.tool.base import BaseTool from unstract.sdk.utils.callback_manager import ( @@ -24,12 +24,8 @@ class ToolLLM: json_regex = re.compile(r"\{(?:.|\n)*\}") - def __init__( - self, - tool: BaseTool, - tool_settings: dict[str, str] = {}, - ): - """ + def __init__(self, tool: BaseTool): + """ToolLLM constructor. Notes: - "Azure OpenAI" : Environment variables required @@ -42,9 +38,7 @@ def __init__( self.tool = tool self.max_tokens = 1024 * 4 self.llm_adapters = adapters - self.llm_adapter_instance_id = tool_settings.get( - ToolSettingsKey.LLM_ADAPTER_ID - ) + self.llm_config_data: Optional[dict[str, Any]] = None @classmethod def run_completion( @@ -90,47 +84,35 @@ def run_completion( time.sleep(5) return None - def get_llm(self, adapter_instance_id: Optional[str] = None) -> LLM: + def get_llm(self, adapter_instance_id: str) -> LLM: """Returns the LLM object for the tool. Returns: LLM: The LLM object for the tool. (llama_index.llms.base.LLM) """ - adapter_instance_id = ( - adapter_instance_id - if adapter_instance_id - else self.llm_adapter_instance_id - ) - # Support for get_llm using adapter_instance_id - if adapter_instance_id is not None: - try: - llm_config_data = ToolAdapter.get_adapter_config( - self.tool, adapter_instance_id - ) - llm_adapter_id = llm_config_data.get(Common.ADAPTER_ID) - if llm_adapter_id in self.llm_adapters: - llm_adapter = self.llm_adapters[llm_adapter_id][ - Common.METADATA - ][Common.ADAPTER] - llm_metadata = llm_config_data.get(Common.ADAPTER_METADATA) - llm_adapter_class: LLMAdapter = llm_adapter(llm_metadata) - llm_instance: LLM = llm_adapter_class.get_llm_instance() - return llm_instance - else: - raise SdkError( - f"LLM adapter not supported : " f"{llm_adapter_id}" - ) - except Exception as e: - self.tool.stream_log( - log=f"Unable to get llm instance: {e}", level=LogLevel.ERROR + try: + llm_config_data = ToolAdapter.get_adapter_config( + self.tool, adapter_instance_id + ) + llm_adapter_id = llm_config_data.get(Common.ADAPTER_ID) + if llm_adapter_id in self.llm_adapters: + llm_adapter = self.llm_adapters[llm_adapter_id][ + Common.METADATA + ][Common.ADAPTER] + llm_metadata = llm_config_data.get(Common.ADAPTER_METADATA) + llm_adapter_class: LLMAdapter = llm_adapter(llm_metadata) + llm_instance: LLM = llm_adapter_class.get_llm_instance() + return llm_instance + else: + raise SdkError( + f"LLM adapter not supported : " f"{llm_adapter_id}" ) - raise SdkError(f"Error getting llm instance: {e}") - else: - raise SdkError( - f"Adapter_instance_id does not have " - f"a valid value: {adapter_instance_id}" + except Exception as e: + self.tool.stream_log( + log=f"Unable to get llm instance: {e}", level=LogLevel.ERROR ) + raise SdkError(f"Error getting llm instance: {e}") def get_max_tokens(self, reserved_for_output: int = 0) -> int: """Returns the maximum number of tokens that can be used for the LLM. diff --git a/src/unstract/sdk/utils/tool_utils.py b/src/unstract/sdk/utils/tool_utils.py index a66c8b33..ee70ff5e 100644 --- a/src/unstract/sdk/utils/tool_utils.py +++ b/src/unstract/sdk/utils/tool_utils.py @@ -5,8 +5,6 @@ import magic -from unstract.sdk.constants import FileReaderSettings - class ToolUtils: """Class containing utility methods.""" @@ -38,18 +36,24 @@ def hash_str(string_to_hash: Any, hash_method: str = "sha256") -> str: raise ValueError(f"Unsupported hash_method: {hash_method}") @staticmethod - def get_hash_from_file(file_path: str): - hashes = [] - chunk_size = FileReaderSettings.FILE_READER_CHUNK_SIZE - - with open(file_path, "rb") as f: - while True: - chunk = f.read(chunk_size) - if not chunk: - break # End of file - hashes.append(ToolUtils.hash_str(chunk)) - hash_value = ToolUtils.hash_str("".join(hashes)) - return hash_value + def get_hash_from_file(file_path: str) -> str: + """Computes the hash for a file. + + Uses sha256 to compute the file hash through a buffered read. + + Args: + file_path (str): Path to file that needs to be hashed + + Returns: + str: SHA256 hash of the file + """ + h = sha256() + b = bytearray(128 * 1024) + mv = memoryview(b) + with open(file_path, "rb", buffering=0) as f: + while n := f.readinto(mv): + h.update(mv[:n]) + return str(h.hexdigest()) @staticmethod def load_json(file_to_load: str) -> dict[str, Any]: diff --git a/src/unstract/sdk/vector_db.py b/src/unstract/sdk/vector_db.py index 08c2ce4b..fb8b853e 100644 --- a/src/unstract/sdk/vector_db.py +++ b/src/unstract/sdk/vector_db.py @@ -10,7 +10,7 @@ from unstract.adapters.vectordb.constants import VectorDbConstants from unstract.sdk.adapters import ToolAdapter -from unstract.sdk.constants import LogLevel, ToolEnv, ToolSettingsKey +from unstract.sdk.constants import LogLevel, ToolEnv from unstract.sdk.exceptions import SdkError from unstract.sdk.platform import PlatformHelper from unstract.sdk.tool.base import BaseTool @@ -21,12 +21,9 @@ class ToolVectorDB: """Class to handle VectorDB for Unstract Tools.""" - def __init__(self, tool: BaseTool, tool_settings: dict[str, str] = {}): + def __init__(self, tool: BaseTool): self.tool = tool self.vector_db_adapters = adapters - self.vector_db_adapter_instance_id = tool_settings.get( - ToolSettingsKey.VECTOR_DB_ADAPTER_ID - ) def __get_org_id(self) -> str: platform_helper = PlatformHelper( @@ -45,49 +42,36 @@ def __get_org_id(self) -> str: def get_vector_db( self, adapter_instance_id: str, embedding_dimension: int ) -> Union[BasePydanticVectorStore, VectorStore]: - adapter_instance_id = ( - adapter_instance_id - if adapter_instance_id - else self.vector_db_adapter_instance_id - ) - if adapter_instance_id is not None: - try: - vector_db_config = ToolAdapter.get_adapter_config( - self.tool, adapter_instance_id + try: + vector_db_config = ToolAdapter.get_adapter_config( + self.tool, adapter_instance_id + ) + vector_db_adapter_id = vector_db_config.get(Common.ADAPTER_ID) + if vector_db_adapter_id in self.vector_db_adapters: + vector_db_adapter = self.vector_db_adapters[ + vector_db_adapter_id + ][Common.METADATA][Common.ADAPTER] + vector_db_metadata = vector_db_config.get( + Common.ADAPTER_METADATA ) - vector_db_adapter_id = vector_db_config.get(Common.ADAPTER_ID) - if vector_db_adapter_id in self.vector_db_adapters: - vector_db_adapter = self.vector_db_adapters[ - vector_db_adapter_id - ][Common.METADATA][Common.ADAPTER] - vector_db_metadata = vector_db_config.get( - Common.ADAPTER_METADATA - ) - org = self.__get_org_id() - # Adding the collection prefix and embedding type - # to the metadata - vector_db_metadata[VectorDbConstants.VECTOR_DB_NAME] = org - vector_db_metadata[ - VectorDbConstants.EMBEDDING_DIMENSION - ] = embedding_dimension + org = self.__get_org_id() + # Adding the collection prefix and embedding type + # to the metadata + vector_db_metadata[VectorDbConstants.VECTOR_DB_NAME] = org + vector_db_metadata[ + VectorDbConstants.EMBEDDING_DIMENSION + ] = embedding_dimension - vector_db_adapter_class = vector_db_adapter( - vector_db_metadata - ) - return vector_db_adapter_class.get_vector_db_instance() - else: - raise SdkError( - f"VectorDB adapter not supported : " - f"{vector_db_adapter_id}" - ) - except Exception as e: - self.tool.stream_log( - log=f"Unable to get vector_db {adapter_instance_id}: {e}", - level=LogLevel.ERROR, + vector_db_adapter_class = vector_db_adapter(vector_db_metadata) + return vector_db_adapter_class.get_vector_db_instance() + else: + raise SdkError( + f"VectorDB adapter not supported : " + f"{vector_db_adapter_id}" ) - raise SdkError(f"Error getting vectorDB instance: {e}") - else: - raise SdkError( - f"Adapter_instance_id does not have " - f"a valid value: {adapter_instance_id}" + except Exception as e: + self.tool.stream_log( + log=f"Unable to get vector_db {adapter_instance_id}: {e}", + level=LogLevel.ERROR, ) + raise SdkError(f"Error getting vectorDB instance: {e}") diff --git a/src/unstract/sdk/x2txt.py b/src/unstract/sdk/x2txt.py index d6e37327..3809226f 100644 --- a/src/unstract/sdk/x2txt.py +++ b/src/unstract/sdk/x2txt.py @@ -1,5 +1,4 @@ from abc import ABCMeta -from typing import Optional from unstract.adapters.constants import Common from unstract.adapters.x2text import adapters @@ -8,6 +7,7 @@ from unstract.sdk.adapters import ToolAdapter from unstract.sdk.constants import LogLevel +from unstract.sdk.exceptions import SdkError from unstract.sdk.tool.base import BaseTool @@ -16,7 +16,7 @@ def __init__(self, tool: BaseTool): self.tool = tool self.x2text_adapters = adapters - def get_x2text(self, adapter_instance_id: str) -> Optional[X2TextAdapter]: + def get_x2text(self, adapter_instance_id: str) -> X2TextAdapter: try: x2text_config = ToolAdapter.get_adapter_config( self.tool, adapter_instance_id @@ -49,4 +49,4 @@ def get_x2text(self, adapter_instance_id: str) -> Optional[X2TextAdapter]: log=f"Unable to get x2text adapter {adapter_instance_id}: {e}", level=LogLevel.ERROR, ) - return None + raise SdkError(f"Error getting vectorDB instance: {e}") From 181215edd60368ddec344f1c9c2a19f2ac04035d Mon Sep 17 00:00:00 2001 From: Chandrasekharan M <117059509+chandrasekharan-zipstack@users.noreply.github.com> Date: Thu, 25 Apr 2024 18:27:43 +0530 Subject: [PATCH 2/6] Update src/unstract/sdk/llm.py Co-authored-by: Ritwik G <100672805+ritwik-g@users.noreply.github.com> Signed-off-by: Chandrasekharan M <117059509+chandrasekharan-zipstack@users.noreply.github.com> --- src/unstract/sdk/llm.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/unstract/sdk/llm.py b/src/unstract/sdk/llm.py index 39a3f8ce..99e15b1d 100644 --- a/src/unstract/sdk/llm.py +++ b/src/unstract/sdk/llm.py @@ -96,18 +96,18 @@ def get_llm(self, adapter_instance_id: str) -> LLM: self.tool, adapter_instance_id ) llm_adapter_id = llm_config_data.get(Common.ADAPTER_ID) - if llm_adapter_id in self.llm_adapters: - llm_adapter = self.llm_adapters[llm_adapter_id][ - Common.METADATA - ][Common.ADAPTER] - llm_metadata = llm_config_data.get(Common.ADAPTER_METADATA) - llm_adapter_class: LLMAdapter = llm_adapter(llm_metadata) - llm_instance: LLM = llm_adapter_class.get_llm_instance() - return llm_instance - else: + if llm_adapter_id not in self.llm_adapters: raise SdkError( f"LLM adapter not supported : " f"{llm_adapter_id}" ) + + llm_adapter = self.llm_adapters[llm_adapter_id][ + Common.METADATA + ][Common.ADAPTER] + llm_metadata = llm_config_data.get(Common.ADAPTER_METADATA) + llm_adapter_class: LLMAdapter = llm_adapter(llm_metadata) + llm_instance: LLM = llm_adapter_class.get_llm_instance() + return llm_instance except Exception as e: self.tool.stream_log( log=f"Unable to get llm instance: {e}", level=LogLevel.ERROR From 92d5e771151e8fa20c53a717b717016a555106f3 Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Thu, 25 Apr 2024 18:35:19 +0530 Subject: [PATCH 3/6] Added docstrings, minor PR comments addressed --- src/unstract/sdk/embedding.py | 28 +++++++++++++++-------- src/unstract/sdk/vector_db.py | 43 ++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/src/unstract/sdk/embedding.py b/src/unstract/sdk/embedding.py index 952490f4..79238241 100644 --- a/src/unstract/sdk/embedding.py +++ b/src/unstract/sdk/embedding.py @@ -17,25 +17,33 @@ def __init__(self, tool: BaseTool): self.embedding_adapters = adapters def get_embedding(self, adapter_instance_id: str) -> BaseEmbedding: + """Gets an instance of LlamaIndex's embedding object. + + Args: + adapter_instance_id (str): UUID of the embedding adapter + + Returns: + BaseEmbedding: Embedding instance + """ try: embedding_config_data = ToolAdapter.get_adapter_config( self.tool, adapter_instance_id ) embedding_adapter_id = embedding_config_data.get(Common.ADAPTER_ID) - if embedding_adapter_id in self.embedding_adapters: - embedding_adapter = self.embedding_adapters[ - embedding_adapter_id - ][Common.METADATA][Common.ADAPTER] - embedding_metadata = embedding_config_data.get( - Common.ADAPTER_METADATA - ) - embedding_adapter_class = embedding_adapter(embedding_metadata) - return embedding_adapter_class.get_embedding_instance() - else: + if embedding_adapter_id not in self.embedding_adapters: raise SdkError( f"Embedding adapter not supported : " f"{embedding_adapter_id}" ) + + embedding_adapter = self.embedding_adapters[embedding_adapter_id][ + Common.METADATA + ][Common.ADAPTER] + embedding_metadata = embedding_config_data.get( + Common.ADAPTER_METADATA + ) + embedding_adapter_class = embedding_adapter(embedding_metadata) + return embedding_adapter_class.get_embedding_instance() except Exception as e: self.tool.stream_log( log=f"Error getting embedding: {e}", level=LogLevel.ERROR diff --git a/src/unstract/sdk/vector_db.py b/src/unstract/sdk/vector_db.py index fb8b853e..9c015be7 100644 --- a/src/unstract/sdk/vector_db.py +++ b/src/unstract/sdk/vector_db.py @@ -42,33 +42,40 @@ def __get_org_id(self) -> str: def get_vector_db( self, adapter_instance_id: str, embedding_dimension: int ) -> Union[BasePydanticVectorStore, VectorStore]: + """Gets an instance of LlamaIndex's VectorStore. + + Args: + adapter_instance_id (str): UUID of the vector DB adapter + embedding_dimension (int): Embedding dimension for the vector store + + Returns: + Union[BasePydanticVectorStore, VectorStore]: Vector store instance + """ try: vector_db_config = ToolAdapter.get_adapter_config( self.tool, adapter_instance_id ) vector_db_adapter_id = vector_db_config.get(Common.ADAPTER_ID) - if vector_db_adapter_id in self.vector_db_adapters: - vector_db_adapter = self.vector_db_adapters[ - vector_db_adapter_id - ][Common.METADATA][Common.ADAPTER] - vector_db_metadata = vector_db_config.get( - Common.ADAPTER_METADATA - ) - org = self.__get_org_id() - # Adding the collection prefix and embedding type - # to the metadata - vector_db_metadata[VectorDbConstants.VECTOR_DB_NAME] = org - vector_db_metadata[ - VectorDbConstants.EMBEDDING_DIMENSION - ] = embedding_dimension - - vector_db_adapter_class = vector_db_adapter(vector_db_metadata) - return vector_db_adapter_class.get_vector_db_instance() - else: + if vector_db_adapter_id not in self.vector_db_adapters: raise SdkError( f"VectorDB adapter not supported : " f"{vector_db_adapter_id}" ) + + vector_db_adapter = self.vector_db_adapters[vector_db_adapter_id][ + Common.METADATA + ][Common.ADAPTER] + vector_db_metadata = vector_db_config.get(Common.ADAPTER_METADATA) + org = self.__get_org_id() + # Adding the collection prefix and embedding type + # to the metadata + vector_db_metadata[VectorDbConstants.VECTOR_DB_NAME] = org + vector_db_metadata[ + VectorDbConstants.EMBEDDING_DIMENSION + ] = embedding_dimension + + vector_db_adapter_class = vector_db_adapter(vector_db_metadata) + return vector_db_adapter_class.get_vector_db_instance() except Exception as e: self.tool.stream_log( log=f"Unable to get vector_db {adapter_instance_id}: {e}", From 25100d9dbef941c97616104464868feba69f28ac Mon Sep 17 00:00:00 2001 From: Chandrasekharan M <117059509+chandrasekharan-zipstack@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:00:30 +0530 Subject: [PATCH 4/6] Version bumped to 0.24.0 Signed-off-by: Chandrasekharan M <117059509+chandrasekharan-zipstack@users.noreply.github.com> --- src/unstract/sdk/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py index dff86478..5c5e7053 100644 --- a/src/unstract/sdk/__init__.py +++ b/src/unstract/sdk/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.23.0" +__version__ = "0.24.0" def get_sdk_version(): From 992e153a0c5eeecbade2f6238628c8fd295fb168 Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Tue, 30 Apr 2024 18:03:25 +0530 Subject: [PATCH 5/6] Addressed PR comments, sort JSON keys and avoid requiring file_hash for key generation --- .pre-commit-config.yaml | 4 +-- pyproject.toml | 2 +- src/unstract/sdk/index.py | 61 ++++++++++++++++----------------------- 3 files changed, 28 insertions(+), 39 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7c8f98e9..b5ea5f5c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,7 +50,7 @@ repos: rev: 24.2.0 hooks: - id: black - args: [--config=pyproject.toml, -l 80] + args: [--config=pyproject.toml, -l 88] language: system exclude: | (?x)^( @@ -60,7 +60,7 @@ repos: rev: 7.0.0 hooks: - id: flake8 - args: [--max-line-length=80] + args: [--max-line-length=88] exclude: | (?x)^( .*migrations/.*\.py| diff --git a/pyproject.toml b/pyproject.toml index db114345..3f027330 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ lint = [ ] [tool.isort] -line_length = 80 +line_length = 88 multi_line_output = 3 include_trailing_comma = true force_grid_wrap = 0 diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index db526c9c..7a6b024c 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -1,3 +1,4 @@ +import json from typing import Optional from llama_index.core import Document @@ -20,9 +21,7 @@ from unstract.sdk.exceptions import IndexingError, SdkError from unstract.sdk.tool.base import BaseTool from unstract.sdk.utils import ToolUtils -from unstract.sdk.utils.callback_manager import ( - CallbackManager as UNCallbackManager, -) +from unstract.sdk.utils.callback_manager import CallbackManager as UNCallbackManager from unstract.sdk.vector_db import ToolVectorDB from unstract.sdk.x2txt import X2Text @@ -32,13 +31,9 @@ def __init__(self, tool: BaseTool): # TODO: Inherit from StreamMixin and avoid using BaseTool self.tool = tool - def get_text_from_index( - self, embedding_type: str, vector_db: str, doc_id: str - ): + def get_text_from_index(self, embedding_type: str, vector_db: str, doc_id: str): embedd_helper = ToolEmbedding(tool=self.tool) - embedding_li = embedd_helper.get_embedding( - adapter_instance_id=embedding_type - ) + embedding_li = embedd_helper.get_embedding(adapter_instance_id=embedding_type) embedding_dimension = embedd_helper.get_embedding_length(embedding_li) vdb_helper = ToolVectorDB( @@ -139,26 +134,20 @@ def index_file( Returns: str: A unique ID for the file and indexing arguments combination """ - # Make file content hash if not available - if not file_hash: - file_hash = ToolUtils.get_hash_from_file(file_path=file_path) - doc_id = self.generate_file_id( tool_id=tool_id, file_hash=file_hash, vector_db=vector_db, embedding=embedding_type, x2text=x2text_adapter, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, + chunk_size=str(chunk_size), + chunk_overlap=str(chunk_overlap), ) self.tool.stream_log(f"Checking if doc_id {doc_id} exists") # Get embedding instance embedd_helper = ToolEmbedding(tool=self.tool) - embedding_li = embedd_helper.get_embedding( - adapter_instance_id=embedding_type - ) + embedding_li = embedd_helper.get_embedding(adapter_instance_id=embedding_type) embedding_dimension = embedd_helper.get_embedding_length(embedding_li) # Get vectorDB instance @@ -255,26 +244,20 @@ def index_file( parser = SimpleNodeParser.from_defaults( chunk_size=len(documents[0].text) + 10, chunk_overlap=0 ) - nodes = parser.get_nodes_from_documents( - documents, show_progress=True - ) + nodes = parser.get_nodes_from_documents(documents, show_progress=True) node = nodes[0] node.embedding = embedding_li.get_query_embedding(" ") vector_db_li.add(nodes=[node]) self.tool.stream_log("Added node to vector db") else: - storage_context = StorageContext.from_defaults( - vector_store=vector_db_li - ) + storage_context = StorageContext.from_defaults(vector_store=vector_db_li) parser = SimpleNodeParser.from_defaults( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) # Set callback_manager to collect Usage stats callback_manager = UNCallbackManager.set_callback_manager( - platform_api_key=self.tool.get_env_or_die( - ToolEnv.PLATFORM_API_KEY - ), + platform_api_key=self.tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY), embedding=embedding_li, ) @@ -302,39 +285,45 @@ def index_file( def generate_file_id( self, tool_id: str, - file_hash: str, vector_db: str, embedding: str, x2text: str, chunk_size: str, chunk_overlap: str, + file_path: Optional[str] = None, + file_hash: Optional[str] = None, ) -> str: """Generates a unique ID useful for identifying files during indexing. Args: tool_id (str): Unique ID of the tool or workflow - file_hash (str): Hash of the file contents vector_db (str): UUID of the vector DB adapter embedding (str): UUID of the embedding adapter x2text (str): UUID of the X2Text adapter chunk_size (str): Chunk size for indexing chunk_overlap (str): Chunk overlap for indexing + file_path (Optional[str]): Path to the file that needs to be indexed. + Defaults to None. One of file_path or file_hash needs to be specified. + file_hash (Optional[str], optional): SHA256 hash of the file. + Defaults to None. If None, the hash is generated with file_path. Returns: str: Key representing unique ID for a file """ + if not file_path and not file_hash: + raise ValueError("One of `file_path` or `file_hash` need to be provided") + + if not file_hash: + file_hash = ToolUtils.get_hash_from_file(file_path=file_path) + index_key = { "tool_id": tool_id, "file_hash": file_hash, - "vector_db_config": ToolAdapter.get_adapter_config( - self.tool, vector_db - ), - "embedding_config": ToolAdapter.get_adapter_config( - self.tool, embedding - ), + "vector_db_config": ToolAdapter.get_adapter_config(self.tool, vector_db), + "embedding_config": ToolAdapter.get_adapter_config(self.tool, embedding), "x2text_config": ToolAdapter.get_adapter_config(self.tool, x2text), "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, } - hashed_index_key = ToolUtils.hash_str(ToolUtils.json_to_str(index_key)) + hashed_index_key = ToolUtils.hash_str(json.dumps(index_key, sort_keys=True)) return hashed_index_key From 94519f598e9c7b1dd2c1dac9a28eb1697266dd90 Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Tue, 30 Apr 2024 19:31:05 +0530 Subject: [PATCH 6/6] Minor fix, added comments --- src/unstract/sdk/index.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index 7a6b024c..59aa38bc 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -136,12 +136,13 @@ def index_file( """ doc_id = self.generate_file_id( tool_id=tool_id, - file_hash=file_hash, vector_db=vector_db, embedding=embedding_type, x2text=x2text_adapter, chunk_size=str(chunk_size), chunk_overlap=str(chunk_overlap), + file_path=file_path, + file_hash=file_hash, ) self.tool.stream_log(f"Checking if doc_id {doc_id} exists") @@ -316,6 +317,9 @@ def generate_file_id( if not file_hash: file_hash = ToolUtils.get_hash_from_file(file_path=file_path) + # Whole adapter config is used currently even though it contains some keys + # which might not be relevant to indexing. This is easier for now than + # marking certain keys of the adapter config as necessary. index_key = { "tool_id": tool_id, "file_hash": file_hash, @@ -325,5 +329,7 @@ def generate_file_id( "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, } + # JSON keys are sorted to ensure that the same key gets hashed even in + # case where the fields are reordered. hashed_index_key = ToolUtils.hash_str(json.dumps(index_key, sort_keys=True)) return hashed_index_key