From b2ea8deb3cc189977f7a5d60f1449220022f9f6c Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Wed, 8 May 2024 08:47:31 +0530 Subject: [PATCH 1/2] Raise from exc related fixes, bumped to 0.25.1 --- src/unstract/sdk/__init__.py | 2 +- src/unstract/sdk/embedding.py | 2 +- src/unstract/sdk/exceptions.py | 4 ++++ src/unstract/sdk/index.py | 2 +- src/unstract/sdk/llm.py | 2 +- src/unstract/sdk/vector_db.py | 2 +- src/unstract/sdk/x2txt.py | 16 +++++++--------- 7 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py index e9466653..31199b5b 100644 --- a/src/unstract/sdk/__init__.py +++ b/src/unstract/sdk/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.25.0" +__version__ = "0.25.1" def get_sdk_version(): diff --git a/src/unstract/sdk/embedding.py b/src/unstract/sdk/embedding.py index 60199238..25b8f058 100644 --- a/src/unstract/sdk/embedding.py +++ b/src/unstract/sdk/embedding.py @@ -45,7 +45,7 @@ def get_embedding(self, adapter_instance_id: str) -> BaseEmbedding: self.tool.stream_log( log=f"Error getting embedding: {e}", level=LogLevel.ERROR ) - raise ToolEmbeddingError(f"Error getting embedding instance: {e}") + raise ToolEmbeddingError(f"Error getting embedding instance: {e}") from e def get_embedding_length(self, embedding: BaseEmbedding) -> int: embedding_list = embedding._get_text_embedding(self.__TEST_SNIPPET) diff --git a/src/unstract/sdk/exceptions.py b/src/unstract/sdk/exceptions.py index 1aa3463c..83e75470 100644 --- a/src/unstract/sdk/exceptions.py +++ b/src/unstract/sdk/exceptions.py @@ -29,5 +29,9 @@ class ToolVectorDBError(SdkError): DEFAULT_MESSAGE = "Error ocurred related to vector DB" +class X2TextError(SdkError): + DEFAULT_MESSAGE = "Error ocurred related to text extractor" + + class RateLimitError(SdkError): DEFAULT_MESSAGE = "Running into rate limit errors, please try again later" diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index 5c5dd4ef..605592bf 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -196,7 +196,7 @@ def index_file( f"Error deleting nodes for {doc_id}: {e}", level=LogLevel.ERROR, ) - raise SdkError(f"Error deleting nodes for {doc_id}: {e}") + raise SdkError(f"Error deleting nodes for {doc_id}: {e}") from e doc_id_found = False if doc_id_found: diff --git a/src/unstract/sdk/llm.py b/src/unstract/sdk/llm.py index 7b8b2f4c..b92b6b7f 100644 --- a/src/unstract/sdk/llm.py +++ b/src/unstract/sdk/llm.py @@ -103,7 +103,7 @@ def get_llm(self, adapter_instance_id: str) -> LLM: self.tool.stream_log( log=f"Unable to get llm instance: {e}", level=LogLevel.ERROR ) - raise ToolLLMError(f"Error getting llm instance: {e}") + raise ToolLLMError(f"Error getting llm instance: {e}") from e def get_max_tokens(self, reserved_for_output: int = 0) -> int: """Returns the maximum number of tokens that can be used for the LLM. diff --git a/src/unstract/sdk/vector_db.py b/src/unstract/sdk/vector_db.py index d40b9743..acc46563 100644 --- a/src/unstract/sdk/vector_db.py +++ b/src/unstract/sdk/vector_db.py @@ -77,4 +77,4 @@ def get_vector_db( log=f"Unable to get vector_db {adapter_instance_id}: {e}", level=LogLevel.ERROR, ) - raise ToolVectorDBError(f"Error getting vectorDB instance: {e}") + raise ToolVectorDBError(f"Error getting vectorDB instance: {e}") from e diff --git a/src/unstract/sdk/x2txt.py b/src/unstract/sdk/x2txt.py index 3809226f..003caa01 100644 --- a/src/unstract/sdk/x2txt.py +++ b/src/unstract/sdk/x2txt.py @@ -7,7 +7,7 @@ from unstract.sdk.adapters import ToolAdapter from unstract.sdk.constants import LogLevel -from unstract.sdk.exceptions import SdkError +from unstract.sdk.exceptions import X2TextError from unstract.sdk.tool.base import BaseTool @@ -28,17 +28,15 @@ def get_x2text(self, adapter_instance_id: str) -> X2TextAdapter: ][Common.ADAPTER] x2text_metadata = x2text_config.get(Common.ADAPTER_METADATA) # Add x2text service host, port and platform_service_key - x2text_metadata[ + x2text_metadata[X2TextConstants.X2TEXT_HOST] = self.tool.get_env_or_die( X2TextConstants.X2TEXT_HOST - ] = self.tool.get_env_or_die(X2TextConstants.X2TEXT_HOST) - x2text_metadata[ + ) + x2text_metadata[X2TextConstants.X2TEXT_PORT] = self.tool.get_env_or_die( X2TextConstants.X2TEXT_PORT - ] = self.tool.get_env_or_die(X2TextConstants.X2TEXT_PORT) + ) x2text_metadata[ X2TextConstants.PLATFORM_SERVICE_API_KEY - ] = self.tool.get_env_or_die( - X2TextConstants.PLATFORM_SERVICE_API_KEY - ) + ] = self.tool.get_env_or_die(X2TextConstants.PLATFORM_SERVICE_API_KEY) x2text_adapter_class = x2text_adapter(x2text_metadata) @@ -49,4 +47,4 @@ def get_x2text(self, adapter_instance_id: str) -> X2TextAdapter: log=f"Unable to get x2text adapter {adapter_instance_id}: {e}", level=LogLevel.ERROR, ) - raise SdkError(f"Error getting vectorDB instance: {e}") + raise X2TextError(f"Error getting text extractor: {e}") from e From 412ea7d6cc2ce068eca415bc00c9dca3a78bcac8 Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Thu, 9 May 2024 11:28:19 +0530 Subject: [PATCH 2/2] Index related error handling improv --- src/unstract/sdk/index.py | 60 +++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index 605592bf..7ad986d3 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -246,29 +246,33 @@ def index_file( document.id_ = doc_id documents.append(document) self.tool.stream_log(f"Number of documents: {len(documents)}") - if chunk_size == 0: - parser = SimpleNodeParser.from_defaults( - chunk_size=len(documents[0].text) + 10, chunk_overlap=0 - ) - nodes = parser.get_nodes_from_documents(documents, show_progress=True) - node = nodes[0] - node.embedding = embedding_li.get_query_embedding(" ") - vector_db_li.add(nodes=[node]) - self.tool.stream_log("Added node to vector db") - else: - storage_context = StorageContext.from_defaults(vector_store=vector_db_li) - parser = SimpleNodeParser.from_defaults( - chunk_size=chunk_size, chunk_overlap=chunk_overlap - ) - # Set callback_manager to collect Usage stats - callback_manager = UNCallbackManager.set_callback_manager( - platform_api_key=self.tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY), - embedding=embedding_li, - ) + try: + if chunk_size == 0: + parser = SimpleNodeParser.from_defaults( + chunk_size=len(documents[0].text) + 10, chunk_overlap=0 + ) + nodes = parser.get_nodes_from_documents(documents, show_progress=True) + node = nodes[0] + node.embedding = embedding_li.get_query_embedding(" ") + vector_db_li.add(nodes=[node]) + self.tool.stream_log("Added node to vector db") + else: + storage_context = StorageContext.from_defaults( + vector_store=vector_db_li + ) + parser = SimpleNodeParser.from_defaults( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + + # Set callback_manager to collect Usage stats + callback_manager = UNCallbackManager.set_callback_manager( + platform_api_key=self.tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY), + embedding=embedding_li, + ) + + self.tool.stream_log("Adding nodes to vector db...") - self.tool.stream_log("Adding nodes to vector db...") - try: VectorStoreIndex.from_documents( documents, storage_context=storage_context, @@ -277,13 +281,13 @@ def index_file( node_parser=parser, callback_manager=callback_manager, ) - except Exception as e: - self.tool.stream_log( - f"Error adding nodes to vector db: {e}", - level=LogLevel.ERROR, - ) - raise IndexingError(str(e)) from e - self.tool.stream_log("Added nodes to vector db") + except Exception as e: + self.tool.stream_log( + f"Error adding nodes to vector db: {e}", + level=LogLevel.ERROR, + ) + raise IndexingError(str(e)) from e + self.tool.stream_log("Added nodes to vector db") self.tool.stream_log("File has been indexed successfully") return doc_id