diff --git a/pdm.lock b/pdm.lock index 64b7411e..aa4b4bda 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "docs", "lint", "test"] strategy = ["cross_platform"] lock_version = "4.4.1" -content_hash = "sha256:9f6e7db5ef796871022d62a1def86f37b18961d4397f0b64e7ec473093d73b63" +content_hash = "sha256:7426a23af99cd74eb85ff69795f7a67ac7f900de9745191d4d82947393892a84" [[package]] name = "aiohttp" @@ -352,22 +352,22 @@ files = [ [[package]] name = "boto3" -version = "1.34.53" +version = "1.34.57" requires_python = ">= 3.8" summary = "The AWS SDK for Python" dependencies = [ - "botocore<1.35.0,>=1.34.53", + "botocore<1.35.0,>=1.34.57", "jmespath<2.0.0,>=0.7.1", "s3transfer<0.11.0,>=0.10.0", ] files = [ - {file = "boto3-1.34.53-py3-none-any.whl", hash = "sha256:340c73f57fcca6f503403e2e13a0a4ad44bec218feee2e0896be612324394afd"}, - {file = "boto3-1.34.53.tar.gz", hash = "sha256:cd30261a782824ce543a628ae524480abb4ca6ab4e4a2631477e48baed43b5f2"}, + {file = "boto3-1.34.57-py3-none-any.whl", hash = "sha256:f8046e3e2d1186a49b49f7464c4811c265c86001f404dd1a96c4365c773a4245"}, + {file = "boto3-1.34.57.tar.gz", hash = "sha256:c26c31ceeeb2bc5d2bb96ba0fdc9a04d7b10e6e0b081c55b9cea9069a0be04dd"}, ] [[package]] name = "botocore" -version = "1.34.53" +version = "1.34.57" requires_python = ">= 3.8" summary = "Low-level, data-driven core of boto 3." dependencies = [ @@ -377,8 +377,8 @@ dependencies = [ "urllib3<2.1,>=1.25.4; python_version >= \"3.10\"", ] files = [ - {file = "botocore-1.34.53-py3-none-any.whl", hash = "sha256:cbbcaddc35738d32df55d26ed5561cf3fa32751a6b22e7e342be87b5e3f55eec"}, - {file = "botocore-1.34.53.tar.gz", hash = "sha256:3d243781e994dfc5b20036d9fb92672bfaef4dbe388eaa79dae6440ea56c53eb"}, + {file = "botocore-1.34.57-py3-none-any.whl", hash = "sha256:c8dafe0ad378a88bcf4153e6972870b03fb5aab406b694202307500709940baf"}, + {file = "botocore-1.34.57.tar.gz", hash = "sha256:9a5aa2034de9f0c367b4b61a92af0fa827f5c21affa19e0a284838a142e71083"}, ] [[package]] @@ -759,11 +759,11 @@ files = [ [[package]] name = "flatbuffers" -version = "23.5.26" +version = "24.3.6" summary = "The FlatBuffers serialization format for Python" files = [ - {file = "flatbuffers-23.5.26-py2.py3-none-any.whl", hash = "sha256:c0ff356da363087b915fde4b8b45bdda73432fc17cddb3c8157472eab1422ad1"}, - {file = "flatbuffers-23.5.26.tar.gz", hash = "sha256:9ea1144cac05ce5d86e2859f431c6cd5e66cd9c78c558317c7955fb8d4c78d89"}, + {file = "flatbuffers-24.3.6-py2.py3-none-any.whl", hash = "sha256:0bc1a9d968c0ba996f97b8c255214ec005cf21b962ef60157f7aa2fc647481f1"}, + {file = "flatbuffers-24.3.6.tar.gz", hash = "sha256:8d90a756ad5754be1fcdaa77065065125c9832ed045b4078875b4d3bc1953352"}, ] [[package]] @@ -954,7 +954,7 @@ files = [ [[package]] name = "google-cloud-bigquery" -version = "3.17.2" +version = "3.18.0" requires_python = ">=3.7" summary = "Google BigQuery API client library" dependencies = [ @@ -966,8 +966,8 @@ dependencies = [ "requests<3.0.0dev,>=2.21.0", ] files = [ - {file = "google-cloud-bigquery-3.17.2.tar.gz", hash = "sha256:6e1cf669a40e567ab3289c7b5f2056363da9fcb85d9a4736ee90240d4a7d84ea"}, - {file = "google_cloud_bigquery-3.17.2-py2.py3-none-any.whl", hash = "sha256:cdadf5283dca55a1a350bacf8c8a7466169d3cf46c5a0a3abc5e9aa0b0a51dee"}, + {file = "google-cloud-bigquery-3.18.0.tar.gz", hash = "sha256:74f0fc6f0ba9477f808d25924dc8a052c55f7ca91064e83e16d3ee5fb7ca77ab"}, + {file = "google_cloud_bigquery-3.18.0-py2.py3-none-any.whl", hash = "sha256:3520552075502c69710d37b1e9600c84e6974ad271914677d16bfafa502857fb"}, ] [[package]] @@ -986,37 +986,37 @@ files = [ [[package]] name = "google-cloud-resource-manager" -version = "1.12.2" +version = "1.12.3" requires_python = ">=3.7" summary = "Google Cloud Resource Manager API client library" dependencies = [ "google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1", - "google-auth<3.0.0dev,>=2.14.1", + "google-auth!=2.24.0,!=2.25.0,<3.0.0dev,>=2.14.1", "grpc-google-iam-v1<1.0.0dev,>=0.12.4", "proto-plus<2.0.0dev,>=1.22.3", "protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5", ] files = [ - {file = "google-cloud-resource-manager-1.12.2.tar.gz", hash = "sha256:2ede446a5087b236f0e1fb39cca3791bae97eb0d9125057401454b190d5572ee"}, - {file = "google_cloud_resource_manager-1.12.2-py2.py3-none-any.whl", hash = "sha256:45abbb8911195cc831cc77c8e3be84decc271686579b332d4142af507f423ebf"}, + {file = "google-cloud-resource-manager-1.12.3.tar.gz", hash = "sha256:809851824119834e4f2310b2c4f38621c1d16b2bb14d5b9f132e69c79d355e7f"}, + {file = "google_cloud_resource_manager-1.12.3-py2.py3-none-any.whl", hash = "sha256:92be7d6959927b76d90eafc4028985c37975a46ded5466a018f02e8649e113d4"}, ] [[package]] name = "google-cloud-storage" -version = "2.14.0" +version = "2.15.0" requires_python = ">=3.7" summary = "Google Cloud Storage API client library" dependencies = [ - "google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5", - "google-auth<3.0dev,>=2.23.3", + "google-api-core<3.0.0dev,>=2.15.0", + "google-auth<3.0dev,>=2.26.1", "google-cloud-core<3.0dev,>=2.3.0", "google-crc32c<2.0dev,>=1.0", "google-resumable-media>=2.6.0", "requests<3.0.0dev,>=2.18.0", ] files = [ - {file = "google-cloud-storage-2.14.0.tar.gz", hash = "sha256:2d23fcf59b55e7b45336729c148bb1c464468c69d5efbaee30f7201dd90eb97e"}, - {file = "google_cloud_storage-2.14.0-py2.py3-none-any.whl", hash = "sha256:8641243bbf2a2042c16a6399551fbb13f062cbc9a2de38d6c0bb5426962e9dbd"}, + {file = "google-cloud-storage-2.15.0.tar.gz", hash = "sha256:7560a3c48a03d66c553dc55215d35883c680fe0ab44c23aa4832800ccc855c74"}, + {file = "google_cloud_storage-2.15.0-py2.py3-none-any.whl", hash = "sha256:5d9237f88b648e1d724a0f20b5cde65996a37fe51d75d17660b1404097327dd2"}, ] [[package]] @@ -1647,15 +1647,15 @@ files = [ [[package]] name = "marshmallow" -version = "3.21.0" +version = "3.21.1" requires_python = ">=3.8" summary = "A lightweight library for converting complex datatypes to and from native Python datatypes." dependencies = [ "packaging>=17.0", ] files = [ - {file = "marshmallow-3.21.0-py3-none-any.whl", hash = "sha256:e7997f83571c7fd476042c2c188e4ee8a78900ca5e74bd9c8097afa56624e9bd"}, - {file = "marshmallow-3.21.0.tar.gz", hash = "sha256:20f53be28c6e374a711a16165fb22a8dc6003e3f7cda1285e3ca777b9193885b"}, + {file = "marshmallow-3.21.1-py3-none-any.whl", hash = "sha256:f085493f79efb0644f270a9bf2892843142d80d7174bbbd2f3713f2a589dc633"}, + {file = "marshmallow-3.21.1.tar.gz", hash = "sha256:4e65e9e0d80fc9e609574b9983cf32579f305c718afb30d7233ab818571768c3"}, ] [[package]] @@ -1680,7 +1680,7 @@ files = [ [[package]] name = "minio" -version = "7.2.4" +version = "7.2.5" summary = "MinIO Python SDK for Amazon S3 Compatible Cloud Storage" dependencies = [ "argon2-cffi", @@ -1690,8 +1690,8 @@ dependencies = [ "urllib3", ] files = [ - {file = "minio-7.2.4-py3-none-any.whl", hash = "sha256:91b51c21d25e3ee6d51f52eab126d6c974371add0d77951e42c322a59c5533e7"}, - {file = "minio-7.2.4.tar.gz", hash = "sha256:d504d8464e5198fb74dd9b572cc88b185ae7997c17705e8c09f3fef2f439d984"}, + {file = "minio-7.2.5-py3-none-any.whl", hash = "sha256:ed9176c96d4271cb1022b9ecb8a538b1e55b32ae06add6de16425cab99ef2304"}, + {file = "minio-7.2.5.tar.gz", hash = "sha256:59d8906e2da248a9caac34d4958a859cc3a44abbe6447910c82b5abfa9d6a2e1"}, ] [[package]] @@ -2547,12 +2547,12 @@ files = [ [[package]] name = "pyparsing" -version = "3.1.1" +version = "3.1.2" requires_python = ">=3.6.8" summary = "pyparsing module - Classes and methods to define and execute parsing grammars" files = [ - {file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"}, - {file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"}, + {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, + {file = "pyparsing-3.1.2.tar.gz", hash = "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad"}, ] [[package]] @@ -2601,15 +2601,15 @@ files = [ [[package]] name = "python-dateutil" -version = "2.9.0" +version = "2.9.0.post0" requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" summary = "Extensions to the standard Python datetime module" dependencies = [ "six>=1.5", ] files = [ - {file = "python-dateutil-2.9.0.tar.gz", hash = "sha256:78e73e19c63f5b20ffa567001531680d939dc042bf7850431877645523c66709"}, - {file = "python_dateutil-2.9.0-py2.py3-none-any.whl", hash = "sha256:cbf2f1da5e6083ac2fbfd4da39a25f34312230110440f424a14c7558bb85d82e"}, + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, ] [[package]] @@ -3575,7 +3575,7 @@ files = [ [[package]] name = "unstract-adapters" -version = "0.4.0" +version = "0.4.1" requires_python = "<3.12,>=3.9" summary = "Unstract Adapters" dependencies = [ @@ -3600,8 +3600,8 @@ dependencies = [ "weaviate-client==3.25.3", ] files = [ - {file = "unstract_adapters-0.4.0-py3-none-any.whl", hash = "sha256:6a7fd6fa376f883242c290a862c4d1b024c357b003cb9d86b494c350edfc58a4"}, - {file = "unstract_adapters-0.4.0.tar.gz", hash = "sha256:78c56813c626af61d1d7b64f3cc603ce7b276a11c96d634d154fce5bc55cd7e6"}, + {file = "unstract_adapters-0.4.1-py3-none-any.whl", hash = "sha256:d8be73fe96f37f347274b62360e2a99b69e83b93e4be4c93797e383e0a0cafbb"}, + {file = "unstract_adapters-0.4.1.tar.gz", hash = "sha256:1f0ceaa8a26af51dd46713ec73b029a115c2a9f43a00e10ae73e8dedee7e1069"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index d902e0d5..3a4cd715 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "python-magic~=0.4.27", "python-dotenv==1.0.0", # LLM Triad - "unstract-adapters~=0.4.0", + "unstract-adapters~=0.4.1", "llama-index==0.9.28", "tiktoken~=0.4.0", "transformers==4.37.0", diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py index 3387b771..2842a6d1 100644 --- a/src/unstract/sdk/__init__.py +++ b/src/unstract/sdk/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.14.0" +__version__ = "0.15.0" def get_sdk_version(): diff --git a/src/unstract/sdk/exceptions.py b/src/unstract/sdk/exceptions.py index a116ab2f..9b077a5b 100644 --- a/src/unstract/sdk/exceptions.py +++ b/src/unstract/sdk/exceptions.py @@ -1,13 +1,10 @@ -from typing import Any, Optional +class SdkError(Exception): + DEFAULT_MESSAGE = "Something went wrong" + def __init__(self, message: str = DEFAULT_MESSAGE): + super().__init__(message) + # Make it user friendly wherever possible + self.message = message -class SdkException(Exception): - def __init__( - self, *args: Any, user_message: Optional[str] = None, **kwargs: Any - ) -> None: - super().__init__(*args, **kwargs) - self._user_message = user_message - - @property - def user_message(self) -> Optional[str]: - return self._user_message + def __str__(self) -> str: + return self.message diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index 8790c6f3..b320877d 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -3,11 +3,12 @@ from llama_index import Document, StorageContext, VectorStoreIndex from llama_index.node_parser import SimpleNodeParser from llama_index.vector_stores import VectorStoreQuery, VectorStoreQueryResult +from unstract.adapters.exceptions import AdapterError from unstract.adapters.x2text.x2text_adapter import X2TextAdapter from unstract.sdk.constants import LogLevel, ToolEnv from unstract.sdk.embedding import ToolEmbedding -from unstract.sdk.exceptions import SdkException +from unstract.sdk.exceptions import SdkError from unstract.sdk.tool.base import BaseTool from unstract.sdk.utils import ToolUtils from unstract.sdk.utils.service_context import ServiceContext @@ -30,7 +31,7 @@ def get_text_from_index( self.tool.stream_log( f"Error loading {embedding_type}", level=LogLevel.ERROR ) - raise SdkException(f"Error loading {embedding_type}") + raise SdkError(f"Error loading {embedding_type}") embedding_dimension = embedd_helper.get_embedding_length(embedding_li) vdb_helper = ToolVectorDB( @@ -45,7 +46,7 @@ def get_text_from_index( self.tool.stream_log( f"Error loading {vector_db}", level=LogLevel.ERROR ) - raise SdkException(f"Error loading {vector_db}") + raise SdkError(f"Error loading {vector_db}") try: self.tool.stream_log(f">>> Querying {vector_db}...") @@ -59,7 +60,7 @@ def get_text_from_index( self.tool.stream_log( f"Error querying {vector_db}: {e}", level=LogLevel.ERROR ) - raise SdkException(f"Error querying {vector_db}: {e}") + raise SdkError(f"Error querying {vector_db}: {e}") n: VectorStoreQueryResult = vector_db_li.query(query=q) if len(n.nodes) > 0: @@ -134,13 +135,18 @@ def index_file( self.tool.stream_log("Extracting text from input file") full_text = [] - x2text = X2Text(tool=self.tool) - x2text_adapter_inst: X2TextAdapter = x2text.get_x2text( - adapter_instance_id=x2text_adapter - ) - extracted_text = x2text_adapter_inst.process( - input_file_path=file_path, output_file_path=output_file_path - ) + extracted_text = "" + try: + x2text = X2Text(tool=self.tool) + x2text_adapter_inst: X2TextAdapter = x2text.get_x2text( + adapter_instance_id=x2text_adapter + ) + extracted_text = x2text_adapter_inst.process( + input_file_path=file_path, output_file_path=output_file_path + ) + except AdapterError as e: + # Wrapping AdapterErrors with SdkError + raise SdkError(str(e)) from e full_text.append( { "section": "full", @@ -173,7 +179,7 @@ def index_file( self.tool.stream_log( f"Error loading {embedding_type}", level=LogLevel.ERROR ) - raise SdkException(f"Error loading {embedding_type}") + raise SdkError(f"Error loading {embedding_type}") embedding_dimension = embedd_helper.get_embedding_length(embedding_li) vector_db_li = vdb_helper.get_vector_db( @@ -184,7 +190,7 @@ def index_file( self.tool.stream_log( f"Error loading {vector_db}", level=LogLevel.ERROR ) - raise SdkException(f"Error loading {vector_db}") + raise SdkError(f"Error loading {vector_db}") q = VectorStoreQuery( query_embedding=embedding_li.get_query_embedding(" "), @@ -214,7 +220,7 @@ def index_file( f"Error deleting nodes for {doc_id}: {e}", level=LogLevel.ERROR, ) - raise SdkException(f"Error deleting nodes for {doc_id}: {e}") + raise SdkError(f"Error deleting nodes for {doc_id}: {e}") doc_id_not_found = True if doc_id_not_found: @@ -271,7 +277,7 @@ def index_file( f"Error adding nodes to vector db: {e}", level=LogLevel.ERROR, ) - raise SdkException(f"Error adding nodes to vector db: {e}") + raise SdkError(f"Error adding nodes to vector db: {e}") self.tool.stream_log("Added nodes to vector db") self.tool.stream_log("Done indexing file") diff --git a/src/unstract/sdk/vector_db.py b/src/unstract/sdk/vector_db.py index 8abf1409..fdf0b1c6 100644 --- a/src/unstract/sdk/vector_db.py +++ b/src/unstract/sdk/vector_db.py @@ -5,9 +5,10 @@ from unstract.adapters.constants import Common from unstract.adapters.vectordb import adapters from unstract.adapters.vectordb.constants import VectorDbConstants + from unstract.sdk.adapters import ToolAdapter from unstract.sdk.constants import LogLevel, ToolEnv, ToolSettingsKey -from unstract.sdk.exceptions import SdkException +from unstract.sdk.exceptions import SdkError from unstract.sdk.platform import PlatformHelper from unstract.sdk.tool.base import BaseTool @@ -34,7 +35,7 @@ def __get_org_id(self) -> str: platform_details = platform_helper.get_platform_details() if not platform_details: # Errors are logged by the SDK itself - raise SdkException("Error getting platform details") + raise SdkError("Error getting platform details") account_id = platform_details.get("organization_id") return account_id