From 2db0fbfcc73c753e12be03776f1825b1c427934a Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Fri, 1 Mar 2024 15:28:27 +0530 Subject: [PATCH] Bumped SDK to 0.13.0 which uses adapters 0.3.0. Minor index file type and docstring fix --- pdm.lock | 52 ++++++++++++++++++------------------ pyproject.toml | 2 +- src/unstract/sdk/__init__.py | 2 +- src/unstract/sdk/index.py | 29 +++++++++++++++++--- tests/test_x2text.py | 5 +--- 5 files changed, 55 insertions(+), 35 deletions(-) diff --git a/pdm.lock b/pdm.lock index 9ed596b9..e192fea6 100644 --- a/pdm.lock +++ b/pdm.lock @@ -2,10 +2,10 @@ # It is not intended for manual editing. [metadata] -groups = ["default", "lint", "docs", "test"] +groups = ["default", "docs", "lint", "test"] strategy = ["cross_platform"] -lock_version = "4.4" -content_hash = "sha256:56a09975791556fa4873ecb5729b178bbb48979779aa2969866c48c56d021339" +lock_version = "4.4.1" +content_hash = "sha256:80a23596dda5cc100f214dca47d136cc92e40e19fa34567c467caeff39d5e9ac" [[package]] name = "aiohttp" @@ -352,22 +352,22 @@ files = [ [[package]] name = "boto3" -version = "1.34.50" +version = "1.34.53" requires_python = ">= 3.8" summary = "The AWS SDK for Python" dependencies = [ - "botocore<1.35.0,>=1.34.50", + "botocore<1.35.0,>=1.34.53", "jmespath<2.0.0,>=0.7.1", "s3transfer<0.11.0,>=0.10.0", ] files = [ - {file = "boto3-1.34.50-py3-none-any.whl", hash = "sha256:8d709365231234bc4f0ca98fdf33a25eeebf78072853c6aa3d259f0f5cf09877"}, - {file = "boto3-1.34.50.tar.gz", hash = "sha256:290952be7899560039cb0042e8a2354f61a7dead0d0ca8bea6ba901930df0468"}, + {file = "boto3-1.34.53-py3-none-any.whl", hash = "sha256:340c73f57fcca6f503403e2e13a0a4ad44bec218feee2e0896be612324394afd"}, + {file = "boto3-1.34.53.tar.gz", hash = "sha256:cd30261a782824ce543a628ae524480abb4ca6ab4e4a2631477e48baed43b5f2"}, ] [[package]] name = "botocore" -version = "1.34.50" +version = "1.34.53" requires_python = ">= 3.8" summary = "Low-level, data-driven core of boto 3." dependencies = [ @@ -377,8 +377,8 @@ dependencies = [ "urllib3<2.1,>=1.25.4; python_version >= \"3.10\"", ] files = [ - {file = "botocore-1.34.50-py3-none-any.whl", hash = "sha256:fda510559dbe796eefdb59561cc81be1b99afba3dee53fd23db9a3d587adc0ab"}, - {file = "botocore-1.34.50.tar.gz", hash = "sha256:33ab82cb96c4bb684f0dbafb071808e4817d83debc88b223e7d988256370c6d7"}, + {file = "botocore-1.34.53-py3-none-any.whl", hash = "sha256:cbbcaddc35738d32df55d26ed5561cf3fa32751a6b22e7e342be87b5e3f55eec"}, + {file = "botocore-1.34.53.tar.gz", hash = "sha256:3d243781e994dfc5b20036d9fb92672bfaef4dbe388eaa79dae6440ea56c53eb"}, ] [[package]] @@ -1133,7 +1133,7 @@ files = [ [[package]] name = "gotrue" -version = "2.1.0" +version = "2.4.1" requires_python = ">=3.8,<4.0" summary = "Python Client Library for GoTrue" dependencies = [ @@ -1141,8 +1141,8 @@ dependencies = [ "pydantic<3,>=1.10", ] files = [ - {file = "gotrue-2.1.0-py3-none-any.whl", hash = "sha256:6483d9a3ac9be1d1ad510be24171e133aa1cec702cc10a8f323b9e7519642447"}, - {file = "gotrue-2.1.0.tar.gz", hash = "sha256:b21d48ee64f0f6a1ed111efe4871a83e542529f1a75a264833b50e6433cd3c98"}, + {file = "gotrue-2.4.1-py3-none-any.whl", hash = "sha256:9647bb7a585c969d26667df21168fa20b18f91c5d6afe286af08d7a0610fd2cc"}, + {file = "gotrue-2.4.1.tar.gz", hash = "sha256:8b260ef285f45a3a2f9b5a006f12afb9fad7a36a28fa277f19e733f22eb88584"}, ] [[package]] @@ -2427,7 +2427,7 @@ files = [ [[package]] name = "pydantic" -version = "2.6.2" +version = "2.6.3" requires_python = ">=3.8" summary = "Data validation using Python type hints" dependencies = [ @@ -2436,8 +2436,8 @@ dependencies = [ "typing-extensions>=4.6.1", ] files = [ - {file = "pydantic-2.6.2-py3-none-any.whl", hash = "sha256:37a5432e54b12fecaa1049c5195f3d860a10e01bdfd24f1840ef14bd0d3aeab3"}, - {file = "pydantic-2.6.2.tar.gz", hash = "sha256:a09be1c3d28f3abe37f8a78af58284b236a92ce520105ddc91a6d29ea1176ba7"}, + {file = "pydantic-2.6.3-py3-none-any.whl", hash = "sha256:72c6034df47f46ccdf81869fddb81aade68056003900a8724a4f160700016a2a"}, + {file = "pydantic-2.6.3.tar.gz", hash = "sha256:e07805c4c7f5c6826e33a1d4c9d47950d7eaf34868e2690f8594d2e30241f11f"}, ] [[package]] @@ -2601,15 +2601,15 @@ files = [ [[package]] name = "python-dateutil" -version = "2.8.2" +version = "2.9.0" requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" summary = "Extensions to the standard Python datetime module" dependencies = [ "six>=1.5", ] files = [ - {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, - {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, + {file = "python-dateutil-2.9.0.tar.gz", hash = "sha256:78e73e19c63f5b20ffa567001531680d939dc042bf7850431877645523c66709"}, + {file = "python_dateutil-2.9.0-py2.py3-none-any.whl", hash = "sha256:cbf2f1da5e6083ac2fbfd4da39a25f34312230110440f424a14c7558bb85d82e"}, ] [[package]] @@ -2826,7 +2826,7 @@ files = [ [[package]] name = "rich" -version = "13.7.0" +version = "13.7.1" requires_python = ">=3.7.0" summary = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" dependencies = [ @@ -2834,8 +2834,8 @@ dependencies = [ "pygments<3.0.0,>=2.13.0", ] files = [ - {file = "rich-13.7.0-py3-none-any.whl", hash = "sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"}, - {file = "rich-13.7.0.tar.gz", hash = "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa"}, + {file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"}, + {file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"}, ] [[package]] @@ -3128,7 +3128,7 @@ version = "2.0.26" requires_python = ">=3.7" summary = "Database Abstraction Library" dependencies = [ - "greenlet!=0.4.17; platform_machine == \"aarch64\" or (platform_machine == \"ppc64le\" or (platform_machine == \"x86_64\" or (platform_machine == \"amd64\" or (platform_machine == \"AMD64\" or (platform_machine == \"win32\" or platform_machine == \"WIN32\")))))", + "greenlet!=0.4.17; platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\"", "typing-extensions>=4.6.0", ] files = [ @@ -3575,7 +3575,7 @@ files = [ [[package]] name = "unstract-adapters" -version = "0.2.2" +version = "0.3.0" requires_python = "<3.12,>=3.9" summary = "Unstract Adapters" dependencies = [ @@ -3602,8 +3602,8 @@ dependencies = [ "weaviate-client==3.25.3", ] files = [ - {file = "unstract_adapters-0.2.2-py3-none-any.whl", hash = "sha256:aaf48036f844f4e0295c36596b1772111ab18229246b8677227b983b3c4226e4"}, - {file = "unstract_adapters-0.2.2.tar.gz", hash = "sha256:465d7859a0eb7a905d2c626939f2fed3a501600f832c93160b3e27a559e5ab22"}, + {file = "unstract_adapters-0.3.0-py3-none-any.whl", hash = "sha256:ac2e6d6902bb54c0ef4d223b68365d4f044fb92b83c05bcd92208155e548253c"}, + {file = "unstract_adapters-0.3.0.tar.gz", hash = "sha256:3c4662a73dfaa0b9c0b947da8e3e351221b876c6dd505d054c70a3067fe8d9dc"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index 9b179244..350f46cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "python-magic~=0.4.27", "python-dotenv==1.0.0", # LLM Triad - "unstract-adapters~=0.2.2", + "unstract-adapters~=0.3.0", "llama-index==0.9.28", "tiktoken~=0.4.0", "transformers==4.37.0", diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py index 9b6709e7..d729e648 100644 --- a/src/unstract/sdk/__init__.py +++ b/src/unstract/sdk/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.12.1" +__version__ = "0.13.0" def get_sdk_version(): diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index b0e31b0f..8790c6f3 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -1,5 +1,4 @@ from typing import Optional -import os from llama_index import Document, StorageContext, VectorStoreIndex from llama_index.node_parser import SimpleNodeParser @@ -106,7 +105,29 @@ def index_file( reindex: bool = False, file_hash: Optional[str] = None, output_file_path: Optional[str] = None, - ): + ) -> str: + """Indexes an individual file using the passed arguments. + + Args: + tool_id (str): UUID of the tool (workflow_id in case its called + from workflow) + embedding_type (str): UUID of the embedding service configured + vector_db (str): UUID of the vector DB configured + x2text_adapter (str): UUID of the x2text adapter configured. + This is to extract text from documents. + file_path (str): Path to the file that needs to be indexed. + chunk_size (int): Chunk size to be used for indexing + chunk_overlap (int): Overlap in chunks to be used for indexing + reindex (bool, optional): Flag to denote if document should be + re-indexed if its already indexed. Defaults to False. + file_hash (Optional[str], optional): SHA256 hash of the file. + Defaults to None. If None, the hash is generated. + output_file_path (Optional[str], optional): File path to write + the extracted contents into. Defaults to None. + + Returns: + str: A unique ID for the file and indexing arguments combination + """ # Make file content hash if not available if not file_hash: file_hash = ToolUtils.get_hash_from_file(file_path=file_path) @@ -117,7 +138,9 @@ def index_file( x2text_adapter_inst: X2TextAdapter = x2text.get_x2text( adapter_instance_id=x2text_adapter ) - extracted_text = x2text_adapter_inst.process(input_file_path=file_path, output_file_path=output_file_path) + extracted_text = x2text_adapter_inst.process( + input_file_path=file_path, output_file_path=output_file_path + ) full_text.append( { "section": "full", diff --git a/tests/test_x2text.py b/tests/test_x2text.py index 39c65356..4f7e219d 100644 --- a/tests/test_x2text.py +++ b/tests/test_x2text.py @@ -6,7 +6,6 @@ from dotenv import load_dotenv from parameterized import parameterized -from unstract.adapters.x2text.constants import LLMWhispererSupportedModes from unstract.sdk.tool.base import BaseTool from unstract.sdk.x2txt import X2Text @@ -53,9 +52,7 @@ def test_get_x2text(self, adapter_instance_id): if os.path.isfile(output_file): os.remove(output_file) - file_content = x2text.process( - input_file, output_file, mode=LLMWhispererSupportedModes.OCR.value - ) + file_content = x2text.process(input_file, output_file) file_size = os.path.getsize(output_file) self.assertGreater(file_size, 0)