From 2db0fbfcc73c753e12be03776f1825b1c427934a Mon Sep 17 00:00:00 2001
From: Chandrasekharan M <chandrasekharan@zipstack.com>
Date: Fri, 1 Mar 2024 15:28:27 +0530
Subject: [PATCH] Bumped SDK to 0.13.0 which uses adapters 0.3.0. Minor index
 file type and docstring fix

---
 pdm.lock                     | 52 ++++++++++++++++++------------------
 pyproject.toml               |  2 +-
 src/unstract/sdk/__init__.py |  2 +-
 src/unstract/sdk/index.py    | 29 +++++++++++++++++---
 tests/test_x2text.py         |  5 +---
 5 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/pdm.lock b/pdm.lock
index 9ed596b9..e192fea6 100644
--- a/pdm.lock
+++ b/pdm.lock
@@ -2,10 +2,10 @@
 # It is not intended for manual editing.
 
 [metadata]
-groups = ["default", "lint", "docs", "test"]
+groups = ["default", "docs", "lint", "test"]
 strategy = ["cross_platform"]
-lock_version = "4.4"
-content_hash = "sha256:56a09975791556fa4873ecb5729b178bbb48979779aa2969866c48c56d021339"
+lock_version = "4.4.1"
+content_hash = "sha256:80a23596dda5cc100f214dca47d136cc92e40e19fa34567c467caeff39d5e9ac"
 
 [[package]]
 name = "aiohttp"
@@ -352,22 +352,22 @@ files = [
 
 [[package]]
 name = "boto3"
-version = "1.34.50"
+version = "1.34.53"
 requires_python = ">= 3.8"
 summary = "The AWS SDK for Python"
 dependencies = [
-    "botocore<1.35.0,>=1.34.50",
+    "botocore<1.35.0,>=1.34.53",
     "jmespath<2.0.0,>=0.7.1",
     "s3transfer<0.11.0,>=0.10.0",
 ]
 files = [
-    {file = "boto3-1.34.50-py3-none-any.whl", hash = "sha256:8d709365231234bc4f0ca98fdf33a25eeebf78072853c6aa3d259f0f5cf09877"},
-    {file = "boto3-1.34.50.tar.gz", hash = "sha256:290952be7899560039cb0042e8a2354f61a7dead0d0ca8bea6ba901930df0468"},
+    {file = "boto3-1.34.53-py3-none-any.whl", hash = "sha256:340c73f57fcca6f503403e2e13a0a4ad44bec218feee2e0896be612324394afd"},
+    {file = "boto3-1.34.53.tar.gz", hash = "sha256:cd30261a782824ce543a628ae524480abb4ca6ab4e4a2631477e48baed43b5f2"},
 ]
 
 [[package]]
 name = "botocore"
-version = "1.34.50"
+version = "1.34.53"
 requires_python = ">= 3.8"
 summary = "Low-level, data-driven core of boto 3."
 dependencies = [
@@ -377,8 +377,8 @@ dependencies = [
     "urllib3<2.1,>=1.25.4; python_version >= \"3.10\"",
 ]
 files = [
-    {file = "botocore-1.34.50-py3-none-any.whl", hash = "sha256:fda510559dbe796eefdb59561cc81be1b99afba3dee53fd23db9a3d587adc0ab"},
-    {file = "botocore-1.34.50.tar.gz", hash = "sha256:33ab82cb96c4bb684f0dbafb071808e4817d83debc88b223e7d988256370c6d7"},
+    {file = "botocore-1.34.53-py3-none-any.whl", hash = "sha256:cbbcaddc35738d32df55d26ed5561cf3fa32751a6b22e7e342be87b5e3f55eec"},
+    {file = "botocore-1.34.53.tar.gz", hash = "sha256:3d243781e994dfc5b20036d9fb92672bfaef4dbe388eaa79dae6440ea56c53eb"},
 ]
 
 [[package]]
@@ -1133,7 +1133,7 @@ files = [
 
 [[package]]
 name = "gotrue"
-version = "2.1.0"
+version = "2.4.1"
 requires_python = ">=3.8,<4.0"
 summary = "Python Client Library for GoTrue"
 dependencies = [
@@ -1141,8 +1141,8 @@ dependencies = [
     "pydantic<3,>=1.10",
 ]
 files = [
-    {file = "gotrue-2.1.0-py3-none-any.whl", hash = "sha256:6483d9a3ac9be1d1ad510be24171e133aa1cec702cc10a8f323b9e7519642447"},
-    {file = "gotrue-2.1.0.tar.gz", hash = "sha256:b21d48ee64f0f6a1ed111efe4871a83e542529f1a75a264833b50e6433cd3c98"},
+    {file = "gotrue-2.4.1-py3-none-any.whl", hash = "sha256:9647bb7a585c969d26667df21168fa20b18f91c5d6afe286af08d7a0610fd2cc"},
+    {file = "gotrue-2.4.1.tar.gz", hash = "sha256:8b260ef285f45a3a2f9b5a006f12afb9fad7a36a28fa277f19e733f22eb88584"},
 ]
 
 [[package]]
@@ -2427,7 +2427,7 @@ files = [
 
 [[package]]
 name = "pydantic"
-version = "2.6.2"
+version = "2.6.3"
 requires_python = ">=3.8"
 summary = "Data validation using Python type hints"
 dependencies = [
@@ -2436,8 +2436,8 @@ dependencies = [
     "typing-extensions>=4.6.1",
 ]
 files = [
-    {file = "pydantic-2.6.2-py3-none-any.whl", hash = "sha256:37a5432e54b12fecaa1049c5195f3d860a10e01bdfd24f1840ef14bd0d3aeab3"},
-    {file = "pydantic-2.6.2.tar.gz", hash = "sha256:a09be1c3d28f3abe37f8a78af58284b236a92ce520105ddc91a6d29ea1176ba7"},
+    {file = "pydantic-2.6.3-py3-none-any.whl", hash = "sha256:72c6034df47f46ccdf81869fddb81aade68056003900a8724a4f160700016a2a"},
+    {file = "pydantic-2.6.3.tar.gz", hash = "sha256:e07805c4c7f5c6826e33a1d4c9d47950d7eaf34868e2690f8594d2e30241f11f"},
 ]
 
 [[package]]
@@ -2601,15 +2601,15 @@ files = [
 
 [[package]]
 name = "python-dateutil"
-version = "2.8.2"
+version = "2.9.0"
 requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 summary = "Extensions to the standard Python datetime module"
 dependencies = [
     "six>=1.5",
 ]
 files = [
-    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
-    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
+    {file = "python-dateutil-2.9.0.tar.gz", hash = "sha256:78e73e19c63f5b20ffa567001531680d939dc042bf7850431877645523c66709"},
+    {file = "python_dateutil-2.9.0-py2.py3-none-any.whl", hash = "sha256:cbf2f1da5e6083ac2fbfd4da39a25f34312230110440f424a14c7558bb85d82e"},
 ]
 
 [[package]]
@@ -2826,7 +2826,7 @@ files = [
 
 [[package]]
 name = "rich"
-version = "13.7.0"
+version = "13.7.1"
 requires_python = ">=3.7.0"
 summary = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
 dependencies = [
@@ -2834,8 +2834,8 @@ dependencies = [
     "pygments<3.0.0,>=2.13.0",
 ]
 files = [
-    {file = "rich-13.7.0-py3-none-any.whl", hash = "sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"},
-    {file = "rich-13.7.0.tar.gz", hash = "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa"},
+    {file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"},
+    {file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"},
 ]
 
 [[package]]
@@ -3128,7 +3128,7 @@ version = "2.0.26"
 requires_python = ">=3.7"
 summary = "Database Abstraction Library"
 dependencies = [
-    "greenlet!=0.4.17; platform_machine == \"aarch64\" or (platform_machine == \"ppc64le\" or (platform_machine == \"x86_64\" or (platform_machine == \"amd64\" or (platform_machine == \"AMD64\" or (platform_machine == \"win32\" or platform_machine == \"WIN32\")))))",
+    "greenlet!=0.4.17; platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\"",
     "typing-extensions>=4.6.0",
 ]
 files = [
@@ -3575,7 +3575,7 @@ files = [
 
 [[package]]
 name = "unstract-adapters"
-version = "0.2.2"
+version = "0.3.0"
 requires_python = "<3.12,>=3.9"
 summary = "Unstract Adapters"
 dependencies = [
@@ -3602,8 +3602,8 @@ dependencies = [
     "weaviate-client==3.25.3",
 ]
 files = [
-    {file = "unstract_adapters-0.2.2-py3-none-any.whl", hash = "sha256:aaf48036f844f4e0295c36596b1772111ab18229246b8677227b983b3c4226e4"},
-    {file = "unstract_adapters-0.2.2.tar.gz", hash = "sha256:465d7859a0eb7a905d2c626939f2fed3a501600f832c93160b3e27a559e5ab22"},
+    {file = "unstract_adapters-0.3.0-py3-none-any.whl", hash = "sha256:ac2e6d6902bb54c0ef4d223b68365d4f044fb92b83c05bcd92208155e548253c"},
+    {file = "unstract_adapters-0.3.0.tar.gz", hash = "sha256:3c4662a73dfaa0b9c0b947da8e3e351221b876c6dd505d054c70a3067fe8d9dc"},
 ]
 
 [[package]]
diff --git a/pyproject.toml b/pyproject.toml
index 9b179244..350f46cb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@ dependencies = [
     "python-magic~=0.4.27",
     "python-dotenv==1.0.0",
     # LLM Triad
-    "unstract-adapters~=0.2.2",
+    "unstract-adapters~=0.3.0",
     "llama-index==0.9.28",
     "tiktoken~=0.4.0",
     "transformers==4.37.0",
diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py
index 9b6709e7..d729e648 100644
--- a/src/unstract/sdk/__init__.py
+++ b/src/unstract/sdk/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.12.1"
+__version__ = "0.13.0"
 
 
 def get_sdk_version():
diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py
index b0e31b0f..8790c6f3 100644
--- a/src/unstract/sdk/index.py
+++ b/src/unstract/sdk/index.py
@@ -1,5 +1,4 @@
 from typing import Optional
-import os
 
 from llama_index import Document, StorageContext, VectorStoreIndex
 from llama_index.node_parser import SimpleNodeParser
@@ -106,7 +105,29 @@ def index_file(
         reindex: bool = False,
         file_hash: Optional[str] = None,
         output_file_path: Optional[str] = None,
-    ):
+    ) -> str:
+        """Indexes an individual file using the passed arguments.
+
+        Args:
+            tool_id (str): UUID of the tool (workflow_id in case its called
+                from workflow)
+            embedding_type (str): UUID of the embedding service configured
+            vector_db (str): UUID of the vector DB configured
+            x2text_adapter (str): UUID of the x2text adapter configured.
+                This is to extract text from documents.
+            file_path (str): Path to the file that needs to be indexed.
+            chunk_size (int): Chunk size to be used for indexing
+            chunk_overlap (int): Overlap in chunks to be used for indexing
+            reindex (bool, optional): Flag to denote if document should be
+                re-indexed if its already indexed. Defaults to False.
+            file_hash (Optional[str], optional): SHA256 hash of the file.
+                Defaults to None. If None, the hash is generated.
+            output_file_path (Optional[str], optional): File path to write
+                the extracted contents into. Defaults to None.
+
+        Returns:
+            str: A unique ID for the file and indexing arguments combination
+        """
         # Make file content hash if not available
         if not file_hash:
             file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
@@ -117,7 +138,9 @@ def index_file(
         x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
             adapter_instance_id=x2text_adapter
         )
-        extracted_text = x2text_adapter_inst.process(input_file_path=file_path, output_file_path=output_file_path)
+        extracted_text = x2text_adapter_inst.process(
+            input_file_path=file_path, output_file_path=output_file_path
+        )
         full_text.append(
             {
                 "section": "full",
diff --git a/tests/test_x2text.py b/tests/test_x2text.py
index 39c65356..4f7e219d 100644
--- a/tests/test_x2text.py
+++ b/tests/test_x2text.py
@@ -6,7 +6,6 @@
 
 from dotenv import load_dotenv
 from parameterized import parameterized
-from unstract.adapters.x2text.constants import LLMWhispererSupportedModes
 
 from unstract.sdk.tool.base import BaseTool
 from unstract.sdk.x2txt import X2Text
@@ -53,9 +52,7 @@ def test_get_x2text(self, adapter_instance_id):
 
         if os.path.isfile(output_file):
             os.remove(output_file)
-        file_content = x2text.process(
-            input_file, output_file, mode=LLMWhispererSupportedModes.OCR.value
-        )
+        file_content = x2text.process(input_file, output_file)
         file_size = os.path.getsize(output_file)
         self.assertGreater(file_size, 0)