Zipstack · nehabagdia · Mar 1, 2024 · Mar 1, 2024
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,7 @@ dependencies = [
     "python-magic~=0.4.27",
     "python-dotenv==1.0.0",
     # LLM Triad
-    "unstract-adapters~=0.2.2",
+    "unstract-adapters~=0.3.0",
     "llama-index==0.9.28",
     "tiktoken~=0.4.0",
     "transformers==4.37.0",

diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.12.1"
+__version__ = "0.13.0"
 
 
 def get_sdk_version():

diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py
@@ -1,5 +1,4 @@
 from typing import Optional
-import os
 
 from llama_index import Document, StorageContext, VectorStoreIndex
 from llama_index.node_parser import SimpleNodeParser
@@ -106,7 +105,29 @@ def index_file(
         reindex: bool = False,
         file_hash: Optional[str] = None,
         output_file_path: Optional[str] = None,
-    ):
+    ) -> str:
+        """Indexes an individual file using the passed arguments.
+
+        Args:
+            tool_id (str): UUID of the tool (workflow_id in case its called
+                from workflow)
+            embedding_type (str): UUID of the embedding service configured
+            vector_db (str): UUID of the vector DB configured
+            x2text_adapter (str): UUID of the x2text adapter configured.
+                This is to extract text from documents.
+            file_path (str): Path to the file that needs to be indexed.
+            chunk_size (int): Chunk size to be used for indexing
+            chunk_overlap (int): Overlap in chunks to be used for indexing
+            reindex (bool, optional): Flag to denote if document should be
+                re-indexed if its already indexed. Defaults to False.
+            file_hash (Optional[str], optional): SHA256 hash of the file.
+                Defaults to None. If None, the hash is generated.
+            output_file_path (Optional[str], optional): File path to write
+                the extracted contents into. Defaults to None.
+
+        Returns:
+            str: A unique ID for the file and indexing arguments combination
+        """
         # Make file content hash if not available
         if not file_hash:
             file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
@@ -117,7 +138,9 @@ def index_file(
         x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
             adapter_instance_id=x2text_adapter
         )
-        extracted_text = x2text_adapter_inst.process(input_file_path=file_path, output_file_path=output_file_path)
+        extracted_text = x2text_adapter_inst.process(
+            input_file_path=file_path, output_file_path=output_file_path
+        )
         full_text.append(
             {
                 "section": "full",

diff --git a/tests/test_x2text.py b/tests/test_x2text.py
@@ -6,7 +6,6 @@
 
 from dotenv import load_dotenv
 from parameterized import parameterized
-from unstract.adapters.x2text.constants import LLMWhispererSupportedModes
 
 from unstract.sdk.tool.base import BaseTool
 from unstract.sdk.x2txt import X2Text
@@ -53,9 +52,7 @@ def test_get_x2text(self, adapter_instance_id):
 
         if os.path.isfile(output_file):
             os.remove(output_file)
-        file_content = x2text.process(
-            input_file, output_file, mode=LLMWhispererSupportedModes.OCR.value
-        )
+        file_content = x2text.process(input_file, output_file)
         file_size = os.path.getsize(output_file)
         self.assertGreater(file_size, 0)