Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 26 additions & 26 deletions pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies = [
"python-magic~=0.4.27",
"python-dotenv==1.0.0",
# LLM Triad
"unstract-adapters~=0.2.2",
"unstract-adapters~=0.3.0",
"llama-index==0.9.28",
"tiktoken~=0.4.0",
"transformers==4.37.0",
Expand Down
2 changes: 1 addition & 1 deletion src/unstract/sdk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.12.1"
__version__ = "0.13.0"


def get_sdk_version():
Expand Down
29 changes: 26 additions & 3 deletions src/unstract/sdk/index.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from typing import Optional
import os

from llama_index import Document, StorageContext, VectorStoreIndex
from llama_index.node_parser import SimpleNodeParser
Expand Down Expand Up @@ -106,7 +105,29 @@ def index_file(
reindex: bool = False,
file_hash: Optional[str] = None,
output_file_path: Optional[str] = None,
):
) -> str:
"""Indexes an individual file using the passed arguments.

Args:
tool_id (str): UUID of the tool (workflow_id in case its called
from workflow)
embedding_type (str): UUID of the embedding service configured
vector_db (str): UUID of the vector DB configured
x2text_adapter (str): UUID of the x2text adapter configured.
This is to extract text from documents.
file_path (str): Path to the file that needs to be indexed.
chunk_size (int): Chunk size to be used for indexing
chunk_overlap (int): Overlap in chunks to be used for indexing
reindex (bool, optional): Flag to denote if document should be
re-indexed if its already indexed. Defaults to False.
file_hash (Optional[str], optional): SHA256 hash of the file.
Defaults to None. If None, the hash is generated.
output_file_path (Optional[str], optional): File path to write
the extracted contents into. Defaults to None.

Returns:
str: A unique ID for the file and indexing arguments combination
"""
# Make file content hash if not available
if not file_hash:
file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
Expand All @@ -117,7 +138,9 @@ def index_file(
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
adapter_instance_id=x2text_adapter
)
extracted_text = x2text_adapter_inst.process(input_file_path=file_path, output_file_path=output_file_path)
extracted_text = x2text_adapter_inst.process(
input_file_path=file_path, output_file_path=output_file_path
)
full_text.append(
{
"section": "full",
Expand Down
5 changes: 1 addition & 4 deletions tests/test_x2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from dotenv import load_dotenv
from parameterized import parameterized
from unstract.adapters.x2text.constants import LLMWhispererSupportedModes

from unstract.sdk.tool.base import BaseTool
from unstract.sdk.x2txt import X2Text
Expand Down Expand Up @@ -53,9 +52,7 @@ def test_get_x2text(self, adapter_instance_id):

if os.path.isfile(output_file):
os.remove(output_file)
file_content = x2text.process(
input_file, output_file, mode=LLMWhispererSupportedModes.OCR.value
)
file_content = x2text.process(input_file, output_file)
file_size = os.path.getsize(output_file)
self.assertGreater(file_size, 0)

Expand Down