diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py index 68e47dd0..9403b103 100644 --- a/src/unstract/sdk/__init__.py +++ b/src/unstract/sdk/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.11.0" +__version__ = "0.11.1" def get_sdk_version(): diff --git a/src/unstract/sdk/constants.py b/src/unstract/sdk/constants.py index 4c06cc28..9a1ee282 100644 --- a/src/unstract/sdk/constants.py +++ b/src/unstract/sdk/constants.py @@ -139,11 +139,13 @@ class ToolSettingsKey: LLM_ADAPTER_ID (str): The key for the LLM adapter ID. EMBEDDING_ADAPTER_ID (str): The key for the embedding adapter ID. VECTOR_DB_ADAPTER_ID (str): The key for the vector DB adapter ID. + X2TEXT_ADAPTER_ID (str): The key for the X2Text adapter ID. """ LLM_ADAPTER_ID = "llmAdapterId" EMBEDDING_ADAPTER_ID = "embeddingAdapterId" VECTOR_DB_ADAPTER_ID = "vectorDbAdapterId" + X2TEXT_ADAPTER_ID = "x2TextAdapterId" class FileReaderSettings: diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index d5a34860..b8d9cce6 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -112,10 +112,10 @@ def index_file( self.tool.stream_log("Extracting text from input file") full_text = [] x2text = X2Text(tool=self.tool) - x2text_adapter: X2TextAdapter = x2text.get_x2text( + x2text_adapter_inst: X2TextAdapter = x2text.get_x2text( adapter_instance_id=x2text_adapter ) - extracted_text = x2text_adapter.process(input_file_path=file_path) + extracted_text = x2text_adapter_inst.process(input_file_path=file_path) full_text.append( { "section": "full", @@ -128,6 +128,7 @@ def index_file( file_hash=file_hash, vector_db=vector_db, embedding=embedding_type, + x2text=x2text_adapter, chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) @@ -257,6 +258,7 @@ def generate_file_id( file_hash: str, vector_db: str, embedding: str, + x2text: str, chunk_size: str, chunk_overlap: str, ) -> str: @@ -267,6 +269,7 @@ def generate_file_id( file_hash (str): Hash of the file contents vector_db (str): UUID of the vector DB adapter embedding (str): UUID of the embedding adapter + x2text (str): UUID of the X2Text adapter chunk_size (str): Chunk size for indexing chunk_overlap (str): Chunk overlap for indexing @@ -274,6 +277,6 @@ def generate_file_id( str: Key representing unique ID for a file """ return ( - f"{tool_id}|{vector_db}|{embedding}|" + f"{tool_id}|{vector_db}|{embedding}|{x2text}|" f"{chunk_size}|{chunk_overlap}|{file_hash}" ) diff --git a/src/unstract/sdk/utils/tool_utils.py b/src/unstract/sdk/utils/tool_utils.py index 45641da6..a66c8b33 100644 --- a/src/unstract/sdk/utils/tool_utils.py +++ b/src/unstract/sdk/utils/tool_utils.py @@ -80,7 +80,7 @@ def json_to_str(json_to_dump: dict[str, Any]) -> str: return compact_json @staticmethod - def get_file_mime_type(self, input_file: Path) -> str: + def get_file_mime_type(input_file: Path) -> str: """Gets the file MIME type for an input file. Uses libmagic to perform the same.