From c46fdc075e5e752b659599b6d92b5e0367265c1b Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Sat, 24 Feb 2024 02:40:43 +0530 Subject: [PATCH 1/3] Updated index_file() to use x2text adapter Added util for getting the file MIME type Minor fix on exceptions.py --- src/unstract/sdk/__init__.py | 2 +- src/unstract/sdk/exceptions.py | 3 - src/unstract/sdk/index.py | 101 ++++----------------------- src/unstract/sdk/tool/validator.py | 23 ++---- src/unstract/sdk/utils/tool_utils.py | 20 ++++++ 5 files changed, 40 insertions(+), 109 deletions(-) diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py index 56617d94..68e47dd0 100644 --- a/src/unstract/sdk/__init__.py +++ b/src/unstract/sdk/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.10.1" +__version__ = "0.11.0" def get_sdk_version(): diff --git a/src/unstract/sdk/exceptions.py b/src/unstract/sdk/exceptions.py index 59a402a0..a116ab2f 100644 --- a/src/unstract/sdk/exceptions.py +++ b/src/unstract/sdk/exceptions.py @@ -11,6 +11,3 @@ def __init__( @property def user_message(self) -> Optional[str]: return self._user_message - - def __str__(self) -> str: - return f"{self.message}" diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index 5e008705..d5a34860 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -1,12 +1,9 @@ -import os -import shutil -import zipfile from typing import Optional -import filetype from llama_index import Document, StorageContext, VectorStoreIndex from llama_index.node_parser import SimpleNodeParser from llama_index.vector_stores import VectorStoreQuery, VectorStoreQueryResult +from unstract.adapters.x2text.x2text_adapter import X2TextAdapter from unstract.sdk.constants import LogLevel, ToolEnv from unstract.sdk.embedding import ToolEmbedding @@ -15,12 +12,7 @@ from unstract.sdk.utils import ToolUtils from unstract.sdk.utils.service_context import ServiceContext from unstract.sdk.vector_db import ToolVectorDB - -allowed_pdf_to_text_converters = [ - "default", - "unstract_llm_whisperer", - "unstract_camelot", -] +from unstract.sdk.x2txt import X2Text class ToolIndex: @@ -106,93 +98,30 @@ def index_file( tool_id: str, embedding_type: str, vector_db: str, + x2text_adapter: str, file_path: str, chunk_size: int, chunk_overlap: int, reindex: bool = False, - converter: str = "default", file_hash: Optional[str] = None, ): - if converter not in allowed_pdf_to_text_converters: - self.tool.stream_log( - "pdf-to-text-converters must be one of " - f"{allowed_pdf_to_text_converters}", - level=LogLevel.ERROR, - ) - raise SdkException( - "pdf-to-text-converters must be one of " - f"{allowed_pdf_to_text_converters}" - ) - - input_file_type = None - input_file_type_mime = None - # Make file content hash if not available if not file_hash: file_hash = ToolUtils.get_hash_from_file(file_path=file_path) - with open(file_path, mode="rb") as input_file_obj: - sample_contents = input_file_obj.read(100) - input_file_type = filetype.guess(sample_contents) - - if input_file_type is None: - input_file_type_mime = "text/plain" - else: - input_file_type_mime = input_file_type.MIME - - self.tool.stream_log(f"Input file type: {input_file_type_mime}") + self.tool.stream_log("Extracting text from input file") full_text = [] - - if input_file_type_mime == "text/plain": - with open(file_path) as input_file_obj: - full_text.append( - { - "section": "full", - "text_contents": self._cleanup_text( - input_file_obj.read() - ), - } - ) - - elif input_file_type_mime == "application/pdf": - raise SdkException( - "Indexing of PDF files is not supported currently" - ) - # TODO: Make use of adapters to convert X2Text - # self.tool.stream_log(f"PDF to text converter: {converter}") - # if converter == "unstract_llm_whisperer" or converter == "default": # noqa - # full_text.append( - # { - # "section": "full", - # "text_contents": self._cleanup_text( - # x2txt.generate_whisper( - # input_file=file_path, - # mode="text", - # dump_text=True, - # ) - # ), - # } - # ) - # else: - # # TODO : Support for Camelot - # x2txt = X2Text(tool=self.tool) - - elif input_file_type_mime == "application/zip": - self.tool.stream_log("Zip file extraction required") - with zipfile.ZipFile(file_path, "r") as zip_ref: - file_name_from_path = os.path.basename(file_path) - temp_directory = f"/tmp/unstract_zip/{file_name_from_path}" - # If temp_directory exists, delete it and create it again - if os.path.exists(temp_directory): - shutil.rmtree(temp_directory) - os.makedirs(temp_directory) - zip_ref.extractall(temp_directory) - else: - self.tool.stream_log( - f"Unsupported file type: {input_file_type_mime}", - level=LogLevel.ERROR, - ) - raise SdkException(f"Unsupported file type: {input_file_type_mime}") + x2text = X2Text(tool=self.tool) + x2text_adapter: X2TextAdapter = x2text.get_x2text( + adapter_instance_id=x2text_adapter + ) + extracted_text = x2text_adapter.process(input_file_path=file_path) + full_text.append( + { + "section": "full", + "text_contents": self._cleanup_text(extracted_text), + } + ) doc_id = ToolIndex.generate_file_id( tool_id=tool_id, diff --git a/src/unstract/sdk/tool/validator.py b/src/unstract/sdk/tool/validator.py index f045301a..d7971fb7 100644 --- a/src/unstract/sdk/tool/validator.py +++ b/src/unstract/sdk/tool/validator.py @@ -3,11 +3,12 @@ from pathlib import Path from typing import Any -import magic from jsonschema import Draft202012Validator, ValidationError, validators + from unstract.sdk.constants import MetadataKey, PropKey from unstract.sdk.tool.base import BaseTool from unstract.sdk.tool.mime_types import EXT_MIME_MAP +from unstract.sdk.utils import ToolUtils def extend_with_default(validator_class: Any) -> Any: @@ -211,26 +212,10 @@ def _validate_file_type(self, input_file: Path) -> None: ) allowed_mimes.append(EXT_MIME_MAP[ext]) - input_file_mime = self._get_file_mime(input_file=input_file) + input_file_mime = ToolUtils.get_file_mime_type(input_file=input_file) + self.tool.stream_log(f"Input file MIME: {input_file_mime}") if input_file_mime not in allowed_mimes: self.tool.stream_error_and_exit( f"File type of {input_file_mime} is not supported by" " the tool, check its PROPERTIES for a list of supported types" ) - - def _get_file_mime(self, input_file: Path) -> str: - """Gets the file MIME type for an input file. Uses libmagic to perform - the same. - - Args: - input_file (Path): Path object of the input file - - Returns: - str: MIME type of the file - """ - input_file_mime = "" - with open(input_file, mode="rb") as input_file_obj: - sample_contents = input_file_obj.read(100) - input_file_mime = magic.from_buffer(sample_contents, mime=True) - self.tool.stream_log(f"Input file MIME: {input_file_mime}") - return input_file_mime diff --git a/src/unstract/sdk/utils/tool_utils.py b/src/unstract/sdk/utils/tool_utils.py index a8bdadd0..6dc277f8 100644 --- a/src/unstract/sdk/utils/tool_utils.py +++ b/src/unstract/sdk/utils/tool_utils.py @@ -1,7 +1,10 @@ import json from hashlib import md5, sha256 +from pathlib import Path from typing import Any +import magic + from unstract.sdk.constants import FileReaderSettings @@ -75,3 +78,20 @@ def json_to_str(json_to_dump: dict[str, Any]) -> str: """ compact_json = json.dumps(json_to_dump, separators=(",", ":")) return compact_json + + @staticmethod + def get_file_mime_type(self, input_file: Path) -> str: + """Gets the file MIME type for an input file. Uses libmagic to perform + the same. + + Args: + input_file (Path): Path object of the input file + + Returns: + str: MIME type of the file + """ + input_file_mime = "" + with open(input_file, mode="rb") as input_file_obj: + sample_contents = input_file_obj.read(100) + input_file_mime = magic.from_buffer(sample_contents, mime=True) + return input_file_mime From 01d612d8d8425b7353db5d749b66fac1d624c82f Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Sat, 24 Feb 2024 02:48:12 +0530 Subject: [PATCH 2/3] Lock file update --- pdm.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pdm.lock b/pdm.lock index f82b49c3..acb65633 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "docs", "lint", "test"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.1" -content_hash = "sha256:a8125f72370ebdf9f5dc8b24aa1dbe72dad9dc31f4346be953476ebc9de23df6" +content_hash = "sha256:26b7a0ea88fc5cac1d0b896c20041b3d0fc9c257f1c12a0e318fc2f0a494b5ec" [[package]] name = "aiohttp" @@ -3761,7 +3761,7 @@ files = [ [[package]] name = "unstract-adapters" -version = "0.2.0" +version = "0.2.1" requires_python = "<3.12,>=3.9" summary = "Unstract Adapters" groups = ["default"] @@ -3789,8 +3789,8 @@ dependencies = [ "weaviate-client==3.25.3", ] files = [ - {file = "unstract_adapters-0.2.0-py3-none-any.whl", hash = "sha256:7a794ff1410de655f9ae42e473ef4eba48a2536458bda54e738d61a1ef8a0f7c"}, - {file = "unstract_adapters-0.2.0.tar.gz", hash = "sha256:6617878780b2bbd3036b315d39d6f3e72b455728ce8a6b1cced666d4d34859b3"}, + {file = "unstract_adapters-0.2.1-py3-none-any.whl", hash = "sha256:7b48707bc5c634f07d4ea2926f1b2925bcd5c6869e057839ba69c2d62d19941b"}, + {file = "unstract_adapters-0.2.1.tar.gz", hash = "sha256:08646a232185185390a193ad12b16715d1fccc69195d2d28e4f291b5c55f8117"}, ] [[package]] From 4a35b93dd35fb3d9aea15b67ff336705a5fd4ac3 Mon Sep 17 00:00:00 2001 From: Chandrasekharan M Date: Sat, 24 Feb 2024 10:52:52 +0530 Subject: [PATCH 3/3] get_file_mime_type() fix for seeking to 0 --- src/unstract/sdk/utils/tool_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/unstract/sdk/utils/tool_utils.py b/src/unstract/sdk/utils/tool_utils.py index 6dc277f8..45641da6 100644 --- a/src/unstract/sdk/utils/tool_utils.py +++ b/src/unstract/sdk/utils/tool_utils.py @@ -94,4 +94,5 @@ def get_file_mime_type(self, input_file: Path) -> str: with open(input_file, mode="rb") as input_file_obj: sample_contents = input_file_obj.read(100) input_file_mime = magic.from_buffer(sample_contents, mime=True) + input_file_obj.seek(0) return input_file_mime