From c46fdc075e5e752b659599b6d92b5e0367265c1b Mon Sep 17 00:00:00 2001
From: Chandrasekharan M <chandrasekharan@zipstack.com>
Date: Sat, 24 Feb 2024 02:40:43 +0530
Subject: [PATCH 1/3] Updated index_file() to use x2text adapter Added util for
 getting the file MIME type Minor fix on exceptions.py

---
 src/unstract/sdk/__init__.py         |   2 +-
 src/unstract/sdk/exceptions.py       |   3 -
 src/unstract/sdk/index.py            | 101 ++++-----------------------
 src/unstract/sdk/tool/validator.py   |  23 ++----
 src/unstract/sdk/utils/tool_utils.py |  20 ++++++
 5 files changed, 40 insertions(+), 109 deletions(-)

diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py
index 56617d94..68e47dd0 100644
--- a/src/unstract/sdk/__init__.py
+++ b/src/unstract/sdk/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.10.1"
+__version__ = "0.11.0"
 
 
 def get_sdk_version():
diff --git a/src/unstract/sdk/exceptions.py b/src/unstract/sdk/exceptions.py
index 59a402a0..a116ab2f 100644
--- a/src/unstract/sdk/exceptions.py
+++ b/src/unstract/sdk/exceptions.py
@@ -11,6 +11,3 @@ def __init__(
     @property
     def user_message(self) -> Optional[str]:
         return self._user_message
-
-    def __str__(self) -> str:
-        return f"{self.message}"
diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py
index 5e008705..d5a34860 100644
--- a/src/unstract/sdk/index.py
+++ b/src/unstract/sdk/index.py
@@ -1,12 +1,9 @@
-import os
-import shutil
-import zipfile
 from typing import Optional
 
-import filetype
 from llama_index import Document, StorageContext, VectorStoreIndex
 from llama_index.node_parser import SimpleNodeParser
 from llama_index.vector_stores import VectorStoreQuery, VectorStoreQueryResult
+from unstract.adapters.x2text.x2text_adapter import X2TextAdapter
 
 from unstract.sdk.constants import LogLevel, ToolEnv
 from unstract.sdk.embedding import ToolEmbedding
@@ -15,12 +12,7 @@
 from unstract.sdk.utils import ToolUtils
 from unstract.sdk.utils.service_context import ServiceContext
 from unstract.sdk.vector_db import ToolVectorDB
-
-allowed_pdf_to_text_converters = [
-    "default",
-    "unstract_llm_whisperer",
-    "unstract_camelot",
-]
+from unstract.sdk.x2txt import X2Text
 
 
 class ToolIndex:
@@ -106,93 +98,30 @@ def index_file(
         tool_id: str,
         embedding_type: str,
         vector_db: str,
+        x2text_adapter: str,
         file_path: str,
         chunk_size: int,
         chunk_overlap: int,
         reindex: bool = False,
-        converter: str = "default",
         file_hash: Optional[str] = None,
     ):
-        if converter not in allowed_pdf_to_text_converters:
-            self.tool.stream_log(
-                "pdf-to-text-converters must be one of "
-                f"{allowed_pdf_to_text_converters}",
-                level=LogLevel.ERROR,
-            )
-            raise SdkException(
-                "pdf-to-text-converters must be one of "
-                f"{allowed_pdf_to_text_converters}"
-            )
-
-        input_file_type = None
-        input_file_type_mime = None
-
         # Make file content hash if not available
         if not file_hash:
             file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
-        with open(file_path, mode="rb") as input_file_obj:
-            sample_contents = input_file_obj.read(100)
-            input_file_type = filetype.guess(sample_contents)
-
-        if input_file_type is None:
-            input_file_type_mime = "text/plain"
-        else:
-            input_file_type_mime = input_file_type.MIME
-
-        self.tool.stream_log(f"Input file type: {input_file_type_mime}")
 
+        self.tool.stream_log("Extracting text from input file")
         full_text = []
-
-        if input_file_type_mime == "text/plain":
-            with open(file_path) as input_file_obj:
-                full_text.append(
-                    {
-                        "section": "full",
-                        "text_contents": self._cleanup_text(
-                            input_file_obj.read()
-                        ),
-                    }
-                )
-
-        elif input_file_type_mime == "application/pdf":
-            raise SdkException(
-                "Indexing of PDF files is not supported currently"
-            )
-            # TODO: Make use of adapters to convert X2Text
-            # self.tool.stream_log(f"PDF to text converter: {converter}")
-            # if converter == "unstract_llm_whisperer" or converter == "default":  # noqa
-            #     full_text.append(
-            #         {
-            #             "section": "full",
-            #             "text_contents": self._cleanup_text(
-            #                 x2txt.generate_whisper(
-            #                     input_file=file_path,
-            #                     mode="text",
-            #                     dump_text=True,
-            #                 )
-            #             ),
-            #         }
-            #     )
-            # else:
-            #     # TODO : Support for Camelot
-            #     x2txt = X2Text(tool=self.tool)
-
-        elif input_file_type_mime == "application/zip":
-            self.tool.stream_log("Zip file extraction required")
-            with zipfile.ZipFile(file_path, "r") as zip_ref:
-                file_name_from_path = os.path.basename(file_path)
-                temp_directory = f"/tmp/unstract_zip/{file_name_from_path}"
-                # If temp_directory exists, delete it and create it again
-                if os.path.exists(temp_directory):
-                    shutil.rmtree(temp_directory)
-                os.makedirs(temp_directory)
-                zip_ref.extractall(temp_directory)
-        else:
-            self.tool.stream_log(
-                f"Unsupported file type: {input_file_type_mime}",
-                level=LogLevel.ERROR,
-            )
-            raise SdkException(f"Unsupported file type: {input_file_type_mime}")
+        x2text = X2Text(tool=self.tool)
+        x2text_adapter: X2TextAdapter = x2text.get_x2text(
+            adapter_instance_id=x2text_adapter
+        )
+        extracted_text = x2text_adapter.process(input_file_path=file_path)
+        full_text.append(
+            {
+                "section": "full",
+                "text_contents": self._cleanup_text(extracted_text),
+            }
+        )
 
         doc_id = ToolIndex.generate_file_id(
             tool_id=tool_id,
diff --git a/src/unstract/sdk/tool/validator.py b/src/unstract/sdk/tool/validator.py
index f045301a..d7971fb7 100644
--- a/src/unstract/sdk/tool/validator.py
+++ b/src/unstract/sdk/tool/validator.py
@@ -3,11 +3,12 @@
 from pathlib import Path
 from typing import Any
 
-import magic
 from jsonschema import Draft202012Validator, ValidationError, validators
+
 from unstract.sdk.constants import MetadataKey, PropKey
 from unstract.sdk.tool.base import BaseTool
 from unstract.sdk.tool.mime_types import EXT_MIME_MAP
+from unstract.sdk.utils import ToolUtils
 
 
 def extend_with_default(validator_class: Any) -> Any:
@@ -211,26 +212,10 @@ def _validate_file_type(self, input_file: Path) -> None:
                 )
             allowed_mimes.append(EXT_MIME_MAP[ext])
 
-        input_file_mime = self._get_file_mime(input_file=input_file)
+        input_file_mime = ToolUtils.get_file_mime_type(input_file=input_file)
+        self.tool.stream_log(f"Input file MIME: {input_file_mime}")
         if input_file_mime not in allowed_mimes:
             self.tool.stream_error_and_exit(
                 f"File type of {input_file_mime} is not supported by"
                 " the tool, check its PROPERTIES for a list of supported types"
             )
-
-    def _get_file_mime(self, input_file: Path) -> str:
-        """Gets the file MIME type for an input file. Uses libmagic to perform
-        the same.
-
-        Args:
-            input_file (Path): Path object of the input file
-
-        Returns:
-            str: MIME type of the file
-        """
-        input_file_mime = ""
-        with open(input_file, mode="rb") as input_file_obj:
-            sample_contents = input_file_obj.read(100)
-            input_file_mime = magic.from_buffer(sample_contents, mime=True)
-        self.tool.stream_log(f"Input file MIME: {input_file_mime}")
-        return input_file_mime
diff --git a/src/unstract/sdk/utils/tool_utils.py b/src/unstract/sdk/utils/tool_utils.py
index a8bdadd0..6dc277f8 100644
--- a/src/unstract/sdk/utils/tool_utils.py
+++ b/src/unstract/sdk/utils/tool_utils.py
@@ -1,7 +1,10 @@
 import json
 from hashlib import md5, sha256
+from pathlib import Path
 from typing import Any
 
+import magic
+
 from unstract.sdk.constants import FileReaderSettings
 
 
@@ -75,3 +78,20 @@ def json_to_str(json_to_dump: dict[str, Any]) -> str:
         """
         compact_json = json.dumps(json_to_dump, separators=(",", ":"))
         return compact_json
+
+    @staticmethod
+    def get_file_mime_type(self, input_file: Path) -> str:
+        """Gets the file MIME type for an input file. Uses libmagic to perform
+        the same.
+
+        Args:
+            input_file (Path): Path object of the input file
+
+        Returns:
+            str: MIME type of the file
+        """
+        input_file_mime = ""
+        with open(input_file, mode="rb") as input_file_obj:
+            sample_contents = input_file_obj.read(100)
+            input_file_mime = magic.from_buffer(sample_contents, mime=True)
+        return input_file_mime

From 01d612d8d8425b7353db5d749b66fac1d624c82f Mon Sep 17 00:00:00 2001
From: Chandrasekharan M <chandrasekharan@zipstack.com>
Date: Sat, 24 Feb 2024 02:48:12 +0530
Subject: [PATCH 2/3] Lock file update

---
 pdm.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pdm.lock b/pdm.lock
index f82b49c3..acb65633 100644
--- a/pdm.lock
+++ b/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "docs", "lint", "test"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
-content_hash = "sha256:a8125f72370ebdf9f5dc8b24aa1dbe72dad9dc31f4346be953476ebc9de23df6"
+content_hash = "sha256:26b7a0ea88fc5cac1d0b896c20041b3d0fc9c257f1c12a0e318fc2f0a494b5ec"
 
 [[package]]
 name = "aiohttp"
@@ -3761,7 +3761,7 @@ files = [
 
 [[package]]
 name = "unstract-adapters"
-version = "0.2.0"
+version = "0.2.1"
 requires_python = "<3.12,>=3.9"
 summary = "Unstract Adapters"
 groups = ["default"]
@@ -3789,8 +3789,8 @@ dependencies = [
     "weaviate-client==3.25.3",
 ]
 files = [
-    {file = "unstract_adapters-0.2.0-py3-none-any.whl", hash = "sha256:7a794ff1410de655f9ae42e473ef4eba48a2536458bda54e738d61a1ef8a0f7c"},
-    {file = "unstract_adapters-0.2.0.tar.gz", hash = "sha256:6617878780b2bbd3036b315d39d6f3e72b455728ce8a6b1cced666d4d34859b3"},
+    {file = "unstract_adapters-0.2.1-py3-none-any.whl", hash = "sha256:7b48707bc5c634f07d4ea2926f1b2925bcd5c6869e057839ba69c2d62d19941b"},
+    {file = "unstract_adapters-0.2.1.tar.gz", hash = "sha256:08646a232185185390a193ad12b16715d1fccc69195d2d28e4f291b5c55f8117"},
 ]
 
 [[package]]

From 4a35b93dd35fb3d9aea15b67ff336705a5fd4ac3 Mon Sep 17 00:00:00 2001
From: Chandrasekharan M <chandrasekharan@zipstack.com>
Date: Sat, 24 Feb 2024 10:52:52 +0530
Subject: [PATCH 3/3] get_file_mime_type() fix for seeking to 0

---
 src/unstract/sdk/utils/tool_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/unstract/sdk/utils/tool_utils.py b/src/unstract/sdk/utils/tool_utils.py
index 6dc277f8..45641da6 100644
--- a/src/unstract/sdk/utils/tool_utils.py
+++ b/src/unstract/sdk/utils/tool_utils.py
@@ -94,4 +94,5 @@ def get_file_mime_type(self, input_file: Path) -> str:
         with open(input_file, mode="rb") as input_file_obj:
             sample_contents = input_file_obj.read(100)
             input_file_mime = magic.from_buffer(sample_contents, mime=True)
+            input_file_obj.seek(0)
         return input_file_mime