Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/unstract/sdk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.10.1"
__version__ = "0.11.0"


def get_sdk_version():
Expand Down
3 changes: 0 additions & 3 deletions src/unstract/sdk/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,3 @@ def __init__(
@property
def user_message(self) -> Optional[str]:
return self._user_message

def __str__(self) -> str:
return f"{self.message}"
101 changes: 15 additions & 86 deletions src/unstract/sdk/index.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import os
import shutil
import zipfile
from typing import Optional

import filetype
from llama_index import Document, StorageContext, VectorStoreIndex
from llama_index.node_parser import SimpleNodeParser
from llama_index.vector_stores import VectorStoreQuery, VectorStoreQueryResult
from unstract.adapters.x2text.x2text_adapter import X2TextAdapter

from unstract.sdk.constants import LogLevel, ToolEnv
from unstract.sdk.embedding import ToolEmbedding
Expand All @@ -15,12 +12,7 @@
from unstract.sdk.utils import ToolUtils
from unstract.sdk.utils.service_context import ServiceContext
from unstract.sdk.vector_db import ToolVectorDB

allowed_pdf_to_text_converters = [
"default",
"unstract_llm_whisperer",
"unstract_camelot",
]
from unstract.sdk.x2txt import X2Text


class ToolIndex:
Expand Down Expand Up @@ -106,93 +98,30 @@ def index_file(
tool_id: str,
embedding_type: str,
vector_db: str,
x2text_adapter: str,
file_path: str,
chunk_size: int,
chunk_overlap: int,
reindex: bool = False,
converter: str = "default",
file_hash: Optional[str] = None,
):
if converter not in allowed_pdf_to_text_converters:
self.tool.stream_log(
"pdf-to-text-converters must be one of "
f"{allowed_pdf_to_text_converters}",
level=LogLevel.ERROR,
)
raise SdkException(
"pdf-to-text-converters must be one of "
f"{allowed_pdf_to_text_converters}"
)

input_file_type = None
input_file_type_mime = None

# Make file content hash if not available
if not file_hash:
file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
with open(file_path, mode="rb") as input_file_obj:
sample_contents = input_file_obj.read(100)
input_file_type = filetype.guess(sample_contents)

if input_file_type is None:
input_file_type_mime = "text/plain"
else:
input_file_type_mime = input_file_type.MIME

self.tool.stream_log(f"Input file type: {input_file_type_mime}")

self.tool.stream_log("Extracting text from input file")
full_text = []

if input_file_type_mime == "text/plain":
with open(file_path) as input_file_obj:
full_text.append(
{
"section": "full",
"text_contents": self._cleanup_text(
input_file_obj.read()
),
}
)

elif input_file_type_mime == "application/pdf":
raise SdkException(
"Indexing of PDF files is not supported currently"
)
# TODO: Make use of adapters to convert X2Text
# self.tool.stream_log(f"PDF to text converter: {converter}")
# if converter == "unstract_llm_whisperer" or converter == "default": # noqa
# full_text.append(
# {
# "section": "full",
# "text_contents": self._cleanup_text(
# x2txt.generate_whisper(
# input_file=file_path,
# mode="text",
# dump_text=True,
# )
# ),
# }
# )
# else:
# # TODO : Support for Camelot
# x2txt = X2Text(tool=self.tool)

elif input_file_type_mime == "application/zip":
self.tool.stream_log("Zip file extraction required")
with zipfile.ZipFile(file_path, "r") as zip_ref:
file_name_from_path = os.path.basename(file_path)
temp_directory = f"/tmp/unstract_zip/{file_name_from_path}"
# If temp_directory exists, delete it and create it again
if os.path.exists(temp_directory):
shutil.rmtree(temp_directory)
os.makedirs(temp_directory)
zip_ref.extractall(temp_directory)
else:
self.tool.stream_log(
f"Unsupported file type: {input_file_type_mime}",
level=LogLevel.ERROR,
)
raise SdkException(f"Unsupported file type: {input_file_type_mime}")
x2text = X2Text(tool=self.tool)
x2text_adapter: X2TextAdapter = x2text.get_x2text(
adapter_instance_id=x2text_adapter
)
extracted_text = x2text_adapter.process(input_file_path=file_path)
full_text.append(
{
"section": "full",
"text_contents": self._cleanup_text(extracted_text),
}
)

doc_id = ToolIndex.generate_file_id(
tool_id=tool_id,
Expand Down
23 changes: 4 additions & 19 deletions src/unstract/sdk/tool/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from pathlib import Path
from typing import Any

import magic
from jsonschema import Draft202012Validator, ValidationError, validators

from unstract.sdk.constants import MetadataKey, PropKey
from unstract.sdk.tool.base import BaseTool
from unstract.sdk.tool.mime_types import EXT_MIME_MAP
from unstract.sdk.utils import ToolUtils


def extend_with_default(validator_class: Any) -> Any:
Expand Down Expand Up @@ -211,26 +212,10 @@ def _validate_file_type(self, input_file: Path) -> None:
)
allowed_mimes.append(EXT_MIME_MAP[ext])

input_file_mime = self._get_file_mime(input_file=input_file)
input_file_mime = ToolUtils.get_file_mime_type(input_file=input_file)
self.tool.stream_log(f"Input file MIME: {input_file_mime}")
if input_file_mime not in allowed_mimes:
self.tool.stream_error_and_exit(
f"File type of {input_file_mime} is not supported by"
" the tool, check its PROPERTIES for a list of supported types"
)

def _get_file_mime(self, input_file: Path) -> str:
"""Gets the file MIME type for an input file. Uses libmagic to perform
the same.

Args:
input_file (Path): Path object of the input file

Returns:
str: MIME type of the file
"""
input_file_mime = ""
with open(input_file, mode="rb") as input_file_obj:
sample_contents = input_file_obj.read(100)
input_file_mime = magic.from_buffer(sample_contents, mime=True)
self.tool.stream_log(f"Input file MIME: {input_file_mime}")
return input_file_mime
21 changes: 21 additions & 0 deletions src/unstract/sdk/utils/tool_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import json
from hashlib import md5, sha256
from pathlib import Path
from typing import Any

import magic

from unstract.sdk.constants import FileReaderSettings


Expand Down Expand Up @@ -75,3 +78,21 @@ def json_to_str(json_to_dump: dict[str, Any]) -> str:
"""
compact_json = json.dumps(json_to_dump, separators=(",", ":"))
return compact_json

@staticmethod
def get_file_mime_type(self, input_file: Path) -> str:
"""Gets the file MIME type for an input file. Uses libmagic to perform
the same.

Args:
input_file (Path): Path object of the input file

Returns:
str: MIME type of the file
"""
input_file_mime = ""
with open(input_file, mode="rb") as input_file_obj:
sample_contents = input_file_obj.read(100)
input_file_mime = magic.from_buffer(sample_contents, mime=True)
input_file_obj.seek(0)
return input_file_mime