diff --git a/aixplain/modules/model/index_model.py b/aixplain/modules/model/index_model.py index b9420f52..c4343a4c 100644 --- a/aixplain/modules/model/index_model.py +++ b/aixplain/modules/model/index_model.py @@ -1,3 +1,6 @@ +import os +import warnings +from uuid import uuid4 from aixplain.enums import EmbeddingModel, Function, Supplier, ResponseStatus, StorageType, FunctionType from aixplain.modules.model import Model from aixplain.utils import config @@ -9,9 +12,7 @@ from aixplain.enums.splitting_options import SplittingOptions import os -from urllib.parse import urljoin -from aixplain.utils.file_utils import _request_with_retry - +DOCLING_MODEL_ID = "677bee6c6eb56331f9192a91" class IndexFilterOperator(Enum): """Enumeration of operators available for filtering index records. @@ -177,8 +178,6 @@ def __init__( model = ModelFactory.get(embedding_model) self.embedding_size = model.additional_info["embedding_size"] except Exception as e: - import warnings - warnings.warn(f"Failed to get embedding size for embedding model {embedding_model}: {e}") self.embedding_size = None @@ -231,11 +230,11 @@ def search(self, query: str, top_k: int = 10, filters: List[IndexFilter] = []) - } return self.run(data=data) - def upsert(self, documents: List[Record], splitter: Optional[Splitter] = None) -> ModelResponse: + def upsert(self, documents: List[Record] | str, splitter: Optional[Splitter] = None) -> ModelResponse: """Upsert documents into the index Args: - documents (List[Record]): List of documents to be upserted + documents (List[Record] | str): List of documents to be upserted or a file path splitter (Splitter, optional): Splitter to be applied. Defaults to None. Returns: @@ -244,8 +243,12 @@ def upsert(self, documents: List[Record], splitter: Optional[Splitter] = None) - Examples: index_model.upsert([Record(value="Hello, world!", value_type="text", uri="", id="1", attributes={})]) index_model.upsert([Record(value="Hello, world!", value_type="text", uri="", id="1", attributes={})], splitter=Splitter(split=True, split_by=SplittingOptions.WORD, split_length=1, split_overlap=0)) + index_model.upsert("my_file.pdf") + index_model.upsert("my_file.pdf", splitter=Splitter(split=True, split_by=SplittingOptions.WORD, split_length=400, split_overlap=50)) Splitter in the above example is optional and can be used to split the documents into smaller chunks. """ + if isinstance(documents, str): + documents = [self.prepare_record_from_file(documents)] # Validate documents for doc in documents: doc.validate() @@ -272,7 +275,7 @@ def upsert(self, documents: List[Record], splitter: Optional[Splitter] = None) - return response raise Exception(f"Failed to upsert documents: {response.error_message}") - def count(self) -> float: + def count(self) -> int: """Get the total number of documents in the index. Returns: @@ -335,6 +338,63 @@ def delete_record(self, record_id: Text) -> ModelResponse: return response raise Exception(f"Failed to delete record: {response.error_message}") + def prepare_record_from_file(self, file_path: str, file_id: str = None) -> Record: + """Prepare a record from a file. + + Args: + file_path (str): The path to the file to be processed. + file_id (str, optional): The ID to assign to the record. If not provided, a unique ID is generated. + + Returns: + Record: A Record object containing the file's content and metadata. + + Raises: + Exception: If the file cannot be parsed. + + Example: + >>> record = index_model.prepare_record_from_file("/path/to/file.txt") + """ + response = self.parse_file(file_path) + file_name = file_path.split("/")[-1] + if not file_id: + file_id = file_name + "_" + str(uuid4()) + return Record(value=response.data, value_type="text", id=file_id, attributes={"file_name": file_name}) + + @staticmethod + def parse_file(file_path: str) -> ModelResponse: + """Parse a file using the Docling model. + + Args: + file_path (str): The path to the file to be parsed. + + Returns: + ModelResponse: The response containing the parsed file content. + + Raises: + Exception: If the file does not exist or cannot be parsed. + + Example: + >>> response = IndexModel.parse_file("/path/to/file.pdf") + """ + if not os.path.exists(file_path): + raise Exception(f"File {file_path} does not exist") + if file_path.endswith(".txt"): + with open(file_path, "r") as file: + data = file.read() + if not data: + warnings.warn(f"File {file_path} is empty") + return ModelResponse(status=ResponseStatus.SUCCESS, data=data, completed=True) + try: + from aixplain.factories import ModelFactory + + model = ModelFactory.get(DOCLING_MODEL_ID) + response = model.run(file_path) + if not response.data: + warnings.warn(f"File {file_path} is empty") + return response + except Exception as e: + raise Exception(f"Failed to parse file: {e}") + def retrieve_records_with_filter(self, filter: IndexFilter) -> ModelResponse: """ Retrieve records from the index that match the given filter. diff --git a/tests/functional/model/data/test_file_parser_input.pdf b/tests/functional/model/data/test_file_parser_input.pdf new file mode 100644 index 00000000..5882d1bc Binary files /dev/null and b/tests/functional/model/data/test_file_parser_input.pdf differ diff --git a/tests/functional/model/run_model_test.py b/tests/functional/model/run_model_test.py index bf7415fb..68380d91 100644 --- a/tests/functional/model/run_model_test.py +++ b/tests/functional/model/run_model_test.py @@ -324,6 +324,102 @@ def test_index_model_air_with_splitter(embedding_model, supplier_params): index_model.delete() +def test_index_model_with_txt_file(): + """Testing Index Model with local txt file input""" + from aixplain.factories import IndexFactory + from uuid import uuid4 + from aixplain.factories.index_factory.utils import AirParams + from pathlib import Path + + # Create test file path + test_file_path = Path(__file__).parent / "data" / "test_input.txt" + + # Create index with OpenAI Ada 002 for text processing + params = AirParams( + name=f"File Index {uuid4()}", description="Index for file processing", embedding_model=EmbeddingModel.OPENAI_ADA002 + ) + index_model = IndexFactory.create(params=params) + + try: + # Upsert the file + response = index_model.upsert(str(test_file_path)) + assert str(response.status) == "SUCCESS" + + # Verify the content was indexed + response = index_model.search("demo") + assert str(response.status) == "SUCCESS" + assert "🤖" in response.data, "Robot emoji should be present in the response" + + # Verify count + assert index_model.count() > 0 + + finally: + # Cleanup + index_model.delete() + + +def test_index_model_with_pdf_file(): + """Testing Index Model with PDF file input""" + from aixplain.factories import IndexFactory + from uuid import uuid4 + from aixplain.factories.index_factory.utils import AirParams + from pathlib import Path + + # Create test file path + test_file_path = Path(__file__).parent / "data" / "test_file_parser_input.pdf" + + # Create index with OpenAI Ada 002 for text processing + params = AirParams( + name=f"PDF Index {uuid4()}", description="Index for PDF processing", embedding_model=EmbeddingModel.OPENAI_ADA002 + ) + index_model = IndexFactory.create(params=params) + + try: + # Upsert the PDF file + response = index_model.upsert(str(test_file_path)) + assert str(response.status) == "SUCCESS" + + # Verify the content was indexed + response = index_model.search("document") + assert str(response.status) == "SUCCESS" + assert len(response.data) > 0 + + # Verify count + assert index_model.count() > 0 + + finally: + # Cleanup + index_model.delete() + + +def test_index_model_with_invalid_file(): + """Testing Index Model with invalid file input""" + from aixplain.factories import IndexFactory + from uuid import uuid4 + from aixplain.factories.index_factory.utils import AirParams + from pathlib import Path + + # Create non-existent file path + test_file_path = Path(__file__).parent / "data" / "nonexistent.pdf" + + # Create index with OpenAI Ada 002 for text processing + params = AirParams( + name=f"Invalid File Index {uuid4()}", + description="Index for invalid file testing", + embedding_model=EmbeddingModel.OPENAI_ADA002, + ) + index_model = IndexFactory.create(params=params) + + try: + # Attempt to upsert non-existent file + with pytest.raises(Exception) as e: + index_model.upsert(str(test_file_path)) + assert "does not exist" in str(e.value) + + finally: + # Cleanup + index_model.delete() + def _test_records(): from aixplain.modules.model.record import Record from aixplain.enums import DataType diff --git a/tests/unit/index_model_test.py b/tests/unit/index_model_test.py index 8d5c3a74..5f5e13eb 100644 --- a/tests/unit/index_model_test.py +++ b/tests/unit/index_model_test.py @@ -252,3 +252,60 @@ def test_index_model_splitter(): assert splitter.split_by == "sentence" assert splitter.split_length == 100 assert splitter.split_overlap == 0 + + +def test_parse_file_success(mocker): + mock_response = {"status": "SUCCESS", "data": "parsed content"} + mock_model = mocker.Mock() + mock_model.run.return_value = ModelResponse(status=ResponseStatus.SUCCESS, data="parsed content") + + mocker.patch("aixplain.factories.ModelFactory.get", return_value=mock_model) + mocker.patch("os.path.exists", return_value=True) + + response = IndexModel.parse_file("test.pdf") + + assert isinstance(response, ModelResponse) + assert response.status == ResponseStatus.SUCCESS + assert response.data == "parsed content" + mock_model.run.assert_called_once_with("test.pdf") + + +def test_parse_file_not_found(): + with pytest.raises(Exception) as e: + IndexModel.parse_file("nonexistent.pdf") + assert str(e.value) == "File nonexistent.pdf does not exist" + + +def test_parse_file_error(mocker): + mocker.patch("os.path.exists", return_value=True) + mocker.patch("aixplain.factories.ModelFactory.get", side_effect=Exception("Model error")) + + with pytest.raises(Exception) as e: + IndexModel.parse_file("test.pdf") + assert str(e.value) == "Failed to parse file: Model error" + + +def test_upsert_with_file_path(mocker): + mock_parse_response = ModelResponse(status=ResponseStatus.SUCCESS, data="parsed content") + mock_upsert_response = {"status": "SUCCESS"} + + mocker.patch("aixplain.modules.model.index_model.IndexModel.parse_file", return_value=mock_parse_response) + mocker.patch("aixplain.factories.FileFactory.check_storage_type", return_value=StorageType.TEXT) + + with requests_mock.Mocker() as mock: + mock.post(execute_url, json=mock_upsert_response, status_code=200) + index_model = IndexModel(id=index_id, data=data, name="name", function=Function.SEARCH) + response = index_model.upsert("test.pdf") + + assert isinstance(response, ModelResponse) + assert response.status == ResponseStatus.SUCCESS + + +def test_upsert_with_invalid_file_path(mocker): + mocker.patch("aixplain.modules.model.index_model.IndexModel.parse_file", side_effect=Exception("File not found")) + + index_model = IndexModel(id=index_id, data=data, name="name", function=Function.SEARCH) + + with pytest.raises(Exception) as e: + index_model.upsert("nonexistent.pdf") + assert str(e.value) == "File not found"