Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 68 additions & 8 deletions aixplain/modules/model/index_model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import warnings
from uuid import uuid4
from aixplain.enums import EmbeddingModel, Function, Supplier, ResponseStatus, StorageType, FunctionType
from aixplain.modules.model import Model
from aixplain.utils import config
Expand All @@ -9,9 +12,7 @@
from aixplain.enums.splitting_options import SplittingOptions
import os

from urllib.parse import urljoin
from aixplain.utils.file_utils import _request_with_retry

DOCLING_MODEL_ID = "677bee6c6eb56331f9192a91"

class IndexFilterOperator(Enum):
"""Enumeration of operators available for filtering index records.
Expand Down Expand Up @@ -177,8 +178,6 @@ def __init__(
model = ModelFactory.get(embedding_model)
self.embedding_size = model.additional_info["embedding_size"]
except Exception as e:
import warnings

warnings.warn(f"Failed to get embedding size for embedding model {embedding_model}: {e}")
self.embedding_size = None

Expand Down Expand Up @@ -231,11 +230,11 @@ def search(self, query: str, top_k: int = 10, filters: List[IndexFilter] = []) -
}
return self.run(data=data)

def upsert(self, documents: List[Record], splitter: Optional[Splitter] = None) -> ModelResponse:
def upsert(self, documents: List[Record] | str, splitter: Optional[Splitter] = None) -> ModelResponse:
"""Upsert documents into the index

Args:
documents (List[Record]): List of documents to be upserted
documents (List[Record] | str): List of documents to be upserted or a file path
splitter (Splitter, optional): Splitter to be applied. Defaults to None.

Returns:
Expand All @@ -244,8 +243,12 @@ def upsert(self, documents: List[Record], splitter: Optional[Splitter] = None) -
Examples:
index_model.upsert([Record(value="Hello, world!", value_type="text", uri="", id="1", attributes={})])
index_model.upsert([Record(value="Hello, world!", value_type="text", uri="", id="1", attributes={})], splitter=Splitter(split=True, split_by=SplittingOptions.WORD, split_length=1, split_overlap=0))
index_model.upsert("my_file.pdf")
index_model.upsert("my_file.pdf", splitter=Splitter(split=True, split_by=SplittingOptions.WORD, split_length=400, split_overlap=50))
Splitter in the above example is optional and can be used to split the documents into smaller chunks.
"""
if isinstance(documents, str):
documents = [self.prepare_record_from_file(documents)]
# Validate documents
for doc in documents:
doc.validate()
Expand All @@ -272,7 +275,7 @@ def upsert(self, documents: List[Record], splitter: Optional[Splitter] = None) -
return response
raise Exception(f"Failed to upsert documents: {response.error_message}")

def count(self) -> float:
def count(self) -> int:
"""Get the total number of documents in the index.

Returns:
Expand Down Expand Up @@ -335,6 +338,63 @@ def delete_record(self, record_id: Text) -> ModelResponse:
return response
raise Exception(f"Failed to delete record: {response.error_message}")

def prepare_record_from_file(self, file_path: str, file_id: str = None) -> Record:
"""Prepare a record from a file.

Args:
file_path (str): The path to the file to be processed.
file_id (str, optional): The ID to assign to the record. If not provided, a unique ID is generated.

Returns:
Record: A Record object containing the file's content and metadata.

Raises:
Exception: If the file cannot be parsed.

Example:
>>> record = index_model.prepare_record_from_file("/path/to/file.txt")
"""
response = self.parse_file(file_path)
file_name = file_path.split("/")[-1]
if not file_id:
file_id = file_name + "_" + str(uuid4())
return Record(value=response.data, value_type="text", id=file_id, attributes={"file_name": file_name})

@staticmethod
def parse_file(file_path: str) -> ModelResponse:
"""Parse a file using the Docling model.

Args:
file_path (str): The path to the file to be parsed.

Returns:
ModelResponse: The response containing the parsed file content.

Raises:
Exception: If the file does not exist or cannot be parsed.

Example:
>>> response = IndexModel.parse_file("/path/to/file.pdf")
"""
if not os.path.exists(file_path):
raise Exception(f"File {file_path} does not exist")
if file_path.endswith(".txt"):
with open(file_path, "r") as file:
data = file.read()
if not data:
warnings.warn(f"File {file_path} is empty")
return ModelResponse(status=ResponseStatus.SUCCESS, data=data, completed=True)
try:
from aixplain.factories import ModelFactory

model = ModelFactory.get(DOCLING_MODEL_ID)
response = model.run(file_path)
if not response.data:
warnings.warn(f"File {file_path} is empty")
return response
except Exception as e:
raise Exception(f"Failed to parse file: {e}")

def retrieve_records_with_filter(self, filter: IndexFilter) -> ModelResponse:
"""
Retrieve records from the index that match the given filter.
Expand Down
Binary file not shown.
96 changes: 96 additions & 0 deletions tests/functional/model/run_model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,102 @@ def test_index_model_air_with_splitter(embedding_model, supplier_params):
index_model.delete()


def test_index_model_with_txt_file():
"""Testing Index Model with local txt file input"""
from aixplain.factories import IndexFactory
from uuid import uuid4
from aixplain.factories.index_factory.utils import AirParams
from pathlib import Path

# Create test file path
test_file_path = Path(__file__).parent / "data" / "test_input.txt"

# Create index with OpenAI Ada 002 for text processing
params = AirParams(
name=f"File Index {uuid4()}", description="Index for file processing", embedding_model=EmbeddingModel.OPENAI_ADA002
)
index_model = IndexFactory.create(params=params)

try:
# Upsert the file
response = index_model.upsert(str(test_file_path))
assert str(response.status) == "SUCCESS"

# Verify the content was indexed
response = index_model.search("demo")
assert str(response.status) == "SUCCESS"
assert "🤖" in response.data, "Robot emoji should be present in the response"

# Verify count
assert index_model.count() > 0

finally:
# Cleanup
index_model.delete()


def test_index_model_with_pdf_file():
"""Testing Index Model with PDF file input"""
from aixplain.factories import IndexFactory
from uuid import uuid4
from aixplain.factories.index_factory.utils import AirParams
from pathlib import Path

# Create test file path
test_file_path = Path(__file__).parent / "data" / "test_file_parser_input.pdf"

# Create index with OpenAI Ada 002 for text processing
params = AirParams(
name=f"PDF Index {uuid4()}", description="Index for PDF processing", embedding_model=EmbeddingModel.OPENAI_ADA002
)
index_model = IndexFactory.create(params=params)

try:
# Upsert the PDF file
response = index_model.upsert(str(test_file_path))
assert str(response.status) == "SUCCESS"

# Verify the content was indexed
response = index_model.search("document")
assert str(response.status) == "SUCCESS"
assert len(response.data) > 0

# Verify count
assert index_model.count() > 0

finally:
# Cleanup
index_model.delete()


def test_index_model_with_invalid_file():
"""Testing Index Model with invalid file input"""
from aixplain.factories import IndexFactory
from uuid import uuid4
from aixplain.factories.index_factory.utils import AirParams
from pathlib import Path

# Create non-existent file path
test_file_path = Path(__file__).parent / "data" / "nonexistent.pdf"

# Create index with OpenAI Ada 002 for text processing
params = AirParams(
name=f"Invalid File Index {uuid4()}",
description="Index for invalid file testing",
embedding_model=EmbeddingModel.OPENAI_ADA002,
)
index_model = IndexFactory.create(params=params)

try:
# Attempt to upsert non-existent file
with pytest.raises(Exception) as e:
index_model.upsert(str(test_file_path))
assert "does not exist" in str(e.value)

finally:
# Cleanup
index_model.delete()

def _test_records():
from aixplain.modules.model.record import Record
from aixplain.enums import DataType
Expand Down
57 changes: 57 additions & 0 deletions tests/unit/index_model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,3 +252,60 @@ def test_index_model_splitter():
assert splitter.split_by == "sentence"
assert splitter.split_length == 100
assert splitter.split_overlap == 0


def test_parse_file_success(mocker):
mock_response = {"status": "SUCCESS", "data": "parsed content"}
mock_model = mocker.Mock()
mock_model.run.return_value = ModelResponse(status=ResponseStatus.SUCCESS, data="parsed content")

mocker.patch("aixplain.factories.ModelFactory.get", return_value=mock_model)
mocker.patch("os.path.exists", return_value=True)

response = IndexModel.parse_file("test.pdf")

assert isinstance(response, ModelResponse)
assert response.status == ResponseStatus.SUCCESS
assert response.data == "parsed content"
mock_model.run.assert_called_once_with("test.pdf")


def test_parse_file_not_found():
with pytest.raises(Exception) as e:
IndexModel.parse_file("nonexistent.pdf")
assert str(e.value) == "File nonexistent.pdf does not exist"


def test_parse_file_error(mocker):
mocker.patch("os.path.exists", return_value=True)
mocker.patch("aixplain.factories.ModelFactory.get", side_effect=Exception("Model error"))

with pytest.raises(Exception) as e:
IndexModel.parse_file("test.pdf")
assert str(e.value) == "Failed to parse file: Model error"


def test_upsert_with_file_path(mocker):
mock_parse_response = ModelResponse(status=ResponseStatus.SUCCESS, data="parsed content")
mock_upsert_response = {"status": "SUCCESS"}

mocker.patch("aixplain.modules.model.index_model.IndexModel.parse_file", return_value=mock_parse_response)
mocker.patch("aixplain.factories.FileFactory.check_storage_type", return_value=StorageType.TEXT)

with requests_mock.Mocker() as mock:
mock.post(execute_url, json=mock_upsert_response, status_code=200)
index_model = IndexModel(id=index_id, data=data, name="name", function=Function.SEARCH)
response = index_model.upsert("test.pdf")

assert isinstance(response, ModelResponse)
assert response.status == ResponseStatus.SUCCESS


def test_upsert_with_invalid_file_path(mocker):
mocker.patch("aixplain.modules.model.index_model.IndexModel.parse_file", side_effect=Exception("File not found"))

index_model = IndexModel(id=index_id, data=data, name="name", function=Function.SEARCH)

with pytest.raises(Exception) as e:
index_model.upsert("nonexistent.pdf")
assert str(e.value) == "File not found"