From ea18f8b46d800476a51968a52d166a044a974fe1 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli Date: Wed, 4 Jun 2025 09:15:38 -0700 Subject: [PATCH 1/2] Coverage for Hybrid search added --- .../vectorstores/test_arangodb_vector.py | 441 +++++++++++++-- .../unit_tests/vectorstores/test_arangodb.py | 532 ++++++++++++++++++ 2 files changed, 936 insertions(+), 37 deletions(-) diff --git a/libs/arangodb/tests/integration_tests/vectorstores/test_arangodb_vector.py b/libs/arangodb/tests/integration_tests/vectorstores/test_arangodb_vector.py index c6a81f4..be428f6 100644 --- a/libs/arangodb/tests/integration_tests/vectorstores/test_arangodb_vector.py +++ b/libs/arangodb/tests/integration_tests/vectorstores/test_arangodb_vector.py @@ -8,7 +8,7 @@ from arango.cursor import Cursor from langchain_core.documents import Document -from langchain_arangodb.vectorstores.arangodb_vector import ArangoVector +from langchain_arangodb.vectorstores.arangodb_vector import ArangoVector, SearchType from langchain_arangodb.vectorstores.utils import DistanceStrategy from tests.integration_tests.utils import ArangoCredentials @@ -69,9 +69,9 @@ def test_arangovector_from_texts_and_similarity_search( index_info = None indexes_raw = collection.indexes() assert indexes_raw is not None, "collection.indexes() returned None" - assert isinstance( - indexes_raw, list - ), f"collection.indexes() expected list, got {type(indexes_raw)}" + assert isinstance(indexes_raw, list), ( + f"collection.indexes() expected list, got {type(indexes_raw)}" + ) indexes: List[Dict[str, Any]] = indexes_raw for index in indexes: if index.get("name") == "test_index" and index.get("type") == "vector": @@ -121,13 +121,13 @@ def test_arangovector_euclidean_distance( collection_euclidean: StandardCollection = _collection_obj_euclidean index_info = None indexes_raw_euclidean = collection_euclidean.indexes() - assert ( - indexes_raw_euclidean is not None - ), "collection_euclidean.indexes() returned None" - assert isinstance( - indexes_raw_euclidean, list - ), f"collection_euclidean.indexes() expected list, \ + assert indexes_raw_euclidean is not None, ( + "collection_euclidean.indexes() returned None" + ) + assert isinstance(indexes_raw_euclidean, list), ( + f"collection_euclidean.indexes() expected list, \ got {type(indexes_raw_euclidean)}" + ) indexes_euclidean: List[Dict[str, Any]] = indexes_raw_euclidean for index in indexes_euclidean: if index.get("name") == "test_index" and index.get("type") == "vector": @@ -380,26 +380,26 @@ def test_arangovector_delete_documents( # Check that deleted documents are indeed gone deleted_docs_check_raw = collection_delete.get_many(ids_to_delete) - assert ( - deleted_docs_check_raw is not None - ), "collection.get_many() returned None for deleted_docs_check" - assert isinstance( - deleted_docs_check_raw, list - ), f"collection.get_many() expected list for deleted_docs_check,\ + assert deleted_docs_check_raw is not None, ( + "collection.get_many() returned None for deleted_docs_check" + ) + assert isinstance(deleted_docs_check_raw, list), ( + f"collection.get_many() expected list for deleted_docs_check,\ got {type(deleted_docs_check_raw)}" + ) deleted_docs_check: List[Dict[str, Any]] = deleted_docs_check_raw assert len(deleted_docs_check) == 0 # Check that remaining documents are still present remaining_ids_expected = ["id_keep1", "id_keep2"] remaining_docs_check_raw = collection_delete.get_many(remaining_ids_expected) - assert ( - remaining_docs_check_raw is not None - ), "collection.get_many() returned None for remaining_docs_check" - assert isinstance( - remaining_docs_check_raw, list - ), f"collection.get_many() expected list for remaining_docs_check,\ + assert remaining_docs_check_raw is not None, ( + "collection.get_many() returned None for remaining_docs_check" + ) + assert isinstance(remaining_docs_check_raw, list), ( + f"collection.get_many() expected list for remaining_docs_check,\ got {type(remaining_docs_check_raw)}" + ) remaining_docs_check: List[Dict[str, Any]] = remaining_docs_check_raw assert len(remaining_docs_check) == 2 @@ -831,9 +831,9 @@ def test_arangovector_core_functionality( # 8. Testing search by ID all_docs_cursor = collection_core.all() assert all_docs_cursor is not None, "collection.all() returned None" - assert isinstance( - all_docs_cursor, Cursor - ), f"collection.all() expected Cursor, got {type(all_docs_cursor)}" + assert isinstance(all_docs_cursor, Cursor), ( + f"collection.all() expected Cursor, got {type(all_docs_cursor)}" + ) all_ids = [doc["_key"] for doc in all_docs_cursor] assert new_ids[0] in all_ids @@ -958,9 +958,9 @@ def test_arangovector_from_existing_collection( # Check that embeddings were added to the original documents doc_data1 = collection_exist.get("doc1") assert doc_data1 is not None, "Document 'doc1' not found in collection_exist" - assert isinstance( - doc_data1, dict - ), f"Expected 'doc1' to be a dict, got {type(doc_data1)}" + assert isinstance(doc_data1, dict), ( + f"Expected 'doc1' to be a dict, got {type(doc_data1)}" + ) doc1: Dict[str, Any] = doc_data1 assert "embedding" in doc1 assert isinstance(doc1["embedding"], list) @@ -991,9 +991,9 @@ def test_arangovector_from_existing_collection( # Check that custom embeddings were added doc_data2 = collection_exist.get("doc1") assert doc_data2 is not None, "Document 'doc1' not found after custom processing" - assert isinstance( - doc_data2, dict - ), f"Expected 'doc1' after custom processing to be a dict, got {type(doc_data2)}" + assert isinstance(doc_data2, dict), ( + f"Expected 'doc1' after custom processing to be a dict, got {type(doc_data2)}" + ) doc2: Dict[str, Any] = doc_data2 assert "custom_embedding" in doc2 assert "custom_text" in doc2 @@ -1036,12 +1036,12 @@ def test_arangovector_from_existing_collection( # Check that the combined text was inserted doc_data3 = collection_exist.get("doc1") - assert ( - doc_data3 is not None - ), "Document 'doc1' not found after insert_text processing" - assert isinstance( - doc_data3, dict - ), f"Expected 'doc1' after insert_text to be a dict, got {type(doc_data3)}" + assert doc_data3 is not None, ( + "Document 'doc1' not found after insert_text processing" + ) + assert isinstance(doc_data3, dict), ( + f"Expected 'doc1' after insert_text to be a dict, got {type(doc_data3)}" + ) doc3: Dict[str, Any] = doc_data3 assert "combined_title_content" in doc3 assert "The Solar System" in doc3["combined_title_content"] @@ -1062,3 +1062,370 @@ def test_arangovector_from_existing_collection( assert len(docs) == 2 assert any(doc.id == "doc1" for doc in docs) assert any(doc.id == "doc3" for doc in docs) + + +@pytest.mark.usefixtures("clear_arangodb_database") +def test_arangovector_hybrid_search_functionality( + arangodb_credentials: ArangoCredentials, + fake_embedding_function: FakeEmbeddings, +) -> None: + """Test hybrid search functionality comparing vector vs hybrid search results.""" + client = ArangoClient(hosts=arangodb_credentials["url"]) + db = client.db( + username=arangodb_credentials["username"], + password=arangodb_credentials["password"], + ) + + # Example texts for hybrid search testing + texts = [ + "The government passed new data privacy laws affecting social media " + "companies like Meta and Twitter.", + "A new smartphone from Samsung features cutting-edge AI and a focus " + "on secure user data.", + "Meta introduces Llama 3, a state-of-the-art language model to " + "compete with OpenAI's GPT-4.", + "How to enable two-factor authentication on Facebook for better " + "account protection.", + "A study on data privacy perceptions among Gen Z social media users " + "reveals concerns over targeted advertising.", + ] + + metadatas = [ + {"source": "news", "topic": "privacy"}, + {"source": "tech", "topic": "mobile"}, + {"source": "ai", "topic": "llm"}, + {"source": "guide", "topic": "security"}, + {"source": "research", "topic": "privacy"}, + ] + + # Create vector store with hybrid search enabled + vector_store = ArangoVector.from_texts( + texts=texts, + embedding=fake_embedding_function, + metadatas=metadatas, + database=db, + collection_name="test_hybrid_collection", + search_type=SearchType.HYBRID, + rrf_search_limit=3, # Top 3 RRF Search + overwrite_index=True, + insert_text=True, # Required for hybrid search + ) + + # Create vector and keyword indexes + vector_store.create_vector_index() + vector_store.create_keyword_index() + + query = "AI data privacy" + + # Test vector search + vector_results = vector_store.similarity_search_with_score( + query=query, + k=2, + use_approx=False, + search_type=SearchType.VECTOR, + ) + + # Test hybrid search + hybrid_results = vector_store.similarity_search_with_score( + query=query, + k=2, + use_approx=False, + search_type=SearchType.HYBRID, + ) + + # Test hybrid search with higher vector weight + hybrid_results_with_higher_vector_weight = ( + vector_store.similarity_search_with_score( + query=query, + k=2, + use_approx=False, + search_type=SearchType.HYBRID, + vector_weight=1.0, + keyword_weight=0.01, + ) + ) + + # Verify all searches return expected number of results + assert len(vector_results) == 2 + assert len(hybrid_results) == 2 + assert len(hybrid_results_with_higher_vector_weight) == 2 + + # Verify that all results have scores + for doc, score in vector_results: + assert isinstance(score, float) + assert score >= 0 + + for doc, score in hybrid_results: + assert isinstance(score, float) + assert score >= 0 + + for doc, score in hybrid_results_with_higher_vector_weight: + assert isinstance(score, float) + assert score >= 0 + + # Verify that hybrid search can produce different rankings than vector search + # This tests that the RRF algorithm is working + vector_top_doc = vector_results[0][0].page_content + hybrid_top_doc = hybrid_results[0][0].page_content + + # The results may be the same or different depending on the content, + # but we should be able to verify the search executed successfully + assert vector_top_doc in texts + assert hybrid_top_doc in texts + + +@pytest.mark.usefixtures("clear_arangodb_database") +def test_arangovector_hybrid_search_with_weights( + arangodb_credentials: ArangoCredentials, + fake_embedding_function: FakeEmbeddings, +) -> None: + """Test hybrid search with different vector and keyword weights.""" + client = ArangoClient(hosts=arangodb_credentials["url"]) + db = client.db( + username=arangodb_credentials["username"], + password=arangodb_credentials["password"], + ) + + texts = [ + "machine learning algorithms for data analysis", + "deep learning neural networks", + "artificial intelligence and machine learning", + "data science and analytics", + "computer vision and image processing", + ] + + vector_store = ArangoVector.from_texts( + texts=texts, + embedding=fake_embedding_function, + database=db, + collection_name="test_weights_collection", + search_type=SearchType.HYBRID, + overwrite_index=True, + insert_text=True, + ) + + vector_store.create_vector_index() + vector_store.create_keyword_index() + + query = "machine learning" + + # Test with equal weights + equal_weight_results = vector_store.similarity_search_with_score( + query=query, + k=3, + search_type=SearchType.HYBRID, + vector_weight=1.0, + keyword_weight=1.0, + use_approx=False, + ) + + # Test with vector emphasis + vector_emphasis_results = vector_store.similarity_search_with_score( + query=query, + k=3, + search_type=SearchType.HYBRID, + vector_weight=10.0, + keyword_weight=1.0, + use_approx=False, + ) + + # Test with keyword emphasis + keyword_emphasis_results = vector_store.similarity_search_with_score( + query=query, + k=3, + search_type=SearchType.HYBRID, + vector_weight=1.0, + keyword_weight=10.0, + use_approx=False, + ) + + # Verify all searches return expected number of results + assert len(equal_weight_results) == 3 + assert len(vector_emphasis_results) == 3 + assert len(keyword_emphasis_results) == 3 + + # Verify scores are valid + for results in [ + equal_weight_results, + vector_emphasis_results, + keyword_emphasis_results, + ]: + for doc, score in results: + assert isinstance(score, float) + assert score >= 0 + + +@pytest.mark.usefixtures("clear_arangodb_database") +def test_arangovector_hybrid_search_custom_keyword_search( + arangodb_credentials: ArangoCredentials, + fake_embedding_function: FakeEmbeddings, +) -> None: + """Test hybrid search with custom keyword search clause.""" + client = ArangoClient(hosts=arangodb_credentials["url"]) + db = client.db( + username=arangodb_credentials["username"], + password=arangodb_credentials["password"], + ) + + texts = [ + "Advanced machine learning techniques", + "Basic machine learning concepts", + "Deep learning and neural networks", + "Traditional machine learning algorithms", + "Modern AI and machine learning", + ] + + metadatas = [ + {"level": "advanced", "category": "ml"}, + {"level": "basic", "category": "ml"}, + {"level": "advanced", "category": "dl"}, + {"level": "intermediate", "category": "ml"}, + {"level": "modern", "category": "ai"}, + ] + + vector_store = ArangoVector.from_texts( + texts=texts, + embedding=fake_embedding_function, + metadatas=metadatas, + database=db, + collection_name="test_custom_keyword_collection", + search_type=SearchType.HYBRID, + overwrite_index=True, + insert_text=True, + ) + + vector_store.create_vector_index() + vector_store.create_keyword_index() + + query = "machine learning" + + # Test with default keyword search + default_results = vector_store.similarity_search_with_score( + query=query, + k=3, + search_type=SearchType.HYBRID, + use_approx=False, + ) + + # Test with custom keyword search clause + custom_keyword_clause = f""" + SEARCH ANALYZER( + doc.{vector_store.text_field} IN TOKENS(@query, @analyzer), + @analyzer + ) AND doc.level == "advanced" + """ + + custom_results = vector_store.similarity_search_with_score( + query=query, + k=3, + search_type=SearchType.HYBRID, + keyword_search_clause=custom_keyword_clause, + use_approx=False, + ) + + # Verify both searches return results + assert len(default_results) >= 1 + assert len(custom_results) >= 1 + + +@pytest.mark.usefixtures("clear_arangodb_database") +def test_arangovector_keyword_index_management( + arangodb_credentials: ArangoCredentials, + fake_embedding_function: FakeEmbeddings, +) -> None: + """Test keyword index creation, retrieval, and deletion.""" + client = ArangoClient(hosts=arangodb_credentials["url"]) + db = client.db( + username=arangodb_credentials["username"], + password=arangodb_credentials["password"], + ) + + texts = ["sample text for keyword indexing"] + + vector_store = ArangoVector.from_texts( + texts=texts, + embedding=fake_embedding_function, + database=db, + collection_name="test_keyword_index", + search_type=SearchType.HYBRID, + keyword_index_name="test_keyword_view", + overwrite_index=True, + insert_text=True, + ) + + # Test keyword index creation + vector_store.create_keyword_index() + + # Test keyword index retrieval + keyword_index = vector_store.retrieve_keyword_index() + assert keyword_index is not None + assert keyword_index["name"] == "test_keyword_view" + assert keyword_index["type"] == "arangosearch" + + # Test keyword index deletion + vector_store.delete_keyword_index() + + # Verify index was deleted + deleted_index = vector_store.retrieve_keyword_index() + assert deleted_index is None + + # Test that creating index again works (idempotent) + vector_store.create_keyword_index() + recreated_index = vector_store.retrieve_keyword_index() + assert recreated_index is not None + + +@pytest.mark.usefixtures("clear_arangodb_database") +def test_arangovector_hybrid_search_error_cases( + arangodb_credentials: ArangoCredentials, + fake_embedding_function: FakeEmbeddings, +) -> None: + """Test error cases for hybrid search functionality.""" + client = ArangoClient(hosts=arangodb_credentials["url"]) + db = client.db( + username=arangodb_credentials["username"], + password=arangodb_credentials["password"], + ) + + texts = ["test text for error cases"] + + # Test creating hybrid search without insert_text should work + # but might not give meaningful results + vector_store = ArangoVector.from_texts( + texts=texts, + embedding=fake_embedding_function, + database=db, + collection_name="test_error_collection", + search_type=SearchType.HYBRID, + insert_text=True, # Required for meaningful hybrid search + overwrite_index=True, + ) + + vector_store.create_vector_index() + vector_store.create_keyword_index() + + # Test that search works even with edge case parameters + results = vector_store.similarity_search_with_score( + query="test", + k=1, + search_type=SearchType.HYBRID, + vector_weight=0.0, # Edge case: no vector weight + keyword_weight=1.0, + use_approx=False, + ) + + # Should still return results (keyword-only search) + assert len(results) >= 0 # May return 0 or more results + + # Test with zero keyword weight + results_vector_only = vector_store.similarity_search_with_score( + query="test", + k=1, + search_type=SearchType.HYBRID, + vector_weight=1.0, + keyword_weight=0.0, # Edge case: no keyword weight + use_approx=False, + ) + + # Should still return results (vector-only search) + assert len(results_vector_only) >= 0 # May return 0 or more results diff --git a/libs/arangodb/tests/unit_tests/vectorstores/test_arangodb.py b/libs/arangodb/tests/unit_tests/vectorstores/test_arangodb.py index 197181a..e1ed83a 100644 --- a/libs/arangodb/tests/unit_tests/vectorstores/test_arangodb.py +++ b/libs/arangodb/tests/unit_tests/vectorstores/test_arangodb.py @@ -648,3 +648,535 @@ def test_select_relevance_score_fn_invalid_strategy_raises_error( "Consider providing relevance_score_fn to ArangoVector constructor." ) assert str(exc_info.value) == expected_message + + +def test_init_with_hybrid_search_type(arango_vector_factory: Any) -> None: + """Test initialization with hybrid search type.""" + from langchain_arangodb.vectorstores.arangodb_vector import SearchType + + vector_store = arango_vector_factory(search_type=SearchType.HYBRID) + assert vector_store.search_type == SearchType.HYBRID + + +def test_similarity_search_hybrid(arango_vector_factory: Any) -> None: + """Test similarity search with hybrid search type.""" + from langchain_arangodb.vectorstores.arangodb_vector import SearchType + + vector_store = arango_vector_factory(search_type=SearchType.HYBRID) + + # Mock the embedding.embed_query method + mock_embedding = [0.1] * 64 + vector_store.embedding.embed_query.return_value = mock_embedding + + # Mock the similarity_search_by_vector_and_keyword method + expected_docs = [MagicMock(), MagicMock()] + with patch.object( + vector_store, + "similarity_search_by_vector_and_keyword", + return_value=expected_docs, + ) as mock_hybrid_search: + docs = vector_store.similarity_search( + query="test query", + k=2, + return_fields={"field1", "field2"}, + use_approx=True, + vector_weight=1.0, + keyword_weight=0.5, + ) + + # Verify embed_query was called with query + vector_store.embedding.embed_query.assert_called_once_with("test query") + + # Verify similarity_search_by_vector_and_keyword was called with correct parameters + mock_hybrid_search.assert_called_once_with( + query="test query", + embedding=mock_embedding, + k=2, + return_fields={"field1", "field2"}, + use_approx=True, + filter_clause="", + vector_weight=1.0, + keyword_weight=0.5, + keyword_search_clause="", + ) + + # Verify the correct documents were returned + assert docs == expected_docs + + +def test_similarity_search_with_score_hybrid(arango_vector_factory: Any) -> None: + """Test similarity search with score using hybrid search type.""" + from langchain_arangodb.vectorstores.arangodb_vector import SearchType + + vector_store = arango_vector_factory(search_type=SearchType.HYBRID) + + # Mock the embedding.embed_query method + mock_embedding = [0.1] * 64 + vector_store.embedding.embed_query.return_value = mock_embedding + + # Mock the similarity_search_by_vector_and_keyword_with_score method + expected_results = [(MagicMock(), 0.8), (MagicMock(), 0.6)] + with patch.object( + vector_store, + "similarity_search_by_vector_and_keyword_with_score", + return_value=expected_results, + ) as mock_hybrid_search_with_score: + results = vector_store.similarity_search_with_score( + query="test query", + k=2, + return_fields={"field1", "field2"}, + use_approx=True, + vector_weight=2.0, + keyword_weight=1.5, + keyword_search_clause="custom clause", + ) + query = "test query" + v_store = vector_store + v_store.embedding.embed_query.assert_called_once_with(query) + + # Verify similarity_search_by_vector_and + # _keyword_with_score was called with correct parameters + mock_hybrid_search_with_score.assert_called_once_with( + query="test query", + embedding=mock_embedding, + k=2, + return_fields={"field1", "field2"}, + use_approx=True, + filter_clause="", + vector_weight=2.0, + keyword_weight=1.5, + keyword_search_clause="custom clause", + ) + + # Verify the correct results were returned + assert results == expected_results + + +def test_similarity_search_by_vector_and_keyword(arango_vector_factory: Any) -> None: + """Test similarity_search_by_vector_and_keyword method.""" + vector_store = arango_vector_factory() + + mock_embedding = [0.1] * 64 + expected_docs = [MagicMock(), MagicMock()] + + with patch.object( + vector_store, + "similarity_search_by_vector_and_keyword_with_score", + return_value=[(expected_docs[0], 0.8), (expected_docs[1], 0.6)], + ) as mock_hybrid_search_with_score: + docs = vector_store.similarity_search_by_vector_and_keyword( + query="test query", + embedding=mock_embedding, + k=2, + return_fields={"field1"}, + use_approx=False, + filter_clause="FILTER doc.type == 'test'", + vector_weight=1.5, + keyword_weight=0.8, + keyword_search_clause="custom search", + ) + + # Verify the method was called with correct parameters + mock_hybrid_search_with_score.assert_called_once_with( + query="test query", + embedding=mock_embedding, + k=2, + return_fields={"field1"}, + use_approx=False, + filter_clause="FILTER doc.type == 'test'", + vector_weight=1.5, + keyword_weight=0.8, + keyword_search_clause="custom search", + ) + + # Verify only documents (not scores) were returned + assert docs == expected_docs + + +def test_similarity_search_by_vector_and_keyword_with_score( + arango_vector_factory: Any, +) -> None: + """Test similarity_search_by_vector_and_keyword_with_score method.""" + vector_store = arango_vector_factory() + + mock_embedding = [0.1] * 64 + mock_cursor = MagicMock() + mock_query = "test query" + mock_bind_vars = {"test": "value"} + + # Mock _build_hybrid_search_query + with patch.object( + vector_store, + "_build_hybrid_search_query", + return_value=(mock_query, mock_bind_vars), + ) as mock_build_query: + # Mock database execution + vector_store.db.aql.execute.return_value = mock_cursor + + # Mock _process_search_query + expected_results = [(MagicMock(), 0.9), (MagicMock(), 0.7)] + with patch.object( + vector_store, "_process_search_query", return_value=expected_results + ) as mock_process: + results = vector_store.similarity_search_by_vector_and_keyword_with_score( + query="test query", + embedding=mock_embedding, + k=3, + return_fields={"field1", "field2"}, + use_approx=True, + filter_clause="FILTER doc.active == true", + vector_weight=2.0, + keyword_weight=1.0, + keyword_search_clause="SEARCH doc.content", + ) + + # Verify _build_hybrid_search_query was called with correct parameters + mock_build_query.assert_called_once_with( + query="test query", + k=3, + embedding=mock_embedding, + return_fields={"field1", "field2"}, + use_approx=True, + filter_clause="FILTER doc.active == true", + vector_weight=2.0, + keyword_weight=1.0, + keyword_search_clause="SEARCH doc.content", + ) + + # Verify database query execution + vector_store.db.aql.execute.assert_called_once_with( + mock_query, bind_vars=mock_bind_vars, stream=True + ) + + # Verify _process_search_query was called + mock_process.assert_called_once_with(mock_cursor) + + # Verify results + assert results == expected_results + + +def test_build_hybrid_search_query(arango_vector_factory: Any) -> None: + """Test _build_hybrid_search_query method.""" + vector_store = arango_vector_factory( + collection_name="test_collection", + keyword_index_name="test_view", + keyword_analyzer="text_en", + rrf_constant=60, + rrf_search_limit=100, + text_field="text", + embedding_field="embedding", + ) + + # Mock retrieve_keyword_index to return None (will create index) + with patch.object(vector_store, "retrieve_keyword_index", return_value=None): + with patch.object(vector_store, "create_keyword_index") as mock_create_index: + # Mock retrieve_vector_index to return None + # (will create index for approx search) + with patch.object(vector_store, "retrieve_vector_index", return_value=None): + with patch.object( + vector_store, "create_vector_index" + ) as mock_create_vector_index: + # Mock database version for approx search + vector_store.db.version.return_value = "3.12.5" + + query, bind_vars = vector_store._build_hybrid_search_query( + query="test query", + k=5, + embedding=[0.1] * 64, + return_fields={"field1", "field2"}, + use_approx=True, + filter_clause="FILTER doc.active == true", + vector_weight=1.5, + keyword_weight=2.0, + keyword_search_clause="", + ) + + # Verify indexes were created + mock_create_index.assert_called_once() + mock_create_vector_index.assert_called_once() + + # Verify query string contains expected components + assert "FOR doc IN @@collection" in query + assert "FOR doc IN @@view" in query + assert "SEARCH ANALYZER" in query + assert "BM25(doc)" in query + assert "COLLECT doc_key = result.doc._key INTO group" in query + assert "SUM(group[*].result.score)" in query + assert "SORT rrf_score DESC" in query + + # Verify bind variables + assert bind_vars["@collection"] == "test_collection" + assert bind_vars["@view"] == "test_view" + assert bind_vars["embedding"] == [0.1] * 64 + assert bind_vars["query"] == "test query" + assert bind_vars["analyzer"] == "text_en" + assert bind_vars["rrf_constant"] == 60 + assert bind_vars["rrf_search_limit"] == 100 + + +def test_build_hybrid_search_query_with_custom_keyword_search( + arango_vector_factory: Any, +) -> None: + """Test _build_hybrid_search_query with custom keyword search clause.""" + vector_store = arango_vector_factory() + + # Mock dependencies + with patch.object( + vector_store, "retrieve_keyword_index", return_value={"name": "test_view"} + ): + with patch.object( + vector_store, "retrieve_vector_index", return_value={"name": "test_index"} + ): + vector_store.db.version.return_value = "3.12.5" + + custom_search_clause = "SEARCH doc.title IN TOKENS(@query, @analyzer)" + + query, bind_vars = vector_store._build_hybrid_search_query( + query="test query", + k=3, + embedding=[0.2] * 64, + return_fields={"title"}, + use_approx=False, + filter_clause="", + vector_weight=1.0, + keyword_weight=1.0, + keyword_search_clause=custom_search_clause, + ) + + # Verify custom keyword search clause is used + assert custom_search_clause in query + # Verify default search clause is not used + assert "doc.text IN TOKENS" not in query + + +def test_keyword_index_management(arango_vector_factory: Any) -> None: + """Test keyword index creation, retrieval, and deletion.""" + vector_store = arango_vector_factory( + keyword_index_name="test_keyword_view", + keyword_analyzer="text_en", + collection_name="test_collection", + text_field="content", + ) + + # Test retrieve_keyword_index when index exists + mock_view = {"name": "test_keyword_view", "type": "arangosearch"} + + with patch.object(vector_store, "retrieve_keyword_index", return_value=mock_view): + result = vector_store.retrieve_keyword_index() + assert result == mock_view + + # Test retrieve_keyword_index when index doesn't exist + with patch.object(vector_store, "retrieve_keyword_index", return_value=None): + result = vector_store.retrieve_keyword_index() + assert result is None + + # Test create_keyword_index + with patch.object(vector_store, "retrieve_keyword_index", return_value=None): + vector_store.create_keyword_index() + + # Verify create_view was called with correct parameters + vector_store.db.create_view.assert_called_once() + call_args = vector_store.db.create_view.call_args + assert call_args[0][0] == "test_keyword_view" + assert call_args[0][1] == "arangosearch" + + view_properties = call_args[0][2] + assert "links" in view_properties + assert "test_collection" in view_properties["links"] + assert "analyzers" in view_properties["links"]["test_collection"] + assert "text_en" in view_properties["links"]["test_collection"]["analyzers"] + + # Test create_keyword_index when index already exists (idempotent) + vector_store.db.create_view.reset_mock() + with patch.object(vector_store, "retrieve_keyword_index", return_value=mock_view): + vector_store.create_keyword_index() + + # Should not create view if it already exists + vector_store.db.create_view.assert_not_called() + + # Test delete_keyword_index + with patch.object(vector_store, "retrieve_keyword_index", return_value=mock_view): + vector_store.delete_keyword_index() + + vector_store.db.delete_view.assert_called_once_with("test_keyword_view") + + # Test delete_keyword_index when index doesn't exist + vector_store.db.delete_view.reset_mock() + with patch.object(vector_store, "retrieve_keyword_index", return_value=None): + vector_store.delete_keyword_index() + + # Should not call delete_view if view doesn't exist + vector_store.db.delete_view.assert_not_called() + + +def test_from_texts_with_hybrid_search_and_invalid_insert_text() -> None: + """Test that from_texts raises ValueError when + hybrid search is used without insert_text.""" + mock_embedding = MagicMock() + mock_embedding.embed_documents.return_value = [[0.1] * 64, [0.2] * 64] + mock_db = MagicMock() + + from langchain_arangodb.vectorstores.arangodb_vector import ArangoVector, SearchType + + with pytest.raises(ValueError) as exc_info: + ArangoVector.from_texts( + texts=["text1", "text2"], + embedding=mock_embedding, + database=mock_db, + search_type=SearchType.HYBRID, + insert_text=False, # This should cause the error + ) + + assert "insert_text must be True when search_type is HYBRID" in str(exc_info.value) + + +def test_from_texts_with_hybrid_search_valid() -> None: + """Test that from_texts works correctly with hybrid search when insert_text=True.""" + mock_embedding = MagicMock() + mock_embedding.embed_documents.return_value = [[0.1] * 64, [0.2] * 64] + mock_db = MagicMock() + mock_collection = MagicMock() + mock_db.has_collection.return_value = True + mock_db.collection.return_value = mock_collection + mock_collection.indexes.return_value = [] + + from langchain_arangodb.vectorstores.arangodb_vector import ArangoVector, SearchType + + with patch.object(ArangoVector, "add_embeddings", return_value=["id1", "id2"]): + vector_store = ArangoVector.from_texts( + texts=["text1", "text2"], + embedding=mock_embedding, + database=mock_db, + search_type=SearchType.HYBRID, + insert_text=True, # This should work + ) + + assert vector_store.search_type == SearchType.HYBRID + + +def test_from_existing_collection_with_hybrid_search_invalid() -> None: + """Test that from_existing_collection raises + error with hybrid search and insert_text=False.""" + mock_embedding = MagicMock() + mock_db = MagicMock() + + from langchain_arangodb.vectorstores.arangodb_vector import ArangoVector, SearchType + + with pytest.raises(ValueError) as exc_info: + ArangoVector.from_existing_collection( + collection_name="test_collection", + text_properties_to_embed=["title", "content"], + embedding=mock_embedding, + database=mock_db, + search_type=SearchType.HYBRID, + insert_text=False, # This should cause the error + ) + + assert "insert_text must be True when search_type is HYBRID" in str(exc_info.value) + + +def test_build_hybrid_search_query_euclidean_distance( + arango_vector_factory: Any, +) -> None: + """Test _build_hybrid_search_query with Euclidean distance strategy.""" + from langchain_arangodb.vectorstores.utils import DistanceStrategy + + vector_store = arango_vector_factory( + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE + ) + + # Mock dependencies + with patch.object( + vector_store, "retrieve_keyword_index", return_value={"name": "test_view"} + ): + with patch.object( + vector_store, "retrieve_vector_index", return_value={"name": "test_index"} + ): + query, bind_vars = vector_store._build_hybrid_search_query( + query="test", + k=2, + embedding=[0.1] * 64, + return_fields=set(), + use_approx=False, + filter_clause="", + vector_weight=1.0, + keyword_weight=1.0, + keyword_search_clause="", + ) + + # Should use L2_DISTANCE for Euclidean distance + assert "L2_DISTANCE" in query + assert "SORT score ASC" in query # Euclidean uses ascending sort + + +def test_build_hybrid_search_query_version_check(arango_vector_factory: Any) -> None: + """Test that _build_hybrid_search_query checks + ArangoDB version for approximate search.""" + vector_store = arango_vector_factory() + + # Mock dependencies + with patch.object( + vector_store, "retrieve_keyword_index", return_value={"name": "test_view"} + ): + with patch.object(vector_store, "retrieve_vector_index", return_value=None): + # Mock old version + vector_store.db.version.return_value = "3.12.3" + + with pytest.raises(ValueError) as exc_info: + vector_store._build_hybrid_search_query( + query="test", + k=2, + embedding=[0.1] * 64, + return_fields=set(), + use_approx=True, # This should trigger the version check + filter_clause="", + vector_weight=1.0, + keyword_weight=1.0, + keyword_search_clause="", + ) + + assert ( + "Approximate Nearest Neighbor search requires ArangoDB >= 3.12.4" + in str(exc_info.value) + ) + + +def test_search_type_override_in_similarity_search(arango_vector_factory: Any) -> None: + """Test that search_type can be overridden in similarity_search method.""" + from langchain_arangodb.vectorstores.arangodb_vector import SearchType + + # Create vector store with default vector search + vector_store = arango_vector_factory(search_type=SearchType.VECTOR) + + mock_embedding = [0.1] * 64 + vector_store.embedding.embed_query.return_value = mock_embedding + + # Test overriding to hybrid search + expected_docs = [MagicMock()] + with patch.object( + vector_store, + "similarity_search_by_vector_and_keyword", + return_value=expected_docs, + ) as mock_hybrid_search: + docs = vector_store.similarity_search( + query="test", + k=1, + search_type=SearchType.HYBRID, # Override default + ) + + # Should call hybrid search method despite default being vector + mock_hybrid_search.assert_called_once() + assert docs == expected_docs + + # Test overriding to vector search + with patch.object( + vector_store, "similarity_search_by_vector", return_value=expected_docs + ) as mock_vector_search: + docs = vector_store.similarity_search( + query="test", + k=1, + search_type=SearchType.VECTOR, # Explicit vector search + ) + + mock_vector_search.assert_called_once() + assert docs == expected_docs From 964e9df5eeaea07c427c1c9430c4c860e73aa836 Mon Sep 17 00:00:00 2001 From: Ajay Kallepalli Date: Wed, 4 Jun 2025 09:27:53 -0700 Subject: [PATCH 2/2] Fix lint --- .../vectorstores/test_arangodb_vector.py | 72 +++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/libs/arangodb/tests/integration_tests/vectorstores/test_arangodb_vector.py b/libs/arangodb/tests/integration_tests/vectorstores/test_arangodb_vector.py index be428f6..0884e75 100644 --- a/libs/arangodb/tests/integration_tests/vectorstores/test_arangodb_vector.py +++ b/libs/arangodb/tests/integration_tests/vectorstores/test_arangodb_vector.py @@ -69,9 +69,9 @@ def test_arangovector_from_texts_and_similarity_search( index_info = None indexes_raw = collection.indexes() assert indexes_raw is not None, "collection.indexes() returned None" - assert isinstance(indexes_raw, list), ( - f"collection.indexes() expected list, got {type(indexes_raw)}" - ) + assert isinstance( + indexes_raw, list + ), f"collection.indexes() expected list, got {type(indexes_raw)}" indexes: List[Dict[str, Any]] = indexes_raw for index in indexes: if index.get("name") == "test_index" and index.get("type") == "vector": @@ -121,13 +121,13 @@ def test_arangovector_euclidean_distance( collection_euclidean: StandardCollection = _collection_obj_euclidean index_info = None indexes_raw_euclidean = collection_euclidean.indexes() - assert indexes_raw_euclidean is not None, ( - "collection_euclidean.indexes() returned None" - ) - assert isinstance(indexes_raw_euclidean, list), ( - f"collection_euclidean.indexes() expected list, \ + assert ( + indexes_raw_euclidean is not None + ), "collection_euclidean.indexes() returned None" + assert isinstance( + indexes_raw_euclidean, list + ), f"collection_euclidean.indexes() expected list, \ got {type(indexes_raw_euclidean)}" - ) indexes_euclidean: List[Dict[str, Any]] = indexes_raw_euclidean for index in indexes_euclidean: if index.get("name") == "test_index" and index.get("type") == "vector": @@ -380,26 +380,26 @@ def test_arangovector_delete_documents( # Check that deleted documents are indeed gone deleted_docs_check_raw = collection_delete.get_many(ids_to_delete) - assert deleted_docs_check_raw is not None, ( - "collection.get_many() returned None for deleted_docs_check" - ) - assert isinstance(deleted_docs_check_raw, list), ( - f"collection.get_many() expected list for deleted_docs_check,\ + assert ( + deleted_docs_check_raw is not None + ), "collection.get_many() returned None for deleted_docs_check" + assert isinstance( + deleted_docs_check_raw, list + ), f"collection.get_many() expected list for deleted_docs_check,\ got {type(deleted_docs_check_raw)}" - ) deleted_docs_check: List[Dict[str, Any]] = deleted_docs_check_raw assert len(deleted_docs_check) == 0 # Check that remaining documents are still present remaining_ids_expected = ["id_keep1", "id_keep2"] remaining_docs_check_raw = collection_delete.get_many(remaining_ids_expected) - assert remaining_docs_check_raw is not None, ( - "collection.get_many() returned None for remaining_docs_check" - ) - assert isinstance(remaining_docs_check_raw, list), ( - f"collection.get_many() expected list for remaining_docs_check,\ + assert ( + remaining_docs_check_raw is not None + ), "collection.get_many() returned None for remaining_docs_check" + assert isinstance( + remaining_docs_check_raw, list + ), f"collection.get_many() expected list for remaining_docs_check,\ got {type(remaining_docs_check_raw)}" - ) remaining_docs_check: List[Dict[str, Any]] = remaining_docs_check_raw assert len(remaining_docs_check) == 2 @@ -831,9 +831,9 @@ def test_arangovector_core_functionality( # 8. Testing search by ID all_docs_cursor = collection_core.all() assert all_docs_cursor is not None, "collection.all() returned None" - assert isinstance(all_docs_cursor, Cursor), ( - f"collection.all() expected Cursor, got {type(all_docs_cursor)}" - ) + assert isinstance( + all_docs_cursor, Cursor + ), f"collection.all() expected Cursor, got {type(all_docs_cursor)}" all_ids = [doc["_key"] for doc in all_docs_cursor] assert new_ids[0] in all_ids @@ -958,9 +958,9 @@ def test_arangovector_from_existing_collection( # Check that embeddings were added to the original documents doc_data1 = collection_exist.get("doc1") assert doc_data1 is not None, "Document 'doc1' not found in collection_exist" - assert isinstance(doc_data1, dict), ( - f"Expected 'doc1' to be a dict, got {type(doc_data1)}" - ) + assert isinstance( + doc_data1, dict + ), f"Expected 'doc1' to be a dict, got {type(doc_data1)}" doc1: Dict[str, Any] = doc_data1 assert "embedding" in doc1 assert isinstance(doc1["embedding"], list) @@ -991,9 +991,9 @@ def test_arangovector_from_existing_collection( # Check that custom embeddings were added doc_data2 = collection_exist.get("doc1") assert doc_data2 is not None, "Document 'doc1' not found after custom processing" - assert isinstance(doc_data2, dict), ( - f"Expected 'doc1' after custom processing to be a dict, got {type(doc_data2)}" - ) + assert isinstance( + doc_data2, dict + ), f"Expected 'doc1' after custom processing to be a dict, got {type(doc_data2)}" doc2: Dict[str, Any] = doc_data2 assert "custom_embedding" in doc2 assert "custom_text" in doc2 @@ -1036,12 +1036,12 @@ def test_arangovector_from_existing_collection( # Check that the combined text was inserted doc_data3 = collection_exist.get("doc1") - assert doc_data3 is not None, ( - "Document 'doc1' not found after insert_text processing" - ) - assert isinstance(doc_data3, dict), ( - f"Expected 'doc1' after insert_text to be a dict, got {type(doc_data3)}" - ) + assert ( + doc_data3 is not None + ), "Document 'doc1' not found after insert_text processing" + assert isinstance( + doc_data3, dict + ), f"Expected 'doc1' after insert_text to be a dict, got {type(doc_data3)}" doc3: Dict[str, Any] = doc_data3 assert "combined_title_content" in doc3 assert "The Solar System" in doc3["combined_title_content"]