roll bugfix into update pr

airbytehq · Sep 26, 2023 · 96003e6 · 96003e6
1 parent ef0c9db
commit 96003e6
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 4 deletions.
diff --git a/airbyte-integrations/connectors/destination-milvus/destination_milvus/indexer.py b/airbyte-integrations/connectors/destination-milvus/destination_milvus/indexer.py
@@ -51,6 +51,7 @@ def _create_client(self):
 
         self._collection = Collection(self.config.collection)
         self._collection.load()
+        self._primary_key = next((field["name"] for field in self._collection.describe()["fields"] if field["is_primary"]), None)
 
     def check(self) -> Optional[str]:
         deployment_mode = os.environ.get("DEPLOYMENT_MODE", "")
@@ -94,6 +95,18 @@ def _delete_for_filter(self, expr: str) -> None:
             self._collection.delete(expr=f"{id_field} in [{id_list_expr}]")
             page = iterator.next()
 
+    def _normalize(self, metadata: dict) -> dict:
+        result = {}
+
+        for key, value in metadata.items():
+            normalized_key = key
+            # the primary key can't be set directly with auto_id, so we prefix it with an underscore
+            if key == self._primary_key:
+                normalized_key = f"_{key}"
+            result[normalized_key] = value
+
+        return result
+
     def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None:
         if len(delete_ids) > 0:
             id_list_expr = ", ".join([f'"{id}"' for id in delete_ids])
@@ -102,5 +115,7 @@ def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None:
         entities = []
         for i in range(len(document_chunks)):
             chunk = document_chunks[i]
-            entities.append({**chunk.metadata, self.config.vector_field: chunk.embedding, self.config.text_field: chunk.page_content})
+            entities.append(
+                {**self._normalize(chunk.metadata), self.config.vector_field: chunk.embedding, self.config.text_field: chunk.page_content}
+            )
         self._collection.insert(entities)
diff --git a/airbyte-integrations/connectors/destination-milvus/unit_tests/indexer_test.py b/airbyte-integrations/connectors/destination-milvus/unit_tests/indexer_test.py
@@ -130,10 +130,11 @@ def test_pre_sync_does_not_call_delete(self):
         self.milvus_indexer._collection.delete.assert_not_called()
 
     def test_index_calls_insert(self):
-        self.milvus_indexer.index([Mock(metadata={"key": "value"}, page_content="some content", embedding=[1,2,3])], [])
+        self.milvus_indexer._primary_key = "id"
+        self.milvus_indexer.index([Mock(metadata={"key": "value", "id": 5}, page_content="some content", embedding=[1,2,3])], [])
 
         self.milvus_indexer._collection.insert.assert_called_with(
-            [{"key": "value", "vector": [1,2,3], "text": "some content"}]
+            [{"key": "value", "vector": [1,2,3], "text": "some content", "_id": 5}]
         )
 
     def test_index_calls_delete(self):

diff --git a/docs/integrations/destinations/milvus.md b/docs/integrations/destinations/milvus.md
@@ -61,6 +61,8 @@ To get started, create a new collection in your Milvus instance. Make sure that
 * The primary key field is set to [auto_id](https://milvus.io/docs/create_collection.md)
 * There is a vector field with the correct dimensionality (1536 for OpenAI, 1024 for Cohere) and [a configured index](https://milvus.io/docs/build_index.md)
 
+If the record contains a field with the same name as the primary key, it will be prefixed with an underscore so Milvus can control the primary key internally.
+
 ### Setting up a collection
 
 When using the Zilliz cloud, this can be done using the UI - in this case only the colleciton name and the vector dimensionality needs to be configured, the vector field with index will be automatically created under the name `vector`. Using the REST API, the following command will create the index:
@@ -103,5 +105,5 @@ vector_store.similarity_search("test")
 
 | Version | Date       | Pull Request                                                  | Subject                                                                                                                                              |
 |:--------| :--------- |:--------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------|
-| 0.0.2   | 2023-08-25 | [#30689](https://github.com/airbytehq/airbyte/pull/30689)     | Update CDK to support azure OpenAI embeddings and text splitting options | 
+| 0.0.2   | 2023-08-25 | [#30689](https://github.com/airbytehq/airbyte/pull/30689)     | Update CDK to support azure OpenAI embeddings and text splitting options, make sure primary key field is not accidentally set | 
 | 0.0.1   | 2023-08-12 | [#29442](https://github.com/airbytehq/airbyte/pull/29442)     | Milvus connector with some embedders  |