Skip to content

Commit

Permalink
roll bugfix into update pr
Browse files Browse the repository at this point in the history
  • Loading branch information
Joe Reuter committed Sep 26, 2023
1 parent ef0c9db commit 96003e6
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def _create_client(self):

self._collection = Collection(self.config.collection)
self._collection.load()
self._primary_key = next((field["name"] for field in self._collection.describe()["fields"] if field["is_primary"]), None)

def check(self) -> Optional[str]:
deployment_mode = os.environ.get("DEPLOYMENT_MODE", "")
Expand Down Expand Up @@ -94,6 +95,18 @@ def _delete_for_filter(self, expr: str) -> None:
self._collection.delete(expr=f"{id_field} in [{id_list_expr}]")
page = iterator.next()

def _normalize(self, metadata: dict) -> dict:
result = {}

for key, value in metadata.items():
normalized_key = key
# the primary key can't be set directly with auto_id, so we prefix it with an underscore
if key == self._primary_key:
normalized_key = f"_{key}"
result[normalized_key] = value

return result

def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None:
if len(delete_ids) > 0:
id_list_expr = ", ".join([f'"{id}"' for id in delete_ids])
Expand All @@ -102,5 +115,7 @@ def index(self, document_chunks: List[Chunk], delete_ids: List[str]) -> None:
entities = []
for i in range(len(document_chunks)):
chunk = document_chunks[i]
entities.append({**chunk.metadata, self.config.vector_field: chunk.embedding, self.config.text_field: chunk.page_content})
entities.append(
{**self._normalize(chunk.metadata), self.config.vector_field: chunk.embedding, self.config.text_field: chunk.page_content}
)
self._collection.insert(entities)
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,11 @@ def test_pre_sync_does_not_call_delete(self):
self.milvus_indexer._collection.delete.assert_not_called()

def test_index_calls_insert(self):
self.milvus_indexer.index([Mock(metadata={"key": "value"}, page_content="some content", embedding=[1,2,3])], [])
self.milvus_indexer._primary_key = "id"
self.milvus_indexer.index([Mock(metadata={"key": "value", "id": 5}, page_content="some content", embedding=[1,2,3])], [])

self.milvus_indexer._collection.insert.assert_called_with(
[{"key": "value", "vector": [1,2,3], "text": "some content"}]
[{"key": "value", "vector": [1,2,3], "text": "some content", "_id": 5}]
)

def test_index_calls_delete(self):
Expand Down
4 changes: 3 additions & 1 deletion docs/integrations/destinations/milvus.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ To get started, create a new collection in your Milvus instance. Make sure that
* The primary key field is set to [auto_id](https://milvus.io/docs/create_collection.md)
* There is a vector field with the correct dimensionality (1536 for OpenAI, 1024 for Cohere) and [a configured index](https://milvus.io/docs/build_index.md)

If the record contains a field with the same name as the primary key, it will be prefixed with an underscore so Milvus can control the primary key internally.

### Setting up a collection

When using the Zilliz cloud, this can be done using the UI - in this case only the colleciton name and the vector dimensionality needs to be configured, the vector field with index will be automatically created under the name `vector`. Using the REST API, the following command will create the index:
Expand Down Expand Up @@ -103,5 +105,5 @@ vector_store.similarity_search("test")

| Version | Date | Pull Request | Subject |
|:--------| :--------- |:--------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------|
| 0.0.2 | 2023-08-25 | [#30689](https://github.com/airbytehq/airbyte/pull/30689) | Update CDK to support azure OpenAI embeddings and text splitting options |
| 0.0.2 | 2023-08-25 | [#30689](https://github.com/airbytehq/airbyte/pull/30689) | Update CDK to support azure OpenAI embeddings and text splitting options, make sure primary key field is not accidentally set |
| 0.0.1 | 2023-08-12 | [#29442](https://github.com/airbytehq/airbyte/pull/29442) | Milvus connector with some embedders |

0 comments on commit 96003e6

Please sign in to comment.