From f1fac856c9b4166969ca281cac9c667fe6dbd61c Mon Sep 17 00:00:00 2001 From: Sounak Chakraborty Date: Fri, 3 Nov 2023 18:42:10 -0700 Subject: [PATCH 1/7] - In Like Command prohibit vdb_index being copied. --- deeplake/core/dataset/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deeplake/core/dataset/dataset.py b/deeplake/core/dataset/dataset.py index 465630e230..c95990d4b3 100644 --- a/deeplake/core/dataset/dataset.py +++ b/deeplake/core/dataset/dataset.py @@ -1168,6 +1168,8 @@ def create_tensor_like( del meta["version"] del meta["name"] del meta["links"] + if "vdb_indexes" in meta: + del meta["vdb_indexes"] meta["dtype"] = np.dtype(meta["typestr"]) if meta["typestr"] else meta["dtype"] destination_tensor = self._create_tensor( From 419b629fb06deb9a08da423e277b96f44c35343b Mon Sep 17 00:00:00 2001 From: Sounak Chakraborty Date: Mon, 6 Nov 2023 11:29:39 -0800 Subject: [PATCH 2/7] - NOOP Index maintenance when no rows got updated. --- deeplake/core/index_maintenance.py | 6 ++++-- deeplake/core/tensor.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/deeplake/core/index_maintenance.py b/deeplake/core/index_maintenance.py index a0d3e50f36..4df4ba92ef 100644 --- a/deeplake/core/index_maintenance.py +++ b/deeplake/core/index_maintenance.py @@ -134,7 +134,9 @@ def index_operation_type_vectorstore( if not below_threshold: return INDEX_OP_TYPE.CREATE_INDEX else: - if ( + if changed_data_len == 0: + return INDEX_OP_TYPE.NOOP + elif ( not index_regeneration and check_index_params(self) and check_incr_threshold(len(self.dataset), changed_data_len) @@ -157,7 +159,7 @@ def index_operation_type_dataset( if not below_threshold: return INDEX_OP_TYPE.CREATE_INDEX - if not check_vdb_indexes(self): + if not check_vdb_indexes(self) or changed_data_len == 0: return INDEX_OP_TYPE.NOOP if index_delete: diff --git a/deeplake/core/tensor.py b/deeplake/core/tensor.py index 65aaf8c382..19ff6ad74b 100644 --- a/deeplake/core/tensor.py +++ b/deeplake/core/tensor.py @@ -350,9 +350,10 @@ def extend( Raises: TensorDtypeMismatchError: Dtype for array must be equal to or castable to this tensor's dtype. """ + old_dataset_length = self.num_samples self._extend(samples, progressbar=progressbar, ignore_errors=ignore_errors) if index_maintenance.validate_embedding_tensor(self): - row_ids = list(range(self.num_samples, self.num_samples + len(samples))) + row_ids = list(range(old_dataset_length, old_dataset_length + len(samples))) index_maintenance.index_operation_dataset( # TODO: this might pick the wrong tensor when we support self.dataset, # index for multiple tensors in the future dml_type=_INDEX_OPERATION_MAPPING["ADD"], From 154ecf340c4b3091910cca44c87c3dc81f25e09e Mon Sep 17 00:00:00 2001 From: Sounak Chakraborty Date: Tue, 14 Nov 2023 19:23:45 +0530 Subject: [PATCH 3/7] - Test Case Update --- .../vectorstore/test_deeplake_vectorstore.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 828d0690c8..27855ebbc2 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -1777,6 +1777,61 @@ def test_vdb_index_incr_maint_tensor_append(local_path, capsys, hub_cloud_dev_to vector_store.delete_by_path(local_path, token=ds.token) +@requires_libdeeplake +def test_vdb_index_like(local_path, capsys, hub_cloud_dev_token): + number_of_data = 1000 + texts, embeddings, ids, metadatas, _ = utils.create_data( + number_of_data=number_of_data, embedding_dim=EMBEDDING_DIM + ) + + # initialize vector store object with vdb index threshold as 200. + vector_store = DeepLakeVectorStore( + path=local_path, + overwrite=True, + verbose=True, + exec_option="compute_engine", + index_params={"threshold": 200, "distance_metric": "L2"}, + token=hub_cloud_dev_token, + ) + + vector_store.add(embedding=embeddings, text=texts, id=ids, metadata=metadatas) + + assert len(vector_store) == number_of_data + assert set(vector_store.dataset.tensors) == set( + [ + "embedding", + "id", + "metadata", + "text", + ] + ) + assert set(vector_store.tensors()) == set( + [ + "embedding", + "id", + "metadata", + "text", + ] + ) + + # Check if the index is recreated properly. + ds = vector_store.dataset + es = ds.embedding.get_vdb_indexes() + assert len(es) == 1 + assert es[0]["id"] == "hnsw_1" + assert es[0]["distance"] == "l2_norm" + assert es[0]["type"] == "hnsw" + + ds = deeplake.load(path=local_path, read_only=True) + + ds2 = deeplake.like('mem://dummy', ds, overwrite=True) + + + for tensor in ds2.tensors: + ds2[tensor].extend(ds[tensor].data()['value']) + + vector_store.delete_by_path(local_path, token=ds.token) + def assert_vectorstore_structure(vector_store, number_of_data): assert len(vector_store) == number_of_data @@ -2597,3 +2652,5 @@ def test_vectorstore_factory(hub_cloud_dev_token, hub_cloud_path, runtime): assert isinstance(db, DeepMemoryVectorStore) else: assert isinstance(db, DeepLakeVectorStore) + + From 1d886a0991b34a3c45accb455719a54b1ef4d173 Mon Sep 17 00:00:00 2001 From: Sounak Chakraborty Date: Tue, 14 Nov 2023 19:24:22 +0530 Subject: [PATCH 4/7] - Test Case Update --- deeplake/core/vectorstore/test_deeplake_vectorstore.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 27855ebbc2..0d4526c4ba 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -1826,7 +1826,6 @@ def test_vdb_index_like(local_path, capsys, hub_cloud_dev_token): ds2 = deeplake.like('mem://dummy', ds, overwrite=True) - for tensor in ds2.tensors: ds2[tensor].extend(ds[tensor].data()['value']) From 962098aa6d395afd62d9d9da0852163b86a890f1 Mon Sep 17 00:00:00 2001 From: Sounak Chakraborty Date: Wed, 15 Nov 2023 23:26:49 +0530 Subject: [PATCH 5/7] - Test Case Update --- deeplake/core/vectorstore/test_deeplake_vectorstore.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 0d4526c4ba..bd80aec96b 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -2651,5 +2651,3 @@ def test_vectorstore_factory(hub_cloud_dev_token, hub_cloud_path, runtime): assert isinstance(db, DeepMemoryVectorStore) else: assert isinstance(db, DeepLakeVectorStore) - - From bd3c6b034b2b01500da243eb302af4af8b47a949 Mon Sep 17 00:00:00 2001 From: Sounak Chakraborty Date: Thu, 16 Nov 2023 09:40:01 +0530 Subject: [PATCH 6/7] - Black Formatting. --- deeplake/core/vectorstore/test_deeplake_vectorstore.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 8dd866467d..fe5c4cc857 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -1761,6 +1761,7 @@ def test_vdb_index_incr_maint_tensor_append(local_path, capsys, hub_cloud_dev_to vector_store.delete_by_path(local_path, token=ds.token) + @requires_libdeeplake def test_vdb_index_like(local_path, capsys, hub_cloud_dev_token): number_of_data = 1000 @@ -1808,10 +1809,10 @@ def test_vdb_index_like(local_path, capsys, hub_cloud_dev_token): ds = deeplake.load(path=local_path, read_only=True) - ds2 = deeplake.like('mem://dummy', ds, overwrite=True) + ds2 = deeplake.like("mem://dummy", ds, overwrite=True) for tensor in ds2.tensors: - ds2[tensor].extend(ds[tensor].data()['value']) + ds2[tensor].extend(ds[tensor].data()["value"]) vector_store.delete_by_path(local_path, token=ds.token) From e18763734aa68f9fed3ccc7c070b1b47423c76ae Mon Sep 17 00:00:00 2001 From: Sounak Chakraborty Date: Thu, 16 Nov 2023 12:54:14 +0530 Subject: [PATCH 7/7] - Update token in test case. --- deeplake/core/vectorstore/test_deeplake_vectorstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index fe5c4cc857..abb7faddf0 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -1814,7 +1814,7 @@ def test_vdb_index_like(local_path, capsys, hub_cloud_dev_token): for tensor in ds2.tensors: ds2[tensor].extend(ds[tensor].data()["value"]) - vector_store.delete_by_path(local_path, token=ds.token) + vector_store.delete_by_path(local_path, token=hub_cloud_dev_token) def assert_vectorstore_structure(vector_store, number_of_data):