Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid VDB Index copy with 'like' API #2687

Merged
merged 11 commits into from
Nov 21, 2023
2 changes: 2 additions & 0 deletions deeplake/core/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1168,6 +1168,8 @@ def create_tensor_like(
del meta["version"]
del meta["name"]
del meta["links"]
if "vdb_indexes" in meta:
del meta["vdb_indexes"]
meta["dtype"] = np.dtype(meta["typestr"]) if meta["typestr"] else meta["dtype"]

destination_tensor = self._create_tensor(
Expand Down
6 changes: 4 additions & 2 deletions deeplake/core/index_maintenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,9 @@
if not below_threshold:
return INDEX_OP_TYPE.CREATE_INDEX
else:
if (
sounakr marked this conversation as resolved.
Show resolved Hide resolved
if changed_data_len == 0:
return INDEX_OP_TYPE.NOOP

Check warning on line 138 in deeplake/core/index_maintenance.py

View check run for this annotation

Codecov / codecov/patch

deeplake/core/index_maintenance.py#L138

Added line #L138 was not covered by tests
elif (
not index_regeneration
and check_index_params(self)
and check_incr_threshold(len(self.dataset), changed_data_len)
Expand All @@ -157,7 +159,7 @@
if not below_threshold:
return INDEX_OP_TYPE.CREATE_INDEX

if not check_vdb_indexes(self):
if not check_vdb_indexes(self) or changed_data_len == 0:
return INDEX_OP_TYPE.NOOP

if index_delete:
Expand Down
3 changes: 2 additions & 1 deletion deeplake/core/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,9 +350,10 @@ def extend(
Raises:
TensorDtypeMismatchError: Dtype for array must be equal to or castable to this tensor's dtype.
"""
old_dataset_length = self.num_samples
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this change really necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, in order to keep track of the initial records.

self._extend(samples, progressbar=progressbar, ignore_errors=ignore_errors)
if index_maintenance.validate_embedding_tensor(self):
row_ids = list(range(self.num_samples, self.num_samples + len(samples)))
row_ids = list(range(old_dataset_length, old_dataset_length + len(samples)))
index_maintenance.index_operation_dataset( # TODO: this might pick the wrong tensor when we support
self.dataset, # index for multiple tensors in the future
dml_type=_INDEX_OPERATION_MAPPING["ADD"],
Expand Down
54 changes: 54 additions & 0 deletions deeplake/core/vectorstore/test_deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -1789,6 +1789,60 @@ def test_vdb_index_incr_maint_tensor_append(local_path, capsys, hub_cloud_dev_to

vector_store.delete_by_path(local_path, token=ds.token)

@requires_libdeeplake
def test_vdb_index_like(local_path, capsys, hub_cloud_dev_token):
number_of_data = 1000
texts, embeddings, ids, metadatas, _ = utils.create_data(
number_of_data=number_of_data, embedding_dim=EMBEDDING_DIM
)

# initialize vector store object with vdb index threshold as 200.
vector_store = DeepLakeVectorStore(
path=local_path,
overwrite=True,
verbose=True,
exec_option="compute_engine",
index_params={"threshold": 200, "distance_metric": "L2"},
token=hub_cloud_dev_token,
)

vector_store.add(embedding=embeddings, text=texts, id=ids, metadata=metadatas)

assert len(vector_store) == number_of_data
assert set(vector_store.dataset.tensors) == set(
[
"embedding",
"id",
"metadata",
"text",
]
)
assert set(vector_store.tensors()) == set(
[
"embedding",
"id",
"metadata",
"text",
]
)

# Check if the index is recreated properly.
ds = vector_store.dataset
es = ds.embedding.get_vdb_indexes()
assert len(es) == 1
assert es[0]["id"] == "hnsw_1"
assert es[0]["distance"] == "l2_norm"
assert es[0]["type"] == "hnsw"

ds = deeplake.load(path=local_path, read_only=True)

ds2 = deeplake.like('mem://dummy', ds, overwrite=True)

for tensor in ds2.tensors:
ds2[tensor].extend(ds[tensor].data()['value'])

vector_store.delete_by_path(local_path, token=ds.token)


def assert_vectorstore_structure(vector_store, number_of_data):
assert len(vector_store) == number_of_data
Expand Down