Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid VDB Index copy with 'like' API #2687

Merged
merged 11 commits into from
Nov 21, 2023
2 changes: 2 additions & 0 deletions deeplake/core/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1168,6 +1168,8 @@ def create_tensor_like(
del meta["version"]
del meta["name"]
del meta["links"]
if "vdb_indexes" in meta:
del meta["vdb_indexes"]
meta["dtype"] = np.dtype(meta["typestr"]) if meta["typestr"] else meta["dtype"]

destination_tensor = self._create_tensor(
Expand Down
2 changes: 1 addition & 1 deletion deeplake/core/index_maintenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def index_operation_type_dataset(self, num_rows, changed_data_len):
if not below_threshold:
return INDEX_OP_TYPE.CREATE_INDEX

if not check_vdb_indexes(self):
if not check_vdb_indexes(self) or changed_data_len == 0:
return INDEX_OP_TYPE.NOOP

return INDEX_OP_TYPE.INCREMENTAL_INDEX
Expand Down
55 changes: 55 additions & 0 deletions deeplake/core/vectorstore/test_deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -1762,6 +1762,61 @@ def test_vdb_index_incr_maint_tensor_append(local_path, capsys, hub_cloud_dev_to
vector_store.delete_by_path(local_path, token=ds.token)


@requires_libdeeplake
def test_vdb_index_like(local_path, capsys, hub_cloud_dev_token):
number_of_data = 1000
texts, embeddings, ids, metadatas, _ = utils.create_data(
number_of_data=number_of_data, embedding_dim=EMBEDDING_DIM
)

# initialize vector store object with vdb index threshold as 200.
vector_store = DeepLakeVectorStore(
path=local_path,
overwrite=True,
verbose=True,
exec_option="compute_engine",
index_params={"threshold": 200, "distance_metric": "L2"},
token=hub_cloud_dev_token,
)

vector_store.add(embedding=embeddings, text=texts, id=ids, metadata=metadatas)

assert len(vector_store) == number_of_data
assert set(vector_store.dataset.tensors) == set(
[
"embedding",
"id",
"metadata",
"text",
]
)
assert set(vector_store.tensors()) == set(
[
"embedding",
"id",
"metadata",
"text",
]
)

# Check if the index is recreated properly.
ds = vector_store.dataset
es = ds.embedding.get_vdb_indexes()
assert len(es) == 1
assert es[0]["id"] == "hnsw_1"
assert es[0]["distance"] == "l2_norm"
assert es[0]["type"] == "hnsw"

ds = deeplake.load(path=local_path, read_only=True)

ds2 = deeplake.like("mem://dummy", ds, overwrite=True)

for tensor in ds2.tensors:
ds2[tensor].extend(ds[tensor].data()["value"])

vector_store.delete_by_path(local_path, token=hub_cloud_dev_token)


def assert_vectorstore_structure(vector_store, number_of_data):
assert len(vector_store) == number_of_data
assert set(vector_store.dataset.tensors) == {
Expand Down
Loading