From b1a4746928a243fd134433d3052e3329c441e033 Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Thu, 23 Nov 2023 22:28:57 +0600 Subject: [PATCH 01/16] Refactoring deepmemory --- .pre-commit-config.yaml | 10 + deeplake/client/client.py | 2 +- deeplake/client/config.py | 9 + deeplake/client/managed/__init__.py | 0 deeplake/client/managed/managed_client.py | 186 ++++++++ deeplake/client/managed/models.py | 30 ++ deeplake/client/utils.py | 2 +- deeplake/core/sample.py | 2 +- deeplake/core/storage/azure.py | 4 +- deeplake/core/vectorstore/__init__.py | 4 +- .../vectorstore/dataset_handlers/__init__.py | 3 + .../client_side_dataset_handler.py | 399 ++++++++++++++++++ .../dataset_handlers/dataset_handler.py | 15 + .../dataset_handlers/dataset_handler_base.py | 216 ++++++++++ .../managed_side_dataset_handler.py | 304 +++++++++++++ .../core/vectorstore/deep_memory/__init__.py | 1 + .../{ => deep_memory}/deep_memory.py | 124 ++++-- .../{ => deep_memory}/test_deepmemory.py | 14 +- .../core/vectorstore/deeplake_vectorstore.py | 376 +++-------------- .../vectorstore/deepmemory_vectorstore.py | 54 --- .../core/vectorstore/embeddings/__init__.py | 0 .../vectorstore/{ => embeddings}/embedder.py | 0 .../{ => embeddings}/test_embedder.py | 5 +- .../vectorstore/test_deeplake_vectorstore.py | 312 ++++++++------ .../vectorstore/unsupported_deep_memory.py | 39 -- .../vector_search/dataset/dataset.py | 7 + .../vector_search/dataset/test_dataset.py | 2 +- .../core/vectorstore/vector_search/utils.py | 22 +- .../core/vectorstore/vectorstore_factory.py | 29 -- .../integrations/huggingface/huggingface.py | 2 +- deeplake/requirements/tests.txt | 4 +- deeplake/util/check_latest_version.py | 2 +- deeplake/visualizer/tests/test_visualizer.py | 2 +- docs/source/deeplake.VectorStore.rst | 11 +- .../deeplake.core.vectorstore.deep_memory.rst | 2 +- docs/source/deeplake.core.vectorstore.rst | 11 - 36 files changed, 1567 insertions(+), 638 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 deeplake/client/managed/__init__.py create mode 100644 deeplake/client/managed/managed_client.py create mode 100644 deeplake/client/managed/models.py create mode 100644 deeplake/core/vectorstore/dataset_handlers/__init__.py create mode 100644 deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py create mode 100644 deeplake/core/vectorstore/dataset_handlers/dataset_handler.py create mode 100644 deeplake/core/vectorstore/dataset_handlers/dataset_handler_base.py create mode 100644 deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py create mode 100644 deeplake/core/vectorstore/deep_memory/__init__.py rename deeplake/core/vectorstore/{ => deep_memory}/deep_memory.py (88%) rename deeplake/core/vectorstore/{ => deep_memory}/test_deepmemory.py (98%) delete mode 100644 deeplake/core/vectorstore/deepmemory_vectorstore.py create mode 100644 deeplake/core/vectorstore/embeddings/__init__.py rename deeplake/core/vectorstore/{ => embeddings}/embedder.py (100%) rename deeplake/core/vectorstore/{ => embeddings}/test_embedder.py (96%) delete mode 100644 deeplake/core/vectorstore/unsupported_deep_memory.py delete mode 100644 deeplake/core/vectorstore/vectorstore_factory.py delete mode 100644 docs/source/deeplake.core.vectorstore.rst diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..fa00d5a7a3 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,10 @@ +repos: + - repo: https://github.com/psf/black + rev: 23.11.0 + hooks: + - id: black + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.7.0 + hooks: + - id: mypy diff --git a/deeplake/client/client.py b/deeplake/client/client.py index a0aed3c043..416179d288 100644 --- a/deeplake/client/client.py +++ b/deeplake/client/client.py @@ -1,5 +1,5 @@ import deeplake -import requests +import requests # type: ignore import textwrap from typing import Any, Optional, Dict, List, Union from deeplake.util.exceptions import ( diff --git a/deeplake/client/config.py b/deeplake/client/config.py index 326a4f5e5b..b01b20d7b4 100644 --- a/deeplake/client/config.py +++ b/deeplake/client/config.py @@ -31,3 +31,12 @@ DEEPLAKE_AUTH_TOKEN = "ACTIVELOOP_TOKEN" ORG_PERMISSION_SUFFIX = "/api/organizations/{}/features/dataset_query" + +# ManagedService Endpoints +INIT_VECTORSTORE_SUFFIX = "/api/dlserver/vectorstore/init" +GET_VECTORSTORE_SUMMARY_SUFFIX = "/api/dlserver/vectorstore/{}/{}/summary" +DELETE_VECTORSTORE_SUFFIX = "/api/dlserver/vectorstore" + +VECTORSTORE_SEARCH_SUFFIX = "/api/dlserver/vectorstore/search" +VECTORSTORE_ADD_SUFFIX = "/api/dlserver/vectorstore/add" +VECTORSTORE_REMOVE_ROWS_SUFFIX = "/api/dlserver/vectorstore/remove" diff --git a/deeplake/client/managed/__init__.py b/deeplake/client/managed/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deeplake/client/managed/managed_client.py b/deeplake/client/managed/managed_client.py new file mode 100644 index 0000000000..0b4089ea68 --- /dev/null +++ b/deeplake/client/managed/managed_client.py @@ -0,0 +1,186 @@ +import numpy as np +from typing import Callable, Dict, List, Any, Optional, Union + +from deeplake.client.client import DeepLakeBackendClient +from deeplake.client.utils import ( + check_response_status, +) +from deeplake.client.config import ( + GET_VECTORSTORE_SUMMARY_SUFFIX, + INIT_VECTORSTORE_SUFFIX, + DELETE_VECTORSTORE_SUFFIX, + VECTORSTORE_ADD_SUFFIX, + VECTORSTORE_REMOVE_ROWS_SUFFIX, + VECTORSTORE_SEARCH_SUFFIX, +) + +from deeplake.client.managed.models import ( + VectorStoreSummaryResponse, + VectorStoreInitResponse, + VectorStoreSearchResponse, + VectorStoreAddResponse, +) + + +class ManagedServiceClient(DeepLakeBackendClient): + def _preprocess_embedding(self, embedding: Union[List[float], np.ndarray, None]): + if embedding is not None and isinstance(embedding, np.ndarray): + return embedding.tolist() + return embedding + + def init_vectorstore( + self, + path: str, + overwrite: Optional[bool] = None, + tensor_params: Optional[List[Dict[str, Any]]] = None, + ): + response = self.request( + method="POST", + relative_url=INIT_VECTORSTORE_SUFFIX, + json={ + "dataset": path, + "overwrite": overwrite, + "tensor_params": tensor_params, + }, + ) + data = response.json() + + return VectorStoreInitResponse( + status_code=response.status_code, + path=data["path"], + summary=data["summary"], + length=data["length"], + tensors=data["tensors"], + exists=data.get("exists", False), + ) + + def delete_vectorstore(self, path: str, force: bool = False): + response = self.request( + method="DELETE", + relative_url=DELETE_VECTORSTORE_SUFFIX, + json={"dataset": path, "force": force}, + ) + check_response_status(response) + + def get_vectorstore_summary(self, path: str): + org_id, dataset_id = path[6:].split("/") + response = self.request( + method="GET", + relative_url=GET_VECTORSTORE_SUMMARY_SUFFIX.format(org_id, dataset_id), + ) + check_response_status(response) + data = response.json() + + return VectorStoreSummaryResponse( + status_code=response.status_code, + summary=data["summary"], + length=data["length"], + tensors=data["tensors"], + ) + + def vectorstore_search( + self, + path: str, + embedding: Optional[Union[List[float], np.ndarray]] = None, + k: int = 4, + distance_metric: Optional[str] = None, + query: Optional[str] = None, + filter: Optional[Dict[str, str]] = None, + embedding_tensor: str = "embedding", + return_tensors: Optional[List[str]] = None, + deep_memory: bool = False, + ): + response = self.request( + method="POST", + relative_url=VECTORSTORE_SEARCH_SUFFIX, + json={ + "dataset": path, + "embedding": self._preprocess_embedding(embedding), + "k": k, + "distance_metric": distance_metric, + "query": query, + "filter": filter, + "embedding_tensor": embedding_tensor, + "return_tensors": return_tensors, + "deep_memory": deep_memory, + }, + ) + check_response_status(response) + data = response.json() + + return VectorStoreSearchResponse( + status_code=response.status_code, + length=data["length"], + data=data["data"], + ) + + def vectorstore_add( + self, + path: str, + processed_tensors: List[Dict[str, List[Any]]], + rate_limiter: Optional[Dict[str, Any]] = None, + batch_byte_size: Optional[int] = None, + return_ids: bool = False, + ): + rest_api_tensors = [] + for tensor in processed_tensors: + for key, value in tensor.items(): + tensor[key] = self._preprocess_embedding(value) + rest_api_tensors.append(tensor) + + response = self.request( + method="POST", + relative_url=VECTORSTORE_ADD_SUFFIX, + json={ + "dataset": path, + "data": rest_api_tensors, + "rate_limiter": rate_limiter, + "batch_byte_size": batch_byte_size, + "return_ids": return_ids, + }, + ) + check_response_status(response) + data = response.json().get("result", {}) + + return VectorStoreAddResponse( + status_code=response.status_code, ids=data.get("ids") + ) + + def vectorstore_remove_rows( + self, + path: str, + indices: Optional[List[int]] = None, + ids: Optional[List[str]] = None, + filter: Optional[Dict[str, str]] = None, + query: Optional[str] = None, + delete_all: bool = False, + ): + response = self.request( + method="POST", + relative_url=VECTORSTORE_REMOVE_ROWS_SUFFIX, + json={ + "dataset": path, + "indices": indices, + "ids": ids, + "filter": filter, + "query": query, + "delete_all": delete_all, + }, + ) + check_response_status(response) + + def vectorstore_update_embeddings( + self, + path: str, + row_ids: List[str], + ids: List[str], + filter: Union[Dict, Callable], + query: str, + embedding_function: Union[Callable, List[Callable]], + embedding_source_tensor: Union[str, List[str]], + embedding_tensor: Union[str, List[str]], + ): + """ + TODO: implement + """ + pass diff --git a/deeplake/client/managed/models.py b/deeplake/client/managed/models.py new file mode 100644 index 0000000000..99c4b61785 --- /dev/null +++ b/deeplake/client/managed/models.py @@ -0,0 +1,30 @@ +from typing import NamedTuple, Dict, List, Optional, Any + + +class VectorStoreSummaryResponse(NamedTuple): + status_code: int + summary: str + length: int + tensors: List[ + Dict[str, Any] + ] # Same format as `tensor_params` in `init_vectorstore` + + +class VectorStoreInitResponse(NamedTuple): + status_code: int + path: str + summary: str + length: int + tensors: List[Dict[str, Any]] + exists: bool + + +class VectorStoreSearchResponse(NamedTuple): + status_code: int + length: int + data: Dict[str, List[Any]] + + +class VectorStoreAddResponse(NamedTuple): + status_code: int + ids: Optional[List[str]] = None diff --git a/deeplake/client/utils.py b/deeplake/client/utils.py index f364ff08e7..84bd289055 100644 --- a/deeplake/client/utils.py +++ b/deeplake/client/utils.py @@ -1,6 +1,6 @@ import os import json -import requests +import requests # type: ignore import textwrap from pathlib import Path from typing import Dict, List, Any, Union, Optional diff --git a/deeplake/core/sample.py b/deeplake/core/sample.py index 3ec1defc42..b3fe31a519 100644 --- a/deeplake/core/sample.py +++ b/deeplake/core/sample.py @@ -1,4 +1,4 @@ -import requests +import requests # type: ignore from deeplake.core.compression import ( compress_array, decompress_array, diff --git a/deeplake/core/storage/azure.py b/deeplake/core/storage/azure.py index 187d552e90..90e1ee8759 100644 --- a/deeplake/core/storage/azure.py +++ b/deeplake/core/storage/azure.py @@ -12,8 +12,8 @@ from concurrent import futures try: - from azure.identity import DefaultAzureCredential - from azure.storage.blob import ( + from azure.identity import DefaultAzureCredential # type: ignore + from azure.storage.blob import ( # type: ignore BlobServiceClient, BlobSasPermissions, ContainerSasPermissions, diff --git a/deeplake/core/vectorstore/__init__.py b/deeplake/core/vectorstore/__init__.py index ce8a4dc86b..e6ad58f51d 100644 --- a/deeplake/core/vectorstore/__init__.py +++ b/deeplake/core/vectorstore/__init__.py @@ -11,8 +11,6 @@ from deeplake.core.vectorstore.vector_search.indra.search_algorithm import ( search as indra_search_algorithm, ) -from deeplake.core.vectorstore.vectorstore_factory import ( - vectorstore_factory as VectorStore, -) +from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore DeepLakeVectorStore = VectorStore diff --git a/deeplake/core/vectorstore/dataset_handlers/__init__.py b/deeplake/core/vectorstore/dataset_handlers/__init__.py new file mode 100644 index 0000000000..82ee3ca7c7 --- /dev/null +++ b/deeplake/core/vectorstore/dataset_handlers/__init__.py @@ -0,0 +1,3 @@ +from deeplake.core.vectorstore.dataset_handlers.dataset_handler import ( + get_dataset_handler, +) diff --git a/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py b/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py new file mode 100644 index 0000000000..42b7589ad4 --- /dev/null +++ b/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py @@ -0,0 +1,399 @@ +import logging +import pathlib +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np + +import deeplake +from deeplake.client.utils import read_token +from deeplake.constants import ( + DEFAULT_VECTORSTORE_DISTANCE_METRIC, + _INDEX_OPERATION_MAPPING, +) +from deeplake.core import index_maintenance +from deeplake.core.dataset import Dataset +from deeplake.core.vectorstore import utils +from deeplake.core.vectorstore.dataset_handlers.dataset_handler_base import DHBase +from deeplake.core.vectorstore.deep_memory.deep_memory import ( + use_deep_memory, + DeepMemory, +) +from deeplake.core.vectorstore.vector_search import dataset as dataset_utils +from deeplake.core.vectorstore.vector_search import vector_search +from deeplake.util.bugout_reporter import feature_report_path +from deeplake.util.exceptions import DeepMemoryWaitingListError + + +class ClientSideDH(DHBase): + def __init__( + self, + path: Union[str, pathlib.Path], + dataset: Dataset, + tensor_params: List[Dict[str, object]], + embedding_function: Any, + read_only: bool, + ingestion_batch_size: int, + index_params: Dict[str, Union[int, str]], + num_workers: int, + exec_option: str, + token: str, + overwrite: bool, + verbose: bool, + runtime: Dict, + creds: Union[Dict, str], + org_id: str, + logger: logging.Logger, + branch: str, + **kwargs: Any, + ): + super().__init__( + path=path, + dataset=dataset, + tensor_params=tensor_params, + embedding_function=embedding_function, + read_only=read_only, + ingestion_batch_size=ingestion_batch_size, + index_params=index_params, + num_workers=num_workers, + exec_option=exec_option, + token=token, + overwrite=overwrite, + verbose=True, + runtime=runtime, + creds=creds, + org_id=org_id, + logger=logger, + **kwargs, + ) + + self.index_params = utils.parse_index_params(index_params) + kwargs["index_params"] = self.index_params + self.dataset = dataset or dataset_utils.create_or_load_dataset( + tensor_params=tensor_params, + dataset_path=self.path, + token=self.token, + creds=self.creds, + logger=self.logger, + read_only=read_only, + exec_option=exec_option, + embedding_function=embedding_function, + overwrite=overwrite, + runtime=runtime, + org_id=self.org_id, + branch=branch, + **kwargs, + ) + self.verbose = verbose + self.tensor_params = tensor_params + self.distance_metric_index = index_maintenance.index_operation_vectorstore(self) + self.deep_memory = DeepMemory( + dataset_or_path=self.path, + token=self.token, + logger=self.logger, + embedding_function=self.embedding_function, + creds=self.creds, + ) + + def add( + self, + embedding_function: Union[Callable, List[Callable]], + embedding_data: Union[List, List[List]], + embedding_tensor: Union[str, List[str]], + return_ids: bool, + rate_limiter: Dict, + **tensors, + ): + feature_report_path( + path=self.bugout_reporting_path, + feature_name="vs.add", + parameters={ + "tensors": list(tensors.keys()) if tensors else None, + "embedding_tensor": embedding_tensor, + "return_ids": return_ids, + "embedding_function": True if embedding_function is not None else False, + "embedding_data": True if embedding_data is not None else False, + }, + token=self.token, + username=self.username, + ) + ( + embedding_function, + embedding_data, + embedding_tensor, + tensors, + ) = utils.parse_tensors_kwargs( + tensors, + embedding_function, + embedding_data, + embedding_tensor, + ) + + ( + embedding_function, + embedding_data, + embedding_tensor, + tensors, + ) = utils.parse_add_arguments( + dataset=self.dataset, + initial_embedding_function=self.embedding_function, + embedding_function=embedding_function, + embedding_data=embedding_data, + embedding_tensor=embedding_tensor, + **tensors, + ) + + processed_tensors, id_ = dataset_utils.preprocess_tensors( + embedding_data, embedding_tensor, self.dataset, **tensors + ) + + assert id_ is not None + + dataset_utils.extend_or_ingest_dataset( + processed_tensors=processed_tensors, + dataset=self.dataset, + embedding_function=embedding_function, + embedding_data=embedding_data, + embedding_tensor=embedding_tensor, + rate_limiter=rate_limiter, + logger=self.logger, + ) + + if self.verbose: + self.dataset.summary() + + if return_ids: + return id_ + return None + + @use_deep_memory + def search( + self, + embedding_data: Union[str, List[str]], + embedding_function: Callable, + embedding: Union[List[float], np.ndarray], + k: int, + distance_metric: str, + query: str, + filter: Union[Dict, Callable], + exec_option: str, + embedding_tensor: str, + return_tensors: List[str], + return_view: bool, + deep_memory: bool, + ) -> Union[Dict, Dataset]: + feature_report_path( + path=self.bugout_reporting_path, + feature_name="vs.search", + parameters={ + "embedding_data": True if embedding_data is not None else False, + "embedding_function": True if embedding_function is not None else False, + "k": k, + "distance_metric": distance_metric, + "query": query[0:100] if query is not None else False, + "filter": True if filter is not None else False, + "exec_option": exec_option, + "embedding_tensor": embedding_tensor, + "embedding": True if embedding is not None else False, + "return_tensors": return_tensors, + "return_view": return_view, + }, + token=self.token, + username=self.username, + ) + + if exec_option is None and self.exec_option != "python" and callable(filter): + self.logger.warning( + 'Switching exec_option to "python" (runs on client) because filter is specified as a function. ' + f'To continue using the original exec_option "{self.exec_option}", please specify the filter as a dictionary or use the "query" parameter to specify a TQL query.' + ) + exec_option = "python" + + exec_option = exec_option or self.exec_option + + if deep_memory and not self.deep_memory: + raise DeepMemoryWaitingListError() + + utils.parse_search_args( + embedding_data=embedding_data, + embedding_function=embedding_function, + initial_embedding_function=self.embedding_function, + embedding=embedding, + k=k, + distance_metric=distance_metric, + query=query, + filter=filter, + exec_option=exec_option, + embedding_tensor=embedding_tensor, + return_tensors=return_tensors, + ) + + return_tensors = utils.parse_return_tensors( + self.dataset, return_tensors, embedding_tensor, return_view + ) + embedding_function = utils.create_embedding_function(embedding_function) + query_emb: Optional[Union[List[float], np.ndarray[Any, Any]]] = None + if query is None: + query_emb = dataset_utils.get_embedding( + embedding, + embedding_data, + embedding_function=embedding_function or self.embedding_function, + ) + + if self.distance_metric_index: + distance_metric = index_maintenance.parse_index_distance_metric_from_params( + self.logger, self.distance_metric_index, distance_metric + ) + + distance_metric = distance_metric or DEFAULT_VECTORSTORE_DISTANCE_METRIC + + return vector_search.search( + query=query, + logger=self.logger, + filter=filter, + query_embedding=query_emb, + k=k, + distance_metric=distance_metric, + exec_option=exec_option, + deeplake_dataset=self.dataset, + embedding_tensor=embedding_tensor, + return_tensors=return_tensors, + return_view=return_view, + deep_memory=deep_memory, + token=self.token, + org_id=self.org_id, + ) + + def delete( + self, + row_ids: List[int], + ids: List[str], + filter: Union[Dict, Callable], + query: str, + exec_option: str, + delete_all: bool, + ) -> bool: + feature_report_path( + path=self.bugout_reporting_path, + feature_name="vs.delete", + parameters={ + "ids": True if ids is not None else False, + "row_ids": True if row_ids is not None else False, + "query": query[0:100] if query is not None else False, + "filter": True if filter is not None else False, + "exec_option": exec_option, + "delete_all": delete_all, + }, + token=self.token, + username=self.username, + ) + + if not row_ids: + row_ids = ( + dataset_utils.search_row_ids( + dataset=self.dataset, + search_fn=self.search, + ids=ids, + filter=filter, + query=query, + select_all=delete_all, + exec_option=exec_option or self.exec_option, + ) + or [] + ) + + ( + self.dataset, + dataset_deleted, + ) = dataset_utils.delete_all_samples_if_specified( + self.dataset, + delete_all, + ) + + self.dataset.pop_multiple(row_ids) + + return True + + def update_embedding( + self, + row_ids: List[str], + ids: List[str], + filter: Union[Dict, Callable], + query: str, + exec_option: str, + embedding_function: Union[Callable, List[Callable]], + embedding_source_tensor: Union[str, List[str]], + embedding_tensor: Union[str, List[str]], + ): + feature_report_path( + path=self.bugout_reporting_path, + feature_name="vs.delete", + parameters={ + "ids": True if ids is not None else False, + "row_ids": True if row_ids is not None else False, + "query": query[0:100] if query is not None else False, + "filter": True if filter is not None else False, + "exec_option": exec_option, + }, + token=self.token, + username=self.username, + ) + + ( + embedding_function, + embedding_source_tensor, + embedding_tensor, + ) = utils.parse_update_arguments( + dataset=self.dataset, + embedding_function=embedding_function, + initial_embedding_function=self.embedding_function, + embedding_source_tensor=embedding_source_tensor, + embedding_tensor=embedding_tensor, + ) + + if not row_ids: + row_ids = dataset_utils.search_row_ids( + dataset=self.dataset, + search_fn=self.search, + ids=ids, + filter=filter, + query=query, + exec_option=exec_option or self.exec_option, + ) + + embedding_tensor_data = utils.convert_embedding_source_tensor_to_embeddings( + dataset=self.dataset, + embedding_source_tensor=embedding_source_tensor, + embedding_tensor=embedding_tensor, + embedding_function=embedding_function, + row_ids=row_ids, + ) + + self.dataset[row_ids].update(embedding_tensor_data) + + def commit(self, allow_empty: bool = True) -> None: + """Commits the Vector Store. + + Args: + allow_empty (bool): Whether to allow empty commits. Defaults to True. + """ + self.dataset.commit(allow_empty=allow_empty) + + def checkout(self, branch: str = "main") -> None: + """Checkout the Vector Store to a specific branch. + + Args: + branch (str): Branch name to checkout. Defaults to "main". + """ + self.dataset.checkout(branch) + + def tensors(self): + """Returns the list of tensors present in the dataset""" + return self.dataset.tensors + + def summary(self): + """Prints a summary of the dataset""" + return self.dataset.summary() + + def __len__(self): + """Length of the dataset""" + return len(self.dataset) diff --git a/deeplake/core/vectorstore/dataset_handlers/dataset_handler.py b/deeplake/core/vectorstore/dataset_handlers/dataset_handler.py new file mode 100644 index 0000000000..9b9d2f760e --- /dev/null +++ b/deeplake/core/vectorstore/dataset_handlers/dataset_handler.py @@ -0,0 +1,15 @@ +from deeplake.core.vectorstore.dataset_handlers.client_side_dataset_handler import ( + ClientSideDH, +) +from deeplake.core.vectorstore.dataset_handlers.managed_side_dataset_handler import ( + ManagedSideDH, +) + + +def get_dataset_handler(*args, **kwargs): + runtime = kwargs.get("runtime", None) + if runtime and runtime.get("tensor_db", True): + # TODO: change to ManagedSideDH when it's ready + return ClientSideDH(*args, **kwargs) + else: + return ClientSideDH(*args, **kwargs) diff --git a/deeplake/core/vectorstore/dataset_handlers/dataset_handler_base.py b/deeplake/core/vectorstore/dataset_handlers/dataset_handler_base.py new file mode 100644 index 0000000000..0dcb88f625 --- /dev/null +++ b/deeplake/core/vectorstore/dataset_handlers/dataset_handler_base.py @@ -0,0 +1,216 @@ +import logging +import pathlib +from abc import abstractmethod, ABC +from typing import Optional, Any, List, Dict, Union, Callable +import jwt + +import numpy as np + +from deeplake.util.path import convert_pathlib_to_string_if_needed +from deeplake.api import dataset +from deeplake.core.dataset import Dataset +from deeplake.constants import ( + DEFAULT_VECTORSTORE_TENSORS, + MAX_BYTES_PER_MINUTE, + TARGET_BYTE_SIZE, +) +from deeplake.client.utils import read_token +from deeplake.core.vectorstore import utils +from deeplake.util.bugout_reporter import ( + feature_report_path, +) +from deeplake.util.path import get_path_type + + +class DHBase(ABC): + """Base class for dataset handlers.""" + + def __init__( + self, + path: Union[str, pathlib.Path], + dataset: Dataset, + tensor_params: List[Dict[str, object]], + embedding_function: Any, + read_only: bool, + ingestion_batch_size: int, + index_params: Dict[str, Union[int, str]], + num_workers: int, + exec_option: str, + token: str, + overwrite: bool, + verbose: bool, + runtime: Dict, + creds: Union[Dict, str], + org_id: str, + logger: logging.Logger, + **kwargs: Any, + ): + try: + from indra import api # type: ignore + + self.indra_installed = True + except Exception: # pragma: no cover + self.indra_installed = False # pragma: no cover + + self._exec_option = exec_option + + self.path: Optional[str] = None + self.dataset = dataset + if dataset and path: + raise ValueError( + "Only one of `dataset` or path should be provided to the dataset handler." + ) + elif not dataset and not path: + raise ValueError("Either `dataset` or path should be provided.") + elif path: + self.path = convert_pathlib_to_string_if_needed(path) + else: + self.dataset = dataset + self.path = dataset.path + + self._token = token + self.logger = logger + self.org_id = org_id if get_path_type(self.path) == "local" else None + self.bugout_reporting_path = self.path or dataset.path + + feature_report_path( + self.bugout_reporting_path, + "vs.initialize", + { + "tensor_params": "default" + if tensor_params is not None + else tensor_params, + "embedding_function": True if embedding_function is not None else False, + "num_workers": num_workers, + "overwrite": overwrite, + "read_only": read_only, + "ingestion_batch_size": ingestion_batch_size, + "index_params": index_params, + "exec_option": exec_option, + "token": self.token, + "verbose": verbose, + "runtime": runtime, + "path": self.path, + }, + token=self.token, + username=self.username, + ) + + self.ingestion_batch_size = ingestion_batch_size + self.index_params = utils.parse_index_params(index_params) + kwargs["index_params"] = self.index_params + self.num_workers = num_workers + self.creds = creds or {} + self.embedding_function = utils.create_embedding_function(embedding_function) + + @property + def token(self): + return self._token or read_token(from_env=True) + + @property + def exec_option(self) -> str: + return utils.parse_exec_option( + self.dataset, self._exec_option, self.indra_installed, self.username + ) + + @property + def username(self) -> str: + username = "public" + if self.token is not None: + try: + username = jwt.decode(self.token, options={"verify_signature": False})[ + "id" + ] + except Exception: + pass + return username + + @abstractmethod + def add( + self, + embedding_function: Union[Callable, List[Callable]], + embedding_data: Union[List, List[List]], + embedding_tensor: Union[str, List[str]], + return_ids: bool, + rate_limiter: Dict, + **tensors, + ): + pass + + @abstractmethod + def search( + self, + embedding_data: Union[str, List[str], None] = None, + embedding_function: Optional[Callable] = None, + embedding: Optional[Union[List[float], np.ndarray]] = None, + k: int = 4, + distance_metric: Optional[str] = None, + query: Optional[str] = None, + filter: Optional[Union[Dict, Callable]] = None, + exec_option: Optional[str] = None, + embedding_tensor: str = "embedding", + return_tensors: Optional[List[str]] = None, + return_view: bool = False, + deep_memory: bool = False, + ) -> Union[Dict, Dataset]: + pass + + @abstractmethod + def delete( + self, + row_ids: List[int], + ids: List[str], + filter: Union[Dict, Callable], + query: str, + exec_option: str, + delete_all: bool, + ) -> bool: + pass + + @abstractmethod + def update_embedding( + self, + row_ids: List[str], + ids: List[str], + filter: Union[Dict, Callable], + query: str, + exec_option: str, + embedding_function: Union[Callable, List[Callable]], + embedding_source_tensor: Union[str, List[str]], + embedding_tensor: Union[str, List[str]], + ): + pass + + @abstractmethod + def tensors(self): + pass + + @abstractmethod + def summary(self): + pass + + @abstractmethod + def __len__(self): + pass + + def commit(self, allow_empty: bool = True) -> None: + """Commits the Vector Store. + + Args: + allow_empty (bool): Whether to allow empty commits. Defaults to True. + + Raises: + NotImplementedError: This method is not implemented by the base class. + """ + raise NotImplementedError() + + def checkout(self, branch: str = "main") -> None: + """Checkout the Vector Store to a specific branch. + + Args: + branch (str): Branch name to checkout. Defaults to "main". + + Raises: + NotImplementedError: This method is not implemented by the base class. + """ + raise NotImplementedError() diff --git a/deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py b/deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py new file mode 100644 index 0000000000..0e5f60a636 --- /dev/null +++ b/deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py @@ -0,0 +1,304 @@ +import logging +import pathlib +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np + +from deeplake.client.managed.managed_client import ManagedServiceClient +from deeplake.client.utils import read_token +from deeplake.constants import MAX_BYTES_PER_MINUTE, TARGET_BYTE_SIZE +from deeplake.core.dataset import Dataset +from deeplake.core.vectorstore.dataset_handlers.dataset_handler_base import DHBase +from deeplake.core.vectorstore.deep_memory.deep_memory import ( + DeepMemory, + use_deep_memory, +) +from deeplake.core.vectorstore import utils +from deeplake.util.bugout_reporter import feature_report_path +from deeplake.util.path import convert_pathlib_to_string_if_needed, get_path_type + + +class ManagedSideDH(DHBase): + def __init__( + self, + path: Union[str, pathlib.Path], + dataset: Dataset, + tensor_params: List[Dict[str, object]], + embedding_function: Any, + read_only: bool, + ingestion_batch_size: int, + index_params: Dict[str, Union[int, str]], + num_workers: int, + exec_option: str, + token: str, + overwrite: bool, + verbose: bool, + runtime: Dict, + creds: Union[Dict, str], + org_id: str, + logger: logging.Logger, + branch: str, + **kwargs: Any, + ): + if embedding_function is not None: + raise NotImplementedError( + "ManagedVectorStore does not support embedding_function for now." + ) + + super().__init__( + path=path, + dataset=dataset, + tensor_params=tensor_params, + embedding_function=embedding_function, + read_only=read_only, + ingestion_batch_size=ingestion_batch_size, + index_params=index_params, + num_workers=num_workers, + exec_option=exec_option, + token=token, + overwrite=overwrite, + verbose=True, + runtime=runtime, + creds=creds, + org_id=org_id, + logger=logger, + **kwargs, + ) + if get_path_type(self.path) != "hub": + raise ValueError( + "ManagedVectorStore can only be initialized with a Deep Lake Cloud path." + ) + self.client = ManagedServiceClient(token=self.token) + self.client.init_vectorstore( + path=self.bugout_reporting_path, + overwrite=overwrite, + tensor_params=tensor_params, + ) + + self.deep_memory = DeepMemory( + dataset_or_path=self.path, + token=self.token, + logger=self.logger, + embedding_function=self.embedding_function, + creds=self.creds, + ) + + def add( + self, + embedding_function: Union[Callable, List[Callable]], + embedding_data: Union[List, List[List]], + embedding_tensor: Union[str, List[str]], + return_ids: bool, + rate_limiter: Dict, + **tensors, + ) -> Optional[List[str]]: + feature_report_path( + path=self.bugout_reporting_path, + feature_name="vs.add", + parameters={ + "tensors": list(tensors.keys()) if tensors else None, + "embedding_tensor": embedding_tensor, + "return_ids": return_ids, + "embedding_function": True if embedding_function is not None else False, + "embedding_data": True if embedding_data is not None else False, + "managed": True, + }, + token=self.token, + username=self.username, + ) + + if embedding_function is not None or embedding_data is not None: + raise NotImplementedError( + "Embedding function is not supported for ManagedVectorStore. Please send precaculated embeddings." + ) + + ( + embedding_function, + embedding_data, + embedding_tensor, + tensors, + ) = utils.parse_tensors_kwargs( + tensors, embedding_function, embedding_data, embedding_tensor + ) + + processed_tensors = { + t: tensors[t].tolist() if isinstance(tensors[t], np.ndarray) else tensors[t] + for t in tensors + } + utils.check_length_of_each_tensor(processed_tensors) + + response = self.client.vectorstore_add( + path=self.path, + processed_tensors=processed_tensors, + rate_limiter=rate_limiter, + return_ids=return_ids, + ) + + if return_ids: + return response.ids + + @use_deep_memory + def search( + self, + embedding_data: Union[str, List[str]], + embedding_function: Optional[Callable], + embedding: Union[List[float], np.ndarray], + k: int, + distance_metric: str, + query: str, + filter: Union[Dict, Callable], + embedding_tensor: str, + return_tensors: List[str], + return_view: bool, + deep_memory: bool, + exec_option: Optional[str] = "tensor_db", + ) -> Union[Dict, Dataset]: + feature_report_path( + path=self.bugout_reporting_path, + feature_name="vs.search", + parameters={ + "embedding_data": True if embedding_data is not None else False, + "embedding_function": True if embedding_function is not None else False, + "k": k, + "distance_metric": distance_metric, + "query": query[0:100] if query is not None else False, + "filter": True if filter is not None else False, + "embedding_tensor": embedding_tensor, + "embedding": True if embedding is not None else False, + "return_tensors": return_tensors, + "return_view": return_view, + "managed": True, + }, + token=self.token, + username=self.username, + ) + + if exec_option != "tensor_db": + raise ValueError("Manged db vectorstore only supports tensor_db execution.") + + if embedding_data is not None or embedding_function is not None: + raise NotImplementedError( + "ManagedVectorStore does not support embedding_function search. Please pass a precalculated embedding." + ) + + if filter is not None and not isinstance(filter, dict): + raise NotImplementedError( + "Only Filter Dictionary is supported for the ManagedVectorStore." + ) + + if return_view: + raise NotImplementedError( + "return_view is not supported for the ManagedVectorStore." + ) + + response = self.client.vectorstore_search( + path=self.path, + embedding=embedding, + k=k, + distance_metric=distance_metric, + query=query, + filter=filter, + embedding_tensor=embedding_tensor, + return_tensors=return_tensors, + deep_memory=deep_memory, + ) + return response.data + + def delete( + self, + row_ids: List[int], + ids: List[str], + filter: Union[Dict, Callable], + query: str, + exec_option: str, + delete_all: bool, + ) -> bool: + feature_report_path( + path=self.bugout_reporting_path, + feature_name="vs.delete", + parameters={ + "ids": True if ids is not None else False, + "row_ids": True if row_ids is not None else False, + "query": query[0:100] if query is not None else False, + "filter": True if filter is not None else False, + "delete_all": delete_all, + "managed": True, + }, + token=self.token, + username=self.username, + ) + + if filter is not None and not isinstance(filter, dict): + raise NotImplementedError( + "Only Filter Dictionary is supported for the ManagedVectorStore." + ) + + if exec_option is not None and exec_option != "tensor_db": + raise ValueError("Manged db vectorstore only supports tensor_db execution.") + + self.client.vectorstore_remove_rows( + path=self.bugout_reporting_path, + indices=row_ids, + ids=ids, + filter=filter, + query=query, + delete_all=delete_all, + ) + return True + + def update_embedding( + self, + row_ids: List[str], + ids: List[str], + filter: Union[Dict, Callable], + query: str, + exec_option: str, + embedding_function: Union[Callable, List[Callable]], + embedding_source_tensor: Union[str, List[str]], + embedding_tensor: Union[str, List[str]], + ): + feature_report_path( + path=self.bugout_reporting_path, + feature_name="vs.delete", + parameters={ + "ids": True if ids is not None else False, + "row_ids": True if row_ids is not None else False, + "query": query[0:100] if query is not None else False, + "filter": True if filter is not None else False, + "managed": True, + }, + token=self.token, + username=self.username, + ) + + if filter is not None and not isinstance(filter, dict): + raise NotImplementedError( + "Only Filter Dictionary is supported for the ManagedVectorStore." + ) + + self.client.vectorstore_update_embeddings( + path=self.bugout_reporting_path, + embedding_function=embedding_function, + embedding_source_tensor=embedding_source_tensor, + embedding_tensor=embedding_tensor, + row_ids=row_ids, + ids=ids, + filter=filter, + query=query, + ) + + def _get_summary(self): + """Returns a summary of the Managed Vector Store.""" + return self.client.get_vectorstore_summary(self.path) + + def tensors(self): + """Returns the list of tensors present in the dataset""" + return [t["name"] for t in self._get_summary().tensors] + + def summary(self): + """Prints a summary of the dataset""" + print(self._get_summary().summary) + + def __len__(self): + """Length of the dataset""" + return self._get_summary().length diff --git a/deeplake/core/vectorstore/deep_memory/__init__.py b/deeplake/core/vectorstore/deep_memory/__init__.py new file mode 100644 index 0000000000..ab8af09e53 --- /dev/null +++ b/deeplake/core/vectorstore/deep_memory/__init__.py @@ -0,0 +1 @@ +from deeplake.core.vectorstore.deep_memory.deep_memory import DeepMemory diff --git a/deeplake/core/vectorstore/deep_memory.py b/deeplake/core/vectorstore/deep_memory/deep_memory.py similarity index 88% rename from deeplake/core/vectorstore/deep_memory.py rename to deeplake/core/vectorstore/deep_memory/deep_memory.py index 363fc87c38..2656b8f3d3 100644 --- a/deeplake/core/vectorstore/deep_memory.py +++ b/deeplake/core/vectorstore/deep_memory/deep_memory.py @@ -10,6 +10,7 @@ import deeplake from deeplake.enterprise.dataloader import indra_available from deeplake.util.exceptions import ( + DeepMemoryWaitingListError, IncorrectRelevanceTypeError, IncorrectQueriesTypeError, ) @@ -18,21 +19,42 @@ DEFAULT_QUERIES_VECTORSTORE_TENSORS, DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE, + DEFAULT_DEEPMEMORY_DISTANCE_METRIC, ) from deeplake.util.storage import get_storage_and_cache_chain from deeplake.core.dataset import Dataset from deeplake.core.dataset.deeplake_cloud_dataset import DeepLakeCloudDataset -from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore from deeplake.client.client import DeepMemoryBackendClient from deeplake.client.utils import JobResponseStatusSchema from deeplake.util.bugout_reporter import ( feature_report_path, ) -from deeplake.util.dataset import try_flushing from deeplake.util.path import get_path_type from deeplake.util.version_control import load_meta +def use_deep_memory(func): + def wrapper(self, *args, **kwargs): + use_deep_memory = kwargs.get("deep_memory") + distance_metric = kwargs.get("distance_metric") + + if use_deep_memory and distance_metric is None: + kwargs["distance_metric"] = DEFAULT_DEEPMEMORY_DISTANCE_METRIC + + return func(self, *args, **kwargs) + + return wrapper + + +def access_control(func): + def wrapper(self, *args, **kwargs): + if self.client is None: + raise DeepMemoryWaitingListError() + return func(self, *args, **kwargs) + + return wrapper + + class Relevance(BaseModel): data: List[List[Tuple[str, int]]] @@ -56,18 +78,16 @@ def validate_relevance_and_queries(relevance, queries): class DeepMemory: def __init__( self, - dataset: Dataset, - client: DeepMemoryBackendClient, + dataset_or_path: Union[Dataset, str], logger: logging.Logger, embedding_function: Optional[Any] = None, token: Optional[str] = None, - creds: Optional[Dict[str, Any]] = None, + creds: Optional[Union[Dict, str]] = None, ): """Based Deep Memory class to train and evaluate models on DeepMemory managed service. Args: - dataset (Dataset): deeplake dataset object. - client (DeepMemoryBackendClient): Client to interact with the DeepMemory managed service. Defaults to None. + dataset_or_path (Union[Dataset, str]): deeplake dataset object or path. logger (logging.Logger): Logger object. embedding_function (Optional[Any], optional): Embedding funtion class used to convert queries/documents to embeddings. Defaults to None. token (Optional[str], optional): API token for the DeepMemory managed service. Defaults to None. @@ -75,24 +95,34 @@ def __init__( Raises: ImportError: if indra is not installed + ValueError: if incorrect type is specified for `dataset_or_path` """ + if isinstance(dataset_or_path, Dataset): + self.path = dataset_or_path.path + elif isinstance(dataset_or_path, str): + self.path = dataset_or_path + else: + raise ValueError( + "dataset_or_path should be a Dataset object or a string path" + ) + feature_report_path( - path=dataset.path, + path=self.path, feature_name="dm.initialize", parameters={ "embedding_function": True if embedding_function is not None else False, - "client": client, "token": token, }, token=token, ) - self.dataset = dataset + self.token = token self.embedding_function = embedding_function - self.client = client + self.client = self._get_dm_client() self.creds = creds or {} self.logger = logger + @access_control def train( self, queries: List[str], @@ -123,9 +153,11 @@ def train( Raises: ValueError: if embedding_function is not specified either during initialization or during training. """ + from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore + self.logger.info("Starting DeepMemory training job") feature_report_path( - path=self.dataset.path, + path=self.path, feature_name="dm.train", parameters={ "queries": queries, @@ -134,11 +166,10 @@ def train( }, token=token or self.token, ) - validate_relevance_and_queries(relevance=relevance, queries=queries) # TODO: Support for passing query_embeddings directly without embedding function - corpus_path = self.dataset.path + corpus_path = self.path queries_path = corpus_path + "_queries" if embedding_function is None and self.embedding_function is None: @@ -183,6 +214,7 @@ def train( ) return response["job_id"] + @access_control def cancel(self, job_id: str): """Cancel a training job on DeepMemory managed service. @@ -196,7 +228,7 @@ def cancel(self, job_id: str): bool: True if job was cancelled successfully, False otherwise. """ feature_report_path( - path=self.dataset.path, + path=self.path, feature_name="dm.cancel", parameters={ "job_id": job_id, @@ -205,6 +237,7 @@ def cancel(self, job_id: str): ) return self.client.cancel_job(job_id=job_id) + @access_control def delete(self, job_id: str): """Delete a training job on DeepMemory managed service. @@ -218,7 +251,7 @@ def delete(self, job_id: str): bool: True if job was deleted successfully, False otherwise. """ feature_report_path( - path=self.dataset.path, + path=self.path, feature_name="dm.delete", parameters={ "job_id": job_id, @@ -227,6 +260,7 @@ def delete(self, job_id: str): ) return self.client.delete_job(job_id=job_id) + @access_control def status(self, job_id: str): """Get the status of a training job on DeepMemory managed service. @@ -246,7 +280,7 @@ def status(self, job_id: str): job_id (str): job_id of the training job. """ feature_report_path( - path=self.dataset.path, + path=self.path, feature_name="dm.status", parameters={ "job_id": job_id, @@ -255,11 +289,11 @@ def status(self, job_id: str): ) _, storage = get_storage_and_cache_chain( - path=self.dataset.path, + path=self.path, db_engine={"tensor_db": True}, read_only=False, creds=self.creds, - token=self.dataset.token, + token=self.token, memory_cache_size=DEFAULT_MEMORY_CACHE_SIZE, local_cache_size=DEFAULT_LOCAL_CACHE_SIZE, ) @@ -278,10 +312,11 @@ def status(self, job_id: str): improvement = None self.client.check_status(job_id=job_id, recall=recall, improvement=improvement) + @access_control def list_jobs(self, debug=False): """List all training jobs on DeepMemory managed service.""" feature_report_path( - path=self.dataset.path, + path=self.path, feature_name="dm.list_jobs", parameters={ "debug": debug, @@ -289,18 +324,18 @@ def list_jobs(self, debug=False): token=self.token, ) _, storage = get_storage_and_cache_chain( - path=self.dataset.path, + path=self.path, db_engine={"tensor_db": True}, read_only=False, creds=self.creds, - token=self.dataset.token, + token=self.token, memory_cache_size=DEFAULT_MEMORY_CACHE_SIZE, local_cache_size=DEFAULT_LOCAL_CACHE_SIZE, ) loaded_dataset = DeepLakeCloudDataset(storage=storage) response = self.client.list_jobs( - dataset_path=self.dataset.path, + dataset_path=self.path, ) response_status_schema = JobResponseStatusSchema(response=response) @@ -338,6 +373,7 @@ def list_jobs(self, debug=False): ) return reposnse_str + @access_control def evaluate( self, relevance: List[List[Tuple[str, int]]], @@ -407,9 +443,8 @@ def evaluate( ImportError: If `indra` is not installed. ValueError: If no embedding_function is provided either during initialization or evaluation. """ - feature_report_path( - path=self.dataset.path, + path=self.path, feature_name="dm.evaluate", parameters={ "relevance": relevance, @@ -421,7 +456,7 @@ def evaluate( }, token=self.token, ) - try_flushing(self.dataset) + try: from indra import api # type: ignore @@ -431,16 +466,17 @@ def evaluate( if not INDRA_INSTALLED: raise ImportError( - "The C++ library is not installed. The library should be installed using `pip install deeplake`, but if you want to install it separately, you may run `pip install libdeeplake`" + "indra is not installed. Please install indra to use this functionality with: pip install `deeplake[enterprise]`" ) + validate_relevance_and_queries(relevance=relevance, queries=queries) + from indra import api # type: ignore - indra_dataset = api.dataset(self.dataset.path, token=self.token) + indra_dataset = api.dataset(self.path, token=self.token) api.tql.prepare_deepmemory_metrics(indra_dataset) parsed_qvs_params = parse_queries_params(qvs_params) - validate_relevance_and_queries(relevance=relevance, queries=queries) start = time() query_embs: Union[List[np.ndarray], List[List[float]]] @@ -455,7 +491,7 @@ def evaluate( if embedding_function is None: raise ValueError( - "Embedding function should be specified either during initialization or during evaluation." + "Embedding function should be specifed either during initialization or during evaluation." ) query_embs = embedding_function(queries) @@ -498,7 +534,7 @@ def evaluate( return recalls self.queries_dataset = deeplake.empty( - self.dataset.path + "_eval_queries", + self.path + "_eval_queries", token=self.token, creds=self.creds, overwrite=True, @@ -518,6 +554,25 @@ def evaluate( self.queries_dataset.commit() return recalls + def _get_dm_client(self): + path = self.path + path_type = get_path_type(path) + + dm_client = DeepMemoryBackendClient(token=self.token) + user_profile = dm_client.get_user_profile() + + if path_type == "hub": + # TODO: add support for windows + dataset_id = path[6:].split("/")[0] + else: + # TODO: change user_profile to user_id + dataset_id = user_profile["name"] + + deepmemory_is_available = dm_client.deepmemory_is_available(dataset_id) + if deepmemory_is_available: + return dm_client + return None + def _get_jobs(self, response): jobs = None if response is not None and len(response) > 0: @@ -628,11 +683,8 @@ def _get_best_model(embedding: Any, job_id: str, latest_job: bool = False): best_recall = 0 best_delta = 0 if latest_job: - try: - best_recall = info["deepmemory/model.npy"]["recall@10"] - best_delta = info["deepmemory/model.npy"]["delta"] - except KeyError: - pass + best_recall = info["deepmemory/model.npy"]["recall@10"] + best_delta = info["deepmemory/model.npy"]["delta"] for job, value in info.items(): if job_id in job: diff --git a/deeplake/core/vectorstore/test_deepmemory.py b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py similarity index 98% rename from deeplake/core/vectorstore/test_deepmemory.py rename to deeplake/core/vectorstore/deep_memory/test_deepmemory.py index 9be27a9210..0a458974fa 100644 --- a/deeplake/core/vectorstore/test_deepmemory.py +++ b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py @@ -6,15 +6,17 @@ import deeplake from deeplake import VectorStore +from deeplake.core.vectorstore.deep_memory.deep_memory import DeepMemory from deeplake.tests.common import requires_libdeeplake -from deeplake.core.vectorstore.unsupported_deep_memory import ( - DeepMemory as UnsupportedDeepMemory, -) from deeplake.util.exceptions import ( DeepMemoryWaitingListError, IncorrectQueriesTypeError, IncorrectRelevanceTypeError, ) +from deeplake.util.exceptions import DeepMemoryWaitingListError + + +logger = logging.getLogger(__name__) class DummyEmbedder: @@ -580,8 +582,10 @@ def test_deepmemory_search_on_local_datasets( @pytest.mark.skipif(sys.platform == "win32", reason="Does not run on Windows") @pytest.mark.slow @requires_libdeeplake -def test_unsupported_deepmemory_users(): - dm = UnsupportedDeepMemory() +def test_unsupported_deepmemory_users(local_ds): + dm = DeepMemory( + dataset_or_path=local_ds, logger=logger, embedding_function=DummyEmbedder + ) with pytest.raises(DeepMemoryWaitingListError): dm.train( queries=[], diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index c29f9606a4..a57dcac768 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -1,37 +1,19 @@ import logging import pathlib from typing import Optional, Any, List, Dict, Union, Callable -import jwt import numpy as np import deeplake -from deeplake.core import index_maintenance -from deeplake.core.distance_type import DistanceType -from deeplake.util.exceptions import DeepMemoryWaitingListError -from deeplake.util.path import convert_pathlib_to_string_if_needed - -from deeplake.api import dataset from deeplake.core.dataset import Dataset +from deeplake.core.vectorstore.dataset_handlers import get_dataset_handler +from deeplake.core.vectorstore.deep_memory import DeepMemory from deeplake.constants import ( DEFAULT_VECTORSTORE_TENSORS, MAX_BYTES_PER_MINUTE, TARGET_BYTE_SIZE, - DEFAULT_VECTORSTORE_DISTANCE_METRIC, - DEFAULT_DEEPMEMORY_DISTANCE_METRIC, - _INDEX_OPERATION_MAPPING, -) -from deeplake.client.utils import read_token -from deeplake.core.vectorstore import utils -from deeplake.core.vectorstore.vector_search import vector_search -from deeplake.core.vectorstore.vector_search import dataset as dataset_utils -from deeplake.core.vectorstore.vector_search import filter as filter_utils -from deeplake.util.bugout_reporter import ( - feature_report_path, ) -from deeplake.util.path import get_path_type -from deeplake.core.vectorstore.unsupported_deep_memory import DeepMemory - +from deeplake.util.bugout_reporter import feature_report_path logger = logging.getLogger(__name__) @@ -41,7 +23,8 @@ class VectorStore: def __init__( self, - path: Union[str, pathlib.Path], + path: Optional[Union[str, pathlib.Path]] = None, + dataset: Optional[Dataset] = None, tensor_params: List[Dict[str, object]] = DEFAULT_VECTORSTORE_TENSORS, embedding_function: Optional[Any] = None, read_only: Optional[bool] = None, @@ -124,87 +107,35 @@ def __init__( Danger: Setting ``overwrite`` to ``True`` will delete all of your data if the Vector Store exists! Be very careful when setting this parameter. """ - try: - from indra import api # type: ignore - - self.indra_installed = True - except Exception: # pragma: no cover - self.indra_installed = False # pragma: no cover - - self._token = token - self.path = convert_pathlib_to_string_if_needed(path) - self.logger = logger - self.org_id = org_id if get_path_type(self.path) == "local" else None - - feature_report_path( - path, - "vs.initialize", - { - "tensor_params": "default" - if tensor_params is not None - else tensor_params, - "embedding_function": True if embedding_function is not None else False, - "num_workers": num_workers, - "overwrite": overwrite, - "read_only": read_only, - "ingestion_batch_size": ingestion_batch_size, - "index_params": index_params, - "exec_option": exec_option, - "token": self.token, - "verbose": verbose, - "runtime": runtime, - }, - token=self.token, - username=self.username, - ) - - self.ingestion_batch_size = ingestion_batch_size - self.index_params = utils.parse_index_params(index_params) - kwargs["index_params"] = self.index_params - self.num_workers = num_workers - self.creds = creds or {} - self.embedding_function = utils.create_embedding_function(embedding_function) - - self.dataset = dataset_utils.create_or_load_dataset( - tensor_params, - path, - self.token, - self.creds, - self.logger, - read_only, - exec_option, - embedding_function, - overwrite, - runtime, - self.org_id, - branch, + self.dataset_handler = get_dataset_handler( + path=path, + dataset=dataset, + tensor_params=tensor_params, + embedding_function=embedding_function, + read_only=read_only, + ingestion_batch_size=ingestion_batch_size, + index_params=index_params, + num_workers=num_workers, + exec_option=exec_option, + token=token, + overwrite=overwrite, + verbose=verbose, + runtime=runtime, + creds=creds, + org_id=org_id, + logger=logger, + branch=branch, **kwargs, ) - self._exec_option = exec_option - self.verbose = verbose - self.tensor_params = tensor_params - self.distance_metric_index = index_maintenance.index_operation_vectorstore( - self, - ) - self.deep_memory = DeepMemory() - - @property - def token(self): - return self._token or read_token(from_env=True) - @property - def exec_option(self) -> str: - return utils.parse_exec_option( - self.dataset, self._exec_option, self.indra_installed, self.username + self.deep_memory = DeepMemory( + dataset_or_path=self.dataset_handler.path, + token=self.dataset_handler.token, + logger=logger, + embedding_function=embedding_function, + creds=self.dataset_handler.creds, ) - @property - def username(self) -> str: - username = "public" - if self.token is not None: - username = jwt.decode(self.token, options={"verify_signature": False})["id"] - return username - def add( self, embedding_function: Optional[Union[Callable, List[Callable]]] = None, @@ -284,69 +215,15 @@ def add( Returns: Optional[List[str]]: List of ids if ``return_ids`` is set to True. Otherwise, None. """ - - feature_report_path( - path=self.path, - feature_name="vs.add", - parameters={ - "tensors": list(tensors.keys()) if tensors else None, - "embedding_tensor": embedding_tensor, - "return_ids": return_ids, - "embedding_function": True if embedding_function is not None else False, - "embedding_data": True if embedding_data is not None else False, - }, - token=self.token, - username=self.username, - ) - ( - embedding_function, - embedding_data, - embedding_tensor, - tensors, - ) = utils.parse_tensors_kwargs( - tensors, - embedding_function, - embedding_data, - embedding_tensor, - ) - - ( - embedding_function, - embedding_data, - embedding_tensor, - tensors, - ) = utils.parse_add_arguments( - dataset=self.dataset, - initial_embedding_function=self.embedding_function, - embedding_function=embedding_function, - embedding_data=embedding_data, - embedding_tensor=embedding_tensor, - **tensors, - ) - - processed_tensors, id_ = dataset_utils.preprocess_tensors( - embedding_data, embedding_tensor, self.dataset, **tensors - ) - - assert id_ is not None - - dataset_utils.extend_or_ingest_dataset( - processed_tensors=processed_tensors, - dataset=self.dataset, + return self.dataset_handler.add( embedding_function=embedding_function, embedding_data=embedding_data, embedding_tensor=embedding_tensor, + return_ids=return_ids, rate_limiter=rate_limiter, - logger=self.logger, + **tensors, ) - if self.verbose: - self.dataset.summary() - - if return_ids: - return id_ - return None - def search( self, embedding_data: Union[str, List[str], None] = None, @@ -423,43 +300,9 @@ def search( Returns: Dict: Dictionary where keys are tensor names and values are the results of the search """ - - feature_report_path( - path=self.path, - feature_name="vs.search", - parameters={ - "embedding_data": True if embedding_data is not None else False, - "embedding_function": True if embedding_function is not None else False, - "k": k, - "distance_metric": distance_metric, - "query": query[0:100] if query is not None else False, - "filter": True if filter is not None else False, - "exec_option": exec_option, - "embedding_tensor": embedding_tensor, - "embedding": True if embedding is not None else False, - "return_tensors": return_tensors, - "return_view": return_view, - }, - token=self.token, - username=self.username, - ) - - if exec_option is None and self.exec_option != "python" and callable(filter): - self.logger.warning( - 'Switching exec_option to "python" (runs on client) because filter is specified as a function. ' - f'To continue using the original exec_option "{self.exec_option}", please specify the filter as a dictionary or use the "query" parameter to specify a TQL query.' - ) - exec_option = "python" - - exec_option = exec_option or self.exec_option - - if deep_memory and not self.deep_memory: - raise DeepMemoryWaitingListError() - - utils.parse_search_args( + return self.dataset_handler.search( embedding_data=embedding_data, embedding_function=embedding_function, - initial_embedding_function=self.embedding_function, embedding=embedding, k=k, distance_metric=distance_metric, @@ -468,47 +311,13 @@ def search( exec_option=exec_option, embedding_tensor=embedding_tensor, return_tensors=return_tensors, - ) - - return_tensors = utils.parse_return_tensors( - self.dataset, return_tensors, embedding_tensor, return_view - ) - embedding_function = utils.create_embedding_function(embedding_function) - query_emb: Optional[Union[List[float], np.ndarray[Any, Any]]] = None - if query is None: - query_emb = dataset_utils.get_embedding( - embedding, - embedding_data, - embedding_function=embedding_function or self.embedding_function, - ) - - if self.distance_metric_index: - distance_metric = index_maintenance.parse_index_distance_metric_from_params( - logger, self.distance_metric_index, distance_metric - ) - - distance_metric = distance_metric or DEFAULT_VECTORSTORE_DISTANCE_METRIC - - return vector_search.search( - query=query, - logger=self.logger, - filter=filter, - query_embedding=query_emb, - k=k, - distance_metric=distance_metric, - exec_option=exec_option, - deeplake_dataset=self.dataset, - embedding_tensor=embedding_tensor, - return_tensors=return_tensors, return_view=return_view, deep_memory=deep_memory, - token=self.token, - org_id=self.org_id, ) def delete( self, - row_ids: Optional[List[str]] = None, + row_ids: Optional[List[int]] = None, ids: Optional[List[str]] = None, filter: Optional[Union[Dict, Callable]] = None, query: Optional[str] = None, @@ -532,7 +341,7 @@ def delete( Args: ids (Optional[List[str]]): List of unique ids. Defaults to None. - row_ids (Optional[List[str]]): List of absolute row indices from the dataset. Defaults to None. + row_ids (Optional[List[int]]): List of absolute row indices from the dataset. Defaults to None. filter (Union[Dict, Callable], optional): Filter for finding samples for deletion. - ``Dict`` - Key-value search on tensors of htype json, evaluated on an AND basis (a sample must satisfy all key-value filters to be True) Dict = {"tensor_name_1": {"key": value}, "tensor_name_2": {"key": value}} - ``Function`` - Any function that is compatible with `deeplake.filter`. @@ -553,47 +362,15 @@ def delete( ValueError: If neither ``ids``, ``filter``, ``query``, nor ``delete_all`` are specified, or if an invalid ``exec_option`` is provided. """ - feature_report_path( - path=self.path, - feature_name="vs.delete", - parameters={ - "ids": True if ids is not None else False, - "row_ids": True if row_ids is not None else False, - "query": query[0:100] if query is not None else False, - "filter": True if filter is not None else False, - "exec_option": exec_option, - "delete_all": delete_all, - }, - token=self.token, - username=self.username, - ) - - if not row_ids: - row_ids = ( - dataset_utils.search_row_ids( - dataset=self.dataset, - search_fn=self.search, - ids=ids, - filter=filter, - query=query, - select_all=delete_all, - exec_option=exec_option or self.exec_option, - ) - or [] - ) - - ( - self.dataset, - dataset_deleted, - ) = dataset_utils.delete_all_samples_if_specified( - self.dataset, - delete_all, + return self.dataset_handler.delete( + row_ids=row_ids, + ids=ids, + filter=filter, + query=query, + exec_option=exec_option, + delete_all=delete_all, ) - self.dataset.pop_multiple(row_ids) - - return True - def update_embedding( self, row_ids: Optional[List[str]] = None, @@ -650,52 +427,17 @@ def update_embedding( embedding_source_tensor (Union[str, List[str]], optional): Name of tensor with data that needs to be converted to embeddings. Defaults to `text`. embedding_tensor (Optional[Union[str, List[str]]], optional): Name of the tensor with embeddings. Defaults to None. """ - feature_report_path( - path=self.path, - feature_name="vs.delete", - parameters={ - "ids": True if ids is not None else False, - "row_ids": True if row_ids is not None else False, - "query": query[0:100] if query is not None else False, - "filter": True if filter is not None else False, - "exec_option": exec_option, - }, - token=self.token, - username=self.username, - ) - - ( - embedding_function, - embedding_source_tensor, - embedding_tensor, - ) = utils.parse_update_arguments( - dataset=self.dataset, + self.dataset_handler.update_embedding( + row_ids=row_ids, + ids=ids, + filter=filter, + query=query, + exec_option=exec_option, embedding_function=embedding_function, - initial_embedding_function=self.embedding_function, - embedding_source_tensor=embedding_source_tensor, - embedding_tensor=embedding_tensor, - ) - - if not row_ids: - row_ids = dataset_utils.search_row_ids( - dataset=self.dataset, - search_fn=self.search, - ids=ids, - filter=filter, - query=query, - exec_option=exec_option or self.exec_option, - ) - - embedding_tensor_data = utils.convert_embedding_source_tensor_to_embeddings( - dataset=self.dataset, embedding_source_tensor=embedding_source_tensor, embedding_tensor=embedding_tensor, - embedding_function=embedding_function, - row_ids=row_ids, ) - self.dataset[row_ids].update(embedding_tensor_data) - @staticmethod def delete_by_path( path: Union[str, pathlib.Path], @@ -717,8 +459,6 @@ def delete_by_path( Danger: This method permanently deletes all of your data if the Vector Store exists! Be very careful when using this method. """ - token = token or read_token(from_env=True) - feature_report_path( path, "vs.delete_by_path", @@ -738,7 +478,7 @@ def commit(self, allow_empty: bool = True) -> None: Args: allow_empty (bool): Whether to allow empty commits. Defaults to True. """ - self.dataset.commit(allow_empty=allow_empty) + self.dataset_handler.commit(allow_empty=allow_empty) def checkout(self, branch: str = "main") -> None: """Checkout the Vector Store to a specific branch. @@ -746,19 +486,29 @@ def checkout(self, branch: str = "main") -> None: Args: branch (str): Branch name to checkout. Defaults to "main". """ - self.dataset.checkout(branch) + self.dataset_handler.checkout(branch) def tensors(self): """Returns the list of tensors present in the dataset""" - return self.dataset.tensors + return self.dataset_handler.tensors() def summary(self): """Prints a summary of the dataset""" - return self.dataset.summary() + return self.dataset_handler.summary() + + @property + def dataset(self): + """Returns the dataset""" + try: + return self.dataset_handler.dataset + except AttributeError: + raise AttributeError( + "Acessing the dataset is not available for managed Vector Store." + ) def __len__(self): """Length of the dataset""" - return len(self.dataset) + return len(self.dataset_handler) DeepLakeVectorStore = VectorStore diff --git a/deeplake/core/vectorstore/deepmemory_vectorstore.py b/deeplake/core/vectorstore/deepmemory_vectorstore.py deleted file mode 100644 index fd5351e50e..0000000000 --- a/deeplake/core/vectorstore/deepmemory_vectorstore.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import Any, Callable, Dict, List, Optional, Union - -import numpy as np - -from deeplake.core.dataset import Dataset -from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore -from deeplake.core.vectorstore.deep_memory import DeepMemory -from deeplake.constants import DEFAULT_DEEPMEMORY_DISTANCE_METRIC - - -class DeepMemoryVectorStore(VectorStore): - def __init__(self, client, *arg, **kwargs): - super().__init__(*arg, **kwargs) - self.deep_memory = DeepMemory( - self.dataset, - token=self.token, - embedding_function=self.embedding_function, - client=client, - creds=self.creds, - logger=self.logger, - ) - - def search( - self, - embedding_data: Union[str, List[str], None] = None, - embedding_function: Optional[Callable] = None, - embedding: Optional[Union[List[float], np.ndarray]] = None, - k: int = 4, - distance_metric: Optional[str] = None, - query: Optional[str] = None, - filter: Optional[Union[Dict, Callable]] = None, - exec_option: Optional[str] = None, - embedding_tensor: str = "embedding", - return_tensors: Optional[List[str]] = None, - return_view: bool = False, - deep_memory: bool = False, - ) -> Union[Dict, Dataset]: - if deep_memory and not distance_metric: - distance_metric = DEFAULT_DEEPMEMORY_DISTANCE_METRIC - - return super().search( - embedding_data=embedding_data, - embedding_function=embedding_function, - embedding=embedding, - k=k, - distance_metric=distance_metric, - query=query, - filter=filter, - exec_option=exec_option, - embedding_tensor=embedding_tensor, - return_tensors=return_tensors, - return_view=return_view, - deep_memory=deep_memory, - ) diff --git a/deeplake/core/vectorstore/embeddings/__init__.py b/deeplake/core/vectorstore/embeddings/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deeplake/core/vectorstore/embedder.py b/deeplake/core/vectorstore/embeddings/embedder.py similarity index 100% rename from deeplake/core/vectorstore/embedder.py rename to deeplake/core/vectorstore/embeddings/embedder.py diff --git a/deeplake/core/vectorstore/test_embedder.py b/deeplake/core/vectorstore/embeddings/test_embedder.py similarity index 96% rename from deeplake/core/vectorstore/test_embedder.py rename to deeplake/core/vectorstore/embeddings/test_embedder.py index bd9dc74acb..981657d707 100644 --- a/deeplake/core/vectorstore/test_embedder.py +++ b/deeplake/core/vectorstore/embeddings/test_embedder.py @@ -6,7 +6,10 @@ import numpy as np from deeplake.constants import MAX_BYTES_PER_MINUTE, TARGET_BYTE_SIZE -from deeplake.core.vectorstore.embedder import DeepLakeEmbedder, chunk_by_bytes +from deeplake.core.vectorstore.embeddings.embedder import ( + DeepLakeEmbedder, + chunk_by_bytes, +) EMBEDDING_DIM = 15 diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index abb7faddf0..6d3ead64f7 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -13,9 +13,7 @@ DeepLakeVectorStore, VectorStore, ) -from deeplake.core.vectorstore.deepmemory_vectorstore import DeepMemoryVectorStore -from deeplake.core.vectorstore.embedder import DeepLakeEmbedder -from deeplake.core.vectorstore.vectorstore_factory import vectorstore_factory +from deeplake.core.vectorstore.embeddings.embedder import DeepLakeEmbedder from deeplake.core.vectorstore import utils from deeplake.tests.common import requires_libdeeplake from deeplake.constants import ( @@ -261,7 +259,7 @@ def test_search_basic(local_path, hub_cloud_dev_token): token=hub_cloud_dev_token, ) - assert vector_store.exec_option == "compute_engine" + assert vector_store.dataset_handler.exec_option == "compute_engine" vector_store.add(embedding=embeddings, text=texts, metadata=metadatas) @@ -287,7 +285,13 @@ def test_search_basic(local_path, hub_cloud_dev_token): assert len(data_p["text"]) == 1 assert ( - sum([tensor in data_p.keys() for tensor in vector_store.dataset.tensors]) == 2 + sum( + [ + tensor in data_p.keys() + for tensor in vector_store.dataset_handler.dataset.tensors + ] + ) + == 2 ) # One for each return_tensors assert len(data_p.keys()) == 3 # One for each return_tensors + score @@ -297,7 +301,7 @@ def test_search_basic(local_path, hub_cloud_dev_token): read_only=True, token=hub_cloud_dev_token, ) - assert vector_store_cloud.exec_option == "compute_engine" + assert vector_store_cloud.dataset_handler.exec_option == "compute_engine" # Use indra implementation to search the data data_ce = vector_store_cloud.search( @@ -307,21 +311,26 @@ def test_search_basic(local_path, hub_cloud_dev_token): ) assert len(data_ce["text"]) == 2 assert ( - sum([tensor in data_ce.keys() for tensor in vector_store_cloud.dataset.tensors]) + sum( + [ + tensor in data_ce.keys() + for tensor in vector_store_cloud.dataset_handler.dataset.tensors + ] + ) == 2 ) # One for each return_tensors assert len(data_ce.keys()) == 3 # One for each return_tensors + score with pytest.raises(ValueError): vector_store_cloud.search( - query=f"SELECT * WHERE id=='{vector_store_cloud.dataset.id[0].numpy()[0]}'", + query=f"SELECT * WHERE id=='{vector_store_cloud.dataset_handler.dataset.id[0].numpy()[0]}'", embedding=query_embedding, k=2, return_tensors=["id", "text"], ) # Run a full custom query - test_text = vector_store_cloud.dataset.text[0].data()["value"] + test_text = vector_store_cloud.dataset_handler.dataset.text[0].data()["value"] data_q = vector_store_cloud.search( query=f"select * where text == '{test_text}'", ) @@ -329,9 +338,12 @@ def test_search_basic(local_path, hub_cloud_dev_token): assert len(data_q["text"]) == 1 assert data_q["text"][0] == test_text assert sum( - [tensor in data_q.keys() for tensor in vector_store_cloud.dataset.tensors] + [ + tensor in data_q.keys() + for tensor in vector_store_cloud.dataset_handler.dataset.tensors + ] ) == len( - vector_store_cloud.dataset.tensors + vector_store_cloud.dataset_handler.dataset.tensors ) # One for each tensor - embedding + score # Run a filter query using a json @@ -342,7 +354,13 @@ def test_search_basic(local_path, hub_cloud_dev_token): ) assert len(data_e_j["text"]) == 1 assert ( - sum([tensor in data_e_j.keys() for tensor in vector_store.dataset.tensors]) == 2 + sum( + [ + tensor in data_e_j.keys() + for tensor in vector_store.dataset_handler.dataset.tensors + ] + ) + == 2 ) # One for each return_tensors assert len(data_e_j.keys()) == 2 @@ -357,7 +375,13 @@ def filter_fn(x): ) assert len(data_e_f["text"]) == 1 assert ( - sum([tensor in data_e_f.keys() for tensor in vector_store.dataset.tensors]) == 2 + sum( + [ + tensor in data_e_f.keys() + for tensor in vector_store.dataset_handler.dataset.tensors + ] + ) + == 2 ) # One for each return_tensors assert len(data_e_f.keys()) == 2 @@ -368,8 +392,10 @@ def filter_fn(x): k=2, return_tensors=["id", "text"], filter={ - "metadata": vector_store_cloud.dataset.metadata[0].data()["value"], - "text": vector_store_cloud.dataset.text[0].data()["value"], + "metadata": vector_store_cloud.dataset_handler.dataset.metadata[0].data()[ + "value" + ], + "text": vector_store_cloud.dataset_handler.dataset.text[0].data()["value"], }, ) assert len(data_ce_f["text"]) == 1 @@ -377,7 +403,7 @@ def filter_fn(x): sum( [ tensor in data_ce_f.keys() - for tensor in vector_store_cloud.dataset.tensors + for tensor in vector_store_cloud.dataset_handler.dataset.tensors ] ) == 2 @@ -417,7 +443,7 @@ def filter_fn(x): path=local_path, overwrite=True, token=hub_cloud_dev_token, exec_option=None ) - assert vector_store_none_exec.exec_option == "compute_engine" + assert vector_store_none_exec.dataset_handler.exec_option == "compute_engine" # Check that filter_fn with cloud dataset (and therefore "compute_engine" exec option) switches to "python" automatically. with pytest.warns(None): @@ -470,7 +496,7 @@ def filter_fn(x): ) vector_store = DeepLakeVectorStore(path="mem://xyz") - assert vector_store.exec_option == "python" + assert vector_store.dataset_handler.exec_option == "python" vector_store.add(embedding=embeddings, text=texts, metadata=metadatas) data = vector_store.search( @@ -524,7 +550,7 @@ def filter_fn(x): vector_store = DeepLakeVectorStore( path="mem://xyz", embedding_function=openai_embeddings ) - assert vector_store.exec_option == "python" + assert vector_store.dataset_handler.exec_option == "python" vector_store.add(embedding=embeddings, text=texts, metadata=metadatas) result = vector_store.search(embedding_data=["dummy"]) assert len(result) == 4 @@ -540,7 +566,7 @@ def test_index_basic(local_path, hub_cloud_dev_token): token=hub_cloud_dev_token, ) - assert vector_store.distance_metric_index is None + assert vector_store.dataset_handler.distance_metric_index is None # Then test behavior when index is added vector_store = VectorStore( @@ -548,7 +574,7 @@ def test_index_basic(local_path, hub_cloud_dev_token): ) vector_store.add(embedding=embeddings, text=texts, metadata=metadatas) - es = vector_store.dataset.embedding.get_vdb_indexes() + es = vector_store.dataset_handler.dataset.embedding.get_vdb_indexes() assert ( es[0]["distance"] == METRIC_TO_INDEX_METRIC[DEFAULT_VECTORSTORE_DISTANCE_METRIC] @@ -556,25 +582,44 @@ def test_index_basic(local_path, hub_cloud_dev_token): # Then test behavior when index is added previously and the dataset is reloaded vector_store = VectorStore(path=local_path, token=hub_cloud_dev_token) - es = vector_store.dataset.embedding.get_vdb_indexes() + es = vector_store.dataset_handler.dataset.embedding.get_vdb_indexes() assert ( es[0]["distance"] == METRIC_TO_INDEX_METRIC[DEFAULT_VECTORSTORE_DISTANCE_METRIC] ) # Test index with sample updates - pre_update_index = vector_store.dataset.embedding.get_vdb_indexes()[0] + pre_update_index = vector_store.dataset_handler.dataset.embedding.get_vdb_indexes()[ + 0 + ] vector_store.add( embedding=[embeddings[0]], text=[texts[0]], metadata=[metadatas[0]] ) - post_update_index = vector_store.dataset.embedding.get_vdb_indexes()[0] + post_update_index = ( + vector_store.dataset_handler.dataset.embedding.get_vdb_indexes()[0] + ) assert pre_update_index == post_update_index + # Test index with sample deletion + pre_delete_index = vector_store.dataset_handler.dataset.embedding.get_vdb_indexes()[ + 0 + ] + vector_store.delete(row_ids=[len(vector_store) - 1]) + post_delete_index = ( + vector_store.dataset_handler.dataset.embedding.get_vdb_indexes()[0] + ) + + assert pre_delete_index == post_delete_index + # Test index with sample updating - pre_update_index = vector_store.dataset.embedding.get_vdb_indexes()[0] + pre_update_index = vector_store.dataset_handler.dataset.embedding.get_vdb_indexes()[ + 0 + ] vector_store.update_embedding(row_ids=[0], embedding_function=embedding_fn) - post_update_index = vector_store.dataset.embedding.get_vdb_indexes()[0] + post_update_index = ( + vector_store.dataset_handler.dataset.embedding.get_vdb_indexes()[0] + ) assert pre_update_index == post_update_index @@ -644,7 +689,7 @@ def test_search_quantitative(distance_metric, hub_cloud_dev_token): filter={"metadata": {"abcdefg": 28}}, ) - test_id = vector_store.dataset.id[0].data()["value"] + test_id = vector_store.dataset_handler.dataset.id[0].data()["value"] data_ce_q = vector_store.search( query=f"select * where id == '{test_id}'", @@ -675,7 +720,7 @@ def test_search_managed(hub_cloud_dev_token): exec_option="tensor_db", ) - assert "vectordb/" in vector_store.dataset.base_storage.path + assert "vectordb/" in vector_store.dataset_handler.dataset.base_storage.path assert len(data_ce["score"]) == len(data_db["score"]) assert all( @@ -708,23 +753,26 @@ def test_delete(local_path, hub_cloud_dev_token): assert_vectorstore_structure(vector_store, 10) # delete the data in the dataset by id: - print(len(vector_store.dataset)) + print(len(vector_store.dataset_handler.dataset)) vector_store.delete(row_ids=[4, 8, 9]) - assert len(vector_store.dataset) == NUMBER_OF_DATA - 3 + assert len(vector_store.dataset_handler.dataset) == NUMBER_OF_DATA - 3 vector_store.delete(filter={"metadata": {"abc": 1}}) - assert len(vector_store.dataset) == NUMBER_OF_DATA - 4 + assert len(vector_store.dataset_handler.dataset) == NUMBER_OF_DATA - 4 vector_store.delete(ids=["7"]) - assert len(vector_store.dataset) == NUMBER_OF_DATA - 5 + assert len(vector_store.dataset_handler.dataset) == NUMBER_OF_DATA - 5 with pytest.raises(ValueError): vector_store.delete() - tensors_before_delete = vector_store.dataset.tensors + tensors_before_delete = vector_store.dataset_handler.dataset.tensors vector_store.delete(delete_all=True) - assert len(vector_store.dataset) == 0 - assert vector_store.dataset.tensors.keys() == tensors_before_delete.keys() + assert len(vector_store.dataset_handler.dataset) == 0 + assert ( + vector_store.dataset_handler.dataset.tensors.keys() + == tensors_before_delete.keys() + ) vector_store.delete_by_path(local_path) dirs = os.listdir("./") @@ -752,7 +800,7 @@ def test_delete(local_path, hub_cloud_dev_token): # delete the data in the dataset by id: vector_store_b.delete(row_ids=[0]) - assert len(vector_store_b.dataset) == NUMBER_OF_DATA - 1 + assert len(vector_store_b.dataset_handler.dataset) == NUMBER_OF_DATA - 1 ds = deeplake.empty(local_path, overwrite=True) ds.create_tensor("id", htype="text") @@ -810,21 +858,25 @@ def assert_updated_vector_store( if callable(embedding_function) and isinstance(embedding_tensor, str): np.testing.assert_array_equal( - vector_store.dataset[embedding_tensor][row_ids].numpy(), + vector_store.dataset_handler.dataset[embedding_tensor][row_ids].numpy(), new_embeddings, ) if callable(embedding_function) and isinstance(embedding_tensor, list): for i in range(len(embedding_tensor)): np.testing.assert_array_equal( - vector_store.dataset[embedding_tensor[i]][row_ids].numpy(), + vector_store.dataset_handler.dataset[embedding_tensor[i]][ + row_ids + ].numpy(), new_embeddings[i], ) if isinstance(embedding_function, list) and isinstance(embedding_tensor, list): for i in range(len(embedding_tensor)): np.testing.assert_array_equal( - vector_store.dataset[embedding_tensor[i]][row_ids].numpy(), + vector_store.dataset_handler.dataset[embedding_tensor[i]][ + row_ids + ].numpy(), new_embeddings[i], ) @@ -1210,7 +1262,7 @@ def test_vdb_index_creation(local_path, capsys, hub_cloud_dev_token): vector_store.add(embedding=embeddings, text=texts, id=ids, metadata=metadatas) assert len(vector_store) == number_of_data - assert set(vector_store.dataset.tensors) == set( + assert set(vector_store.dataset_handler.dataset.tensors) == set( [ "embedding", "id", @@ -1228,7 +1280,7 @@ def test_vdb_index_creation(local_path, capsys, hub_cloud_dev_token): ) # Check if the index is recreated properly. - ds = vector_store.dataset + ds = vector_store.dataset_handler.dataset es = ds.embedding.get_vdb_indexes() assert len(es) == 1 assert es[0]["id"] == "hnsw_1" @@ -1281,7 +1333,7 @@ def test_vdb_index_incr_maint(local_path, capsys, hub_cloud_dev_token): vector_store.add(embedding=emb4, text=txt4, id=ids4, metadata=md4) assert len(vector_store) == number_of_data - assert set(vector_store.dataset.tensors) == set( + assert set(vector_store.dataset_handler.dataset.tensors) == set( [ "embedding", "id", @@ -1299,7 +1351,7 @@ def test_vdb_index_incr_maint(local_path, capsys, hub_cloud_dev_token): ) # Check if the index is recreated properly. - ds = vector_store.dataset + ds = vector_store.dataset_handler.dataset es = ds.embedding.get_vdb_indexes() assert len(es) == 1 assert es[0]["id"] == "hnsw_1" @@ -1373,13 +1425,13 @@ def test_vdb_index_incr_maint_extend(local_path, capsys, hub_cloud_dev_token): ) vector_store.add(embedding=emb1, text=txt1, id=ids1, metadata=md1) - ds = vector_store.dataset + ds = vector_store.dataset_handler.dataset ds.extend({"embedding": emb2, "text": txt2, "id": ids2, "metadata": md2}) ds.extend({"embedding": emb3, "text": txt3, "id": ids3, "metadata": md3}) ds.extend({"embedding": emb4, "text": txt4, "id": ids4, "metadata": md4}) assert len(vector_store) == number_of_data - assert set(vector_store.dataset.tensors) == set( + assert set(vector_store.dataset_handler.dataset.tensors) == set( [ "embedding", "id", @@ -1397,7 +1449,7 @@ def test_vdb_index_incr_maint_extend(local_path, capsys, hub_cloud_dev_token): ) # Check if the index is recreated properly. - ds = vector_store.dataset + ds = vector_store.dataset_handler.dataset es = ds.embedding.get_vdb_indexes() assert len(es) == 1 assert es[0]["id"] == "hnsw_1" @@ -1474,14 +1526,14 @@ def test_vdb_index_incr_maint_append_pop(local_path, capsys, hub_cloud_dev_token token=hub_cloud_dev_token, ) - ds = vector_store.dataset + ds = vector_store.dataset_handler.dataset ds.append({"embedding": emb1, "text": txt1, "id": ids1, "metadata": md1}) ds.append({"embedding": emb2, "text": txt2, "id": ids2, "metadata": md2}) ds.append({"embedding": emb3, "text": txt3, "id": ids3, "metadata": md3}) ds.append({"embedding": emb4, "text": txt4, "id": ids4, "metadata": md4}) # assert len(vector_store) == number_of_data - assert set(vector_store.dataset.tensors) == set( + assert set(vector_store.dataset_handler.dataset.tensors) == set( [ "embedding", "id", @@ -1583,13 +1635,13 @@ def test_vdb_index_incr_maint_update(local_path, capsys, hub_cloud_dev_token): ) vector_store.add(embedding=emb1, text=txt1, id=ids1, metadata=md1) - ds = vector_store.dataset + ds = vector_store.dataset_handler.dataset ds.append({"embedding": emb2, "text": txt2, "id": ids2, "metadata": md2}) ds.append({"embedding": emb3, "text": txt3, "id": ids3, "metadata": md3}) ds.append({"embedding": emb4, "text": txt4, "id": ids4, "metadata": md4}) # assert len(vector_store) == number_of_data - assert set(vector_store.dataset.tensors) == set( + assert set(vector_store.dataset_handler.dataset.tensors) == set( [ "embedding", "id", @@ -1700,7 +1752,7 @@ def test_vdb_index_incr_maint_tensor_append(local_path, capsys, hub_cloud_dev_to ) vector_store.add(embedding=emb1, text=txt1, id=ids1, metadata=md1) - ds = vector_store.dataset + ds = vector_store.dataset_handler.dataset ds.embedding.append(emb2) ds.embedding.append(emb3) @@ -1708,7 +1760,7 @@ def test_vdb_index_incr_maint_tensor_append(local_path, capsys, hub_cloud_dev_to # ds.embedding[104] = emb5 # assert len(vector_store) == number_of_data - assert set(vector_store.dataset.tensors) == set( + assert set(vector_store.dataset_handler.dataset.tensors) == set( [ "embedding", "id", @@ -1819,7 +1871,7 @@ def test_vdb_index_like(local_path, capsys, hub_cloud_dev_token): def assert_vectorstore_structure(vector_store, number_of_data): assert len(vector_store) == number_of_data - assert set(vector_store.dataset.tensors) == { + assert set(vector_store.dataset_handler.dataset.tensors) == { "embedding", "id", "metadata", @@ -1831,14 +1883,14 @@ def assert_vectorstore_structure(vector_store, number_of_data): "metadata", "text", } - assert vector_store.dataset.embedding.htype == "embedding" - assert vector_store.dataset.id.htype == "text" - assert vector_store.dataset.metadata.htype == "json" - assert vector_store.dataset.text.htype == "text" - assert vector_store.dataset.embedding.dtype == "float32" - assert vector_store.dataset.id.dtype == "str" - assert vector_store.dataset.metadata.dtype == "str" - assert vector_store.dataset.text.dtype == "str" + assert vector_store.dataset_handler.dataset.embedding.htype == "embedding" + assert vector_store.dataset_handler.dataset.id.htype == "text" + assert vector_store.dataset_handler.dataset.metadata.htype == "json" + assert vector_store.dataset_handler.dataset.text.htype == "text" + assert vector_store.dataset_handler.dataset.embedding.dtype == "float32" + assert vector_store.dataset_handler.dataset.id.dtype == "str" + assert vector_store.dataset_handler.dataset.metadata.dtype == "str" + assert vector_store.dataset_handler.dataset.text.dtype == "str" @pytest.mark.slow @@ -1918,10 +1970,10 @@ def test_ingestion_images(local_path): ids = vector_store.add(image=images, embedding=embeddings, return_ids=True) - assert "image" in vector_store.dataset.tensors - assert "embedding" in vector_store.dataset.tensors - assert len(vector_store.dataset.image[0].numpy().shape) == 3 - assert len(vector_store.dataset.image[1].numpy().shape) == 3 + assert "image" in vector_store.dataset_handler.dataset.tensors + assert "embedding" in vector_store.dataset_handler.dataset.tensors + assert len(vector_store.dataset_handler.dataset.image[0].numpy().shape) == 3 + assert len(vector_store.dataset_handler.dataset.image[1].numpy().shape) == 3 assert len(ids) == 10 @@ -1937,7 +1989,7 @@ def test_parse_add_arguments(local_path): with pytest.raises(ValueError): # Throw error because embedding_function requires embed_data_from utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, initial_embedding_function=embedding_fn, embedding_function=embedding_fn, embeding_tensor="embedding", @@ -1949,7 +2001,7 @@ def test_parse_add_arguments(local_path): with pytest.raises(ValueError): # Throw error because embedding function is not specified anywhere utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, embedding_data=texts, embeding_tensor="embedding", text=texts, @@ -1960,7 +2012,7 @@ def test_parse_add_arguments(local_path): with pytest.raises(ValueError): # Throw error because data is not specified for all tensors utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, text=texts, id=ids, metadata=metadatas, @@ -1974,7 +2026,7 @@ def test_parse_add_arguments(local_path): embed_data_from, tensors, ) = utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, initial_embedding_function=embedding_fn, embedding_tensor="embedding", text=texts, @@ -1989,7 +2041,7 @@ def test_parse_add_arguments(local_path): embed_data_from, tensors, ) = utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, initial_embedding_function=embedding_fn_dp, text=texts, id=ids, @@ -2009,7 +2061,7 @@ def test_parse_add_arguments(local_path): with pytest.raises(ValueError): # initial embedding function specified and embeding_tensor is not specified utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, initial_embedding_function=embedding_fn_dp, embedding_data=texts, text=texts, @@ -2021,7 +2073,7 @@ def test_parse_add_arguments(local_path): with pytest.raises(ValueError): # Throw error because embedding_function and embedding are specified utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, initial_embedding_function=embedding_fn_dp, embedding_function=embedding_fn_dp, embedding_data=texts, @@ -2035,7 +2087,7 @@ def test_parse_add_arguments(local_path): with pytest.raises(ValueError): # initial_embedding_function is specified and embeding_tensor, embed_data_from and embedding is specified. utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, initial_embedding_function=embedding_fn_dp, embedding_tensor="embedding", embedding_data=texts, @@ -2048,7 +2100,7 @@ def test_parse_add_arguments(local_path): with pytest.raises(ValueError): # initial_embedding_function is not specified and embeding_tensor, embed_data_from and embedding is specified. utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, embeding_tensor="embedding", embedding_data=texts, text=texts, @@ -2059,7 +2111,7 @@ def test_parse_add_arguments(local_path): with pytest.raises(ValueError): utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, embedding_function=embedding_fn_dp, initial_embedding_function=embedding_fn_dp, embedding_data=texts, @@ -2076,7 +2128,7 @@ def test_parse_add_arguments(local_path): embedding_tensors, tensors, ) = utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, embedding_function=embedding_fn2_dp, embedding_data=texts, embedding_tensor="embedding", @@ -2097,7 +2149,7 @@ def test_parse_add_arguments(local_path): embedding_tensors, tensors, ) = utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, embedding_function=embedding_fn2_dp, embedding_data="text", embedding_tensor="embedding", @@ -2131,7 +2183,7 @@ def test_parse_add_arguments(local_path): # There are two embedding but an embedding_tensor is not specified, so it's not clear where to add the embedding data with pytest.raises(ValueError): utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, embedding_function=embedding_fn2_dp, embedding_data="text", text=texts, @@ -2154,7 +2206,7 @@ def test_parse_add_arguments(local_path): # There is no embedding tensor, so it's not clear where to add the embedding data with pytest.raises(ValueError): utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, embedding_function=embedding_fn2_dp, embedding_data=texts, text=texts, @@ -2183,7 +2235,7 @@ def test_parse_add_arguments(local_path): embedding_tensors, tensors, ) = utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, embedding_function=embedding_fn2_dp, embedding_data=texts, text=texts, @@ -2204,7 +2256,7 @@ def test_parse_add_arguments(local_path): embedding_tensor, tensors, ) = utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, initial_embedding_function=embedding_fn_dp, text=texts, id=ids, @@ -2227,7 +2279,7 @@ def test_parse_add_arguments(local_path): embedding_tensor, tensors, ) = utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, text=texts, id=ids, metadata=metadatas, @@ -2241,7 +2293,7 @@ def test_parse_add_arguments(local_path): embedding_tensor, tensors, ) = utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, text=texts, id=ids, metadata=metadatas, @@ -2255,7 +2307,7 @@ def test_parse_add_arguments(local_path): embedding_tensor, tensors, ) = utils.parse_add_arguments( - dataset=deeplake_vector_store.dataset, + dataset=deeplake_vector_store.dataset_handler.dataset, text=texts, id=ids, metadata=metadatas, @@ -2388,7 +2440,9 @@ def test_multiple_embeddings(local_path): ) # test with initial embedding function - vector_store.embedding_function = DeepLakeEmbedder(embedding_function=embedding_fn) + vector_store.dataset_handler.embedding_function = DeepLakeEmbedder( + embedding_function=embedding_fn + ) vector_store.add( text=texts, embedding_data=[texts, texts], @@ -2411,11 +2465,11 @@ def test_multiple_embeddings(local_path): embedding_2=(embedding_fn3, 25 * _texts), ) - assert len(vector_store.dataset) == 50040 - assert len(vector_store.dataset.embedding_1) == 50040 - assert len(vector_store.dataset.embedding_2) == 50040 - assert len(vector_store.dataset.id) == 50040 - assert len(vector_store.dataset.text) == 50040 + assert len(vector_store.dataset_handler.dataset) == 50040 + assert len(vector_store.dataset_handler.dataset.embedding_1) == 50040 + assert len(vector_store.dataset_handler.dataset.embedding_2) == 50040 + assert len(vector_store.dataset_handler.dataset.id) == 50040 + assert len(vector_store.dataset_handler.dataset.text) == 50040 def test_extend_none(local_path): @@ -2434,11 +2488,11 @@ def test_extend_none(local_path): ) vector_store.add(text=texts, embedding=None, id=ids, metadata=None) - assert len(vector_store.dataset) == 10 - assert len(vector_store.dataset.text) == 10 - assert len(vector_store.dataset.embedding) == 10 - assert len(vector_store.dataset.id) == 10 - assert len(vector_store.dataset.metadata) == 10 + assert len(vector_store.dataset_handler.dataset) == 10 + assert len(vector_store.dataset_handler.dataset.text) == 10 + assert len(vector_store.dataset_handler.dataset.embedding) == 10 + assert len(vector_store.dataset_handler.dataset.id) == 10 + assert len(vector_store.dataset_handler.dataset.metadata) == 10 def test_query_dim(local_path): @@ -2472,9 +2526,9 @@ def test_embeddings_only(local_path): embedding_1=(embedding_fn, texts), embedding_2=(embedding_fn3, texts) ) - assert len(vector_store.dataset) == 10 - assert len(vector_store.dataset.embedding_1) == 10 - assert len(vector_store.dataset.embedding_2) == 10 + assert len(vector_store.dataset_handler.dataset) == 10 + assert len(vector_store.dataset_handler.dataset.embedding_1) == 10 + assert len(vector_store.dataset_handler.dataset.embedding_2) == 10 def test_uuid_fix(local_path): @@ -2484,12 +2538,14 @@ def test_uuid_fix(local_path): vector_store.add(text=texts, id=ids, embedding=embeddings, metadata=metadatas) - assert vector_store.dataset.id.data()["value"] == list(map(str, ids)) + assert vector_store.dataset_handler.dataset.id.data()["value"] == list( + map(str, ids) + ) def test_read_only(): db = VectorStore("hub://davitbun/twitter-algorithm") - assert db.dataset.read_only == True + assert db.dataset_handler.dataset.read_only == True def test_delete_by_path_wrong_path(): @@ -2500,26 +2556,26 @@ def test_delete_by_path_wrong_path(): @requires_libdeeplake def test_exec_option_with_auth(local_path, hub_cloud_path, hub_cloud_dev_token): db = VectorStore(path=local_path) - assert db.exec_option == "python" + assert db.dataset_handler.exec_option == "python" db = VectorStore( path=local_path, token=hub_cloud_dev_token, ) - assert db.exec_option == "compute_engine" + assert db.dataset_handler.exec_option == "compute_engine" db = VectorStore( path=hub_cloud_path, token=hub_cloud_dev_token, ) - assert db.exec_option == "compute_engine" + assert db.dataset_handler.exec_option == "compute_engine" db = VectorStore( path=hub_cloud_path + "_tensor_db", token=hub_cloud_dev_token, runtime={"tensor_db": True}, ) - assert db.exec_option == "tensor_db" + assert db.dataset_handler.exec_option == "tensor_db" @requires_libdeeplake @@ -2538,19 +2594,19 @@ def test_exec_option_cli( db = VectorStore( path=local_path, ) - assert db.exec_option == "compute_engine" + assert db.dataset_handler.exec_option == "compute_engine" # hub cloud dataset and logged in with cli db = VectorStore( path=hub_cloud_path, ) - assert db.exec_option == "compute_engine" + assert db.dataset_handler.exec_option == "compute_engine" # hub cloud dataset and logged in with cli db = VectorStore( path="mem://abc", ) - assert db.exec_option == "python" + assert db.dataset_handler.exec_option == "python" # logging out with cli runner.invoke(logout) @@ -2559,7 +2615,7 @@ def test_exec_option_cli( db = VectorStore( path=local_path, ) - assert db.exec_option == "python" + assert db.dataset_handler.exec_option == "python" # Check whether after logging out exec_option changes to python # logging in with cli token @@ -2567,10 +2623,10 @@ def test_exec_option_cli( db = VectorStore( path=local_path, ) - assert db.exec_option == "compute_engine" + assert db.dataset_handler.exec_option == "compute_engine" # logging out with cli runner.invoke(logout) - assert db.exec_option == "python" + assert db.dataset_handler.exec_option == "python" # Check whether after logging out when token specified exec_option doesn't change # logging in with cli token @@ -2579,10 +2635,10 @@ def test_exec_option_cli( path=local_path, token=hub_cloud_dev_token, ) - assert db.exec_option == "compute_engine" + assert db.dataset_handler.exec_option == "compute_engine" # logging out with cli runner.invoke(logout) - assert db.exec_option == "compute_engine" + assert db.dataset_handler.exec_option == "compute_engine" @requires_libdeeplake @@ -2609,30 +2665,26 @@ def test_exec_option_with_connected_datasets( runner.invoke(login, f"-t {hub_cloud_dev_token}") assert db.exec_option == "python" - db.dataset.connect( + db.dataset_handler.dataset.connect( creds_key=hub_cloud_dev_managed_creds_key, dest_path=hub_cloud_path, token=hub_cloud_dev_token, ) - db.dataset.add_creds_key(hub_cloud_dev_managed_creds_key, managed=True) + db.dataset_handler.dataset.add_creds_key( + hub_cloud_dev_managed_creds_key, managed=True + ) assert db.exec_option == "compute_engine" -@pytest.mark.slow -@pytest.mark.parametrize( - "runtime", - ["runtime", None], - indirect=True, -) -@pytest.mark.skipif(sys.platform == "win32", reason="Does not run on Windows") -def test_vectorstore_factory(hub_cloud_dev_token, hub_cloud_path, runtime): - db = vectorstore_factory( - path=hub_cloud_path, - runtime=runtime, - token=hub_cloud_dev_token, +def test_dataset_init_param(local_ds): + local_ds.create_tensor("text", htype="text") + local_ds.create_tensor("embedding", htype="embedding") + local_ds.create_tensor("id", htype="text") + local_ds.create_tensor("metadata", htype="json") + + db = VectorStore( + dataset=local_ds, ) - if runtime is not None: - assert isinstance(db, DeepMemoryVectorStore) - else: - assert isinstance(db, DeepLakeVectorStore) + db.add(text=texts, embedding=embeddings, id=ids, metadata=metadatas) + assert len(db) == 10 diff --git a/deeplake/core/vectorstore/unsupported_deep_memory.py b/deeplake/core/vectorstore/unsupported_deep_memory.py deleted file mode 100644 index 44408486bc..0000000000 --- a/deeplake/core/vectorstore/unsupported_deep_memory.py +++ /dev/null @@ -1,39 +0,0 @@ -from deeplake.util.exceptions import DeepMemoryWaitingListError - -from typing import List, Tuple, Optional, Callable, Union, Dict, Any - -import numpy as np - - -class DeepMemory: - """This the class that raises exceptions for users that don't have access to Deep Memory""" - - def __init__(*args, **kwargs): - # Just some initialization to make sure that the class is not empty - pass - - def train( - self, - queries: List[str], - relevance: List[List[Tuple[str, int]]], - embedding_function: Optional[Callable[[str], np.ndarray]] = None, - token: Optional[str] = None, - ) -> str: - raise DeepMemoryWaitingListError() - - def status(self, job_id: str): - raise DeepMemoryWaitingListError() - - def list_jobs(self, debug=False): - raise DeepMemoryWaitingListError() - - def evaluate( - self, - relevance: List[List[Tuple[str, int]]], - queries: List[str], - embedding_function: Optional[Callable[..., List[np.ndarray]]] = None, - embedding: Optional[Union[List[np.ndarray], List[List[float]]]] = None, - top_k: List[int] = [1, 3, 5, 10, 50, 100], - qvs_params: Optional[Dict[str, Any]] = None, - ) -> Dict[str, Dict[str, float]]: - raise DeepMemoryWaitingListError() diff --git a/deeplake/core/vectorstore/vector_search/dataset/dataset.py b/deeplake/core/vectorstore/vector_search/dataset/dataset.py index 0b3fbc7014..2450718767 100644 --- a/deeplake/core/vectorstore/vector_search/dataset/dataset.py +++ b/deeplake/core/vectorstore/vector_search/dataset/dataset.py @@ -555,11 +555,18 @@ def extend_or_ingest_dataset( def convert_id_to_row_id(ids, dataset, search_fn, query, exec_option, filter): if ids is None: delete_view = search_fn( + embedding_data=None, + embedding_function=None, + embedding=None, + distance_metric=None, + embedding_tensor=None, filter=filter, query=query, exec_option=exec_option, + return_tensors=False, return_view=True, k=int(1e9), + deep_memory=False, ) else: diff --git a/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py b/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py index d123fe1073..2775ac9b4a 100644 --- a/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py +++ b/deeplake/core/vectorstore/vector_search/dataset/test_dataset.py @@ -9,7 +9,7 @@ from deeplake.core.vectorstore import utils from deeplake.core.vectorstore.vector_search import dataset as dataset_utils from deeplake.core.vectorstore import DeepLakeVectorStore -from deeplake.core.vectorstore.embedder import DeepLakeEmbedder +from deeplake.core.vectorstore.embeddings.embedder import DeepLakeEmbedder from deeplake.constants import ( DEFAULT_VECTORSTORE_DEEPLAKE_PATH, DEFAULT_VECTORSTORE_TENSORS, diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index 17c5bfc87d..d1a31e8ce4 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -9,7 +9,7 @@ from deeplake.util.warnings import always_warn from deeplake.client.utils import read_token from deeplake.core.dataset import DeepLakeCloudDataset, Dataset -from deeplake.core.vectorstore.embedder import DeepLakeEmbedder +from deeplake.core.vectorstore.embeddings.embedder import DeepLakeEmbedder from deeplake.client.client import DeepLakeBackendClient from deeplake.util.path import get_path_type @@ -637,3 +637,23 @@ def create_embedding_function(embedding_function): embedding_function=embedding_function, ) return None + + +def create_and_load_vectorstore(): + from deeplake import VectorStore + + db = VectorStore( + path="local_path", + overwrite=True, + ) + + texts, embeddings, ids, metadata, _ = create_data( + number_of_data=100, embedding_dim=1536, metadata_key="abc" + ) + db.add( + text=texts, + embedding=embeddings, + id=ids, + metadata=metadata, + ) + return db diff --git a/deeplake/core/vectorstore/vectorstore_factory.py b/deeplake/core/vectorstore/vectorstore_factory.py deleted file mode 100644 index f132d4c0c7..0000000000 --- a/deeplake/core/vectorstore/vectorstore_factory.py +++ /dev/null @@ -1,29 +0,0 @@ -from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore -from deeplake.core.vectorstore.deepmemory_vectorstore import DeepMemoryVectorStore -from deeplake.client.client import DeepMemoryBackendClient -from deeplake.util.path import get_path_type - - -def vectorstore_factory( - path, - *args, - **kwargs, -): - path_type = get_path_type(path) - - dm_client = DeepMemoryBackendClient(token=kwargs.get("token")) - user_profile = dm_client.get_user_profile() - - if path_type == "hub": - # TODO: add support for windows - dataset_id = path[6:].split("/")[0] - else: - # TODO: change user_profile to user_id - dataset_id = user_profile["name"] - - deepmemory_is_available = dm_client.deepmemory_is_available(dataset_id) - if deepmemory_is_available: - return DeepMemoryVectorStore( - path=path, client=dm_client, org_id=dataset_id, *args, **kwargs - ) - return VectorStore(path=path, *args, **kwargs) diff --git a/deeplake/integrations/huggingface/huggingface.py b/deeplake/integrations/huggingface/huggingface.py index ddf91ee0fb..e2515abb9b 100644 --- a/deeplake/integrations/huggingface/huggingface.py +++ b/deeplake/integrations/huggingface/huggingface.py @@ -9,7 +9,7 @@ def _is_seq_convertible(seq): - from datasets import Sequence + from datasets import Sequence # type: ignore if isinstance(seq, Sequence): feature = seq.feature diff --git a/deeplake/requirements/tests.txt b/deeplake/requirements/tests.txt index 7300df0788..d2f66c77a8 100644 --- a/deeplake/requirements/tests.txt +++ b/deeplake/requirements/tests.txt @@ -8,9 +8,11 @@ pytest-profiling pytest-threadleak coverage -mypy +mypy==1.7.0 black darglint +pre-commit + typing_extensions>=3.10.0.0 types-requests diff --git a/deeplake/util/check_latest_version.py b/deeplake/util/check_latest_version.py index 50c50cc043..42c806d677 100644 --- a/deeplake/util/check_latest_version.py +++ b/deeplake/util/check_latest_version.py @@ -1,7 +1,7 @@ import json import os import warnings -import requests +import requests # type: ignore from deeplake.client.config import HUB_PYPI_VERSION_PATH from deeplake.core.fast_forwarding import version_compare import time diff --git a/deeplake/visualizer/tests/test_visualizer.py b/deeplake/visualizer/tests/test_visualizer.py index cc2c33ba16..194167aa43 100644 --- a/deeplake/visualizer/tests/test_visualizer.py +++ b/deeplake/visualizer/tests/test_visualizer.py @@ -1,5 +1,5 @@ import numpy as np -import requests +import requests # type: ignore from deeplake.visualizer.visualizer import visualizer from deeplake.tests.dataset_fixtures import * import deeplake as dp diff --git a/docs/source/deeplake.VectorStore.rst b/docs/source/deeplake.VectorStore.rst index 9eb9953fb7..1e50656b04 100644 --- a/docs/source/deeplake.VectorStore.rst +++ b/docs/source/deeplake.VectorStore.rst @@ -1,7 +1,8 @@ deeplake.VectorStore --------------------- +==================== +.. currentmodule:: deeplake.core.vectorstore.deeplake_vectorstore -.. autoclass:: deeplake.core.vectorstore.deeplake_vectorstore.VectorStore - :members: - :show-inheritance: - :special-members: __init__ \ No newline at end of file +.. autoclass:: VectorStore() + :members: + + .. automethod:: __init__ \ No newline at end of file diff --git a/docs/source/deeplake.core.vectorstore.deep_memory.rst b/docs/source/deeplake.core.vectorstore.deep_memory.rst index 280680adad..f6bdd85fb6 100644 --- a/docs/source/deeplake.core.vectorstore.deep_memory.rst +++ b/docs/source/deeplake.core.vectorstore.deep_memory.rst @@ -2,7 +2,7 @@ deeplake.core.vectorstore.deep_memory ========================= .. currentmodule:: deeplake.core.vectorstore.deep_memory -VectorStore +DeepMemory ~~~~~~~~~~~ .. autoclass:: DeepMemory() :members: diff --git a/docs/source/deeplake.core.vectorstore.rst b/docs/source/deeplake.core.vectorstore.rst deleted file mode 100644 index aac34e0e88..0000000000 --- a/docs/source/deeplake.core.vectorstore.rst +++ /dev/null @@ -1,11 +0,0 @@ -deeplake.core.vectorstore -========================= -.. currentmodule:: deeplake.core.vectorstore - -VectorStore -~~~~~~~~~~~ -.. autodata:: VectorStore() - :members: - - .. automethod:: __init__ - .. automethod:: __len__ From afd1264d5068e4d6f4cf051f2b9890541b1c1afe Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Thu, 30 Nov 2023 21:23:53 +0600 Subject: [PATCH 02/16] ux improvements --- .../client_side_dataset_handler.py | 19 +- .../dataset_handlers/dataset_handler.py | 13 +- .../dataset_handlers/dataset_handler_base.py | 4 +- .../managed_side_dataset_handler.py | 3 +- .../vectorstore/deep_memory/deep_memory.py | 63 ++++- .../deep_memory/test_deepmemory.py | 93 ++++--- .../core/vectorstore/deeplake_vectorstore.py | 16 +- .../core/vectorstore/dev_helpers/__init__.py | 0 .../dev_helpers/vectorstore_tools.py | 106 ++++++++ .../vectorstore/test_deeplake_vectorstore.py | 234 ++++++++++++++++- .../vector_search/indra/search_algorithm.py | 242 ++++++++++++------ .../vector_search/indra/vector_search.py | 2 + .../vector_search/python/vector_search.py | 6 + .../core/vectorstore/vector_search/utils.py | 29 +-- .../vector_search/vector_search.py | 3 + deeplake/requirements/tests.txt | 1 + deeplake/tests/path_fixtures.py | 45 +++- 17 files changed, 707 insertions(+), 172 deletions(-) create mode 100644 deeplake/core/vectorstore/dev_helpers/__init__.py create mode 100644 deeplake/core/vectorstore/dev_helpers/vectorstore_tools.py diff --git a/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py b/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py index 42b7589ad4..16c7b861e3 100644 --- a/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py +++ b/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py @@ -87,7 +87,8 @@ def __init__( self.tensor_params = tensor_params self.distance_metric_index = index_maintenance.index_operation_vectorstore(self) self.deep_memory = DeepMemory( - dataset_or_path=self.path, + dataset=self.dataset, + path=path, token=self.token, logger=self.logger, embedding_function=self.embedding_function, @@ -180,6 +181,7 @@ def search( return_tensors: List[str], return_view: bool, deep_memory: bool, + return_tql: bool, ) -> Union[Dict, Dataset]: feature_report_path( path=self.bugout_reporting_path, @@ -196,6 +198,7 @@ def search( "embedding": True if embedding is not None else False, "return_tensors": return_tensors, "return_view": return_view, + "return_tql": return_tql, }, token=self.token, username=self.username, @@ -261,6 +264,7 @@ def search( deep_memory=deep_memory, token=self.token, org_id=self.org_id, + return_tql=return_tql, ) def delete( @@ -338,6 +342,14 @@ def update_embedding( username=self.username, ) + if row_ids and ids: + raise ValueError("Only one of row_ids and ids can be specified.") + elif row_ids and filter: + raise ValueError("Only one of row_ids and filter can be specified.") + + if filter and query: + raise ValueError("Only one of filter and query can be specified.") + ( embedding_function, embedding_source_tensor, @@ -378,13 +390,14 @@ def commit(self, allow_empty: bool = True) -> None: """ self.dataset.commit(allow_empty=allow_empty) - def checkout(self, branch: str = "main") -> None: + def checkout(self, branch: str, create: bool) -> None: """Checkout the Vector Store to a specific branch. Args: branch (str): Branch name to checkout. Defaults to "main". + create (bool): Whether to create the branch if it does not exist. Defaults to False. """ - self.dataset.checkout(branch) + self.dataset.checkout(branch, create=create) def tensors(self): """Returns the list of tensors present in the dataset""" diff --git a/deeplake/core/vectorstore/dataset_handlers/dataset_handler.py b/deeplake/core/vectorstore/dataset_handlers/dataset_handler.py index 9b9d2f760e..9d2a0d91ca 100644 --- a/deeplake/core/vectorstore/dataset_handlers/dataset_handler.py +++ b/deeplake/core/vectorstore/dataset_handlers/dataset_handler.py @@ -7,9 +7,10 @@ def get_dataset_handler(*args, **kwargs): - runtime = kwargs.get("runtime", None) - if runtime and runtime.get("tensor_db", True): - # TODO: change to ManagedSideDH when it's ready - return ClientSideDH(*args, **kwargs) - else: - return ClientSideDH(*args, **kwargs) + # TODO: Use this logic when managed db will be ready + # runtime = kwargs.get("runtime", None) + # if runtime and runtime.get("tensor_db", True): + # return ClientSideDH(*args, **kwargs) + # else: + # return ClientSideDH(*args, **kwargs) + return ClientSideDH(*args, **kwargs) diff --git a/deeplake/core/vectorstore/dataset_handlers/dataset_handler_base.py b/deeplake/core/vectorstore/dataset_handlers/dataset_handler_base.py index 0dcb88f625..366369549b 100644 --- a/deeplake/core/vectorstore/dataset_handlers/dataset_handler_base.py +++ b/deeplake/core/vectorstore/dataset_handlers/dataset_handler_base.py @@ -152,6 +152,7 @@ def search( return_tensors: Optional[List[str]] = None, return_view: bool = False, deep_memory: bool = False, + return_tql_query: bool = False, ) -> Union[Dict, Dataset]: pass @@ -204,11 +205,12 @@ def commit(self, allow_empty: bool = True) -> None: """ raise NotImplementedError() - def checkout(self, branch: str = "main") -> None: + def checkout(self, branch: str, create: bool) -> None: """Checkout the Vector Store to a specific branch. Args: branch (str): Branch name to checkout. Defaults to "main". + create (bool): Whether to create the branch if it does not exist. Defaults to False. Raises: NotImplementedError: This method is not implemented by the base class. diff --git a/deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py b/deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py index 0e5f60a636..c34969128a 100644 --- a/deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py +++ b/deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py @@ -76,7 +76,8 @@ def __init__( ) self.deep_memory = DeepMemory( - dataset_or_path=self.path, + dataset=None, + path=path, token=self.token, logger=self.logger, embedding_function=self.embedding_function, diff --git a/deeplake/core/vectorstore/deep_memory/deep_memory.py b/deeplake/core/vectorstore/deep_memory/deep_memory.py index 2656b8f3d3..8a4f3f4216 100644 --- a/deeplake/core/vectorstore/deep_memory/deep_memory.py +++ b/deeplake/core/vectorstore/deep_memory/deep_memory.py @@ -1,4 +1,5 @@ import logging +import pathlib import uuid from collections import defaultdict from pydantic import BaseModel, ValidationError @@ -8,13 +9,12 @@ import numpy as np import deeplake -from deeplake.enterprise.dataloader import indra_available from deeplake.util.exceptions import ( DeepMemoryWaitingListError, IncorrectRelevanceTypeError, IncorrectQueriesTypeError, ) -from deeplake.util.remove_cache import get_base_storage +from deeplake.util.path import convert_pathlib_to_string_if_needed from deeplake.constants import ( DEFAULT_QUERIES_VECTORSTORE_TENSORS, DEFAULT_MEMORY_CACHE_SIZE, @@ -30,7 +30,6 @@ feature_report_path, ) from deeplake.util.path import get_path_type -from deeplake.util.version_control import load_meta def use_deep_memory(func): @@ -78,7 +77,8 @@ def validate_relevance_and_queries(relevance, queries): class DeepMemory: def __init__( self, - dataset_or_path: Union[Dataset, str], + dataset: Dataset, + path: Union[str, pathlib.Path], logger: logging.Logger, embedding_function: Optional[Any] = None, token: Optional[str] = None, @@ -87,7 +87,7 @@ def __init__( """Based Deep Memory class to train and evaluate models on DeepMemory managed service. Args: - dataset_or_path (Union[Dataset, str]): deeplake dataset object or path. + dataset (Dataset): deeplake dataset object or path. logger (logging.Logger): Logger object. embedding_function (Optional[Any], optional): Embedding funtion class used to convert queries/documents to embeddings. Defaults to None. token (Optional[str], optional): API token for the DeepMemory managed service. Defaults to None. @@ -97,14 +97,8 @@ def __init__( ImportError: if indra is not installed ValueError: if incorrect type is specified for `dataset_or_path` """ - if isinstance(dataset_or_path, Dataset): - self.path = dataset_or_path.path - elif isinstance(dataset_or_path, str): - self.path = dataset_or_path - else: - raise ValueError( - "dataset_or_path should be a Dataset object or a string path" - ) + self.dataset = dataset + self.path = convert_pathlib_to_string_if_needed(path) feature_report_path( path=self.path, @@ -143,7 +137,8 @@ def train( relevance (List[List[Tuple[str, int]]]): List of relevant documents for each query with their respective relevance score. The outer list corresponds to the queries and the inner list corresponds to the doc_id, relevence_score pair for each query. doc_id is the document id in the corpus dataset. It is stored in the `id` tensor of the corpus dataset. - relevence_score is the relevance score of the document for the query. The range is between 0 and 1, where 0 stands for not relevant and 1 stands for relevant. + relevence_score is the relevance score of the document for the query. The value is either 0 and 1, where 0 stands for not relevant (unknown relevance) + and 1 stands for relevant. Currently, only values of 1 contribute to the training, and there is no reason to provide examples with relevance of 0. embedding_function (Optional[Callable[[str], np.ndarray]], optional): Embedding funtion used to convert queries to embeddings. Defaults to None. token (str, optional): API token for the DeepMemory managed service. Defaults to None. @@ -554,6 +549,46 @@ def evaluate( self.queries_dataset.commit() return recalls + @access_control + def get_model(self): + """Get the name of the model currently being used by DeepMemory managed service.""" + return self.dataset.embedding.info["deepmemory"]["model.npy"]["job_id"] + + @access_control + def set_model(self, model_name: str): + """Set model.npy to use `model_name` instead of default model + Args: + model_name (str): name of the model to use + """ + + if "npy" not in model_name: + model_name += ".npy" + + # verify model_name + self._verify_model_name(model_name) + + # set model.npy to use `model_name` instead of default model + self._set_model_npy(model_name) + + def _verify_model_name(self, model_name: str): + if model_name not in self.dataset.embedding.info["deepmemory"]: + raise ValueError( + "Invalid model name. Please choose from the following models: " + + ", ".join(self.dataset.embedding.info["deepmemory"].keys()) + ) + + def _set_model_npy(self, model_name: str): + # get new model.npy + new_model_npy = self.dataset.embedding.info["deepmemory"][model_name] + + # get old deepmemory dictionary and update it: + old_deepmemory = self.dataset.embedding.info["deepmemory"] + new_deepmemory = old_deepmemory.copy() + new_deepmemory.update({"model.npy": new_model_npy}) + + # assign new deepmemory dictionary to the dataset: + self.dataset.embedding.info["deepmemory"] = new_deepmemory + def _get_dm_client(self): path = self.path path_type = get_path_type(path) diff --git a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py index 0a458974fa..d18a1b85d1 100644 --- a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py +++ b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py @@ -3,6 +3,7 @@ import pytest import sys from time import sleep +from unittest.mock import MagicMock import deeplake from deeplake import VectorStore @@ -528,35 +529,6 @@ def test_deepmemory_status(capsys, job_id, corpus_query_pair_path, hub_cloud_dev assert status.out[511:] == output_str[511:] -@pytest.mark.slow -@pytest.mark.skipif(sys.platform == "win32", reason="Does not run on Windows") -def test_deepmemory_search( - corpus_query_relevances_copy, - testing_relevance_query_deepmemory, - hub_cloud_dev_token, -): - corpus, _, _, _ = corpus_query_relevances_copy - relevance, query_embedding = testing_relevance_query_deepmemory - - db = VectorStore( - path=corpus, - runtime={"tensor_db": True}, - token=hub_cloud_dev_token, - ) - - output = db.search( - embedding=query_embedding, deep_memory=True, return_tensors=["id"] - ) - - assert len(output["id"]) == 4 - assert relevance in output["id"] - - output = db.search(embedding=query_embedding) - assert len(output["id"]) == 4 - assert relevance not in output["id"] - # TODO: add some logging checks - - @pytest.mark.skipif(sys.platform == "win32", reason="Does not run on Windows") @pytest.mark.slow @requires_libdeeplake @@ -660,3 +632,66 @@ def test_not_supported_training_args(corpus_query_relevances_copy, hub_cloud_dev queries=queries, relevance="relevances", ) + + +def test_deepmemory_v2_set_model_should_set_model_for_all_subsequent_loads( + local_dmv2_dataset, + hub_cloud_dev_token, +): + # Setiing model should set model for all subsequent loads + db = VectorStore(path=local_dmv2_dataset, token=hub_cloud_dev_token) + assert db.deep_memory.get_model() == "655f86e8ab93e7fc5067a3ac_2" + + # ensure after setting model, get model returns specified model + db.deep_memory.set_model("655f86e8ab93e7fc5067a3ac_1") + + assert ( + db.dataset.embedding.info["deepmemory"]["model.npy"]["job_id"] + == "655f86e8ab93e7fc5067a3ac_1" + ) + assert db.deep_memory.get_model() == "655f86e8ab93e7fc5067a3ac_1" + + # ensure after setting model, reloading the dataset returns the same model + db = VectorStore(path=local_dmv2_dataset, token=hub_cloud_dev_token) + assert db.deep_memory.get_model() == "655f86e8ab93e7fc5067a3ac_1" + + +@pytest.mark.slow +@pytest.mark.skipif(sys.platform == "win32", reason="Does not run on Windows") +def test_deepmemory_search_should_contain_correct_answer( + corpus_query_relevances_copy, + testing_relevance_query_deepmemory, + hub_cloud_dev_token, +): + corpus, _, _, _ = corpus_query_relevances_copy + relevance, query_embedding = testing_relevance_query_deepmemory + + db = VectorStore( + path=corpus, + token=hub_cloud_dev_token, + ) + + output = db.search( + embedding=query_embedding, deep_memory=True, return_tensors=["id"] + ) + assert len(output["id"]) == 4 + assert relevance in output["id"] + + +@pytest.mark.slow +@pytest.mark.skipif(sys.platform == "win32", reason="Does not run on Windows") +def test_deeplake_search_should_not_contain_correct_answer( + corpus_query_relevances_copy, + testing_relevance_query_deepmemory, + hub_cloud_dev_token, +): + corpus, _, _, _ = corpus_query_relevances_copy + relevance, query_embedding = testing_relevance_query_deepmemory + + db = VectorStore( + path=corpus, + token=hub_cloud_dev_token, + ) + output = db.search(embedding=query_embedding) + assert len(output["id"]) == 4 + assert relevance not in output["id"] diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index a57dcac768..9512869775 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -128,13 +128,7 @@ def __init__( **kwargs, ) - self.deep_memory = DeepMemory( - dataset_or_path=self.dataset_handler.path, - token=self.dataset_handler.token, - logger=logger, - embedding_function=embedding_function, - creds=self.dataset_handler.creds, - ) + self.deep_memory = self.dataset_handler.deep_memory def add( self, @@ -238,6 +232,7 @@ def search( return_tensors: Optional[List[str]] = None, return_view: bool = False, deep_memory: bool = False, + return_tql: bool = False, ) -> Union[Dict, Dataset]: """VectorStore search method that combines embedding search, metadata search, and custom TQL search. @@ -288,6 +283,7 @@ def search( return_view (bool): Return a Deep Lake dataset view that satisfied the search parameters, instead of a dictionary with data. Defaults to False. If ``True`` return_tensors is set to "*" beucase data is lazy-loaded and there is no cost to including all tensors in the view. deep_memory (bool): Whether to use the Deep Memory model for improving search results. Defaults to False if deep_memory is not specified in the Vector Store initialization. If True, the distance metric is set to "deepmemory_distance", which represents the metric with which the model was trained. The search is performed using the Deep Memory model. If False, the distance metric is set to "COS" or whatever distance metric user specifies. + return_tql (bool): Whether to return the TQL query string used for the search. Defaults to False. .. # noqa: DAR101 @@ -313,6 +309,7 @@ def search( return_tensors=return_tensors, return_view=return_view, deep_memory=deep_memory, + return_tql=return_tql, ) def delete( @@ -480,13 +477,14 @@ def commit(self, allow_empty: bool = True) -> None: """ self.dataset_handler.commit(allow_empty=allow_empty) - def checkout(self, branch: str = "main") -> None: + def checkout(self, branch: str = "main", create=False) -> None: """Checkout the Vector Store to a specific branch. Args: branch (str): Branch name to checkout. Defaults to "main". + create (bool): Whether to create the branch if it doesn't exist. Defaults to False. """ - self.dataset_handler.checkout(branch) + self.dataset_handler.checkout(branch, create=create) def tensors(self): """Returns the list of tensors present in the dataset""" diff --git a/deeplake/core/vectorstore/dev_helpers/__init__.py b/deeplake/core/vectorstore/dev_helpers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deeplake/core/vectorstore/dev_helpers/vectorstore_tools.py b/deeplake/core/vectorstore/dev_helpers/vectorstore_tools.py new file mode 100644 index 0000000000..11a5cd8896 --- /dev/null +++ b/deeplake/core/vectorstore/dev_helpers/vectorstore_tools.py @@ -0,0 +1,106 @@ +import deeplake +from typing import Dict, List, Optional, Tuple +from deeplake.core.vectorstore.vector_search.utils import create_data + + +def create_and_load_vectorstore(): + from deeplake import VectorStore + + db = VectorStore( + path="local_path", + overwrite=True, + ) + + texts, embeddings, ids, metadata, _ = create_data( + number_of_data=100, embedding_dim=1536, metadata_key="abc" + ) + db.add( + text=texts, + embedding=embeddings, + id=ids, + metadata=metadata, + ) + return db + + +def train_deepmemory_model( + dataset_name: str = f"hub://activeloop-test/scifact", + corpus: Optional[Dict] = None, + relevenace: Optional[List[List[Tuple[str, int]]]] = None, + queries: Optional[List[str]] = None, + token: Optional[str] = None, + overwrite: bool = False, + enviroment: str = "staging", +): + from deeplake import VectorStore + from langchain.embeddings.openai import OpenAIEmbeddings # type: ignore + + if enviroment == "staging": + deeplake.client.config.USE_STAGING_ENVIRONMENT = True + elif enviroment == "dev": + deeplake.client.config.USE_DEV_ENVIRONMENT = True + + embedding_function = OpenAIEmbeddings() + if corpus is None: + if ( + not deeplake.exists(dataset_name, token=token, creds={}) + or overwrite == True + ): + deeplake.deepcopy( + f"hub://activeloop-test/deepmemory_test_corpus", + dataset_name, + token=token, + overwrite=True, + runtime={"tensor_db": True}, + ) + + db = VectorStore( + dataset_name, + token=token, + embedding_function=embedding_function, + ) + else: + db = VectorStore( + dataset_name, + token=token, + overwrite=True, + embedding_function=embedding_function, + ) + db.add(**corpus) + + query_vs = None + + if relevenace is None: + query_vs = VectorStore( + path=f"hub://activeloop-test/deepmemory_test_queries", + runtime={"tensor_db": True}, + token=token, + ) + relevance = query_vs.dataset.metadata.data()["value"] + + if queries is None: + if not query_vs: + query_vs = VectorStore( + path=f"hub://activeloop-test/deepmemory_test_queries", + runtime={"tensor_db": True}, + token=token, + ) + queries = query_vs.dataset.text.data()["value"] + + db.deep_memory.train( + relevance=relevance, + queries=queries, + ) + return db + + +def set_backend(backend="prod"): + if backend == "staging": + deeplake.client.config.USE_STAGING_ENVIRONMENT = True + deeplake.client.config.USE_DEV_ENVIRONMENT = False + elif backend == "dev": + deeplake.client.config.USE_DEV_ENVIRONMENT = True + deeplake.client.config.USE_STAGING_ENVIRONMENT = False + else: + deeplake.client.config.USE_DEV_ENVIRONMENT = False + deeplake.client.config.USE_STAGING_ENVIRONMENT = False diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 6d3ead64f7..a1b9dfc623 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -4,6 +4,7 @@ from math import isclose from functools import partial from typing import List +from unittest.mock import patch import numpy as np import pytest @@ -881,15 +882,47 @@ def assert_updated_vector_store( ) +# TODO: refactor this method: +# 1. Split this method into multiple methods +# 2. use create_and_populate_vs, update_vs_with_new_emb_fn to make these tests more readable +# 3. create one fixture for these nested fixtures @requires_libdeeplake @pytest.mark.parametrize( - "ds, vector_store_hash_ids, vector_store_row_ids, vector_store_filters, vector_store_query", + "ds, vector_store_hash_ids, vector_store_row_ids, vector_store_filters, vector_store_query, hub_cloud_dev_token", [ - ("local_auth_ds", "vector_store_hash_ids", None, None, None), - ("local_auth_ds", None, "vector_store_row_ids", None, None), - ("local_auth_ds", None, None, "vector_store_filter_udf", None), - ("local_auth_ds", None, None, "vector_store_filters", None), - ("hub_cloud_ds", None, None, None, "vector_store_query"), + ( + "local_auth_ds", + "vector_store_hash_ids", + None, + None, + None, + "hub_cloud_dev_token", + ), + ( + "local_auth_ds", + None, + "vector_store_row_ids", + None, + None, + "hub_cloud_dev_token", + ), + ( + "local_auth_ds", + None, + None, + "vector_store_filter_udf", + None, + "hub_cloud_dev_token", + ), + ( + "local_auth_ds", + None, + None, + "vector_store_filters", + None, + "hub_cloud_dev_token", + ), + ("hub_cloud_ds", None, None, None, "vector_store_query", "hub_cloud_dev_token"), ], indirect=True, ) @@ -903,6 +936,7 @@ def test_update_embedding( vector_store_filters, vector_store_query, init_embedding_function, + hub_cloud_dev_token, ): if vector_store_filters == "filter_udf": vector_store_filters = filter_udf @@ -919,7 +953,7 @@ def test_update_embedding( exec_option="compute_engine", embedding_function=init_embedding_function, index_params={"threshold": 10}, - token=ds.token, + token=hub_cloud_dev_token, ) # add data to the dataset: @@ -1242,6 +1276,102 @@ def test_update_embedding( vector_store.delete_by_path(path + "_multi", token=ds.token) +def create_and_populate_vs( + path, + token=None, + overwrite=True, + verbose=False, + exec_option="compute_engine", + index_params={"threshold": -1}, + number_of_data=NUMBER_OF_DATA, + runtime=None, +): + # if runtime specified and tensor_db is enabled, then set exec_option to None + if runtime and runtime.get("tensor_db", False): + exec_option = None + + vector_store = DeepLakeVectorStore( + path=path, + overwrite=overwrite, + verbose=verbose, + exec_option=exec_option, + index_params=index_params, + token=token, + runtime=runtime, + ) + + utils.create_data(number_of_data=number_of_data, embedding_dim=EMBEDDING_DIM) + + # add data to the dataset: + metadatas[1:6] = [{"a": 1} for _ in range(5)] + vector_store.add(id=ids, embedding=embeddings, text=texts, metadata=metadatas) + return vector_store + + +def update_vs_with_new_emb_fn( + vs, + new_embedding_value, + ids=None, + row_ids=None, + filter=None, + query=None, + embedding_source_tensor="text", + embedding_tensor="embedding", +): + embedding_fn = get_embedding_function(embedding_value=new_embedding_value) + + vs.update_embedding( + ids=ids, + row_ids=row_ids, + filter=filter, + query=query, + embedding_function=embedding_fn, + embedding_source_tensor=embedding_source_tensor, + embedding_tensor=embedding_tensor, + ) + + +def test_update_embedding_ids_and_row_ids_specified( + local_path, + vector_store_hash_ids, + vector_store_row_ids, + hub_cloud_dev_token, +): + vector_store = create_and_populate_vs( + local_path, + token=hub_cloud_dev_token, + ) + + with pytest.raises(ValueError): + update_vs_with_new_emb_fn( + vector_store, + new_embedding_value=100, + ids=vector_store_hash_ids, + row_ids=vector_store_row_ids, + ) + + +@requires_libdeeplake +def test_update_embedding_query_and_filter_specified( + local_path, + vector_store_filters, + vector_store_query, + hub_cloud_dev_token, +): + vector_store = create_and_populate_vs( + local_path, + token=hub_cloud_dev_token, + ) + + with pytest.raises(ValueError): + update_vs_with_new_emb_fn( + vector_store, + new_embedding_value=100, + filter=vector_store_filters, + query=vector_store_query, + ) + + @requires_libdeeplake def test_vdb_index_creation(local_path, capsys, hub_cloud_dev_token): number_of_data = 1000 @@ -2688,3 +2818,93 @@ def test_dataset_init_param(local_ds): db.add(text=texts, embedding=embeddings, id=ids, metadata=metadatas) assert len(db) == 10 + + +def test_vs_commit(local_path): + # TODO: add index params, when index will support commit + db = create_and_populate_vs( + local_path, number_of_data=NUMBER_OF_DATA, index_params=None + ) + db.checkout("branch_1", create=True) + db.commit("commit_1") + db.add(text=texts, embedding=embeddings, id=ids, metadata=metadatas) + assert len(db) == 2 * NUMBER_OF_DATA + + db.checkout("main") + assert len(db) == NUMBER_OF_DATA + + +def test_vs_init_when_both_dataset_and_path_is_specified(local_path): + with pytest.raises(ValueError): + VectorStore( + path=local_path, + dataset=deeplake.empty(local_path, overwrite=True), + ) + + +def test_vs_init_when_both_dataset_and_path_are_not_specified(): + with pytest.raises(ValueError): + VectorStore() + + +def test_vs_init_with_emptyt_token(local_path): + with patch("deeplake.client.config.DEEPLAKE_AUTH_TOKEN", ""): + db = VectorStore( + path=local_path, + ) + + assert db.dataset_handler.username == "public" + + +@pytest.fixture +def mock_search_managed(mocker): + # Replace SearchManaged with a mock + mock_class = mocker.patch( + "deeplake.core.vectorstore.vector_search.indra.search_algorithm.SearchManaged" + ) + return mock_class + + +@pytest.fixture +def mock_search_indra(mocker): + # Replace SearchIndra with a mock + mock_class = mocker.patch( + "deeplake.core.vectorstore.vector_search.indra.search_algorithm.SearchIndra" + ) + return mock_class + + +@pytest.mark.slow +def test_db_search_with_managed_db_should_instantiate_SearchManaged_class( + mock_search_managed, hub_cloud_path, hub_cloud_dev_token +): + # using interaction test to ensure that the search managed class is executed + db = create_and_populate_vs( + hub_cloud_path, + runtime={"tensor_db": True}, + token=hub_cloud_dev_token, + ) + + # Perform the search + db.search(embedding=query_embedding) + + # Assert that SearchManaged was instantiated + mock_search_managed.assert_called() + + +@pytest.mark.slow +@requires_libdeeplake +def test_db_search_should_instantiate_SearchIndra_class( + mock_search_indra, hub_cloud_path, hub_cloud_dev_token +): + # using interaction test to ensure that the search indra class is executed + db = create_and_populate_vs( + hub_cloud_path, + token=hub_cloud_dev_token, + ) + + # Perform the search + db.search(embedding=query_embedding) + + # Assert that SearchIndra was instantiated + mock_search_indra.assert_called() diff --git a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py index 71a8a1f353..133a572fe6 100644 --- a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py +++ b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py @@ -1,4 +1,5 @@ import numpy as np +from abc import ABC, abstractmethod from typing import Union, Dict, List, Optional from deeplake.core.vectorstore.vector_search.indra import query @@ -9,6 +10,157 @@ from deeplake.enterprise.util import raise_indra_installation_error +class SearchBasic(ABC): + def __init__( + self, + deeplake_dataset: DeepLakeDataset, + org_id: Optional[str] = None, + token: Optional[str] = None, + runtime: Optional[Dict] = None, + deep_memory: bool = False, + ): + """Base class for all search algorithms. + Args: + deeplake_dataset (DeepLakeDataset): DeepLake dataset object. + org_id (Optional[str], optional): Organization ID, is needed only for local datasets. Defaults to None. + token (Optional[str], optional): Token used for authentication. Defaults to None. + runtime (Optional[Dict], optional): Whether to run query on managed_db or indra. Defaults to None. + deep_memory (bool): Use DeepMemory for the search. Defaults to False. + """ + self.deeplake_dataset = deeplake_dataset + self.org_id = org_id + self.token = token + self.runtime = runtime + self.deep_memory = deep_memory + + def run( + self, + tql_string: str, + return_view: bool, + return_tql: bool, + distance_metric: str, + k: int, + query_embedding: Union[np.ndarray, List[float]], + embedding_tensor: str, + tql_filter: str, + return_tensors: List[str], + ): + tql_query = self._create_tql_string( + tql_string, + distance_metric, + k, + query_embedding, + embedding_tensor, + tql_filter, + return_tensors, + ) + view = self._get_view( + tql_query, + runtime=self.runtime, + ) + + if return_view: + return view + + return_data = self._collect_return_data(view) + + if return_tql: + return {"data": return_data, "tql": tql_query} + return return_data + + @abstractmethod + def _collect_return_data( + self, + view: DeepLakeDataset, + ): + pass + + @staticmethod + def _create_tql_string( + tql_string: str, + distance_metric: str, + k: int, + query_embedding: Union[np.ndarray, List[float]], + embedding_tensor: str, + tql_filter: str, + return_tensors: List[str], + ): + """Creates TQL query string for the vector search.""" + if tql_string: + return tql_string + else: + return query.parse_query( + distance_metric, + k, + query_embedding, + embedding_tensor, + tql_filter, + return_tensors, + ) + + @abstractmethod + def _get_view(self, tql_query: str, runtime: Optional[Dict] = None): + pass + + +class SearchIndra(SearchBasic): + def _get_view(self, tql_query, runtime: Optional[Dict] = None): + indra_dataset = self._get_indra_dataset() + indra_view = indra_dataset.query(tql_query) + view = DeepLakeQueryDataset( + deeplake_ds=self.deeplake_dataset, indra_ds=indra_view + ) + view._tql_query = tql_query + return view + + def _get_indra_dataset(self): + try: + from indra import api # type: ignore + + INDRA_INSTALLED = True + except ImportError: + INDRA_INSTALLED = False + pass + + if not INDRA_INSTALLED: + raise raise_indra_installation_error(indra_import_error=None) + + if self.deeplake_dataset.libdeeplake_dataset is not None: + indra_dataset = self.deeplake_dataset.libdeeplake_dataset + else: + if self.org_id is not None: + self.deeplake_dataset.org_id = self.org_id + if self.token is not None: + self.deeplake_dataset.set_token(self.token) + + indra_dataset = dataset_to_libdeeplake(self.deeplake_dataset) + return indra_dataset + + def _collect_return_data( + self, + view: DeepLakeDataset, + ): + return_data = {} + for tensor in view.tensors: + return_data[tensor] = utils.parse_tensor_return(view[tensor]) + return return_data + + +class SearchManaged(SearchBasic): + def _get_view(self, tql_query, runtime: Optional[Dict] = None): + view, data = self.deeplake_dataset.query( + tql_query, runtime=runtime, return_data=True + ) + self.data = data + return view + + def _collect_return_data( + self, + view: DeepLakeDataset, + ): + return self.data + + def search( query_embedding: np.ndarray, distance_metric: str, @@ -23,6 +175,7 @@ def search( deep_memory: bool = False, token: Optional[str] = None, org_id: Optional[str] = None, + return_tql: bool = False, ) -> Union[Dict, DeepLakeDataset]: """Generalized search algorithm that uses indra. It combines vector search and other TQL queries. @@ -40,6 +193,7 @@ def search( deep_memory (bool): Use DeepMemory for the search. Defaults to False. token (Optional[str], optional): Token used for authentication. Defaults to None. org_id (Optional[str], optional): Organization ID, is needed only for local datasets. Defaults to None. + return_tql (bool): Return TQL query used for the search. Defaults to False. Raises: ValueError: If both tql_string and tql_filter are specified. @@ -48,76 +202,20 @@ def search( Returns: Union[Dict, DeepLakeDataset]: Dictionary where keys are tensor names and values are the results of the search, or a Deep Lake dataset view. """ - try: - from indra import api # type: ignore - - INDRA_INSTALLED = True - except ImportError: - INDRA_INSTALLED = False - pass - - if tql_string: - tql_query = tql_string + searcher: SearchBasic + if runtime and runtime.get("db_engine", False): + searcher = SearchManaged(deeplake_dataset, org_id, token, runtime=runtime) else: - tql_query = query.parse_query( - distance_metric, - k, - query_embedding, - embedding_tensor, - tql_filter, - return_tensors, - ) - - if deep_memory: - if not INDRA_INSTALLED: - raise raise_indra_installation_error(indra_import_error=None) - - if deeplake_dataset.libdeeplake_dataset is not None: - indra_dataset = deeplake_dataset.libdeeplake_dataset - else: - if org_id is not None: - deeplake_dataset.org_id = org_id - if token is not None: - deeplake_dataset.set_token(token) - - indra_dataset = dataset_to_libdeeplake(deeplake_dataset) - api.tql.prepare_deepmemory_metrics(indra_dataset) - - indra_view = indra_dataset.query(tql_query) - - view = DeepLakeQueryDataset(deeplake_ds=deeplake_dataset, indra_ds=indra_view) - view._tql_query = tql_query - - if return_view: - return view - - return_data = {} - for tensor in view.tensors: - return_data[tensor] = utils.parse_tensor_return(view[tensor]) - elif runtime and runtime.get("db_engine", False): - view, data = deeplake_dataset.query( - tql_query, runtime=runtime, return_data=True - ) - if return_view: - return view - - return_data = data - else: - if not INDRA_INSTALLED: - raise raise_indra_installation_error( - indra_import_error=None - ) # pragma: no cover - - view = deeplake_dataset.query( - tql_query, - runtime=runtime, - ) - - if return_view: - return view - - return_data = {} - for tensor in view.tensors: - return_data[tensor] = utils.parse_tensor_return(view[tensor]) - - return return_data + searcher = SearchIndra(deeplake_dataset, org_id, token) + + return searcher.run( + tql_string=tql_string, + return_view=return_view, + return_tql=return_tql, + distance_metric=distance_metric, + k=k, + query_embedding=query_embedding, + embedding_tensor=embedding_tensor, + tql_filter=tql_filter, + return_tensors=return_tensors, + ) diff --git a/deeplake/core/vectorstore/vector_search/indra/vector_search.py b/deeplake/core/vectorstore/vector_search/indra/vector_search.py index b0e80cb019..6b94fcb599 100644 --- a/deeplake/core/vectorstore/vector_search/indra/vector_search.py +++ b/deeplake/core/vectorstore/vector_search/indra/vector_search.py @@ -21,6 +21,7 @@ def vector_search( deep_memory, token, org_id, + return_tql, ) -> Union[Dict, DeepLakeDataset]: try: from indra import api # type: ignore @@ -58,4 +59,5 @@ def vector_search( deep_memory=deep_memory, token=token, org_id=org_id, + return_tql=return_tql, ) diff --git a/deeplake/core/vectorstore/vector_search/python/vector_search.py b/deeplake/core/vectorstore/vector_search/python/vector_search.py index e632b5e5a1..cfbf186b48 100644 --- a/deeplake/core/vectorstore/vector_search/python/vector_search.py +++ b/deeplake/core/vectorstore/vector_search/python/vector_search.py @@ -21,12 +21,18 @@ def vector_search( deep_memory, token, org_id, + return_tql, ) -> Union[Dict, DeepLakeDataset]: if query is not None: raise NotImplementedError( f"User-specified TQL queries are not supported for exec_option={exec_option} " ) + if return_tql: + raise NotImplementedError( + f"return_tql is not supported for exec_option={exec_option}" + ) + view = filter_utils.attribute_based_filtering_python(dataset, filter) return_data = {} diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index d1a31e8ce4..7447cb87d2 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -1,8 +1,11 @@ import functools -import time import types +import random +import string from abc import ABC, abstractmethod +from typing import Optional, List, Dict, Tuple +import deeplake from deeplake.constants import MB, DEFAULT_VECTORSTORE_INDEX_PARAMS, TARGET_BYTE_SIZE from deeplake.enterprise.util import raise_indra_installation_error from deeplake.util.exceptions import TensorDoesNotExistError @@ -15,10 +18,6 @@ import numpy as np -import jwt -import random -import string -from typing import Optional, List, Dict EXEC_OPTION_TO_RUNTIME: Dict[str, Optional[Dict]] = { "compute_engine": None, @@ -637,23 +636,3 @@ def create_embedding_function(embedding_function): embedding_function=embedding_function, ) return None - - -def create_and_load_vectorstore(): - from deeplake import VectorStore - - db = VectorStore( - path="local_path", - overwrite=True, - ) - - texts, embeddings, ids, metadata, _ = create_data( - number_of_data=100, embedding_dim=1536, metadata_key="abc" - ) - db.add( - text=texts, - embedding=embeddings, - id=ids, - metadata=metadata, - ) - return db diff --git a/deeplake/core/vectorstore/vector_search/vector_search.py b/deeplake/core/vectorstore/vector_search/vector_search.py index 4606fd1257..bf7a3fdc38 100644 --- a/deeplake/core/vectorstore/vector_search/vector_search.py +++ b/deeplake/core/vectorstore/vector_search/vector_search.py @@ -31,6 +31,7 @@ def search( deep_memory: bool = False, token: Optional[str] = None, org_id: Optional[str] = None, + return_tql: bool = False, ) -> Union[Dict, DeepLakeDataset]: """Searching function Args: @@ -53,6 +54,7 @@ def search( deep_memory (bool): Use DeepMemory for the search. Defaults to False. token (Optional[str], optional): Token used for authentication. Defaults to None. org_id (Optional[str], optional): Organization ID, is needed only for local datasets. Defaults to None. + return_tql (bool): Return TQL query used for the search. Defaults to False. """ return EXEC_OPTION_TO_SEARCH_TYPE[exec_option]( query=query, @@ -69,4 +71,5 @@ def search( deep_memory=deep_memory, token=token, org_id=org_id, + return_tql=return_tql, ) diff --git a/deeplake/requirements/tests.txt b/deeplake/requirements/tests.txt index d2f66c77a8..373fa63000 100644 --- a/deeplake/requirements/tests.txt +++ b/deeplake/requirements/tests.txt @@ -2,6 +2,7 @@ pytest pytest-cases pytest-benchmark pytest-cov +pytest-mock pytest-timeout pytest-rerunfailures pytest-profiling diff --git a/deeplake/tests/path_fixtures.py b/deeplake/tests/path_fixtures.py index 1e5dc5874c..cbda99035f 100644 --- a/deeplake/tests/path_fixtures.py +++ b/deeplake/tests/path_fixtures.py @@ -716,27 +716,32 @@ def dataframe_ingestion_data(): @pytest.fixture def vector_store_hash_ids(request): - return [f"{i}" for i in range(5)] + if getattr(request, "param", True): + return [f"{i}" for i in range(5)] @pytest.fixture def vector_store_row_ids(request): - return [i for i in range(5)] + if getattr(request, "param", True): + return [i for i in range(5)] @pytest.fixture def vector_store_filter_udf(request): - return "filter_udf" + if getattr(request, "param", True): + return "filter_udf" @pytest.fixture def vector_store_filters(request): - return {"a": 1} + if getattr(request, "param", True): + return {"a": 1} @pytest.fixture def vector_store_query(request): - return "select * where metadata=={'a': 1}" + if getattr(request, "param", True): + return "select * where metadata=={'a': 1}" @pytest.fixture @@ -784,3 +789,33 @@ def precomputed_jobs_list(): with open(os.path.join(parent, "precomputed_jobs_list.txt"), "r") as f: jobs = f.read() return jobs + + +@pytest.fixture +def local_dmv2_dataset(request, hub_cloud_dev_token): + dmv2_path = f"hub://{HUB_CLOUD_DEV_USERNAME}/dmv2" + + local_cache_path = ".deeplake_tests_cache/" + if not os.path.exists(local_cache_path): + os.mkdir(local_cache_path) + + dataset_cache_path = local_cache_path + "dmv2" + if not os.path.exists(dataset_cache_path): + deeplake.deepcopy( + dmv2_path, + dataset_cache_path, + token=hub_cloud_dev_token, + overwrite=True, + ) + + corpus = _get_storage_path(request, LOCAL) + + deeplake.deepcopy( + dataset_cache_path, + corpus, + token=hub_cloud_dev_token, + overwrite=True, + ) + yield corpus + + delete_if_exists(corpus, hub_cloud_dev_token) From 0e495e43718ae0208ebe2f2decb570ed787d2214 Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Tue, 5 Dec 2023 16:20:16 +0600 Subject: [PATCH 03/16] removing duplicate arguments --- deeplake/client/managed/__init__.py | 0 deeplake/client/managed/managed_client.py | 186 ----------- deeplake/client/managed/models.py | 30 -- .../managed_side_dataset_handler.py | 305 ------------------ .../vectorstore/deep_memory/deep_memory.py | 7 - .../deep_memory/test_deepmemory.py | 29 ++ .../vectorstore/test_deeplake_vectorstore.py | 67 ++-- deeplake/tests/path_fixtures.py | 4 - 8 files changed, 63 insertions(+), 565 deletions(-) delete mode 100644 deeplake/client/managed/__init__.py delete mode 100644 deeplake/client/managed/managed_client.py delete mode 100644 deeplake/client/managed/models.py delete mode 100644 deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py diff --git a/deeplake/client/managed/__init__.py b/deeplake/client/managed/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/deeplake/client/managed/managed_client.py b/deeplake/client/managed/managed_client.py deleted file mode 100644 index 0b4089ea68..0000000000 --- a/deeplake/client/managed/managed_client.py +++ /dev/null @@ -1,186 +0,0 @@ -import numpy as np -from typing import Callable, Dict, List, Any, Optional, Union - -from deeplake.client.client import DeepLakeBackendClient -from deeplake.client.utils import ( - check_response_status, -) -from deeplake.client.config import ( - GET_VECTORSTORE_SUMMARY_SUFFIX, - INIT_VECTORSTORE_SUFFIX, - DELETE_VECTORSTORE_SUFFIX, - VECTORSTORE_ADD_SUFFIX, - VECTORSTORE_REMOVE_ROWS_SUFFIX, - VECTORSTORE_SEARCH_SUFFIX, -) - -from deeplake.client.managed.models import ( - VectorStoreSummaryResponse, - VectorStoreInitResponse, - VectorStoreSearchResponse, - VectorStoreAddResponse, -) - - -class ManagedServiceClient(DeepLakeBackendClient): - def _preprocess_embedding(self, embedding: Union[List[float], np.ndarray, None]): - if embedding is not None and isinstance(embedding, np.ndarray): - return embedding.tolist() - return embedding - - def init_vectorstore( - self, - path: str, - overwrite: Optional[bool] = None, - tensor_params: Optional[List[Dict[str, Any]]] = None, - ): - response = self.request( - method="POST", - relative_url=INIT_VECTORSTORE_SUFFIX, - json={ - "dataset": path, - "overwrite": overwrite, - "tensor_params": tensor_params, - }, - ) - data = response.json() - - return VectorStoreInitResponse( - status_code=response.status_code, - path=data["path"], - summary=data["summary"], - length=data["length"], - tensors=data["tensors"], - exists=data.get("exists", False), - ) - - def delete_vectorstore(self, path: str, force: bool = False): - response = self.request( - method="DELETE", - relative_url=DELETE_VECTORSTORE_SUFFIX, - json={"dataset": path, "force": force}, - ) - check_response_status(response) - - def get_vectorstore_summary(self, path: str): - org_id, dataset_id = path[6:].split("/") - response = self.request( - method="GET", - relative_url=GET_VECTORSTORE_SUMMARY_SUFFIX.format(org_id, dataset_id), - ) - check_response_status(response) - data = response.json() - - return VectorStoreSummaryResponse( - status_code=response.status_code, - summary=data["summary"], - length=data["length"], - tensors=data["tensors"], - ) - - def vectorstore_search( - self, - path: str, - embedding: Optional[Union[List[float], np.ndarray]] = None, - k: int = 4, - distance_metric: Optional[str] = None, - query: Optional[str] = None, - filter: Optional[Dict[str, str]] = None, - embedding_tensor: str = "embedding", - return_tensors: Optional[List[str]] = None, - deep_memory: bool = False, - ): - response = self.request( - method="POST", - relative_url=VECTORSTORE_SEARCH_SUFFIX, - json={ - "dataset": path, - "embedding": self._preprocess_embedding(embedding), - "k": k, - "distance_metric": distance_metric, - "query": query, - "filter": filter, - "embedding_tensor": embedding_tensor, - "return_tensors": return_tensors, - "deep_memory": deep_memory, - }, - ) - check_response_status(response) - data = response.json() - - return VectorStoreSearchResponse( - status_code=response.status_code, - length=data["length"], - data=data["data"], - ) - - def vectorstore_add( - self, - path: str, - processed_tensors: List[Dict[str, List[Any]]], - rate_limiter: Optional[Dict[str, Any]] = None, - batch_byte_size: Optional[int] = None, - return_ids: bool = False, - ): - rest_api_tensors = [] - for tensor in processed_tensors: - for key, value in tensor.items(): - tensor[key] = self._preprocess_embedding(value) - rest_api_tensors.append(tensor) - - response = self.request( - method="POST", - relative_url=VECTORSTORE_ADD_SUFFIX, - json={ - "dataset": path, - "data": rest_api_tensors, - "rate_limiter": rate_limiter, - "batch_byte_size": batch_byte_size, - "return_ids": return_ids, - }, - ) - check_response_status(response) - data = response.json().get("result", {}) - - return VectorStoreAddResponse( - status_code=response.status_code, ids=data.get("ids") - ) - - def vectorstore_remove_rows( - self, - path: str, - indices: Optional[List[int]] = None, - ids: Optional[List[str]] = None, - filter: Optional[Dict[str, str]] = None, - query: Optional[str] = None, - delete_all: bool = False, - ): - response = self.request( - method="POST", - relative_url=VECTORSTORE_REMOVE_ROWS_SUFFIX, - json={ - "dataset": path, - "indices": indices, - "ids": ids, - "filter": filter, - "query": query, - "delete_all": delete_all, - }, - ) - check_response_status(response) - - def vectorstore_update_embeddings( - self, - path: str, - row_ids: List[str], - ids: List[str], - filter: Union[Dict, Callable], - query: str, - embedding_function: Union[Callable, List[Callable]], - embedding_source_tensor: Union[str, List[str]], - embedding_tensor: Union[str, List[str]], - ): - """ - TODO: implement - """ - pass diff --git a/deeplake/client/managed/models.py b/deeplake/client/managed/models.py deleted file mode 100644 index 99c4b61785..0000000000 --- a/deeplake/client/managed/models.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import NamedTuple, Dict, List, Optional, Any - - -class VectorStoreSummaryResponse(NamedTuple): - status_code: int - summary: str - length: int - tensors: List[ - Dict[str, Any] - ] # Same format as `tensor_params` in `init_vectorstore` - - -class VectorStoreInitResponse(NamedTuple): - status_code: int - path: str - summary: str - length: int - tensors: List[Dict[str, Any]] - exists: bool - - -class VectorStoreSearchResponse(NamedTuple): - status_code: int - length: int - data: Dict[str, List[Any]] - - -class VectorStoreAddResponse(NamedTuple): - status_code: int - ids: Optional[List[str]] = None diff --git a/deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py b/deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py deleted file mode 100644 index c34969128a..0000000000 --- a/deeplake/core/vectorstore/dataset_handlers/managed_side_dataset_handler.py +++ /dev/null @@ -1,305 +0,0 @@ -import logging -import pathlib -from typing import Any, Callable, Dict, List, Optional, Union - -import numpy as np - -from deeplake.client.managed.managed_client import ManagedServiceClient -from deeplake.client.utils import read_token -from deeplake.constants import MAX_BYTES_PER_MINUTE, TARGET_BYTE_SIZE -from deeplake.core.dataset import Dataset -from deeplake.core.vectorstore.dataset_handlers.dataset_handler_base import DHBase -from deeplake.core.vectorstore.deep_memory.deep_memory import ( - DeepMemory, - use_deep_memory, -) -from deeplake.core.vectorstore import utils -from deeplake.util.bugout_reporter import feature_report_path -from deeplake.util.path import convert_pathlib_to_string_if_needed, get_path_type - - -class ManagedSideDH(DHBase): - def __init__( - self, - path: Union[str, pathlib.Path], - dataset: Dataset, - tensor_params: List[Dict[str, object]], - embedding_function: Any, - read_only: bool, - ingestion_batch_size: int, - index_params: Dict[str, Union[int, str]], - num_workers: int, - exec_option: str, - token: str, - overwrite: bool, - verbose: bool, - runtime: Dict, - creds: Union[Dict, str], - org_id: str, - logger: logging.Logger, - branch: str, - **kwargs: Any, - ): - if embedding_function is not None: - raise NotImplementedError( - "ManagedVectorStore does not support embedding_function for now." - ) - - super().__init__( - path=path, - dataset=dataset, - tensor_params=tensor_params, - embedding_function=embedding_function, - read_only=read_only, - ingestion_batch_size=ingestion_batch_size, - index_params=index_params, - num_workers=num_workers, - exec_option=exec_option, - token=token, - overwrite=overwrite, - verbose=True, - runtime=runtime, - creds=creds, - org_id=org_id, - logger=logger, - **kwargs, - ) - if get_path_type(self.path) != "hub": - raise ValueError( - "ManagedVectorStore can only be initialized with a Deep Lake Cloud path." - ) - self.client = ManagedServiceClient(token=self.token) - self.client.init_vectorstore( - path=self.bugout_reporting_path, - overwrite=overwrite, - tensor_params=tensor_params, - ) - - self.deep_memory = DeepMemory( - dataset=None, - path=path, - token=self.token, - logger=self.logger, - embedding_function=self.embedding_function, - creds=self.creds, - ) - - def add( - self, - embedding_function: Union[Callable, List[Callable]], - embedding_data: Union[List, List[List]], - embedding_tensor: Union[str, List[str]], - return_ids: bool, - rate_limiter: Dict, - **tensors, - ) -> Optional[List[str]]: - feature_report_path( - path=self.bugout_reporting_path, - feature_name="vs.add", - parameters={ - "tensors": list(tensors.keys()) if tensors else None, - "embedding_tensor": embedding_tensor, - "return_ids": return_ids, - "embedding_function": True if embedding_function is not None else False, - "embedding_data": True if embedding_data is not None else False, - "managed": True, - }, - token=self.token, - username=self.username, - ) - - if embedding_function is not None or embedding_data is not None: - raise NotImplementedError( - "Embedding function is not supported for ManagedVectorStore. Please send precaculated embeddings." - ) - - ( - embedding_function, - embedding_data, - embedding_tensor, - tensors, - ) = utils.parse_tensors_kwargs( - tensors, embedding_function, embedding_data, embedding_tensor - ) - - processed_tensors = { - t: tensors[t].tolist() if isinstance(tensors[t], np.ndarray) else tensors[t] - for t in tensors - } - utils.check_length_of_each_tensor(processed_tensors) - - response = self.client.vectorstore_add( - path=self.path, - processed_tensors=processed_tensors, - rate_limiter=rate_limiter, - return_ids=return_ids, - ) - - if return_ids: - return response.ids - - @use_deep_memory - def search( - self, - embedding_data: Union[str, List[str]], - embedding_function: Optional[Callable], - embedding: Union[List[float], np.ndarray], - k: int, - distance_metric: str, - query: str, - filter: Union[Dict, Callable], - embedding_tensor: str, - return_tensors: List[str], - return_view: bool, - deep_memory: bool, - exec_option: Optional[str] = "tensor_db", - ) -> Union[Dict, Dataset]: - feature_report_path( - path=self.bugout_reporting_path, - feature_name="vs.search", - parameters={ - "embedding_data": True if embedding_data is not None else False, - "embedding_function": True if embedding_function is not None else False, - "k": k, - "distance_metric": distance_metric, - "query": query[0:100] if query is not None else False, - "filter": True if filter is not None else False, - "embedding_tensor": embedding_tensor, - "embedding": True if embedding is not None else False, - "return_tensors": return_tensors, - "return_view": return_view, - "managed": True, - }, - token=self.token, - username=self.username, - ) - - if exec_option != "tensor_db": - raise ValueError("Manged db vectorstore only supports tensor_db execution.") - - if embedding_data is not None or embedding_function is not None: - raise NotImplementedError( - "ManagedVectorStore does not support embedding_function search. Please pass a precalculated embedding." - ) - - if filter is not None and not isinstance(filter, dict): - raise NotImplementedError( - "Only Filter Dictionary is supported for the ManagedVectorStore." - ) - - if return_view: - raise NotImplementedError( - "return_view is not supported for the ManagedVectorStore." - ) - - response = self.client.vectorstore_search( - path=self.path, - embedding=embedding, - k=k, - distance_metric=distance_metric, - query=query, - filter=filter, - embedding_tensor=embedding_tensor, - return_tensors=return_tensors, - deep_memory=deep_memory, - ) - return response.data - - def delete( - self, - row_ids: List[int], - ids: List[str], - filter: Union[Dict, Callable], - query: str, - exec_option: str, - delete_all: bool, - ) -> bool: - feature_report_path( - path=self.bugout_reporting_path, - feature_name="vs.delete", - parameters={ - "ids": True if ids is not None else False, - "row_ids": True if row_ids is not None else False, - "query": query[0:100] if query is not None else False, - "filter": True if filter is not None else False, - "delete_all": delete_all, - "managed": True, - }, - token=self.token, - username=self.username, - ) - - if filter is not None and not isinstance(filter, dict): - raise NotImplementedError( - "Only Filter Dictionary is supported for the ManagedVectorStore." - ) - - if exec_option is not None and exec_option != "tensor_db": - raise ValueError("Manged db vectorstore only supports tensor_db execution.") - - self.client.vectorstore_remove_rows( - path=self.bugout_reporting_path, - indices=row_ids, - ids=ids, - filter=filter, - query=query, - delete_all=delete_all, - ) - return True - - def update_embedding( - self, - row_ids: List[str], - ids: List[str], - filter: Union[Dict, Callable], - query: str, - exec_option: str, - embedding_function: Union[Callable, List[Callable]], - embedding_source_tensor: Union[str, List[str]], - embedding_tensor: Union[str, List[str]], - ): - feature_report_path( - path=self.bugout_reporting_path, - feature_name="vs.delete", - parameters={ - "ids": True if ids is not None else False, - "row_ids": True if row_ids is not None else False, - "query": query[0:100] if query is not None else False, - "filter": True if filter is not None else False, - "managed": True, - }, - token=self.token, - username=self.username, - ) - - if filter is not None and not isinstance(filter, dict): - raise NotImplementedError( - "Only Filter Dictionary is supported for the ManagedVectorStore." - ) - - self.client.vectorstore_update_embeddings( - path=self.bugout_reporting_path, - embedding_function=embedding_function, - embedding_source_tensor=embedding_source_tensor, - embedding_tensor=embedding_tensor, - row_ids=row_ids, - ids=ids, - filter=filter, - query=query, - ) - - def _get_summary(self): - """Returns a summary of the Managed Vector Store.""" - return self.client.get_vectorstore_summary(self.path) - - def tensors(self): - """Returns the list of tensors present in the dataset""" - return [t["name"] for t in self._get_summary().tensors] - - def summary(self): - """Prints a summary of the dataset""" - print(self._get_summary().summary) - - def __len__(self): - """Length of the dataset""" - return self._get_summary().length diff --git a/deeplake/core/vectorstore/deep_memory/deep_memory.py b/deeplake/core/vectorstore/deep_memory/deep_memory.py index e33d3ec872..1222a784f0 100644 --- a/deeplake/core/vectorstore/deep_memory/deep_memory.py +++ b/deeplake/core/vectorstore/deep_memory/deep_memory.py @@ -21,7 +21,6 @@ DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE, DEFAULT_DEEPMEMORY_DISTANCE_METRIC, - DEFAULT_DEEPMEMORY_DISTANCE_METRIC, ) from deeplake.util.storage import get_storage_and_cache_chain from deeplake.core.dataset import Dataset @@ -153,8 +152,6 @@ def train( """ from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore - from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore - self.logger.info("Starting DeepMemory training job") feature_report_path( path=self.path, @@ -170,7 +167,6 @@ def train( # TODO: Support for passing query_embeddings directly without embedding function corpus_path = self.path - corpus_path = self.path queries_path = corpus_path + "_queries" if embedding_function is None and self.embedding_function is None: @@ -325,13 +321,11 @@ def list_jobs(self, debug=False): token=self.token, ) _, storage = get_storage_and_cache_chain( - path=self.path, path=self.path, db_engine={"tensor_db": True}, read_only=False, creds=self.creds, token=self.token, - token=self.token, memory_cache_size=DEFAULT_MEMORY_CACHE_SIZE, local_cache_size=DEFAULT_LOCAL_CACHE_SIZE, ) @@ -339,7 +333,6 @@ def list_jobs(self, debug=False): response = self.client.list_jobs( dataset_path=self.path, - dataset_path=self.path, ) response_status_schema = JobResponseStatusSchema(response=response) diff --git a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py index d18a1b85d1..9778b0f413 100644 --- a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py +++ b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py @@ -529,6 +529,35 @@ def test_deepmemory_status(capsys, job_id, corpus_query_pair_path, hub_cloud_dev assert status.out[511:] == output_str[511:] +@pytest.mark.slow +@pytest.mark.skipif(sys.platform == "win32", reason="Does not run on Windows") +def test_deepmemory_search( + corpus_query_relevances_copy, + testing_relevance_query_deepmemory, + hub_cloud_dev_token, +): + corpus, _, _, _ = corpus_query_relevances_copy + relevance, query_embedding = testing_relevance_query_deepmemory + + db = VectorStore( + path=corpus, + runtime={"tensor_db": True}, + token=hub_cloud_dev_token, + ) + + output = db.search( + embedding=query_embedding, deep_memory=True, return_tensors=["id"] + ) + + assert len(output["id"]) == 4 + assert relevance in output["id"] + + output = db.search(embedding=query_embedding) + assert len(output["id"]) == 4 + assert relevance not in output["id"] + # TODO: add some logging checks + + @pytest.mark.skipif(sys.platform == "win32", reason="Does not run on Windows") @pytest.mark.slow @requires_libdeeplake diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 55afd3b62e..66b56cb8a6 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -882,10 +882,6 @@ def assert_updated_vector_store( ) -# TODO: refactor this method: -# 1. Split this method into multiple methods -# 2. use create_and_populate_vs, update_vs_with_new_emb_fn to make these tests more readable -# 3. create one fixture for these nested fixtures # TODO: refactor this method: # 1. Split this method into multiple methods (1 test per 1 behavior) # 2. use create_and_populate_vs to make these tests more readable @@ -1329,67 +1325,72 @@ def create_and_populate_vs( return vector_store -def update_vs_with_new_emb_fn( - vs, - new_embedding_value, - ids=None, - row_ids=None, - filter=None, - query=None, - embedding_source_tensor="text", - embedding_tensor="embedding", +def test_update_embedding_row_ids_and_ids_specified_should_throw_exception( + local_path, + vector_store_hash_ids, + vector_store_row_ids, + hub_cloud_dev_token, ): - embedding_fn = get_embedding_function(embedding_value=new_embedding_value) - - vs.update_embedding( - ids=ids, - row_ids=row_ids, - filter=filter, - query=query, - embedding_function=embedding_fn, - embedding_source_tensor=embedding_source_tensor, - embedding_tensor=embedding_tensor, + # specifying both row_ids and ids during update embedding should throw an exception + # initializing vectorstore and populating it: + vector_store = create_and_populate_vs( + local_path, + token=hub_cloud_dev_token, ) + embedding_fn = get_embedding_function() + # calling update_embedding with both ids and row_ids being specified + with pytest.raises(ValueError): + vector_store.update_embedding( + ids=vector_store_hash_ids, + row_ids=vector_store_row_ids, + embedding_function=embedding_fn, + ) -def test_update_embedding_ids_and_row_ids_specified( + +def test_update_embedding_row_ids_and_filter_specified_should_throw_exception( local_path, - vector_store_hash_ids, + vector_store_filters, vector_store_row_ids, hub_cloud_dev_token, ): + # specifying both row_ids and filter during update embedding should throw an exception + # initializing vectorstore and populating it: vector_store = create_and_populate_vs( local_path, token=hub_cloud_dev_token, ) + embedding_fn = get_embedding_function() with pytest.raises(ValueError): - update_vs_with_new_emb_fn( - vector_store, - new_embedding_value=100, - ids=vector_store_hash_ids, + vector_store.update_embedding( row_ids=vector_store_row_ids, + filter=vector_store_filters, + embedding_function=embedding_fn, ) @requires_libdeeplake -def test_update_embedding_query_and_filter_specified( +def test_update_embedding_query_and_filter_specified_should_throw_exception( local_path, vector_store_filters, vector_store_query, hub_cloud_dev_token, ): + # initializing vectorstore and populating it: vector_store = create_and_populate_vs( local_path, token=hub_cloud_dev_token, ) + embedding_fn = get_embedding_function() + + # calling update_embedding with both query and filter being specified with pytest.raises(ValueError): - update_vs_with_new_emb_fn( - vector_store, - new_embedding_value=100, + vector_store.update_embedding( filter=vector_store_filters, query=vector_store_query, + embedding_function=embedding_fn, ) diff --git a/deeplake/tests/path_fixtures.py b/deeplake/tests/path_fixtures.py index 90b027497f..179e4724b3 100644 --- a/deeplake/tests/path_fixtures.py +++ b/deeplake/tests/path_fixtures.py @@ -718,16 +718,12 @@ def dataframe_ingestion_data(): def vector_store_hash_ids(request): if getattr(request, "param", True): return [f"{i}" for i in range(5)] - if getattr(request, "param", True): - return [f"{i}" for i in range(5)] @pytest.fixture def vector_store_row_ids(request): if getattr(request, "param", True): return [i for i in range(5)] - if getattr(request, "param", True): - return [i for i in range(5)] @pytest.fixture From 32870d00f727ff23da9a22da184a71b78e840372 Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Tue, 5 Dec 2023 16:41:44 +0600 Subject: [PATCH 04/16] Adding return_tql to ClientSideDH --- .../vectorstore/dataset_handlers/client_side_dataset_handler.py | 2 ++ .../core/vectorstore/vector_search/python/test_vector_search.py | 1 + 2 files changed, 3 insertions(+) diff --git a/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py b/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py index f587d8b5fc..e248c5b35d 100644 --- a/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py +++ b/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py @@ -168,6 +168,7 @@ def search( return_tensors: List[str], return_view: bool, deep_memory: bool, + return_tql: bool, ) -> Union[Dict, Dataset]: feature_report_path( path=self.bugout_reporting_path, @@ -210,6 +211,7 @@ def search( exec_option=exec_option, embedding_tensor=embedding_tensor, return_tensors=return_tensors, + return_tql=return_tql, ) return_tensors = utils.parse_return_tensors( diff --git a/deeplake/core/vectorstore/vector_search/python/test_vector_search.py b/deeplake/core/vectorstore/vector_search/python/test_vector_search.py index da18ec955f..9c9b8b2974 100644 --- a/deeplake/core/vectorstore/vector_search/python/test_vector_search.py +++ b/deeplake/core/vectorstore/vector_search/python/test_vector_search.py @@ -29,6 +29,7 @@ def test_vector_search(): deep_memory=False, token=None, org_id=None, + return_tql=False, ) assert len(data["score"]) == 10 From 1269ea912a9bc3cb39b67514d123aae38303563f Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Tue, 5 Dec 2023 17:25:30 +0600 Subject: [PATCH 05/16] mypy fix --- .../core/vectorstore/vector_search/indra/search_algorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py index 133a572fe6..d93e09242e 100644 --- a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py +++ b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py @@ -80,7 +80,7 @@ def _create_tql_string( tql_string: str, distance_metric: str, k: int, - query_embedding: Union[np.ndarray, List[float]], + query_embedding: np.ndarray, embedding_tensor: str, tql_filter: str, return_tensors: List[str], From cc66a2586c120c78fa640edeb6325992c53e00a1 Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Tue, 5 Dec 2023 17:56:29 +0600 Subject: [PATCH 06/16] fixing failing fast tests --- deeplake/core/vectorstore/vector_search/dataset/dataset.py | 1 + .../vectorstore/vector_search/python/test_vector_search.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/deeplake/core/vectorstore/vector_search/dataset/dataset.py b/deeplake/core/vectorstore/vector_search/dataset/dataset.py index 2450718767..ffa3f88b0d 100644 --- a/deeplake/core/vectorstore/vector_search/dataset/dataset.py +++ b/deeplake/core/vectorstore/vector_search/dataset/dataset.py @@ -567,6 +567,7 @@ def convert_id_to_row_id(ids, dataset, search_fn, query, exec_option, filter): return_view=True, k=int(1e9), deep_memory=False, + return_tql=False, ) else: diff --git a/deeplake/core/vectorstore/vector_search/python/test_vector_search.py b/deeplake/core/vectorstore/vector_search/python/test_vector_search.py index 9c9b8b2974..2c8f1ac4ba 100644 --- a/deeplake/core/vectorstore/vector_search/python/test_vector_search.py +++ b/deeplake/core/vectorstore/vector_search/python/test_vector_search.py @@ -50,6 +50,7 @@ def test_vector_search(): deep_memory=False, token=None, org_id=None, + return_tql=False, ) data = vector_search.vector_search( @@ -67,6 +68,7 @@ def test_vector_search(): deep_memory=False, token=None, org_id=None, + return_tql=False, ) assert len(data) == 10 @@ -88,4 +90,5 @@ def test_vector_search(): deep_memory=False, token=None, org_id=None, + return_tql=False, ) From 11633efd4c94895520a44eb5deb0327d517e34ef Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Tue, 5 Dec 2023 22:32:02 +0600 Subject: [PATCH 07/16] fixing mypy --- deeplake/core/vectorstore/deep_memory/deep_memory.py | 1 - deeplake/core/vectorstore/deep_memory/test_deepmemory.py | 4 +--- .../core/vectorstore/vector_search/indra/search_algorithm.py | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/deeplake/core/vectorstore/deep_memory/deep_memory.py b/deeplake/core/vectorstore/deep_memory/deep_memory.py index 1222a784f0..56aed4df48 100644 --- a/deeplake/core/vectorstore/deep_memory/deep_memory.py +++ b/deeplake/core/vectorstore/deep_memory/deep_memory.py @@ -97,7 +97,6 @@ def __init__( Raises: ImportError: if indra is not installed - ValueError: if incorrect type is specified for `dataset_or_path` """ self.dataset = dataset self.path = convert_pathlib_to_string_if_needed(path) diff --git a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py index 9778b0f413..8a0d7189ab 100644 --- a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py +++ b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py @@ -584,9 +584,7 @@ def test_deepmemory_search_on_local_datasets( @pytest.mark.slow @requires_libdeeplake def test_unsupported_deepmemory_users(local_ds): - dm = DeepMemory( - dataset_or_path=local_ds, logger=logger, embedding_function=DummyEmbedder - ) + dm = DeepMemory(path=local_ds, logger=logger, embedding_function=DummyEmbedder) with pytest.raises(DeepMemoryWaitingListError): dm.train( queries=[], diff --git a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py index d93e09242e..bfea83783f 100644 --- a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py +++ b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py @@ -40,7 +40,7 @@ def run( return_tql: bool, distance_metric: str, k: int, - query_embedding: Union[np.ndarray, List[float]], + query_embedding: np.ndarray, embedding_tensor: str, tql_filter: str, return_tensors: List[str], From 33ee33ef4e4d2670429988a6b9f7a39f63fd91f9 Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Wed, 6 Dec 2023 09:44:28 +0600 Subject: [PATCH 08/16] deleted vectorstools --- .../deep_memory/test_deepmemory.py | 7 +- .../core/vectorstore/dev_helpers/__init__.py | 0 .../dev_helpers/vectorstore_tools.py | 106 ------------------ .../core/vectorstore/vector_search/utils.py | 6 +- 4 files changed, 10 insertions(+), 109 deletions(-) delete mode 100644 deeplake/core/vectorstore/dev_helpers/__init__.py delete mode 100644 deeplake/core/vectorstore/dev_helpers/vectorstore_tools.py diff --git a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py index 8a0d7189ab..1fa6fb1f0a 100644 --- a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py +++ b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py @@ -584,7 +584,12 @@ def test_deepmemory_search_on_local_datasets( @pytest.mark.slow @requires_libdeeplake def test_unsupported_deepmemory_users(local_ds): - dm = DeepMemory(path=local_ds, logger=logger, embedding_function=DummyEmbedder) + dm = DeepMemory( + path=local_ds, + dataset=None, + logger=logger, + embedding_function=DummyEmbedder, + ) with pytest.raises(DeepMemoryWaitingListError): dm.train( queries=[], diff --git a/deeplake/core/vectorstore/dev_helpers/__init__.py b/deeplake/core/vectorstore/dev_helpers/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/deeplake/core/vectorstore/dev_helpers/vectorstore_tools.py b/deeplake/core/vectorstore/dev_helpers/vectorstore_tools.py deleted file mode 100644 index 11a5cd8896..0000000000 --- a/deeplake/core/vectorstore/dev_helpers/vectorstore_tools.py +++ /dev/null @@ -1,106 +0,0 @@ -import deeplake -from typing import Dict, List, Optional, Tuple -from deeplake.core.vectorstore.vector_search.utils import create_data - - -def create_and_load_vectorstore(): - from deeplake import VectorStore - - db = VectorStore( - path="local_path", - overwrite=True, - ) - - texts, embeddings, ids, metadata, _ = create_data( - number_of_data=100, embedding_dim=1536, metadata_key="abc" - ) - db.add( - text=texts, - embedding=embeddings, - id=ids, - metadata=metadata, - ) - return db - - -def train_deepmemory_model( - dataset_name: str = f"hub://activeloop-test/scifact", - corpus: Optional[Dict] = None, - relevenace: Optional[List[List[Tuple[str, int]]]] = None, - queries: Optional[List[str]] = None, - token: Optional[str] = None, - overwrite: bool = False, - enviroment: str = "staging", -): - from deeplake import VectorStore - from langchain.embeddings.openai import OpenAIEmbeddings # type: ignore - - if enviroment == "staging": - deeplake.client.config.USE_STAGING_ENVIRONMENT = True - elif enviroment == "dev": - deeplake.client.config.USE_DEV_ENVIRONMENT = True - - embedding_function = OpenAIEmbeddings() - if corpus is None: - if ( - not deeplake.exists(dataset_name, token=token, creds={}) - or overwrite == True - ): - deeplake.deepcopy( - f"hub://activeloop-test/deepmemory_test_corpus", - dataset_name, - token=token, - overwrite=True, - runtime={"tensor_db": True}, - ) - - db = VectorStore( - dataset_name, - token=token, - embedding_function=embedding_function, - ) - else: - db = VectorStore( - dataset_name, - token=token, - overwrite=True, - embedding_function=embedding_function, - ) - db.add(**corpus) - - query_vs = None - - if relevenace is None: - query_vs = VectorStore( - path=f"hub://activeloop-test/deepmemory_test_queries", - runtime={"tensor_db": True}, - token=token, - ) - relevance = query_vs.dataset.metadata.data()["value"] - - if queries is None: - if not query_vs: - query_vs = VectorStore( - path=f"hub://activeloop-test/deepmemory_test_queries", - runtime={"tensor_db": True}, - token=token, - ) - queries = query_vs.dataset.text.data()["value"] - - db.deep_memory.train( - relevance=relevance, - queries=queries, - ) - return db - - -def set_backend(backend="prod"): - if backend == "staging": - deeplake.client.config.USE_STAGING_ENVIRONMENT = True - deeplake.client.config.USE_DEV_ENVIRONMENT = False - elif backend == "dev": - deeplake.client.config.USE_DEV_ENVIRONMENT = True - deeplake.client.config.USE_STAGING_ENVIRONMENT = False - else: - deeplake.client.config.USE_DEV_ENVIRONMENT = False - deeplake.client.config.USE_STAGING_ENVIRONMENT = False diff --git a/deeplake/core/vectorstore/vector_search/utils.py b/deeplake/core/vectorstore/vector_search/utils.py index 7447cb87d2..6927a33571 100644 --- a/deeplake/core/vectorstore/vector_search/utils.py +++ b/deeplake/core/vectorstore/vector_search/utils.py @@ -190,11 +190,13 @@ def generate_json(value, key): return {key: value} -def create_data(number_of_data, embedding_dim=100, metadata_key="abc"): +def create_data( + number_of_data, embedding_dim=100, metadata_key="abc", string_length=1000 +): embeddings = np.random.uniform( low=-10, high=10, size=(number_of_data, embedding_dim) ).astype(np.float32) - texts = [generate_random_string(1000) for i in range(number_of_data)] + texts = [generate_random_string(string_length) for i in range(number_of_data)] ids = [f"{i}" for i in range(number_of_data)] metadata = [generate_json(i, metadata_key) for i in range(number_of_data)] images = ["deeplake/tests/dummy_data/images/car.jpg" for i in range(number_of_data)] From a35ab3ef29472bf6531fb4d342c5b5873e4b4e8d Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Wed, 6 Dec 2023 09:56:53 +0600 Subject: [PATCH 09/16] removing unecessary parameters --- .../dataset_handlers/client_side_dataset_handler.py | 1 - deeplake/core/vectorstore/deeplake_vectorstore.py | 1 - .../core/vectorstore/vector_search/indra/search_algorithm.py | 1 - deeplake/core/vectorstore/vector_search/indra/vector_search.py | 1 - deeplake/core/vectorstore/vector_search/vector_search.py | 3 --- 5 files changed, 7 deletions(-) diff --git a/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py b/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py index e248c5b35d..d0e881c920 100644 --- a/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py +++ b/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py @@ -245,7 +245,6 @@ def search( embedding_tensor=embedding_tensor, return_tensors=return_tensors, return_view=return_view, - deep_memory=deep_memory, token=self.token, org_id=self.org_id, ) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 1c511a1f0d..08f9554928 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -322,7 +322,6 @@ def search( embedding_tensor=embedding_tensor, return_tensors=return_tensors, return_view=return_view, - deep_memory=deep_memory, return_tql=return_tql, ) diff --git a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py index bfea83783f..8fe74c5798 100644 --- a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py +++ b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py @@ -172,7 +172,6 @@ def search( runtime: dict, return_tensors: List[str], return_view: bool = False, - deep_memory: bool = False, token: Optional[str] = None, org_id: Optional[str] = None, return_tql: bool = False, diff --git a/deeplake/core/vectorstore/vector_search/indra/vector_search.py b/deeplake/core/vectorstore/vector_search/indra/vector_search.py index 6b94fcb599..ba67dafff2 100644 --- a/deeplake/core/vectorstore/vector_search/indra/vector_search.py +++ b/deeplake/core/vectorstore/vector_search/indra/vector_search.py @@ -56,7 +56,6 @@ def vector_search( runtime=runtime, return_tensors=return_tensors, return_view=return_view, - deep_memory=deep_memory, token=token, org_id=org_id, return_tql=return_tql, diff --git a/deeplake/core/vectorstore/vector_search/vector_search.py b/deeplake/core/vectorstore/vector_search/vector_search.py index bf7a3fdc38..4e2b079676 100644 --- a/deeplake/core/vectorstore/vector_search/vector_search.py +++ b/deeplake/core/vectorstore/vector_search/vector_search.py @@ -28,7 +28,6 @@ def search( query_embedding: Optional[Union[List[float], np.ndarray]] = None, embedding_tensor: str = "embedding", return_view: bool = False, - deep_memory: bool = False, token: Optional[str] = None, org_id: Optional[str] = None, return_tql: bool = False, @@ -51,7 +50,6 @@ def search( return_tensors (Optional[List[str]], optional): List of tensors to return data for. embedding_tensor (str): name of the tensor in the dataset with `htype="embedding"`. Defaults to "embedding". return_view (Bool): Return a Deep Lake dataset view that satisfied the search parameters, instead of a dictinary with data. Defaults to False. - deep_memory (bool): Use DeepMemory for the search. Defaults to False. token (Optional[str], optional): Token used for authentication. Defaults to None. org_id (Optional[str], optional): Organization ID, is needed only for local datasets. Defaults to None. return_tql (bool): Return TQL query used for the search. Defaults to False. @@ -68,7 +66,6 @@ def search( k=k, return_tensors=return_tensors, return_view=return_view, - deep_memory=deep_memory, token=token, org_id=org_id, return_tql=return_tql, From 62e62ced84e03619bcbbff6102e1bbb330f0b2e0 Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Wed, 6 Dec 2023 10:08:11 +0600 Subject: [PATCH 10/16] removing deepmemory parameter --- deeplake/core/vectorstore/deeplake_vectorstore.py | 1 + deeplake/core/vectorstore/test_deeplake_vectorstore.py | 10 ++++++++++ .../vectorstore/vector_search/indra/vector_search.py | 1 - 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 08f9554928..61cd5a7ecc 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -323,6 +323,7 @@ def search( return_tensors=return_tensors, return_view=return_view, return_tql=return_tql, + deep_memory=deep_memory, ) def delete( diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 66b56cb8a6..0417902a7c 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -2961,3 +2961,13 @@ def test_db_search_should_instantiate_SearchIndra_class( # Assert that SearchIndra was instantiated mock_search_indra.assert_called() + + +def returning_tql_for_exec_option_python_should_throw_exception(local_path): + db = VectorStore( + path=local_path, + ) + db.add(text=texts, embedding=embeddings, id=ids, metadata=metadatas) + + with pytest.raises(ValueError): + db.search(embedding=query_embedding, return_tql=True) diff --git a/deeplake/core/vectorstore/vector_search/indra/vector_search.py b/deeplake/core/vectorstore/vector_search/indra/vector_search.py index ba67dafff2..96992567c6 100644 --- a/deeplake/core/vectorstore/vector_search/indra/vector_search.py +++ b/deeplake/core/vectorstore/vector_search/indra/vector_search.py @@ -18,7 +18,6 @@ def vector_search( k, return_tensors, return_view, - deep_memory, token, org_id, return_tql, From e2dd88b90053588c2ae2053006b33169babc6564 Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Wed, 6 Dec 2023 11:49:20 +0600 Subject: [PATCH 11/16] increasing code coverage --- .../client_side_dataset_handler.py | 2 +- .../vectorstore/deep_memory/deep_memory.py | 2 +- .../deep_memory/test_deepmemory.py | 26 ++++++++++- .../vectorstore/test_deeplake_vectorstore.py | 25 ++++++++++- deeplake/tests/path_fixtures.py | 44 ++++++++++++++++++- 5 files changed, 93 insertions(+), 6 deletions(-) diff --git a/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py b/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py index d0e881c920..6445db99b1 100644 --- a/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py +++ b/deeplake/core/vectorstore/dataset_handlers/client_side_dataset_handler.py @@ -211,7 +211,6 @@ def search( exec_option=exec_option, embedding_tensor=embedding_tensor, return_tensors=return_tensors, - return_tql=return_tql, ) return_tensors = utils.parse_return_tensors( @@ -245,6 +244,7 @@ def search( embedding_tensor=embedding_tensor, return_tensors=return_tensors, return_view=return_view, + return_tql=return_tql, token=self.token, org_id=self.org_id, ) diff --git a/deeplake/core/vectorstore/deep_memory/deep_memory.py b/deeplake/core/vectorstore/deep_memory/deep_memory.py index 56aed4df48..62c653fed8 100644 --- a/deeplake/core/vectorstore/deep_memory/deep_memory.py +++ b/deeplake/core/vectorstore/deep_memory/deep_memory.py @@ -174,7 +174,7 @@ def train( ) if embedding_function is None and self.embedding_function is not None: - embedding_function = self.embedding_function.embed_documents + embedding_function = self.embedding_function runtime = None if get_path_type(corpus_path) == "hub": diff --git a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py index 1fa6fb1f0a..db86f7a8d0 100644 --- a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py +++ b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py @@ -41,9 +41,9 @@ def test_deepmemory_init(hub_cloud_path, hub_cloud_dev_token): assert db.deep_memory is not None -def embedding_fn(texts): +def embedding_fn(texts, embedding_dim=1536): return [ - np.random.uniform(low=-10, high=10, size=(1536)).astype(np.float32) + np.random.uniform(low=-10, high=10, size=(embedding_dim)).astype(np.float32) for _ in range(len(texts)) ] @@ -727,3 +727,25 @@ def test_deeplake_search_should_not_contain_correct_answer( output = db.search(embedding=query_embedding) assert len(output["id"]) == 4 assert relevance not in output["id"] + + +@pytest.mark.slow +@pytest.mark.flaky(reruns=3) +@pytest.mark.skipif(sys.platform == "win32", reason="Does not run on Windows") +def test_deepmemory_train_with_embedding_function_specified_in_constructor_should_not_throw_any_exception( + deepmemory_small_dataset_copy, + hub_cloud_dev_token, +): + corpus, queries, relevances, _ = deepmemory_small_dataset_copy + + db = VectorStore( + path=corpus, + runtime={"tensor_db": True}, + token=hub_cloud_dev_token, + embedding_function=embedding_fn, + ) + + job_id = db.deep_memory.train( + queries=queries, + relevance=relevances, + ) diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index 0417902a7c..e5beef7914 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -2969,5 +2969,28 @@ def returning_tql_for_exec_option_python_should_throw_exception(local_path): ) db.add(text=texts, embedding=embeddings, id=ids, metadata=metadatas) - with pytest.raises(ValueError): + with pytest.raises(NotImplementedError): db.search(embedding=query_embedding, return_tql=True) + + +def test_returning_tql_for_exec_option_compute_engine_should_return_correct_tql( + local_path, +): + db = VectorStore( + path=local_path, + ) + + texts, embeddings, ids, metadatas, _ = utils.create_data( + number_of_data=10, embedding_dim=3 + ) + + db.add(text=texts, embedding=embeddings, id=ids, metadata=metadatas) + + query_embedding = np.zeros(3, dtype=np.float32) + output = db.search(embedding=query_embedding, return_tql=True) + + assert output["tql"] == ( + "select text, metadata, id, score from " + "(select *, COSINE_SIMILARITY(embedding, ARRAY[0.0, 0.0, 0.0]) as score " + "order by COSINE_SIMILARITY(embedding, ARRAY[0.0, 0.0, 0.0]) DESC limit 4)" + ) diff --git a/deeplake/tests/path_fixtures.py b/deeplake/tests/path_fixtures.py index 179e4724b3..e192175c90 100644 --- a/deeplake/tests/path_fixtures.py +++ b/deeplake/tests/path_fixtures.py @@ -798,7 +798,7 @@ def precomputed_jobs_list(): def local_dmv2_dataset(request, hub_cloud_dev_token): dmv2_path = f"hub://{HUB_CLOUD_DEV_USERNAME}/dmv2" - local_cache_path = ".deeplake_tests_cache/" + local_cache_path = ".deepmemory_tests_cache/" if not os.path.exists(local_cache_path): os.mkdir(local_cache_path) @@ -822,3 +822,45 @@ def local_dmv2_dataset(request, hub_cloud_dev_token): yield corpus delete_if_exists(corpus, hub_cloud_dev_token) + + +@pytest.fixture +def deepmemory_small_dataset_copy(request, hub_cloud_dev_token): + dm_path = f"hub://{HUB_CLOUD_DEV_USERNAME}/tiny_dm_dataset" + queries_path = f"hub://{HUB_CLOUD_DEV_USERNAME}/queries_vs" + + local_cache_path = ".deepmemory_tests_cache/" + if not os.path.exists(local_cache_path): + os.mkdir(local_cache_path) + + dataset_cache_path = local_cache_path + "tiny_dm_queries" + if not os.path.exists(dataset_cache_path): + deeplake.deepcopy( + queries_path, + dataset_cache_path, + token=hub_cloud_dev_token, + overwrite=True, + ) + + corpus = _get_storage_path(request, HUB_CLOUD) + query_vs = VectorStore( + path=dataset_cache_path, + ) + queries = query_vs.dataset.text.data()["value"] + relevance = query_vs.dataset.metadata.data()["value"] + relevance = [rel["relevance"] for rel in relevance] + + deeplake.deepcopy( + dm_path, + corpus, + token=hub_cloud_dev_token, + overwrite=True, + runtime={"tensor_db": True}, + ) + + queries_path = corpus + "_eval_queries" + + yield corpus, queries, relevance, queries_path + + delete_if_exists(corpus, hub_cloud_dev_token) + delete_if_exists(queries_path, hub_cloud_dev_token) From bdd08eabc1bc69b7ee721857b6895454875ab58a Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Wed, 6 Dec 2023 11:56:54 +0600 Subject: [PATCH 12/16] increasing code coverage --- .../vectorstore/deep_memory/deep_memory.py | 6 ++-- .../deep_memory/test_deepmemory.py | 33 +++++++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/deeplake/core/vectorstore/deep_memory/deep_memory.py b/deeplake/core/vectorstore/deep_memory/deep_memory.py index 62c653fed8..4d9370a727 100644 --- a/deeplake/core/vectorstore/deep_memory/deep_memory.py +++ b/deeplake/core/vectorstore/deep_memory/deep_memory.py @@ -480,10 +480,8 @@ def evaluate( if embedding is not None: query_embs = embedding else: - if self.embedding_function is not None: - embedding_function = ( - embedding_function or self.embedding_function.embed_documents - ) + if self.embedding_function is not None and embedding_function is None: + embedding_function = self.embedding_function if embedding_function is None: raise ValueError( diff --git a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py index db86f7a8d0..25330a824a 100644 --- a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py +++ b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py @@ -749,3 +749,36 @@ def test_deepmemory_train_with_embedding_function_specified_in_constructor_shoul queries=queries, relevance=relevances, ) + + +@pytest.mark.slow +@pytest.mark.flaky(reruns=3) +@pytest.mark.skipif(sys.platform == "win32", reason="Does not run on Windows") +def test_deepmemory_evaluate_with_embedding_function_specified_in_constructor_should_not_throw_any_exception( + corpus_query_pair_path, + hub_cloud_dev_token, +): + corpus, queries = corpus_query_pair_path + + db = VectorStore( + path=corpus, + runtime={"tensor_db": True}, + token=hub_cloud_dev_token, + embedding_function=embedding_fn, + ) + + queries_vs = VectorStore( + path=queries, + runtime={"tensor_db": True}, + token=hub_cloud_dev_token, + embedding_function=embedding_fn, + ) + + queries = queries_vs.dataset[:10].text.data()["value"] + relevance = queries_vs.dataset[:10].metadata.data()["value"] + relevance = [rel["relevance"] for rel in relevance] + + recall = db.deep_memory.evaluate( + queries=queries, + relevance=relevance, + ) From 9019a9b9734e65969f49a4f3387ccec4e3dd3cf5 Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Wed, 6 Dec 2023 17:09:08 +0600 Subject: [PATCH 13/16] fixing failing tests --- deeplake/core/vectorstore/vector_search/python/vector_search.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deeplake/core/vectorstore/vector_search/python/vector_search.py b/deeplake/core/vectorstore/vector_search/python/vector_search.py index cfbf186b48..b1ecf1885b 100644 --- a/deeplake/core/vectorstore/vector_search/python/vector_search.py +++ b/deeplake/core/vectorstore/vector_search/python/vector_search.py @@ -18,7 +18,6 @@ def vector_search( k, return_tensors, return_view, - deep_memory, token, org_id, return_tql, From 46abb68dcf4a32507fcb0b17569f02b9460fd79e Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Wed, 6 Dec 2023 17:18:59 +0600 Subject: [PATCH 14/16] darglint fix --- .../core/vectorstore/vector_search/indra/search_algorithm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py index 8fe74c5798..35b77ffbae 100644 --- a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py +++ b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py @@ -189,7 +189,6 @@ def search( runtime (dict): Runtime parameters for the query. return_tensors (List[str]): List of tensors to return data for. return_view (bool): Return a Deep Lake dataset view that satisfied the search parameters, instead of a dictinary with data. Defaults to False. - deep_memory (bool): Use DeepMemory for the search. Defaults to False. token (Optional[str], optional): Token used for authentication. Defaults to None. org_id (Optional[str], optional): Organization ID, is needed only for local datasets. Defaults to None. return_tql (bool): Return TQL query used for the search. Defaults to False. From f68c2be9a2c811147df625c618a4a4a01454c9bd Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Wed, 6 Dec 2023 22:49:01 +0600 Subject: [PATCH 15/16] fixing failing tests --- deeplake/core/vectorstore/deep_memory/test_deepmemory.py | 2 +- deeplake/core/vectorstore/test_deeplake_vectorstore.py | 2 ++ .../core/vectorstore/vector_search/python/test_vector_search.py | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py index 25330a824a..cda3232dce 100644 --- a/deeplake/core/vectorstore/deep_memory/test_deepmemory.py +++ b/deeplake/core/vectorstore/deep_memory/test_deepmemory.py @@ -433,7 +433,7 @@ def test_deepmemory_evaluate_with_embedding_func_in_init( path=corpus, runtime={"tensor_db": True}, token=hub_cloud_dev_token, - embedding_function=DummyEmbedder, + embedding_function=embedding_fn, ) recall = db.deep_memory.evaluate( queries=queries, diff --git a/deeplake/core/vectorstore/test_deeplake_vectorstore.py b/deeplake/core/vectorstore/test_deeplake_vectorstore.py index e5beef7914..5c25721af2 100644 --- a/deeplake/core/vectorstore/test_deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/test_deeplake_vectorstore.py @@ -2975,9 +2975,11 @@ def returning_tql_for_exec_option_python_should_throw_exception(local_path): def test_returning_tql_for_exec_option_compute_engine_should_return_correct_tql( local_path, + hub_cloud_dev_token, ): db = VectorStore( path=local_path, + token=hub_cloud_dev_token, ) texts, embeddings, ids, metadatas, _ = utils.create_data( diff --git a/deeplake/core/vectorstore/vector_search/python/test_vector_search.py b/deeplake/core/vectorstore/vector_search/python/test_vector_search.py index 2c8f1ac4ba..7ca352e5ae 100644 --- a/deeplake/core/vectorstore/vector_search/python/test_vector_search.py +++ b/deeplake/core/vectorstore/vector_search/python/test_vector_search.py @@ -26,7 +26,6 @@ def test_vector_search(): k=10, return_tensors=[], return_view=False, - deep_memory=False, token=None, org_id=None, return_tql=False, From 056beade1da2e9799f1c2ab7f6900b30a1458538 Mon Sep 17 00:00:00 2001 From: AdkSarsen Date: Thu, 7 Dec 2023 00:17:09 +0600 Subject: [PATCH 16/16] fixing python vector search error --- .../vectorstore/vector_search/python/test_vector_search.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/deeplake/core/vectorstore/vector_search/python/test_vector_search.py b/deeplake/core/vectorstore/vector_search/python/test_vector_search.py index 7ca352e5ae..848ca6a9a5 100644 --- a/deeplake/core/vectorstore/vector_search/python/test_vector_search.py +++ b/deeplake/core/vectorstore/vector_search/python/test_vector_search.py @@ -46,7 +46,6 @@ def test_vector_search(): k=10, return_tensors=[], return_view=False, - deep_memory=False, token=None, org_id=None, return_tql=False, @@ -64,7 +63,6 @@ def test_vector_search(): k=10, return_tensors=[], return_view=True, - deep_memory=False, token=None, org_id=None, return_tql=False, @@ -86,7 +84,6 @@ def test_vector_search(): k=10, return_tensors=[], return_view=True, - deep_memory=False, token=None, org_id=None, return_tql=False,