Skip to content

Commit

Permalink
Merge branch 'main' into deepmemory_ux_improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
AdkSarsen committed Dec 5, 2023
2 parents afd1264 + 432d71d commit 59a4fac
Show file tree
Hide file tree
Showing 14 changed files with 148 additions and 68 deletions.
2 changes: 1 addition & 1 deletion deeplake/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
]


__version__ = "3.8.9"
__version__ = "3.8.10"
warn_if_update_required(__version__)
__encoded_version__ = np.array(__version__)
config = {"s3": Config(max_pool_connections=50, connect_timeout=300, read_timeout=300)}
Expand Down
16 changes: 9 additions & 7 deletions deeplake/core/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1355,7 +1355,7 @@ def __setattr__(self, name: str, value):

def __iter__(self):
dataset_read(self)
for i in range(len(self)):
for i in range(self.__len__(warn=False)):
yield self.__getitem__(
i, is_iteration=not isinstance(self.index.values[0], list)
)
Expand Down Expand Up @@ -2150,7 +2150,9 @@ def pytorch(
)

if progressbar:
dataloader = tqdm(dataloader, desc=self.path, total=len(self) // batch_size)
dataloader = tqdm(
dataloader, desc=self.path, total=self.__len__(warn=False) // batch_size
)
dataset_read(self)
return dataloader

Expand Down Expand Up @@ -3054,7 +3056,6 @@ def _append_or_extend(
"""
tensors = self.tensors
new_row_ids = list(range(len(self), len(self) + len(sample)))
if isinstance(sample, Dataset):
sample = sample.tensors
if not isinstance(sample, dict):
Expand Down Expand Up @@ -3177,7 +3178,8 @@ def extend(
raise ValueError(
f"Incoming samples are not of equal lengths. Incoming sample sizes: {sizes}"
)
new_row_ids = list(range(len(self), len(self) + n))
len_ds = self.__len__(warn=False)
new_row_ids = list(range(len_ds, len_ds + n))
[f() for f in list(self._update_hooks.values())]
if extend:
if ignore_errors:
Expand Down Expand Up @@ -3247,7 +3249,7 @@ def append(
>>> ds.append({"data": [1, 2, 3, 4], "labels":[0, 1, 2, 3]})
"""
new_row_ids = [len(self)]
new_row_ids = [self.__len__(warn=False)]
self._append_or_extend(
sample,
extend=False,
Expand Down Expand Up @@ -3344,7 +3346,7 @@ def get_sample_from_engine(
index_maintenance.index_operation_dataset(
self,
dml_type=_INDEX_OPERATION_MAPPING["UPDATE"],
rowids=list(self.index.values[0].indices(len(self))),
rowids=list(self.index.values[0].indices(self.__len__(warn=False))),
)
except Exception as e:
for k, v in saved.items():
Expand All @@ -3361,7 +3363,7 @@ def get_sample_from_engine(
index_maintenance.index_operation_dataset(
self,
dml_type=_INDEX_OPERATION_MAPPING["UPDATE"],
rowids=list(self.index.values[0].indices(len(self))),
rowids=list(self.index.values[0].indices(self.__len__(warn=False))),
)
raise e
finally:
Expand Down
2 changes: 1 addition & 1 deletion deeplake/core/dataset/deeplake_query_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def sample_indices(self):
return t.indra_tensor.indexes
except RuntimeError:
pass
return range(self.num_samples)
return range(len(self))

def _tensors(
self, include_hidden: bool = True, include_disabled=True
Expand Down
9 changes: 8 additions & 1 deletion deeplake/core/dataset/deeplake_query_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,14 @@ def shape_interval(self):

@property
def ndim(self):
return len(self.shape)
ndim = len(self.indra_tensor.min_shape) + 1
if self.is_sequence:
ndim += 1
if self.index:
for idx in self.index.values:
if not idx.subscriptable():
ndim -= 1
return ndim

@property
def meta(self):
Expand Down
1 change: 0 additions & 1 deletion deeplake/core/tests/test_deeplake_indra_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,6 @@ def test_query_tensors_polygon_htype_consistency(local_auth_ds_generator):
orig = ds.polygon[i].numpy()
new = view.polygon[i].numpy()

assert type(orig) == type(new)
for i, j in zip(orig, new):
assert np.all(i == j)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,19 @@

import numpy as np

import deeplake
from deeplake.client.utils import read_token
from deeplake.constants import (
DEFAULT_VECTORSTORE_DISTANCE_METRIC,
_INDEX_OPERATION_MAPPING,
)
from deeplake.core import index_maintenance
from deeplake.core.dataset import Dataset
from deeplake.core.vectorstore import utils
from deeplake.core.vectorstore.dataset_handlers.dataset_handler_base import DHBase
from deeplake.core.vectorstore.deep_memory.deep_memory import (
use_deep_memory,
DeepMemory,
)
from deeplake.core.vectorstore.vector_search import dataset as dataset_utils
from deeplake.core.vectorstore.vector_search import vector_search
from deeplake.util.bugout_reporter import feature_report_path
from deeplake.util.exceptions import DeepMemoryWaitingListError


class ClientSideDH(DHBase):
Expand Down Expand Up @@ -86,14 +81,6 @@ def __init__(
self.verbose = verbose
self.tensor_params = tensor_params
self.distance_metric_index = index_maintenance.index_operation_vectorstore(self)
self.deep_memory = DeepMemory(
dataset=self.dataset,
path=path,
token=self.token,
logger=self.logger,
embedding_function=self.embedding_function,
creds=self.creds,
)

def add(
self,
Expand Down Expand Up @@ -181,7 +168,6 @@ def search(
return_tensors: List[str],
return_view: bool,
deep_memory: bool,
return_tql: bool,
) -> Union[Dict, Dataset]:
feature_report_path(
path=self.bugout_reporting_path,
Expand All @@ -198,7 +184,6 @@ def search(
"embedding": True if embedding is not None else False,
"return_tensors": return_tensors,
"return_view": return_view,
"return_tql": return_tql,
},
token=self.token,
username=self.username,
Expand All @@ -213,9 +198,6 @@ def search(

exec_option = exec_option or self.exec_option

if deep_memory and not self.deep_memory:
raise DeepMemoryWaitingListError()

utils.parse_search_args(
embedding_data=embedding_data,
embedding_function=embedding_function,
Expand Down Expand Up @@ -264,7 +246,6 @@ def search(
deep_memory=deep_memory,
token=self.token,
org_id=self.org_id,
return_tql=return_tql,
)

def delete(
Expand Down
3 changes: 0 additions & 3 deletions deeplake/core/vectorstore/dataset_handlers/dataset_handler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from deeplake.core.vectorstore.dataset_handlers.client_side_dataset_handler import (
ClientSideDH,
)
from deeplake.core.vectorstore.dataset_handlers.managed_side_dataset_handler import (
ManagedSideDH,
)


def get_dataset_handler(*args, **kwargs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,7 @@
import numpy as np

from deeplake.util.path import convert_pathlib_to_string_if_needed
from deeplake.api import dataset
from deeplake.core.dataset import Dataset
from deeplake.constants import (
DEFAULT_VECTORSTORE_TENSORS,
MAX_BYTES_PER_MINUTE,
TARGET_BYTE_SIZE,
)
from deeplake.client.utils import read_token
from deeplake.core.vectorstore import utils
from deeplake.util.bugout_reporter import (
Expand Down Expand Up @@ -152,7 +146,6 @@ def search(
return_tensors: Optional[List[str]] = None,
return_view: bool = False,
deep_memory: bool = False,
return_tql_query: bool = False,
) -> Union[Dict, Dataset]:
pass

Expand Down
27 changes: 18 additions & 9 deletions deeplake/core/vectorstore/deep_memory/deep_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import deeplake
from deeplake.util.exceptions import (
DeepMemoryWaitingListError,
DeepMemoryWaitingListError,
IncorrectRelevanceTypeError,
IncorrectQueriesTypeError,
Expand All @@ -20,6 +21,7 @@
DEFAULT_MEMORY_CACHE_SIZE,
DEFAULT_LOCAL_CACHE_SIZE,
DEFAULT_DEEPMEMORY_DISTANCE_METRIC,
DEFAULT_DEEPMEMORY_DISTANCE_METRIC,
)
from deeplake.util.storage import get_storage_and_cache_chain
from deeplake.core.dataset import Dataset
Expand All @@ -32,6 +34,15 @@
from deeplake.util.path import get_path_type


def access_control(func):
def wrapper(self, *args, **kwargs):
if self.client is None:
raise DeepMemoryWaitingListError()
return func(self, *args, **kwargs)

return wrapper


def use_deep_memory(func):
def wrapper(self, *args, **kwargs):
use_deep_memory = kwargs.get("deep_memory")
Expand All @@ -45,15 +56,6 @@ def wrapper(self, *args, **kwargs):
return wrapper


def access_control(func):
def wrapper(self, *args, **kwargs):
if self.client is None:
raise DeepMemoryWaitingListError()
return func(self, *args, **kwargs)

return wrapper


class Relevance(BaseModel):
data: List[List[Tuple[str, int]]]

Expand Down Expand Up @@ -88,6 +90,7 @@ def __init__(
Args:
dataset (Dataset): deeplake dataset object or path.
path (Union[str, pathlib.Path]): Path to the dataset.
logger (logging.Logger): Logger object.
embedding_function (Optional[Any], optional): Embedding funtion class used to convert queries/documents to embeddings. Defaults to None.
token (Optional[str], optional): API token for the DeepMemory managed service. Defaults to None.
Expand Down Expand Up @@ -150,6 +153,8 @@ def train(
"""
from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore

from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore

self.logger.info("Starting DeepMemory training job")
feature_report_path(
path=self.path,
Expand All @@ -165,6 +170,7 @@ def train(

# TODO: Support for passing query_embeddings directly without embedding function
corpus_path = self.path
corpus_path = self.path
queries_path = corpus_path + "_queries"

if embedding_function is None and self.embedding_function is None:
Expand Down Expand Up @@ -319,18 +325,21 @@ def list_jobs(self, debug=False):
token=self.token,
)
_, storage = get_storage_and_cache_chain(
path=self.path,
path=self.path,
db_engine={"tensor_db": True},
read_only=False,
creds=self.creds,
token=self.token,
token=self.token,
memory_cache_size=DEFAULT_MEMORY_CACHE_SIZE,
local_cache_size=DEFAULT_LOCAL_CACHE_SIZE,
)
loaded_dataset = DeepLakeCloudDataset(storage=storage)

response = self.client.list_jobs(
dataset_path=self.path,
dataset_path=self.path,
)

response_status_schema = JobResponseStatusSchema(response=response)
Expand Down
16 changes: 15 additions & 1 deletion deeplake/core/vectorstore/deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@
from deeplake.core.dataset import Dataset
from deeplake.core.vectorstore.dataset_handlers import get_dataset_handler
from deeplake.core.vectorstore.deep_memory import DeepMemory
from deeplake.core.vectorstore.dataset_handlers import get_dataset_handler
from deeplake.core.vectorstore.deep_memory import DeepMemory
from deeplake.constants import (
DEFAULT_VECTORSTORE_TENSORS,
MAX_BYTES_PER_MINUTE,
TARGET_BYTE_SIZE,
)
from deeplake.util.bugout_reporter import feature_report_path
from deeplake.util.exceptions import DeepMemoryWaitingListError


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -128,7 +132,14 @@ def __init__(
**kwargs,
)

self.deep_memory = self.dataset_handler.deep_memory
self.deep_memory = DeepMemory(
dataset=self.dataset_handler.dataset,
path=self.dataset_handler.path,
token=self.dataset_handler.token,
logger=logger,
embedding_function=embedding_function,
creds=self.dataset_handler.creds,
)

def add(
self,
Expand Down Expand Up @@ -296,6 +307,9 @@ def search(
Returns:
Dict: Dictionary where keys are tensor names and values are the results of the search
"""
if deep_memory and not self.deep_memory:
raise DeepMemoryWaitingListError()

return self.dataset_handler.search(
embedding_data=embedding_data,
embedding_function=embedding_function,
Expand Down
Loading

0 comments on commit 59a4fac

Please sign in to comment.