Skip to content

Commit

Permalink
Merge pull request #2457 from activeloopai/data_fix
Browse files Browse the repository at this point in the history
[AL-2335] Data fix
  • Loading branch information
levongh committed Jul 20, 2023
2 parents 388e04c + ba27861 commit dfb4a8f
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 18 deletions.
28 changes: 22 additions & 6 deletions deeplake/core/dataset/deeplake_query_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,19 @@


class DeepLakeQueryDataset(Dataset):
def __init__(self, deeplake_ds, indra_ds, group_index=None, enabled_tensors=None):
def __init__(
self,
deeplake_ds,
indra_ds,
group_index=None,
enabled_tensors=None,
index: Optional[Index] = None,
):
self.deeplake_ds = deeplake_ds
self.indra_ds = indra_ds
self.group_index = group_index or deeplake_ds.group_index
self.enabled_tensors = enabled_tensors or deeplake_ds.enabled_tensors
self._index = index or self.deeplake_ds.index

@property
def meta(self):
Expand All @@ -63,7 +71,9 @@ def _get_tensor_from_root(self, fullpath):
except:
pass
indra_tensor = tensor
return DeepLakeQueryTensor(deeplake_tensor, indra_tensor)
return DeepLakeQueryTensor(
deeplake_tensor, indra_tensor, index=self.index
)

def pytorch(
self,
Expand Down Expand Up @@ -113,6 +123,7 @@ def __getitem__(
ret = DeepLakeQueryDataset(
deeplake_ds=self.deeplake_ds,
indra_ds=self.indra_ds,
index=self.index,
group_index=posixpath.join(self.group_index, item),
)
elif "/" in item:
Expand Down Expand Up @@ -144,6 +155,7 @@ def __getitem__(
deeplake_ds=self.deeplake_ds,
indra_ds=self.indra_ds,
enabled_tensors=enabled_tensors,
index=self.index,
)
elif isinstance(item, tuple) and len(item) and isinstance(item[0], str):
ret = self
Expand All @@ -160,6 +172,7 @@ def __getitem__(
ret = DeepLakeQueryDataset(
deeplake_ds=self.deeplake_ds,
indra_ds=self.indra_ds[item],
index=self.index[item],
)
else:
raise InvalidKeyTypeError(item)
Expand Down Expand Up @@ -258,7 +271,7 @@ def no_view_dataset(self):

@property
def index(self):
return self.deeplake_ds.index
return self._index

def _tensors(
self, include_hidden: bool = True, include_disabled=True
Expand All @@ -274,10 +287,12 @@ def _tensors(
for t in indra_tensors:
if t.name in original_keys:
original_tensors[t.name] = DeepLakeQueryTensor(
original_tensors[t.name], t
original_tensors[t.name], t, index=self.index
)
else:
original_tensors[t.name] = DeepLakeQueryTensor(None, t)
original_tensors[t.name] = DeepLakeQueryTensor(
None, t, index=self.index
)
return original_tensors

def __str__(self):
Expand Down Expand Up @@ -305,7 +320,8 @@ def copy(self, *args, **kwargs):
)

def __del__(self):
self.indra_ds = None
"""Leaving the implementation empty as at the moement indra dataset deletaion is taken care of in other place"""
pass

def random_split(self, lengths: Sequence[Union[int, float]]):
if math.isclose(sum(lengths), 1) and sum(lengths) <= 1:
Expand Down
21 changes: 17 additions & 4 deletions deeplake/core/dataset/deeplake_query_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(
self,
deeplake_tensor,
indra_tensor,
index: Optional[Index] = None,
is_iteration: bool = False,
):
self.deeplake_tensor = deeplake_tensor
Expand All @@ -30,6 +31,8 @@ def __init__(

self.first_dim = None

self._index = index or Index(self.indra_tensor.index)

def __getattr__(self, key):
try:
return getattr(self.deeplake_tensor, key)
Expand Down Expand Up @@ -57,6 +60,7 @@ def __getitem__(
return DeepLakeQueryTensor(
self.deeplake_tensor,
indra_tensor,
index=self.index[item],
is_iteration=is_iteration,
)

Expand All @@ -74,15 +78,15 @@ def numpy(

def text(self, fetch_chunks: bool = False):
"""Return text data. Only applicable for tensors with 'text' base htype."""
if len(self.indra_tensor) == 1:
if self.ndim == 1:
return self.indra_tensor.bytes().decode()
return list(
self.indra_tensor[i].bytes().decode() for i in range(len(self.indra_tensor))
)

def dict(self, fetch_chunks: bool = False):
"""Return json data. Only applicable for tensors with 'json' base htype."""
if len(self.indra_tensor) == 1:
if self.ndim == 1:
return json.loads(self.indra_tensor.bytes().decode())
return list(
json.loads(self.indra_tensor[i].bytes().decode())
Expand Down Expand Up @@ -127,10 +131,19 @@ def min_shape(self):

@property
def shape(self):
return self.indra_tensor.shape
if (
not self.indra_tensor.is_sequence
and len(self.indra_tensor) == 1
and self.index.values[0].subscriptable()
):
return (len(self.indra_tensor), *self.indra_tensor.shape)
else:
return self.indra_tensor.shape

@property
def index(self):
if self._index is not None:
return self._index
return Index(self.indra_tensor.indexes)

@property
Expand All @@ -139,7 +152,7 @@ def shape_interval(self):

@property
def ndim(self):
return len(self.max_shape)
return len(self.shape)

@property
def meta(self):
Expand Down
14 changes: 7 additions & 7 deletions deeplake/core/tests/test_deeplake_indra_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,13 +242,13 @@ def test_sequences_accessing_data(local_auth_ds_generator):

deeplake_indra_ds = deeplake_ds.query("SELECT * GROUP BY label")
assert len(deeplake_indra_ds) == 2
assert deeplake_indra_ds.image.shape == [2, None, None, 10, 3]
assert deeplake_indra_ds[0].image.shape == [101, 10, 10, 3]
assert deeplake_indra_ds[0, 0].image.shape == [10, 10, 3]
assert deeplake_indra_ds.image.shape == (2, None, None, 10, 3)
assert deeplake_indra_ds[0].image.shape == (101, 10, 10, 3)
assert deeplake_indra_ds[0, 0].image.shape == (10, 10, 3)
assert deeplake_indra_ds[0].image.numpy().shape == (101, 10, 10, 3)
assert deeplake_indra_ds[1].image.shape == [99, None, 10, 3]
assert deeplake_indra_ds[1, 0].image.shape == [10, 10, 3]
assert deeplake_indra_ds[1, 98].image.shape == [20, 10, 3]
assert deeplake_indra_ds[1].image.shape == (99, None, 10, 3)
assert deeplake_indra_ds[1, 0].image.shape == (10, 10, 3)
assert deeplake_indra_ds[1, 98].image.shape == (20, 10, 3)
assert deeplake_indra_ds[1].image.numpy().shape == (99,)
assert deeplake_indra_ds[1].image.numpy()[0].shape == (10, 10, 3)
assert deeplake_indra_ds[1].image.numpy()[98].shape == (20, 10, 3)
Expand Down Expand Up @@ -335,7 +335,7 @@ def test_virtual_tensors(local_auth_ds_generator):
"json",
"num_labels",
]
assert deeplake_indra_ds.text[0].data() == {"value": "Hello 0"}
assert deeplake_indra_ds.text[0].data() == deeplake_ds.text[0].data()
assert deeplake_indra_ds.json[0].data() == {"value": '{"key": "val"}'}
assert deeplake_ds.json[0].data() == {"value": '{"key": "val"}'}

Expand Down
32 changes: 32 additions & 0 deletions deeplake/core/vectorstore/vector_search/indra/test_indra.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from deeplake.core.vectorstore.vector_search.indra.tql_distance_metrics import (
get_tql_distance_metric,
)
from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore

array = "ARRAY[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]"
METRIC_FUNC_TO_METRIC_STRING = {
Expand Down Expand Up @@ -61,3 +62,34 @@ def test_tql_metric_to_tql_str(metric, limit=10):
metric, 10, query_embedding, embedding_tensor, "", ["*"]
)
assert parsed_query == METRIC_FUNC_TO_QUERY_STRING[metric]


def test_search_resulting_shapes():
vector_store = VectorStore("hub://activeloop/paul_graham_essay", read_only=True)
search_text = "What I Worked On"

def filter_fn(x):
return search_text in x["text"].data()["value"]

embedding = vector_store.dataset.embedding[0].numpy()
embedding_str = "ARRAY[{}]".format(", ".join(map(str, embedding)))
TQL_QUERY = f"select * from (select *, L2_NORM(embedding-{embedding_str}) as score where contains(text, '{search_text}')) order by score ASC limit 4"

view = vector_store.dataset.filter(filter_fn)
view_value = view.text.data(aslist=True)["value"]
view_value_0 = view[0].text.data(aslist=True)["value"]

view1 = vector_store.dataset.query(
f"select * where contains(text, '{search_text}')"
)
view1_value = view1.text.data(aslist=True)["value"]
view1_value_0 = view1[0].text.data(aslist=True)["value"]

view2 = vector_store.dataset.query(TQL_QUERY)
view2_value = view2.text.data(aslist=True)["value"]
view2.text.summary()
assert len(view2.text) == len(view2) == 1
view2_value_0 = view2[0].text.data(aslist=True)["value"]

assert view_value == view1_value == view2_value
assert view_value_0 == view1_value_0 == view2_value_0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def libdeeplake_availabe():
extras_require["all"] = [req_map[r] for r in all_extras]

if libdeeplake_availabe():
libdeeplake = "libdeeplake==0.0.60"
libdeeplake = "libdeeplake==0.0.61"
extras_require["enterprise"] = [libdeeplake, "pyjwt"]
extras_require["all"].append(libdeeplake)

Expand Down

0 comments on commit dfb4a8f

Please sign in to comment.