Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tag htype #2676

Merged
merged 13 commits into from
Dec 1, 2023
1 change: 1 addition & 0 deletions deeplake/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,6 +1211,7 @@ def test_htypes_list():
"point_cloud",
"polygon",
"segment_mask",
"tag",
"text",
"video",
]
Expand Down
15 changes: 15 additions & 0 deletions deeplake/api/tests/test_tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import numpy as np
import deeplake
import pytest


@pytest.mark.parametrize("chunk_compression", [None, "lz4"])
def test_tag(memory_ds, chunk_compression):
with memory_ds as ds:
ds.create_tensor("abc", htype="tag", chunk_compression=chunk_compression)
ds.abc.append("a")
ds.abc.append(["a", "b"])
ds.abc.extend(["a", ["b", "c"]])

np.testing.assert_array_equal(ds.abc.shapes(), np.array([[1], [2], [1], [2]]))
assert ds.abc.data()["value"] == [["a"], ["a", "b"], ["a"], ["b", "c"]]
4 changes: 2 additions & 2 deletions deeplake/core/chunk/base_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def __init__(
else (len(tensor_meta.max_shape) if tensor_meta.max_shape else None)
)
self.is_text_like = (
self.htype in {"json", "list", "text"} or self.tensor_meta.is_link
self.htype in {"json", "list", "text", "tag"} or self.tensor_meta.is_link
)

self.compression = compression
Expand Down Expand Up @@ -328,7 +328,7 @@ def serialize_sample(
incoming_sample = incoming_sample.path
if incoming_sample is None:
htype = "text" if self.tensor_meta.is_link else self.htype
empty_mapping = {"text": "", "list": [], "json": {}}
empty_mapping = {"text": "", "list": [], "json": {}, "tag": []}
incoming_sample = empty_mapping[htype]

if isinstance(incoming_sample, Sample):
Expand Down
10 changes: 7 additions & 3 deletions deeplake/core/chunk_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def is_data_cachable(self):
tensor_meta = self.tensor_meta
return (
self.chunk_class == UncompressedChunk
and tensor_meta.htype not in ["text", "json", "list", "polygon"]
and tensor_meta.htype not in ["text", "json", "list", "polygon", "tag"]
and tensor_meta.max_shape
and (tensor_meta.max_shape == tensor_meta.min_shape)
and (np.prod(tensor_meta.max_shape) < 20)
Expand Down Expand Up @@ -746,6 +746,10 @@ def _sanitize_samples(
p if isinstance(p, Polygons) else Polygons(p, dtype=tensor_meta.dtype)
for p in samples
]
elif tensor_meta.htype == "tag":
samples = [
sample if isinstance(sample, list) else [sample] for sample in samples
]
return samples, verified_samples

def _convert_class_labels(self, samples):
Expand Down Expand Up @@ -3055,7 +3059,7 @@ def get_empty_sample(self):
raise ValueError("This tensor has no samples, cannot get empty sample.")
htype = self.tensor_meta.htype
dtype = self.tensor_meta.dtype
if htype in ("text", "json", "list"):
if htype in ("text", "json", "list", "tag"):
return get_empty_text_like_sample(htype)
ndim = len(self.tensor_meta.max_shape)
if self.is_sequence:
Expand All @@ -3066,7 +3070,7 @@ def get_empty_sample(self):
@property
def is_text_like(self):
return (
self.tensor_meta.htype in {"text", "json", "list"}
self.tensor_meta.htype in {"text", "json", "list", "tag"}
or self.tensor_meta.is_link
)

Expand Down
2 changes: 1 addition & 1 deletion deeplake/core/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -858,7 +858,7 @@ def _create_tensor(
"nifti",
):
self._create_sample_info_tensor(name)
if create_shape_tensor and htype not in ("text", "json"):
if create_shape_tensor and htype not in ("text", "json", "tag"):
self._create_sample_shape_tensor(name, htype=htype)
if create_id_tensor:
self._create_sample_id_tensor(name)
Expand Down
4 changes: 2 additions & 2 deletions deeplake/core/meta/tensor_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def _validate_required_htype_overwrites(htype: str, htype_overwrite: dict):
)

if htype_overwrite["dtype"] is not None:
if htype in ("json", "list"):
if htype in ("json", "list", "tag"):
validate_json_schema(htype_overwrite["dtype"])
else:
_raise_if_condition(
Expand All @@ -385,7 +385,7 @@ def _format_values(htype: str, htype_overwrite: dict):

dtype = htype_overwrite["dtype"]
if dtype is not None:
if htype in ("json", "list", "intrinsics"):
if htype in ("json", "list", "tag", "intrinsics"):
if getattr(dtype, "__module__", None) == "typing":
htype_overwrite["dtype"] = str(dtype)
else:
Expand Down
2 changes: 1 addition & 1 deletion deeplake/core/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def buffer(self):

@property
def is_text_like(self):
return self.htype in {"text", "list", "json"}
return self.htype in {"text", "list", "json", "tag"}

@property
def dtype(self):
Expand Down
6 changes: 3 additions & 3 deletions deeplake/core/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ def check_sample_shape(shape, num_dims):


def text_to_bytes(sample, dtype, htype):
if htype in ("json", "list"):
if htype in ("json", "list", "tag"):
if isinstance(sample, np.ndarray):
if htype == "list":
sample = list(sample) if sample.dtype == object else sample.tolist()
Expand All @@ -463,7 +463,7 @@ def text_to_bytes(sample, dtype, htype):
sample = list(sample)
validate_json_object(sample, dtype)
byts = json.dumps(sample, cls=HubJsonEncoder).encode()
shape = (len(sample),) if htype == "list" else (1,)
shape = (len(sample),) if htype in ("list", "tag") else (1,)
else: # htype == "text":
if isinstance(sample, np.ndarray) and sample.size == 1:
sample = str(sample.reshape(()))
Expand All @@ -480,7 +480,7 @@ def bytes_to_text(buffer, htype):
arr = np.empty(1, dtype=object)
arr[0] = json.loads(bytes.decode(buffer), cls=HubJsonDecoder)
return arr
elif htype == "list":
elif htype in ("list", "tag"):
lst = json.loads(bytes.decode(buffer), cls=HubJsonDecoder)
arr = np.empty(len(lst), dtype=object)
arr[:] = lst
Expand Down
10 changes: 5 additions & 5 deletions deeplake/core/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ def ndim(self) -> int:
@property
def dtype(self) -> Optional[np.dtype]:
"""Dtype of the tensor."""
if self.base_htype in ("json", "list"):
if self.base_htype in ("json", "list", "tag"):
return np.dtype(str)
if self.meta.dtype:
return np.dtype(self.meta.typestr or self.meta.dtype)
Expand Down Expand Up @@ -977,7 +977,7 @@ def data(self, aslist: bool = False, fetch_chunks: bool = False) -> Any:
return {"value": self.text(fetch_chunks=fetch_chunks)}
if htype == "json":
return {"value": self.dict(fetch_chunks=fetch_chunks)}
if htype == "list":
if htype in ("list", "tag"):
return {"value": self.list(fetch_chunks=fetch_chunks)}
if self.htype == "video":
data = {}
Expand Down Expand Up @@ -1441,9 +1441,9 @@ def dict(self, fetch_chunks: bool = False):
return self._extract_value("json", fetch_chunks=fetch_chunks)

def list(self, fetch_chunks: bool = False):
"""Return list data. Only applicable for tensors with 'list' base htype."""
if self.base_htype != "list":
raise Exception("Only supported for list tensors.")
"""Return list data. Only applicable for tensors with 'list' or 'tag' base htype."""
if self.base_htype not in ("list", "tag"):
raise Exception("Only supported for list and tag tensors.")

if self.ndim == 1:
return list(self.numpy(fetch_chunks=fetch_chunks))
Expand Down
29 changes: 29 additions & 0 deletions deeplake/enterprise/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,35 @@ def test_string_tensors(local_auth_ds):
np.testing.assert_array_equal(batch["strings"], f"string{idx}")


@pytest.mark.slow
@requires_torch
@requires_libdeeplake
@pytest.mark.flaky
def test_tag_tensors(local_auth_ds):
with local_auth_ds as ds:
ds.create_tensor("tags", htype="tag")
ds.tags.extend(
[
f"tag{idx}" if idx % 2 == 0 else [f"tag{idx}", f"tag{idx}"]
for idx in range(5)
]
)

ptds = ds.pytorch(batch_size=1)
for idx, batch in enumerate(ptds):
if idx % 2 == 0:
np.testing.assert_array_equal(batch["tags"], [[f"tag{idx}"]])
else:
np.testing.assert_array_equal(batch["tags"], [[f"tag{idx}", f"tag{idx}"]])

ptds2 = ds.pytorch(batch_size=None)
for idx, batch in enumerate(ptds2):
if idx % 2 == 0:
np.testing.assert_array_equal(batch["tags"], [f"tag{idx}"])
else:
np.testing.assert_array_equal(batch["tags"], [f"tag{idx}", f"tag{idx}"])


@pytest.mark.xfail(raises=NotImplementedError, strict=True)
def test_pytorch_large():
raise NotImplementedError
Expand Down
9 changes: 9 additions & 0 deletions deeplake/htype.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class htype:
IMAGE_RGB = "image.rgb"
IMAGE_GRAY = "image.gray"
CLASS_LABEL = "class_label"
TAG = "tag"
BBOX = "bbox"
BBOX_3D = "bbox.3d"
VIDEO = "video"
Expand Down Expand Up @@ -95,6 +96,7 @@ class htype:
},
htype.LIST: {"dtype": "List"},
htype.TEXT: {"dtype": "str"},
htype.TAG: {"dtype": "List"},
htype.DICOM: {"sample_compression": "dcm"},
htype.NIFTI: {},
htype.POINT_CLOUD: {"dtype": "float32"},
Expand Down Expand Up @@ -135,6 +137,11 @@ def CLASS_LABEL(shape, dtype):
constraints.ndim_error("class_label", len(shape))
)

@staticmethod
def TAG(shape, dtype):
if dtype.name != "str":
raise IncompatibleHtypeError(constraints.dtype_error("tag", dtype))

@staticmethod
def BBOX(shape, dtype):
if len(shape) not in (2, 3):
Expand Down Expand Up @@ -185,6 +192,7 @@ def POINT(shape, dtype):
HTYPE_CONSTRAINTS: Dict[str, Callable] = {
htype.IMAGE: constraints.IMAGE,
htype.CLASS_LABEL: constraints.CLASS_LABEL,
htype.TAG: constraints.TAG,
htype.BBOX: constraints.BBOX,
htype.BBOX_3D: constraints.BBOX_3D,
htype.EMBEDDING: constraints.EMBEDDING,
Expand All @@ -211,6 +219,7 @@ def POINT(shape, dtype):
htype.IMAGE_GRAY: _image_compressions,
htype.VIDEO: VIDEO_COMPRESSIONS[:],
htype.AUDIO: AUDIO_COMPRESSIONS[:],
htype.TAG: BYTE_COMPRESSIONS[:],
htype.TEXT: BYTE_COMPRESSIONS[:],
htype.LIST: BYTE_COMPRESSIONS[:],
htype.JSON: BYTE_COMPRESSIONS[:],
Expand Down
15 changes: 13 additions & 2 deletions deeplake/integrations/pytorch/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
)

if isinstance(elem, np.ndarray) and elem.size > 0 and isinstance(elem[0], str):
batch = [it[0] for it in batch]
if elem.dtype == object:
return [it.tolist() for it in batch]

Check warning on line 26 in deeplake/integrations/pytorch/common.py

View check run for this annotation

Codecov / codecov/patch

deeplake/integrations/pytorch/common.py#L25-L26

Added lines #L25 - L26 were not covered by tests
else:
batch = [it[0] for it in batch]

Check warning on line 28 in deeplake/integrations/pytorch/common.py

View check run for this annotation

Codecov / codecov/patch

deeplake/integrations/pytorch/common.py#L28

Added line #L28 was not covered by tests
elif isinstance(elem, (tuple, list)) and len(elem) > 0 and isinstance(elem[0], str):
batch = [it[0] for it in batch]
elif isinstance(elem, Polygons):
Expand Down Expand Up @@ -52,7 +55,10 @@
if isinstance(data, IterableOrderedDict):
return IterableOrderedDict((k, convert_fn(v)) for k, v in data.items())
if isinstance(data, np.ndarray) and data.size > 0 and isinstance(data[0], str):
data = data[0]
if data.dtype == object:
return data.tolist()

Check warning on line 59 in deeplake/integrations/pytorch/common.py

View check run for this annotation

Codecov / codecov/patch

deeplake/integrations/pytorch/common.py#L58-L59

Added lines #L58 - L59 were not covered by tests
else:
data = data[0]

Check warning on line 61 in deeplake/integrations/pytorch/common.py

View check run for this annotation

Codecov / codecov/patch

deeplake/integrations/pytorch/common.py#L61

Added line #L61 was not covered by tests
elif isinstance(data, Polygons):
data = data.numpy()

Expand Down Expand Up @@ -85,6 +91,7 @@
jpeg_png_compressed_tensors = []
json_tensors = []
list_tensors = []
tag_tensors = []
supported_image_compressions = {"png", "jpeg"}
for tensor_name in tensors:
tensor = dataset._get_tensor_from_root(tensor_name)
Expand All @@ -101,13 +108,17 @@
json_tensors.append(tensor_name)
elif meta.htype == "list":
list_tensors.append(tensor_name)
elif meta.htype == "tag":
tag_tensors.append(tensor_name)

if verbose and (json_tensors or list_tensors):
json_list_tensors = set(json_tensors + list_tensors)
warnings.warn(
f"The following tensors have json or list htype: {json_list_tensors}. Collation of these tensors will fail by default. Ensure that these tensors are either transformed by specifying a transform or a custom collate_fn is specified to handle them."
)

list_tensors += tag_tensors

return jpeg_png_compressed_tensors, json_tensors, list_tensors


Expand Down
27 changes: 27 additions & 0 deletions deeplake/integrations/tests/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,33 @@ def test_string_tensors(local_ds):
np.testing.assert_array_equal(batch["strings"], f"string{idx}")


@requires_torch
@pytest.mark.flaky
def test_tag_tensors(local_ds):
with local_ds:
local_ds.create_tensor("tags", htype="tag")
local_ds.tags.extend(
[
f"tag{idx}" if idx % 2 == 0 else [f"tag{idx}", f"tag{idx}"]
for idx in range(5)
]
)

ptds = local_ds.pytorch(batch_size=1)
for idx, batch in enumerate(ptds):
if idx % 2 == 0:
np.testing.assert_array_equal(batch["tags"], [[f"tag{idx}"]])
else:
np.testing.assert_array_equal(batch["tags"], [[f"tag{idx}", f"tag{idx}"]])

ptds2 = local_ds.pytorch(batch_size=None)
for idx, batch in enumerate(ptds2):
if idx % 2 == 0:
np.testing.assert_array_equal(batch["tags"], [f"tag{idx}"])
else:
np.testing.assert_array_equal(batch["tags"], [f"tag{idx}", f"tag{idx}"])


@pytest.mark.slow
@requires_torch
@pytest.mark.flaky
Expand Down
2 changes: 1 addition & 1 deletion deeplake/util/casting.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def get_empty_text_like_sample(htype: str):
return ""
elif htype == "json":
return {}
elif htype == "list":
elif htype == "list" or htype == "tag":
return []
else:
raise ValueError(
Expand Down
2 changes: 2 additions & 0 deletions deeplake/util/htype.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
raise ValueError(
"Can't create a linked tensor with a generic htype, you need to specify htype, for example link[image]"
)
if is_sequence and htype == HTYPE.TAG:
raise ValueError("Htype sequence[tag] is not supported")

Check warning on line 31 in deeplake/util/htype.py

View check run for this annotation

Codecov / codecov/patch

deeplake/util/htype.py#L31

Added line #L31 was not covered by tests
return is_sequence, is_link, htype


Expand Down