activeloopai · farizrahman4u · Jul 14, 2021 · Jul 7, 2021 · Jul 7, 2021 · Jul 7, 2021
diff --git a/hub/api/tests/test_chunk_sizes.py b/hub/api/tests/test_chunk_sizes.py
@@ -0,0 +1,121 @@
+import numpy as np
+from hub.constants import KB
+
+
+def _update_chunk_sizes(ds, max_chunk_size: int):
+    """Updates all chunk sizes for tensors that already exist in `ds`. If
+    more tensors are created after calling this method, those tensors will NOT have
+    the same chunk size.
+    """
+
+    # TODO: set / update chunk sizes API (to replace this function)
+
+    min_chunk_size = max_chunk_size // 2
+
+    for tensor in ds.tensors.values():
+        chunk_engine = tensor.chunk_engine
+
+        chunk_engine.max_chunk_size = max_chunk_size
+        chunk_engine.min_chunk_size = min_chunk_size
+
+
+def _assert_num_chunks(tensor, expected_num_chunks):
+    chunk_engine = tensor.chunk_engine
+    actual_num_chunks = chunk_engine.chunk_id_encoder.num_chunks
+    assert actual_num_chunks == expected_num_chunks
+
+
+def _create_tensors(ds):
+    images = ds.create_tensor("images", htype="image", sample_compression=None)
+    labels = ds.create_tensor("labels", htype="class_label")
+    return images, labels
+
+
+def _append_tensors(images, labels):
+    for i in range(100):
+        x = np.ones((28, 28), dtype=np.uint8) * i
+        y = np.uint32(i)
+
+        images.append(x)
+        labels.append(y)
+
+
+def _extend_tensors(images, labels):
+    images.extend(np.ones((100, 28, 28), dtype=np.uint8))
+    labels.extend(np.ones(100, dtype=np.uint32))
+
+
+def test_append(memory_ds):
+    ds = memory_ds
+    images, labels = _create_tensors(ds)
+    _update_chunk_sizes(ds, 32 * KB)
+
+    _append_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 5)
+
+    _append_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 10)
+
+    _append_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 15)
+
+    assert len(ds) == 300
+
+
+def test_extend(memory_ds):
+    ds = memory_ds
+    images, labels = _create_tensors(ds)
+
+    _update_chunk_sizes(ds, 32 * KB)
+
+    _extend_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 5)
+
+    _extend_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 10)
+
+    _extend_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 15)
+
+    assert len(ds) == 300
+
+
+def test_extend_and_append(memory_ds):
+    ds = memory_ds
+    images, labels = _create_tensors(ds)
+
+    _update_chunk_sizes(ds, 32 * KB)
+
+    _extend_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 5)
+
+    _append_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 10)
+
+    _extend_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 15)
+
+    _append_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 20)
+
+    assert len(ds) == 400
diff --git a/hub/constants.py b/hub/constants.py
@@ -40,10 +40,9 @@
 
 CHUNKS_FOLDER = "chunks"
 
-CHUNK_EXTENSION = "npz"
 ENCODED_CHUNK_NAMES_FOLDER = "chunks_index"
 # unsharded naming will help with backwards compatibility
-ENCODED_CHUNK_NAMES_FILENAME = f"unsharded.{CHUNK_EXTENSION}"
+ENCODED_CHUNK_NAMES_FILENAME = f"unsharded"
 
 ENCODING_DTYPE = np.uint32
 # caclulate the number of bits to shift right when converting a 128-bit uuid into `ENCODING_DTYPE`

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
@@ -8,6 +8,8 @@
 from hub.core.meta.encode.shape import ShapeEncoder
 from hub.core.meta.encode.byte_positions import BytePositionsEncoder
 
+from hub.core.serialize import serialize_chunk, deserialize_chunk, infer_chunk_num_bytes
+
 
 class Chunk(Cachable):
     def __init__(
@@ -108,31 +110,24 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
 
     def __len__(self):
         """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached."""
-
-        shape_nbytes = self.shapes_encoder.nbytes
-        range_nbytes = self.byte_positions_encoder.nbytes
-        error_bytes = 32  # to account for any extra delimeters/stuff that `np.savez` may create in excess
-        return shape_nbytes + range_nbytes + self.num_data_bytes + error_bytes
+        return infer_chunk_num_bytes(
+            hub.__version__,
+            self.shapes_encoder.array,
+            self.byte_positions_encoder.array,
+            len_data=len(self._data),
+        )
 
     def tobytes(self) -> memoryview:
-        out = BytesIO()
-
-        # TODO: for fault tolerance, we should have a chunk store the ID for the next chunk
-        # TODO: in case the index chunk meta gets pwned (especially during a potentially failed transform job merge)
-
-        np.savez(
-            out,
-            version=hub.__encoded_version__,
-            shapes=self.shapes_encoder.array,
-            byte_positions=self.byte_positions_encoder.array,
-            data=np.frombuffer(self.memoryview_data, dtype=np.uint8),
+        return serialize_chunk(
+            hub.__version__,
+            self.shapes_encoder.array,
+            self.byte_positions_encoder.array,
+            [self._data],
         )
-        out.seek(0)
-        return out.getbuffer()
 
     @classmethod
     def frombuffer(cls, buffer: bytes):
-        bio = BytesIO(buffer)
-        npz = np.load(bio)
-        data = memoryview(npz["data"].tobytes())
-        return cls(npz["shapes"], npz["byte_positions"], data=data)
+        if not buffer:
+            return cls()
+        version, shapes, byte_positions, data = deserialize_chunk(buffer)
+        return cls(shapes, byte_positions, data=data)
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
@@ -6,6 +6,8 @@
 from typing import Optional, Tuple
 import numpy as np
 from uuid import uuid4
+from hub.core.serialize import serialize_chunkids, deserialize_chunkids
+
 
 # these constants are for accessing the data layout. see the `ChunkIdEncoder` docstring.
 CHUNK_ID_INDEX = 0
@@ -71,13 +73,11 @@ def __init__(self):
         self._encoded_ids = None
 
     def tobytes(self) -> memoryview:
-        bio = BytesIO()
-        np.savez(
-            bio,
-            version=hub.__encoded_version__,
-            ids=self._encoded_ids,
-        )
-        return bio.getbuffer()
+        if self._encoded_ids is None:
+            return serialize_chunkids(
+                hub.__version__, [np.array([], dtype=ENCODING_DTYPE)]
+            )
+        return serialize_chunkids(hub.__version__, [self._encoded_ids])
 
     @staticmethod
     def name_from_id(id: ENCODING_DTYPE) -> str:
@@ -102,9 +102,11 @@ def get_name_for_chunk(self, chunk_index: int) -> str:
     @classmethod
     def frombuffer(cls, buffer: bytes):
         instance = cls()
-        bio = BytesIO(buffer)
-        npz = np.load(bio)
-        instance._encoded_ids = npz["ids"]
+        if not buffer:
+            return instance
+        version, ids = deserialize_chunkids(buffer)
+        if ids.nbytes:
+            instance._encoded_ids = ids
         return instance
 
     @property
@@ -117,7 +119,7 @@ def num_chunks(self) -> int:
     def num_samples(self) -> int:
         if self._encoded_ids is None:
             return 0
-        return int(self._encoded_ids[-1, LAST_INDEX_INDEX] + 1)
+        return int(self._encoded_ids[-1, LAST_INDEX_INDEX]) + 1
 
     def generate_chunk_id(self) -> ENCODING_DTYPE:
         """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it.