From dbd83e33ee013b9ae1b1cfe739ae3d5bc776c26a Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 7 Jul 2021 07:29:40 +0530
Subject: [PATCH 01/79] initial

---
 hub/core/chunk.py        |  69 ++++++++------
 hub/core/chunk_engine.py |   4 +-
 hub/core/lowlevel.py     | 191 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 236 insertions(+), 28 deletions(-)
 create mode 100644 hub/core/lowlevel.py

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index 95c1f941ff..05cbc4b9a8 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -8,6 +8,8 @@
 from hub.core.meta.encode.shape import ShapeEncoder
 from hub.core.meta.encode.byte_positions import BytePositionsEncoder
 
+from hub.core.lowlevel import encode, decode, malloc, _write_pybytes
+
 
 class Chunk(Cachable):
     def __init__(
@@ -44,17 +46,46 @@ def __init__(
         self.shapes_encoder = ShapeEncoder(encoded_shapes)
         self.byte_positions_encoder = BytePositionsEncoder(encoded_byte_positions)
 
-        self._data: Union[memoryview, bytearray] = data or bytearray()
+        self._data: List[memoryview] = [] if data is None else [data]
+
+
+
 
     @property
     def memoryview_data(self):
-        if isinstance(self._data, memoryview):
-            return self._data
-        return memoryview(self._data)
+        # deprecated
+        if len(self._data) == 1:
+            return self._data[0]
+        ptr = malloc(sum(map(len,self._data)))
+        for data in self._data:
+            ptr = _write_pybytes(ptr, data)
+        return memoryview(ptr.bytes)
+
+    def _get_2d_idx(self, idx):
+        i = 0
+        while len(self._data[i]) <= idx:
+            i += 1
+            idx -= len(self._data[i])
+        return i, idx
+
+    def view(self, start, end):
+        if len(self._data) == 1:
+            return self._data[0][start: end]
+        start2d = self._get_2d_idx(start)
+        end2d = self._get_2d_idx(end)
+        byts = []
+        byts.append(self._data[start2d[0]][start2d[1]:])
+        for i in range(start2d[0] + 1, end2d[0]):
+            byts.append(self._data[i])
+        byts.append(self._data[end2d[0]][:end2d[1]])
+        ptr = malloc(end - start)
+        for byt in byts:
+            ptr = _write_pybytes(ptr, byt)
+        return memoryview(ptr.bytes)
 
     @property
     def num_data_bytes(self):
-        return len(self._data)
+        return sum(map(len, self._data))
 
     def is_under_min_space(self, min_data_bytes_target: int) -> bool:
         """If this chunk's data is less than `min_data_bytes_target`, returns True."""
@@ -84,11 +115,11 @@ def append_sample(self, buffer: memoryview, max_data_bytes: int, shape: Tuple[in
             )
 
         # `_data` will be a `memoryview` if `frombuffer` is called.
-        if isinstance(self._data, memoryview):
-            self._data = bytearray(self._data)
+        # if isinstance(self._data, memoryview):
+        #     self._data = bytearray(self._data)
 
         # note: incoming_num_bytes can be 0 (empty sample)
-        self._data += buffer
+        self._data.append(buffer)
         self.update_headers(incoming_num_bytes, shape)
 
     def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
@@ -116,24 +147,10 @@ def __len__(self):
         return shape_nbytes + range_nbytes + self.num_data_bytes + error_bytes
 
     def tobytes(self) -> memoryview:
-        out = BytesIO()
-
-        # TODO: for fault tolerance, we should have a chunk store the ID for the next chunk
-        # TODO: in case the index chunk meta gets pwned (especially during a potentially failed transform job merge)
+        return encode(hub.__version__, self.shapes.encoder.array, self.byte_positions_encoder.array, self._data)
 
-        np.savez(
-            out,
-            version=hub.__encoded_version__,
-            shapes=self.shapes_encoder.array,
-            byte_positions=self.byte_positions_encoder.array,
-            data=np.frombuffer(self.memoryview_data, dtype=np.uint8),
-        )
-        out.seek(0)
-        return out.getbuffer()
 
     @classmethod
-    def frombuffer(cls, buffer: bytes):
-        bio = BytesIO(buffer)
-        npz = np.load(bio)
-        data = memoryview(npz["data"].tobytes())
-        return cls(npz["shapes"], npz["byte_positions"], data=data)
+    def frombuffer(cls, buffer: bytes) -> "Chunk":
+        version, shapes, byte_positions, data = decode(buffer)
+        return cls(shapes, byte_position, data=data)
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 51eb5c4747..b695dc0a9d 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -365,12 +365,12 @@ def read_sample_from_chunk(
 
         enc = self.chunk_id_encoder
 
-        buffer = chunk.memoryview_data
+        # buffer = chunk.memoryview_data
         local_sample_index = enc.get_local_sample_index(global_sample_index)
         shape = chunk.shapes_encoder[local_sample_index]
         sb, eb = chunk.byte_positions_encoder[local_sample_index]
 
-        buffer = buffer[sb:eb]
+        buffer = chunk.view(sb, eb)
         if expect_compressed:
             sample = decompress_array(buffer, shape)
         else:
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
new file mode 100644
index 0000000000..94e361019d
--- /dev/null
+++ b/hub/core/lowlevel.py
@@ -0,0 +1,191 @@
+import numpy as np
+import ctypes
+from collections import namedtuple
+from typing import Tuple, List, Union, Optional
+import hub
+
+
+class Pointer(object):
+    __slots__ = ("address", "size", "_c_array")
+
+    def __init__(self, address: Optional[int] = None, size: Optional[int] = None, c_array: Optional[ctypes.Array] = None) -> None:
+        if c_array is None:
+            if address is None or size is None:
+                raise ValueError("Expected c_array or address and size args.")
+            self.address = address
+            self.size = size
+            self._set_c_array()
+        else:
+            self._c_array = c_array
+            self.address = ctypes.addressof(c_array)
+            self.size = len(c_array)
+
+    def _set_c_array(self) -> None:
+        self._c_array = (ctypes.c_byte * self.size).from_address(self.address)
+
+    def __add__(self, i: int) -> "Pointer":
+        assert i >= 0
+        assert i <= self.size
+        return Pointer(self.address + i, self.size - i)
+
+    def __iadd__(self, i: int) -> "Pointer":
+        assert i >= 0
+        assert i <= self.size
+        self.address += i
+        self.size -= i
+        self._set_c_array()
+        return self
+
+    def __setitem__(self, idx: int, byte: int) -> None:
+        self._c_array[idx] = byte
+
+    def __getitem__(self, idx: int) -> int:
+        return self._c_array[idx]
+
+    @property
+    def memoryview(self):
+        return memoryview(self._c_array)
+
+    @property
+    def bytes(self):
+        return bytes(self._c_array)
+
+    def __len__(self):
+        return self.size
+
+
+def malloc(size: int) -> Pointer:
+    return Pointer(c_array=(ctypes.c_byte * size)())
+
+
+def memcpy(dest: Pointer, src:Pointer, count=None) -> None:
+    if count is None:
+        count = src.size
+    ctypes.memmove(dest.address, src.address, count)
+
+
+def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer:
+    ptr2 = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff))
+    memcpy(ptr, ptr2)
+    ptr += len(byts)
+    return ptr
+
+
+def _ndarray_to_ptr(arr: np.ndarray) -> Pointer:
+    return Pointer(arr.__array_interface__['data'][0], arr.itemsize * arr.size)
+
+
+def encode(version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: List[bytes]) -> memoryview:
+    # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128)
+    # NOTE: Assumption: len(version) < 256
+    assert len(version) < 256
+    assert max((map(ord, version))) < 128
+    version_slice_size = 1 + len(version)
+    shape_info_data_size = shape_info.itemsize * shape_info.size
+    shape_info_slice_size = 4 + 4 + shape_info_data_size
+    byte_positions_data_size = byte_positions.itemsize * byte_positions.size
+    byte_positions_slice_size = 4 + 4 + byte_positions_data_size
+    data_slice_size = sum(map(len, data))
+    flatbuff = malloc(version_slice_size + shape_info_slice_size + byte_positions_slice_size + data_slice_size)
+    ptr = flatbuff + 0
+
+    # write version
+    ptr[0] = len(version)
+    ptr += 1
+    for c in version:
+        ptr[0] = ord(c)
+        ptr += 1
+
+    # write shape info
+    ptr = _write_pybytes(ptr, np.int32(shape_info.shape[0]).tobytes())
+    ptr = _write_pybytes(ptr, np.int32(shape_info.shape[1]).tobytes())
+    memcpy(ptr, _ndarray_to_ptr(shape_info))
+    ptr += shape_info_data_size
+
+    # write byte positions
+    ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[0]).tobytes())
+    ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[1]).tobytes())
+    memcpy(ptr, _ndarray_to_ptr(byte_positions))
+    ptr += byte_positions_data_size
+
+    # write actual data
+    for d in data:
+        ptr = _write_pybytes(ptr, d)
+
+    assert ptr.size == 0
+
+    return flatbuff.memoryview
+
+
+def decode(buff: Union[bytes, Pointer]) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
+    if isinstance(buff, bytes):
+        buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff))
+        copy = True
+    else:
+        copy = False
+    ptr = buff + 0
+
+    # read version
+    len_version = ptr[0]
+    version = ''
+    ptr += 1
+    for i in range(len_version):
+        version += chr(ptr[i])
+    ptr += len_version
+
+    # read shape info
+    shape_info_dtype = np.dtype(hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE)
+    shape_info_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32)
+    ptr += 8
+    shape_info_data_size = int(np.prod(shape_info_shape) * shape_info_dtype.itemsize)
+    shape_info = np.frombuffer(ptr.memoryview[:shape_info_data_size], dtype=shape_info_dtype).reshape(shape_info_shape)
+    if copy:
+        shape_info = shape_info.copy()
+    ptr += shape_info_data_size
+
+    # read byte positions
+    byte_positions_dtype = np.dtype(hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE)
+    byte_positions_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32)
+    ptr += 8
+    byte_positions_data_size = int(np.prod(byte_positions_shape) * byte_positions_dtype.itemsize)
+    byte_positions = np.frombuffer(ptr.memoryview[:byte_positions_data_size], dtype=byte_positions_dtype).reshape(byte_positions_shape)
+    if copy:
+        byte_positions = byte_positions.copy()
+    ptr += byte_positions_data_size
+    if copy:
+        data = memoryview(ptr.bytes)
+    else:
+        data = ptr.memoryview
+    return version, shape_info, byte_positions, ptr.memoryview
+
+
+def test():
+    version = hub.__version__
+    shape_info = np.cast[hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE](np.random.randint(100 ,size=(17, 63)))
+    byte_positions = np.cast[hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE](np.random.randint(100 ,size=(31, 79)))
+    data = [
+        b'1234' * 7,
+        b'abcdefg' * 8,
+        b'qwertyuiop' * 9
+    ]
+    encoded = bytes(encode(version, shape_info, byte_positions, data))
+
+    # from bytes
+    decoded = decode(encoded)
+    version2, shape_info2, byte_positions2, data2 = decoded
+    assert version2 == version
+    np.testing.assert_array_equal(shape_info, shape_info2)
+    np.testing.assert_array_equal(byte_positions, byte_positions2)
+    assert b''.join(data) == bytes(data2)
+
+    # from pointer
+    buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded))
+    decoded = decode(buff)
+    version2, shape_info2, byte_positions2, data2 = decoded
+    assert version2 == version
+    np.testing.assert_array_equal(shape_info, shape_info2)
+    np.testing.assert_array_equal(byte_positions, byte_positions2)
+    assert b''.join(data) == bytes(data2)
+
+if __name__ == "__main__":
+    test()
\ No newline at end of file

From 557df265e006aaa96e56b947d8fb942addcf3cfc Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 7 Jul 2021 07:32:02 +0530
Subject: [PATCH 02/79] format

---
 hub/core/chunk.py    | 19 +++++++------
 hub/core/lowlevel.py | 65 ++++++++++++++++++++++++++++++--------------
 2 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index 05cbc4b9a8..defd6fbcd3 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -48,15 +48,12 @@ def __init__(
 
         self._data: List[memoryview] = [] if data is None else [data]
 
-
-
-
     @property
     def memoryview_data(self):
         # deprecated
         if len(self._data) == 1:
             return self._data[0]
-        ptr = malloc(sum(map(len,self._data)))
+        ptr = malloc(sum(map(len, self._data)))
         for data in self._data:
             ptr = _write_pybytes(ptr, data)
         return memoryview(ptr.bytes)
@@ -70,14 +67,14 @@ def _get_2d_idx(self, idx):
 
     def view(self, start, end):
         if len(self._data) == 1:
-            return self._data[0][start: end]
+            return self._data[0][start:end]
         start2d = self._get_2d_idx(start)
         end2d = self._get_2d_idx(end)
         byts = []
-        byts.append(self._data[start2d[0]][start2d[1]:])
+        byts.append(self._data[start2d[0]][start2d[1] :])
         for i in range(start2d[0] + 1, end2d[0]):
             byts.append(self._data[i])
-        byts.append(self._data[end2d[0]][:end2d[1]])
+        byts.append(self._data[end2d[0]][: end2d[1]])
         ptr = malloc(end - start)
         for byt in byts:
             ptr = _write_pybytes(ptr, byt)
@@ -147,8 +144,12 @@ def __len__(self):
         return shape_nbytes + range_nbytes + self.num_data_bytes + error_bytes
 
     def tobytes(self) -> memoryview:
-        return encode(hub.__version__, self.shapes.encoder.array, self.byte_positions_encoder.array, self._data)
-
+        return encode(
+            hub.__version__,
+            self.shapes.encoder.array,
+            self.byte_positions_encoder.array,
+            self._data,
+        )
 
     @classmethod
     def frombuffer(cls, buffer: bytes) -> "Chunk":
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 94e361019d..79cb4798bf 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -8,7 +8,12 @@
 class Pointer(object):
     __slots__ = ("address", "size", "_c_array")
 
-    def __init__(self, address: Optional[int] = None, size: Optional[int] = None, c_array: Optional[ctypes.Array] = None) -> None:
+    def __init__(
+        self,
+        address: Optional[int] = None,
+        size: Optional[int] = None,
+        c_array: Optional[ctypes.Array] = None,
+    ) -> None:
         if c_array is None:
             if address is None or size is None:
                 raise ValueError("Expected c_array or address and size args.")
@@ -58,7 +63,7 @@ def malloc(size: int) -> Pointer:
     return Pointer(c_array=(ctypes.c_byte * size)())
 
 
-def memcpy(dest: Pointer, src:Pointer, count=None) -> None:
+def memcpy(dest: Pointer, src: Pointer, count=None) -> None:
     if count is None:
         count = src.size
     ctypes.memmove(dest.address, src.address, count)
@@ -72,10 +77,12 @@ def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer:
 
 
 def _ndarray_to_ptr(arr: np.ndarray) -> Pointer:
-    return Pointer(arr.__array_interface__['data'][0], arr.itemsize * arr.size)
+    return Pointer(arr.__array_interface__["data"][0], arr.itemsize * arr.size)
 
 
-def encode(version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: List[bytes]) -> memoryview:
+def encode(
+    version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: List[bytes]
+) -> memoryview:
     # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128)
     # NOTE: Assumption: len(version) < 256
     assert len(version) < 256
@@ -86,7 +93,12 @@ def encode(version: str, shape_info: np.ndarray, byte_positions: np.ndarray, dat
     byte_positions_data_size = byte_positions.itemsize * byte_positions.size
     byte_positions_slice_size = 4 + 4 + byte_positions_data_size
     data_slice_size = sum(map(len, data))
-    flatbuff = malloc(version_slice_size + shape_info_slice_size + byte_positions_slice_size + data_slice_size)
+    flatbuff = malloc(
+        version_slice_size
+        + shape_info_slice_size
+        + byte_positions_slice_size
+        + data_slice_size
+    )
     ptr = flatbuff + 0
 
     # write version
@@ -117,7 +129,9 @@ def encode(version: str, shape_info: np.ndarray, byte_positions: np.ndarray, dat
     return flatbuff.memoryview
 
 
-def decode(buff: Union[bytes, Pointer]) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
+def decode(
+    buff: Union[bytes, Pointer]
+) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
     if isinstance(buff, bytes):
         buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff))
         copy = True
@@ -127,7 +141,7 @@ def decode(buff: Union[bytes, Pointer]) -> Tuple[str, np.ndarray, np.ndarray, me
 
     # read version
     len_version = ptr[0]
-    version = ''
+    version = ""
     ptr += 1
     for i in range(len_version):
         version += chr(ptr[i])
@@ -138,17 +152,25 @@ def decode(buff: Union[bytes, Pointer]) -> Tuple[str, np.ndarray, np.ndarray, me
     shape_info_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32)
     ptr += 8
     shape_info_data_size = int(np.prod(shape_info_shape) * shape_info_dtype.itemsize)
-    shape_info = np.frombuffer(ptr.memoryview[:shape_info_data_size], dtype=shape_info_dtype).reshape(shape_info_shape)
+    shape_info = np.frombuffer(
+        ptr.memoryview[:shape_info_data_size], dtype=shape_info_dtype
+    ).reshape(shape_info_shape)
     if copy:
         shape_info = shape_info.copy()
     ptr += shape_info_data_size
 
     # read byte positions
-    byte_positions_dtype = np.dtype(hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE)
+    byte_positions_dtype = np.dtype(
+        hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE
+    )
     byte_positions_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32)
     ptr += 8
-    byte_positions_data_size = int(np.prod(byte_positions_shape) * byte_positions_dtype.itemsize)
-    byte_positions = np.frombuffer(ptr.memoryview[:byte_positions_data_size], dtype=byte_positions_dtype).reshape(byte_positions_shape)
+    byte_positions_data_size = int(
+        np.prod(byte_positions_shape) * byte_positions_dtype.itemsize
+    )
+    byte_positions = np.frombuffer(
+        ptr.memoryview[:byte_positions_data_size], dtype=byte_positions_dtype
+    ).reshape(byte_positions_shape)
     if copy:
         byte_positions = byte_positions.copy()
     ptr += byte_positions_data_size
@@ -161,13 +183,13 @@ def decode(buff: Union[bytes, Pointer]) -> Tuple[str, np.ndarray, np.ndarray, me
 
 def test():
     version = hub.__version__
-    shape_info = np.cast[hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE](np.random.randint(100 ,size=(17, 63)))
-    byte_positions = np.cast[hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE](np.random.randint(100 ,size=(31, 79)))
-    data = [
-        b'1234' * 7,
-        b'abcdefg' * 8,
-        b'qwertyuiop' * 9
-    ]
+    shape_info = np.cast[hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE](
+        np.random.randint(100, size=(17, 63))
+    )
+    byte_positions = np.cast[
+        hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE
+    ](np.random.randint(100, size=(31, 79)))
+    data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9]
     encoded = bytes(encode(version, shape_info, byte_positions, data))
 
     # from bytes
@@ -176,7 +198,7 @@ def test():
     assert version2 == version
     np.testing.assert_array_equal(shape_info, shape_info2)
     np.testing.assert_array_equal(byte_positions, byte_positions2)
-    assert b''.join(data) == bytes(data2)
+    assert b"".join(data) == bytes(data2)
 
     # from pointer
     buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded))
@@ -185,7 +207,8 @@ def test():
     assert version2 == version
     np.testing.assert_array_equal(shape_info, shape_info2)
     np.testing.assert_array_equal(byte_positions, byte_positions2)
-    assert b''.join(data) == bytes(data2)
+    assert b"".join(data) == bytes(data2)
+
 
 if __name__ == "__main__":
-    test()
\ No newline at end of file
+    test()

From 95ce176bc546e9442b9bda6df6531450c0d6ed4f Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 7 Jul 2021 07:44:00 +0530
Subject: [PATCH 03/79] typo

---
 hub/core/chunk.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index defd6fbcd3..b5d962b950 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -146,7 +146,7 @@ def __len__(self):
     def tobytes(self) -> memoryview:
         return encode(
             hub.__version__,
-            self.shapes.encoder.array,
+            self.shapes_encoder.array,
             self.byte_positions_encoder.array,
             self._data,
         )

From 950ac7c6df6af56518f7216d7fad2b5dd8537a23 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 7 Jul 2021 07:46:43 +0530
Subject: [PATCH 04/79] typo

---
 hub/core/lowlevel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 79cb4798bf..0a7c1ba671 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -70,7 +70,7 @@ def memcpy(dest: Pointer, src: Pointer, count=None) -> None:
 
 
 def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer:
-    ptr2 = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff))
+    ptr2 = Pointer(c_array=(ctypes.c_byte * len(byts))(*byts))
     memcpy(ptr, ptr2)
     ptr += len(byts)
     return ptr

From b55fc388b88f2c281fda4c023611b59431e55ec9 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 7 Jul 2021 08:39:56 +0530
Subject: [PATCH 05/79] bug fix

---
 hub/core/chunk.py    | 5 +++--
 hub/core/lowlevel.py | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index b5d962b950..6071d75038 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -75,10 +75,11 @@ def view(self, start, end):
         for i in range(start2d[0] + 1, end2d[0]):
             byts.append(self._data[i])
         byts.append(self._data[end2d[0]][: end2d[1]])
-        ptr = malloc(end - start)
+        buff = malloc(end - start)
+        ptr = buff + 0
         for byt in byts:
             ptr = _write_pybytes(ptr, byt)
-        return memoryview(ptr.bytes)
+        return memoryview(buff.bytes)
 
     @property
     def num_data_bytes(self):
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 0a7c1ba671..2600431190 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -87,6 +87,8 @@ def encode(
     # NOTE: Assumption: len(version) < 256
     assert len(version) < 256
     assert max((map(ord, version))) < 128
+    assert shape_info.ndim == 2
+    assert byte_positions.ndim == 2
     version_slice_size = 1 + len(version)
     shape_info_data_size = shape_info.itemsize * shape_info.size
     shape_info_slice_size = 4 + 4 + shape_info_data_size

From b0f7d88fac6773204b96782a768f0d2c3d86d179 Mon Sep 17 00:00:00 2001
From: McCrearyD <mccreary@dyllan.ai>
Date: Tue, 6 Jul 2021 20:11:32 -0700
Subject: [PATCH 06/79] some docs and fix 1D shapes

---
 hub/core/chunk.py | 52 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index b5d962b950..478ed67c0a 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -1,7 +1,7 @@
 from hub.util.exceptions import FullChunkError
 import hub
 from hub.core.storage.cachable import Cachable
-from typing import Sequence, Tuple, Union
+from typing import List, Sequence, Tuple, Union
 import numpy as np
 from io import BytesIO
 
@@ -58,28 +58,50 @@ def memoryview_data(self):
             ptr = _write_pybytes(ptr, data)
         return memoryview(ptr.bytes)
 
-    def _get_2d_idx(self, idx):
+    def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]:
+        """Converts `byte_index`, which is an index for a flattened stream of bytes, into a 2D index that can
+        be used for a list of byte streams of varying lengths. Used for accessing `self._data`, which is a list
+        of `memoryview`s.
+
+        Args:
+            byte_index (int): Index over a flattened stream of bytes.
+
+        Returns:
+            Tuple[int, int]: 2D index to be used to access `self._data`.
+        """
+
         i = 0
-        while len(self._data[i]) <= idx:
+        while len(self._data[i]) <= byte_index:
             i += 1
-            idx -= len(self._data[i])
-        return i, idx
+            byte_index -= len(self._data[i])
+        return i, byte_index
 
-    def view(self, start, end):
+    def view(self, start_byte: int, end_byte: int):
         if len(self._data) == 1:
-            return self._data[0][start:end]
-        start2d = self._get_2d_idx(start)
-        end2d = self._get_2d_idx(end)
+            return self._data[0][start_byte:end_byte]
+
+        start2d = self._get_2d_idx(start_byte)
+        end2d = self._get_2d_idx(end_byte)
+
+        # TODO: document this
+        # builds a list of memoryviews that contain the pieces we need for the output view
         byts = []
         byts.append(self._data[start2d[0]][start2d[1] :])
         for i in range(start2d[0] + 1, end2d[0]):
             byts.append(self._data[i])
         byts.append(self._data[end2d[0]][: end2d[1]])
-        ptr = malloc(end - start)
+
+        ptr = malloc(end_byte - start_byte)
+
         for byt in byts:
             ptr = _write_pybytes(ptr, byt)
+
         return memoryview(ptr.bytes)
 
+    @property
+    def num_samples(self):
+        return self.shapes_encoder.num_samples
+
     @property
     def num_data_bytes(self):
         return sum(map(len, self._data))
@@ -139,11 +161,14 @@ def __len__(self):
 
         shape_nbytes = self.shapes_encoder.nbytes
         range_nbytes = self.byte_positions_encoder.nbytes
-        error_bytes = 32  # to account for any extra delimeters/stuff that `np.savez` may create in excess
+        error_bytes = 32  # TODO: calculate these bytes actually
 
         return shape_nbytes + range_nbytes + self.num_data_bytes + error_bytes
 
     def tobytes(self) -> memoryview:
+        if self.num_samples == 0:
+            return memoryview(bytes())
+
         return encode(
             hub.__version__,
             self.shapes_encoder.array,
@@ -153,5 +178,8 @@ def tobytes(self) -> memoryview:
 
     @classmethod
     def frombuffer(cls, buffer: bytes) -> "Chunk":
+        if len(buffer) == 0:
+            return cls()
+
         version, shapes, byte_positions, data = decode(buffer)
-        return cls(shapes, byte_position, data=data)
+        return cls(shapes, byte_positions, data=data)

From 22816920e0c04a8904342de8b3f4f9e91eca0f0c Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 7 Jul 2021 09:24:32 +0530
Subject: [PATCH 07/79] add assertion for easy debugging

---
 hub/core/chunk.py    |  5 ++---
 hub/core/lowlevel.py | 16 ++++++----------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index fb7838719d..8aa29a21ec 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -69,11 +69,11 @@ def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]:
         Returns:
             Tuple[int, int]: 2D index to be used to access `self._data`.
         """
-
+        assert byte_index < sum(map(len, self._data))
         i = 0
         while len(self._data[i]) <= byte_index:
-            i += 1
             byte_index -= len(self._data[i])
+            i += 1
         return i, byte_index
 
     def view(self, start_byte: int, end_byte: int):
@@ -178,6 +178,5 @@ def tobytes(self) -> memoryview:
     def frombuffer(cls, buffer: bytes) -> "Chunk":
         if len(buffer) == 0:
             return cls()
-
         version, shapes, byte_positions, data = decode(buffer)
         return cls(shapes, byte_positions, data=data)
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 2600431190..85a341e3ab 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -132,9 +132,9 @@ def encode(
 
 
 def decode(
-    buff: Union[bytes, Pointer]
+    buff: Union[bytes, Pointer, memoryview]
 ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
-    if isinstance(buff, bytes):
+    if not isinstance(buff, Pointer):
         buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff))
         copy = True
     else:
@@ -150,7 +150,7 @@ def decode(
     ptr += len_version
 
     # read shape info
-    shape_info_dtype = np.dtype(hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE)
+    shape_info_dtype = np.dtype(hub.constants.ENCODING_DTYPE)
     shape_info_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32)
     ptr += 8
     shape_info_data_size = int(np.prod(shape_info_shape) * shape_info_dtype.itemsize)
@@ -162,9 +162,7 @@ def decode(
     ptr += shape_info_data_size
 
     # read byte positions
-    byte_positions_dtype = np.dtype(
-        hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE
-    )
+    byte_positions_dtype = np.dtype(hub.constants.ENCODING_DTYPE)
     byte_positions_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32)
     ptr += 8
     byte_positions_data_size = int(
@@ -185,12 +183,10 @@ def decode(
 
 def test():
     version = hub.__version__
-    shape_info = np.cast[hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE](
+    shape_info = np.cast[hub.constants.ENCODING_DTYPE](
         np.random.randint(100, size=(17, 63))
     )
-    byte_positions = np.cast[
-        hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE
-    ](np.random.randint(100, size=(31, 79)))
+    byte_positions = np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(31, 79)))
     data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9]
     encoded = bytes(encode(version, shape_info, byte_positions, data))
 

From 26cd3772ed83418b74fec96a8d02c27b3f8f4e21 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 7 Jul 2021 09:28:25 +0530
Subject: [PATCH 08/79] one off

---
 hub/core/chunk.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index 8aa29a21ec..b84fbd9221 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -81,7 +81,7 @@ def view(self, start_byte: int, end_byte: int):
             return self._data[0][start_byte:end_byte]
 
         start2d = self._get_2d_idx(start_byte)
-        end2d = self._get_2d_idx(end_byte)
+        end2d = self._get_2d_idx(end_byte - 1)
 
         # TODO: document this
         # builds a list of memoryviews that contain the pieces we need for the output view
@@ -89,7 +89,7 @@ def view(self, start_byte: int, end_byte: int):
         byts.append(self._data[start2d[0]][start2d[1] :])
         for i in range(start2d[0] + 1, end2d[0]):
             byts.append(self._data[i])
-        byts.append(self._data[end2d[0]][: end2d[1]])
+        byts.append(self._data[end2d[0]][: end2d[1] + 1])
         buff = malloc(end_byte - start_byte)
         ptr = buff + 0
         for byt in byts:

From fadc940d8e51ae1f6f902e3a4424714edc9fdfd7 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 7 Jul 2021 09:49:25 +0530
Subject: [PATCH 09/79] segfault fix

---
 hub/core/chunk.py    | 4 ++--
 hub/core/lowlevel.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index b84fbd9221..4a4dcdf28f 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -90,10 +90,10 @@ def view(self, start_byte: int, end_byte: int):
         for i in range(start2d[0] + 1, end2d[0]):
             byts.append(self._data[i])
         byts.append(self._data[end2d[0]][: end2d[1] + 1])
-        buff = malloc(end_byte - start_byte)
+        buff = malloc(sum(map(len, byts)))
         ptr = buff + 0
         for byt in byts:
-            ptr = _write_pybytes(ptr, byt)
+            ptr = _write_pybytes(ptr, byt.cast("B"))
         return memoryview(buff.bytes)
 
     @property
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 85a341e3ab..145f1eeea0 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -135,7 +135,9 @@ def decode(
     buff: Union[bytes, Pointer, memoryview]
 ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
     if not isinstance(buff, Pointer):
-        buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff))
+        ptr = Pointer(c_array=(ctypes.c_byte * len(buff))())
+        _write_pybytes(ptr, buff)
+        buff = ptr
         copy = True
     else:
         copy = False

From 035bacc03f4d22801ff1ab446a65945bbbf8e5f5 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 7 Jul 2021 10:09:41 +0530
Subject: [PATCH 10/79] smol fixes

---
 hub/core/lowlevel.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 145f1eeea0..6cf8ecabd1 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -55,6 +55,10 @@ def memoryview(self):
     def bytes(self):
         return bytes(self._c_array)
 
+    @property
+    def bytearray(self):
+        return bytearray(self._c_array)
+
     def __len__(self):
         return self.size
 
@@ -128,7 +132,7 @@ def encode(
 
     assert ptr.size == 0
 
-    return flatbuff.memoryview
+    return flatbuff.bytes
 
 
 def decode(
@@ -180,7 +184,7 @@ def decode(
         data = memoryview(ptr.bytes)
     else:
         data = ptr.memoryview
-    return version, shape_info, byte_positions, ptr.memoryview
+    return version, shape_info, byte_positions, data
 
 
 def test():

From 1eddf740b6ee758d4bfbfc3d24b71bbb369f3aff Mon Sep 17 00:00:00 2001
From: McCrearyD <mccreary@dyllan.ai>
Date: Tue, 6 Jul 2021 21:44:59 -0700
Subject: [PATCH 11/79] add clear cache to memory test in api and fix return in
 `decode`

---
 hub/api/tests/test_api.py | 1 +
 hub/core/lowlevel.py      | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 2210352dfc..212094d244 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -246,6 +246,7 @@ def test_sequence_samples(ds: Dataset):
 
     tensor.append([1, 2, 3])
     tensor.extend([[4, 5, 6]])
+    ds.clear_cache()
 
     assert len(tensor) == 2
 
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 145f1eeea0..00894261fe 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -180,7 +180,7 @@ def decode(
         data = memoryview(ptr.bytes)
     else:
         data = ptr.memoryview
-    return version, shape_info, byte_positions, ptr.memoryview
+    return version, shape_info, byte_positions, data
 
 
 def test():
@@ -188,7 +188,9 @@ def test():
     shape_info = np.cast[hub.constants.ENCODING_DTYPE](
         np.random.randint(100, size=(17, 63))
     )
-    byte_positions = np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(31, 79)))
+    byte_positions = np.cast[hub.constants.ENCODING_DTYPE](
+        np.random.randint(100, size=(31, 79))
+    )
     data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9]
     encoded = bytes(encode(version, shape_info, byte_positions, data))
 

From 3a409e17e058ee5fc7a45a780644175f14b2ee65 Mon Sep 17 00:00:00 2001
From: McCrearyD <mccreary@dyllan.ai>
Date: Tue, 6 Jul 2021 22:06:29 -0700
Subject: [PATCH 12/79] add a better exception for pointer GC

---
 hub/core/lowlevel.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 00894261fe..cee97a442f 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -70,7 +70,14 @@ def memcpy(dest: Pointer, src: Pointer, count=None) -> None:
 
 
 def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer:
-    ptr2 = Pointer(c_array=(ctypes.c_byte * len(byts))(*byts))
+    try:
+        ptr2 = Pointer(c_array=(ctypes.c_byte * len(byts))(*byts))
+    except NotImplementedError:
+        # TODO: exceptions.py
+        raise Exception(
+            "Reference for pointer was garbage collected. Maybe because the cache killed it?"
+        )
+
     memcpy(ptr, ptr2)
     ptr += len(byts)
     return ptr

From ef112524a3d14c421908175c540c476969fc1af7 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Thu, 8 Jul 2021 12:39:25 +0530
Subject: [PATCH 13/79] smol fix

---
 hub/core/lowlevel.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index e2c2600187..10ffe62a23 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -76,8 +76,7 @@ def memcpy(dest: Pointer, src: Pointer, count=None) -> None:
 def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer:
     ptr2 = Pointer(c_array=(ctypes.c_byte * len(byts))(*byts))
     memcpy(ptr, ptr2)
-    ptr += len(byts)
-    return ptr
+    return ptr + len(byts)
 
 
 def _ndarray_to_ptr(arr: np.ndarray) -> Pointer:
@@ -117,13 +116,13 @@ def encode(
     ptr = _write_pybytes(ptr, np.int32(shape_info.shape[0]).tobytes())
     ptr = _write_pybytes(ptr, np.int32(shape_info.shape[1]).tobytes())
     memcpy(ptr, _ndarray_to_ptr(shape_info))
-    ptr += shape_info_data_size
+    ptr += shape_info.nbytes
 
     # write byte positions
     ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[0]).tobytes())
     ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[1]).tobytes())
     memcpy(ptr, _ndarray_to_ptr(byte_positions))
-    ptr += byte_positions_data_size
+    ptr += byte_positions.nbytes
 
     # write actual data
     for d in data:

From 57d2da7c0cd45c1be3b2dff7641a503861d501e0 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Thu, 8 Jul 2021 13:27:48 +0530
Subject: [PATCH 14/79] all fix

---
 hub/core/chunk.py    | 35 ++++++++++++++++++++++++-----------
 hub/core/lowlevel.py | 31 +++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index cce8581e30..67d352dc61 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -69,32 +69,45 @@ def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]:
         Returns:
             Tuple[int, int]: 2D index to be used to access `self._data`.
         """
-        assert byte_index < sum(map(len, self._data))
         i = 0
-        while len(self._data[i]) <= byte_index:
-            byte_index -= len(self._data[i])
-            i += 1
+        data = self._data
+        while True:
+            try:
+                num_data_i = len(data[i])
+            except IndexError:  # slightly faster than checking i < len(self._data) in a loop
+                return i - 1, len(data[i - 1]) + byte_index
+            if num_data_i <= byte_index:
+                byte_index -= num_data_i
+                i += 1
+            else:
+                break
         return i, byte_index
 
     def view(self, start_byte: int, end_byte: int):
         if len(self._data) == 1:
             return self._data[0][start_byte:end_byte]
 
-        start2d = self._get_2d_idx(start_byte)
-        end2d = self._get_2d_idx(end_byte - 1)
+        start2dx, start2dy = self._get_2d_idx(start_byte)
+        end2dx, end2dy = self._get_2d_idx(end_byte)
+        if start2dx == end2dx:
+            # Indexing to the same inner chunk, this would be fast
+            buff = malloc(end2dy - start2dy)
+            _write_pybytes(buff, self._data[start2dx][start2dy:end2dy])
+            return buff.memoryview
 
         # TODO: document this
         # builds a list of memoryviews that contain the pieces we need for the output view
+
         byts = []
-        byts.append(self._data[start2d[0]][start2d[1] :])
-        for i in range(start2d[0] + 1, end2d[0]):
+        byts.append(self._data[start2dx][start2dy:])
+        for i in range(start2dx + 1, end2dx):
             byts.append(self._data[i])
-        byts.append(self._data[end2d[0]][: end2d[1] + 1])
+        byts.append(self._data[end2dx][:end2dy])
         buff = malloc(sum(map(len, byts)))
         ptr = buff + 0
         for byt in byts:
             ptr = _write_pybytes(ptr, byt.cast("B"))
-        return memoryview(buff.bytes)
+        return buff.memoryview
 
     @property
     def num_samples(self):
@@ -160,7 +173,7 @@ def __len__(self):
             hub.__version__,
             self.shapes_encoder.array,
             self.byte_positions_encoder.array,
-            self._data
+            self._data,
         )
 
     def tobytes(self) -> memoryview:
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 3a0da9533b..1ea8505f4e 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -1,7 +1,7 @@
 import numpy as np
 import ctypes
 from collections import namedtuple
-from typing import Tuple, List, Union, Optional
+from typing import Tuple, Sequence, Union, Optional
 import hub
 
 
@@ -89,7 +89,13 @@ def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer:
 def _ndarray_to_ptr(arr: np.ndarray) -> Pointer:
     return Pointer(arr.__array_interface__["data"][0], arr.itemsize * arr.size)
 
-def _infer_num_bytes(version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: List[bytes]):
+
+def _infer_num_bytes(
+    version: str,
+    shape_info: np.ndarray,
+    byte_positions: np.ndarray,
+    data: Union[Sequence[bytes], Sequence[memoryview]],
+):
     # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128)
     # NOTE: Assumption: len(version) < 256
     assert len(version) < 256
@@ -100,16 +106,23 @@ def _infer_num_bytes(version: str, shape_info: np.ndarray, byte_positions: np.nd
     # shape_info_slice_size = 4 + 4 + shape_info.nbytes
     # byte_positions_slice_size = 4 + 4 + byte_positions.nbytes
     # data_slice_size = sum(map(len, data))
-    return len(version) + shape_info.nbytes + byte_positions.nbytes + sum(map(len, data)) + 17
+    return (
+        len(version)
+        + shape_info.nbytes
+        + byte_positions.nbytes
+        + sum(map(len, data))
+        + 17
+    )
+
 
 def encode(
-    version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: List[bytes]
+    version: str,
+    shape_info: np.ndarray,
+    byte_positions: np.ndarray,
+    data: Union[Sequence[bytes], Sequence[memoryview]],
 ) -> memoryview:
 
-
-    flatbuff = malloc(
-        _infer_num_bytes(version, shape_info, byte_positions, data)
-    )
+    flatbuff = malloc(_infer_num_bytes(version, shape_info, byte_positions, data))
     ptr = flatbuff + 0
 
     # write version
@@ -135,8 +148,6 @@ def encode(
     for d in data:
         ptr = _write_pybytes(ptr, d)
 
-    assert ptr.size == 0
-
     return flatbuff.bytes
 
 

From 4dde08a711709d57595f552522aa1af97ed46b48 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Thu, 8 Jul 2021 16:07:05 +0530
Subject: [PATCH 15/79] chuunk id optims init

---
 hub/core/chunk.py                |   8 +-
 hub/core/lowlevel.py             |  55 ++++++++++++--
 hub/core/meta/encode/chunk_id.py | 121 +++++++++++++++++++------------
 3 files changed, 126 insertions(+), 58 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index 67d352dc61..d5c008b928 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -8,7 +8,7 @@
 from hub.core.meta.encode.shape import ShapeEncoder
 from hub.core.meta.encode.byte_positions import BytePositionsEncoder
 
-from hub.core.lowlevel import encode, decode, malloc, _write_pybytes, _infer_num_bytes
+from hub.core.lowlevel import encode_chunk, decode_chunk, malloc, _write_pybytes, _infer_chunk_num_bytes
 
 
 class Chunk(Cachable):
@@ -169,7 +169,7 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
 
     def __len__(self):
         """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached."""
-        return _infer_num_bytes(
+        return _infer_chunk_num_bytes(
             hub.__version__,
             self.shapes_encoder.array,
             self.byte_positions_encoder.array,
@@ -180,7 +180,7 @@ def tobytes(self) -> memoryview:
         if self.num_samples == 0:
             return memoryview(bytes())
 
-        return encode(
+        return encode_chunk(
             hub.__version__,
             self.shapes_encoder.array,
             self.byte_positions_encoder.array,
@@ -191,5 +191,5 @@ def tobytes(self) -> memoryview:
     def frombuffer(cls, buffer: bytes) -> "Chunk":
         if len(buffer) == 0:
             return cls()
-        version, shapes, byte_positions, data = decode(buffer)
+        version, shapes, byte_positions, data = decode_chunk(buffer)
         return cls(shapes, byte_positions, data=data)
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 1ea8505f4e..43d7323367 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -90,7 +90,7 @@ def _ndarray_to_ptr(arr: np.ndarray) -> Pointer:
     return Pointer(arr.__array_interface__["data"][0], arr.itemsize * arr.size)
 
 
-def _infer_num_bytes(
+def _infer_chunk_num_bytes(
     version: str,
     shape_info: np.ndarray,
     byte_positions: np.ndarray,
@@ -115,14 +115,14 @@ def _infer_num_bytes(
     )
 
 
-def encode(
+def encode_chunk(
     version: str,
     shape_info: np.ndarray,
     byte_positions: np.ndarray,
     data: Union[Sequence[bytes], Sequence[memoryview]],
 ) -> memoryview:
 
-    flatbuff = malloc(_infer_num_bytes(version, shape_info, byte_positions, data))
+    flatbuff = malloc(_infer_chunk_num_bytes(version, shape_info, byte_positions, data))
     ptr = flatbuff + 0
 
     # write version
@@ -148,10 +148,10 @@ def encode(
     for d in data:
         ptr = _write_pybytes(ptr, d)
 
-    return flatbuff.bytes
+    return memoryview(flatbuff.bytes)
 
 
-def decode(
+def decode_chunk(
     buff: Union[bytes, Pointer, memoryview]
 ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
     if not isinstance(buff, Pointer):
@@ -202,6 +202,45 @@ def decode(
         data = ptr.memoryview
     return version, shape_info, byte_positions, data
 
+def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
+    len_version = len(version)
+    flatbuff = malloc(
+        1 + len_version + sum([x.nbytes for x in ids])
+    )
+
+    # Write version
+    ptr = flatbuff + 0
+    ptr[0] = len_version
+    ptr += 1
+
+    for i, c in enumerate(version):
+        ptr[i] = ord(c)
+
+    ptr += len_version
+
+    for arr in ids:
+        memcpy(ptr, _ndarray_to_ptr(arr))
+        ptr += arr.nbytes
+
+    return memoryview(flatbuff.bytes)
+
+def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]:
+    ptr = Pointer(c_array=(ctypes.c_byte * len(buff))())
+    _write_pybytes(ptr, buff)
+    buff = ptr
+
+    # Read version
+    len_version = ptr[0]
+    ptr += 1
+    version = ""
+    for i in range(len_version):
+        version += chr(ptr[i])
+    ptr += len_version
+
+    # Read chunk ids
+    ids = np.frombuffer(ptr.memoryview, dtype=hub.constants.ENCODING_DTYPE).reshape(-1, 2).copy()
+
+    return version, ids
 
 def test():
     version = hub.__version__
@@ -212,10 +251,10 @@ def test():
         np.random.randint(100, size=(31, 79))
     )
     data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9]
-    encoded = bytes(encode(version, shape_info, byte_positions, data))
+    encoded = bytes(encode_chunk(version, shape_info, byte_positions, data))
 
     # from bytes
-    decoded = decode(encoded)
+    decoded = decode_chunk(encoded)
     version2, shape_info2, byte_positions2, data2 = decoded
     assert version2 == version
     np.testing.assert_array_equal(shape_info, shape_info2)
@@ -224,7 +263,7 @@ def test():
 
     # from pointer
     buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded))
-    decoded = decode(buff)
+    decoded = decode_chunk(buff)
     version2, shape_info2, byte_positions2, data2 = decoded
     assert version2 == version
     np.testing.assert_array_equal(shape_info, shape_info2)
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 92c737aa68..a0cdd4872a 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -3,9 +3,11 @@
 import hub
 from hub.core.storage.cachable import Cachable
 from io import BytesIO
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 import numpy as np
 from uuid import uuid4
+from hub.core.lowlevel import encode_chunkids, decode_chunkids
+
 
 # these constants are for accessing the data layout. see the `ChunkIdEncoder` docstring.
 CHUNK_ID_INDEX = 0
@@ -67,17 +69,35 @@ def __init__(self):
                 Then, you get the left-most column and that is your chunk ID!
 
         """
-
-        self._encoded_ids = None
+        self._shards: List[np.ndarray] = []
+        self._buffer: List[List(int, int)] = []
+
+    def _flush_buffer(self):
+        if self._buffer:
+            self._shards.append(np.array(self._buffer, dtype=ENCODING_DTYPE))
+        self._buffer.clear()
+
+    def _get_2d_idx(self, idx: int) -> Tuple[int, int]:
+        i = 0
+        data = self._shards
+        while True:
+            try:
+                num_data_i = len(data[i])
+            except IndexError:  # slightly faster than checking i < len(self._data) in a loop
+                return -1, idx
+            if num_data_i <= idx:
+                idx -= num_data_i
+                i += 1
+            else:
+                break
+        return i, idx
 
     def tobytes(self) -> memoryview:
-        bio = BytesIO()
-        np.savez(
-            bio,
-            version=hub.__encoded_version__,
-            ids=self._encoded_ids,
+        self._flush_buffer()
+        return encode_chunkids(
+            hub.__version__,
+            self._shards
         )
-        return bio.getbuffer()
 
     @staticmethod
     def name_from_id(id: ENCODING_DTYPE) -> str:
@@ -95,29 +115,41 @@ def id_from_name(name: str) -> ENCODING_DTYPE:
     def get_name_for_chunk(self, chunk_index: int) -> str:
         """Gets the name for the chunk at index `chunk_index`. If you need to get the name for a chunk from a sample index, instead
         use `__getitem__`, then `name_from_id`."""
-
-        chunk_id = self._encoded_ids[:, CHUNK_ID_INDEX][chunk_index]
+        chunk_id = self.get_entry(chunk_index)[CHUNK_ID_INDEX]
         return ChunkIdEncoder.name_from_id(chunk_id)
 
     @classmethod
     def frombuffer(cls, buffer: bytes):
+        version, ids = decode_chunkids(buffer)
         instance = cls()
-        bio = BytesIO(buffer)
-        npz = np.load(bio)
-        instance._encoded_ids = npz["ids"]
+        instance._shards = [ids]
         return instance
 
     @property
     def num_chunks(self) -> int:
-        if self._encoded_ids is None:
-            return 0
-        return len(self._encoded_ids)
+        return sum(map(len, self._shards)) + len(self._buffer)
+
+    def get_entry(self, idx):
+        x, y = self._get_2d_idx(idx)
+        return self._buffer[y] if x < 0 else self._shards[x][y]
+
+    @property
+    def last_entry(self) -> int:
+        if self._buffer:
+            return self._buffer[-1]
+        if self._shards:
+            return self._shards[-1][-1]
+
+    @property
+    def last_index(self) -> int:
+        last_entry = self.last_entry
+        if not last_entry:
+            return -1
+        return last_entry[LAST_INDEX_INDEX]
 
     @property
     def num_samples(self) -> int:
-        if self._encoded_ids is None:
-            return 0
-        return int(self._encoded_ids[-1, LAST_INDEX_INDEX] + 1)
+        return self.last_index + 1
 
     def generate_chunk_id(self) -> ENCODING_DTYPE:
         """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it.
@@ -128,21 +160,10 @@ def generate_chunk_id(self) -> ENCODING_DTYPE:
         """
 
         id = ENCODING_DTYPE(uuid4().int >> UUID_SHIFT_AMOUNT)
-
-        if self.num_samples == 0:
-            self._encoded_ids = np.array([[id, -1]], dtype=ENCODING_DTYPE)
-
-        else:
-            last_index = self.num_samples - 1
-
-            new_entry = np.array(
-                [[id, last_index]],
-                dtype=ENCODING_DTYPE,
-            )
-            self._encoded_ids = np.concatenate([self._encoded_ids, new_entry])
-
+        self._buffer.append([id, self.last_index])
         return id
 
+
     def register_samples_to_last_chunk_id(self, num_samples: int):
         """Registers samples to the chunk ID that was generated last with the `generate_chunk_id` method.
         This method should be called at least once per chunk created.
@@ -171,12 +192,12 @@ def register_samples_to_last_chunk_id(self, num_samples: int):
                 "Cannot register 0 num_samples (signifying a partial sample continuing the last chunk) when no last chunk exists."
             )
 
-        current_entry = self._encoded_ids[-1]
+        current_entry = self.last_entry
 
         # this operation will trigger an overflow for the first addition, so supress the warning
-        np.seterr(over="ignore")
-        current_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples)
-        np.seterr(over="warn")
+        # np.seterr(over="ignore")
+        self.last_entry[LAST_INDEX_INDEX] += num_samples
+        # np.seterr(over="warn")
 
     def get_local_sample_index(self, global_sample_index: int) -> int:
         """Converts `global_sample_index` into a new index that is relative to the chunk the sample belongs to.
@@ -206,19 +227,23 @@ def get_local_sample_index(self, global_sample_index: int) -> int:
             int: local index value between 0 and the amount of samples the chunk contains - 1.
         """
 
-        _, chunk_index = self.__getitem__(global_sample_index, return_chunk_index=True)  # type: ignore
+        _, (shard_index, chunk_index) = self.get(global_sample_index, return_chunk_index=True)  # type: ignore
 
-        if chunk_index == 0:
+        if not shard_index and not chunk_index:
             return global_sample_index
 
-        current_entry = self._encoded_ids[chunk_index - 1]  # type: ignore
+        # current_entry = self._encoded_ids[chunk_index - 1]
+        current_entry = self._shards[shard_index][chunk_index - 1]  # buffer already flushed by get() call
         last_num_samples = current_entry[LAST_INDEX_INDEX] + 1
 
-        return int(global_sample_index - last_num_samples)
+        return global_sample_index - int(last_num_samples)
+
+    def __getitem__(self, sample_index: int) -> int:
+        return self.get(sample_index)
 
-    def __getitem__(
+    def get(
         self, sample_index: int, return_chunk_index: bool = False
-    ) -> Tuple[ENCODING_DTYPE, Optional[int]]:
+    ) -> Union[int, Tuple[int, Tuple[int, int]]]:
         """Get the ID for the chunk that `sample_index` is stored in.
         To get the name of the chunk, use `name_from_id`.
 
@@ -242,11 +267,15 @@ def __getitem__(
         if sample_index < 0:
             sample_index = (self.num_samples) + sample_index
 
-        idx = np.searchsorted(self._encoded_ids[:, LAST_INDEX_INDEX], sample_index)
-        id = self._encoded_ids[idx, CHUNK_ID_INDEX]
+        self._flush_buffer()
+        last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._shards]
+        shard_idx = bp.searchsorted(last_idxs, sample_index)
+        shard = self._shards[shard_idx]
+        idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
+        id = shard[idx, CHUNK_ID_INDEX]
         chunk_index = idx
 
         if return_chunk_index:
-            return id, chunk_index
+            return id, (shard_idx, chunk_index)
 
         return id

From df034956b110a683456fc1e3cdc67b4a854d1a92 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Thu, 8 Jul 2021 16:49:50 +0530
Subject: [PATCH 16/79] debug msgs

---
 hub/core/meta/encode/chunk_id.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index a0cdd4872a..5ea3d79c0f 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -70,7 +70,7 @@ def __init__(self):
 
         """
         self._shards: List[np.ndarray] = []
-        self._buffer: List[List(int, int)] = []
+        self._buffer: List[List[int]] = []
 
     def _flush_buffer(self):
         if self._buffer:
@@ -143,7 +143,7 @@ def last_entry(self) -> int:
     @property
     def last_index(self) -> int:
         last_entry = self.last_entry
-        if not last_entry:
+        if last_entry is None:
             return -1
         return last_entry[LAST_INDEX_INDEX]
 
@@ -184,7 +184,7 @@ def register_samples_to_last_chunk_id(self, num_samples: int):
 
         if self.num_samples == 0:
             raise ChunkIdEncoderError(
-                "Cannot register samples because no chunk IDs exist."
+                f"Cannot register samples because no chunk IDs exist. {self._buffer}, {self._shards}"
             )
 
         if num_samples == 0 and self.num_chunks < 2:
@@ -196,7 +196,7 @@ def register_samples_to_last_chunk_id(self, num_samples: int):
 
         # this operation will trigger an overflow for the first addition, so supress the warning
         # np.seterr(over="ignore")
-        self.last_entry[LAST_INDEX_INDEX] += num_samples
+        self.last_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples)
         # np.seterr(over="warn")
 
     def get_local_sample_index(self, global_sample_index: int) -> int:
@@ -269,7 +269,7 @@ def get(
 
         self._flush_buffer()
         last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._shards]
-        shard_idx = bp.searchsorted(last_idxs, sample_index)
+        shard_idx = np.searchsorted(last_idxs, sample_index)
         shard = self._shards[shard_idx]
         idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
         id = shard[idx, CHUNK_ID_INDEX]

From 71ee06c80116bd9187ec31593d71e91791527c17 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Thu, 8 Jul 2021 19:13:28 +0530
Subject: [PATCH 17/79] fix refcounting bug

---
 hub/core/chunk.py                |  8 +++-
 hub/core/lowlevel.py             | 64 ++++++++++++++++++++++++--------
 hub/core/meta/encode/chunk_id.py | 64 ++++++++++++++++++++------------
 3 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index d5c008b928..d001421cfb 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -8,7 +8,13 @@
 from hub.core.meta.encode.shape import ShapeEncoder
 from hub.core.meta.encode.byte_positions import BytePositionsEncoder
 
-from hub.core.lowlevel import encode_chunk, decode_chunk, malloc, _write_pybytes, _infer_chunk_num_bytes
+from hub.core.lowlevel import (
+    encode_chunk,
+    decode_chunk,
+    malloc,
+    _write_pybytes,
+    _infer_chunk_num_bytes,
+)
 
 
 class Chunk(Cachable):
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 43d7323367..76ca91a77b 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -1,12 +1,12 @@
 import numpy as np
 import ctypes
 from collections import namedtuple
-from typing import Tuple, Sequence, Union, Optional
+from typing import Tuple, Sequence, Union, Optional, List
 import hub
 
 
 class Pointer(object):
-    __slots__ = ("address", "size", "_c_array")
+    __slots__ = ("address", "size", "_c_array", "_refs")
 
     def __init__(
         self,
@@ -14,6 +14,7 @@ def __init__(
         size: Optional[int] = None,
         c_array: Optional[ctypes.Array] = None,
     ) -> None:
+        self._refs: List[ctypes.Array] = []
         if c_array is None:
             if address is None or size is None:
                 raise ValueError("Expected c_array or address and size args.")
@@ -26,12 +27,18 @@ def __init__(
             self.size = len(c_array)
 
     def _set_c_array(self) -> None:
+        try:
+            self._refs.append(self._c_array)
+        except AttributeError:
+            pass
         self._c_array = (ctypes.c_byte * self.size).from_address(self.address)
 
     def __add__(self, i: int) -> "Pointer":
         assert i >= 0
         assert i <= self.size
-        return Pointer(self.address + i, self.size - i)
+        ret = Pointer(self.address + i, self.size - i)
+        ret._refs.append(self._c_array)
+        return ret
 
     def __iadd__(self, i: int) -> "Pointer":
         assert i >= 0
@@ -155,9 +162,13 @@ def decode_chunk(
     buff: Union[bytes, Pointer, memoryview]
 ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
     if not isinstance(buff, Pointer):
-        ptr = Pointer(c_array=(ctypes.c_byte * len(buff))())
-        _write_pybytes(ptr, buff)
-        buff = ptr
+        try:
+            buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff))
+        except NotImplementedError:
+            # TODO: exceptions.py
+            raise Exception(
+                "Reference for pointer was garbage collected. Maybe because the cache killed it?"
+            )
         copy = True
     else:
         copy = False
@@ -202,11 +213,10 @@ def decode_chunk(
         data = ptr.memoryview
     return version, shape_info, byte_positions, data
 
+
 def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
     len_version = len(version)
-    flatbuff = malloc(
-        1 + len_version + sum([x.nbytes for x in ids])
-    )
+    flatbuff = malloc(1 + len_version + sum([x.nbytes for x in ids]))
 
     # Write version
     ptr = flatbuff + 0
@@ -224,10 +234,15 @@ def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
 
     return memoryview(flatbuff.bytes)
 
+
 def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]:
-    ptr = Pointer(c_array=(ctypes.c_byte * len(buff))())
-    _write_pybytes(ptr, buff)
-    buff = ptr
+    try:
+        ptr = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff))
+    except NotImplementedError:
+        # TODO: exceptions.py
+        raise Exception(
+            "Reference for pointer was garbage collected. Maybe because the cache killed it?"
+        )
 
     # Read version
     len_version = ptr[0]
@@ -235,14 +250,20 @@ def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]:
     version = ""
     for i in range(len_version):
         version += chr(ptr[i])
+
     ptr += len_version
 
     # Read chunk ids
-    ids = np.frombuffer(ptr.memoryview, dtype=hub.constants.ENCODING_DTYPE).reshape(-1, 2).copy()
+    ids = (
+        np.frombuffer(ptr.memoryview, dtype=hub.constants.ENCODING_DTYPE)
+        .reshape(-1, 2)
+        .copy()
+    )
 
     return version, ids
 
-def test():
+
+def test_chunk_encoding():
     version = hub.__version__
     shape_info = np.cast[hub.constants.ENCODING_DTYPE](
         np.random.randint(100, size=(17, 63))
@@ -271,5 +292,18 @@ def test():
     assert b"".join(data) == bytes(data2)
 
 
+def test_chunkids_encoding():
+    version = hub.__version__
+    shards = [
+        np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(100, 2)))
+    ]
+    encoded = encode_chunkids(version, shards)
+    decoded = decode_chunkids(encoded)
+    version2, ids = decoded
+    assert version2 == version
+    np.testing.assert_array_equal(np.concatenate(shards), ids)
+
+
 if __name__ == "__main__":
-    test()
+    test_chunk_encoding()
+    test_chunkids_encoding()
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 5ea3d79c0f..a0eff31206 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -3,7 +3,7 @@
 import hub
 from hub.core.storage.cachable import Cachable
 from io import BytesIO
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, List
 import numpy as np
 from uuid import uuid4
 from hub.core.lowlevel import encode_chunkids, decode_chunkids
@@ -15,7 +15,7 @@
 
 
 class ChunkIdEncoder(Cachable):
-    def __init__(self):
+    def __init__(self, ids=None):
         """Custom compressor that allows reading of chunk IDs from a sample index without decompressing.
 
         Chunk IDs:
@@ -69,8 +69,8 @@ def __init__(self):
                 Then, you get the left-most column and that is your chunk ID!
 
         """
-        self._shards: List[np.ndarray] = []
         self._buffer: List[List[int]] = []
+        self._shards: List[np.ndarray] = [] if ids is None else [ids]
 
     def _flush_buffer(self):
         if self._buffer:
@@ -94,10 +94,13 @@ def _get_2d_idx(self, idx: int) -> Tuple[int, int]:
 
     def tobytes(self) -> memoryview:
         self._flush_buffer()
-        return encode_chunkids(
-            hub.__version__,
-            self._shards
-        )
+        encoded = encode_chunkids(hub.__version__, self._shards)
+        decoded = decode_chunkids(encoded)[1]
+        if self._shards:
+            np.testing.assert_array_equal(
+                decoded, np.concatenate(self._shards), err_msg=str(bytes(encoded))
+            )
+        return encoded
 
     @staticmethod
     def name_from_id(id: ENCODING_DTYPE) -> str:
@@ -121,9 +124,7 @@ def get_name_for_chunk(self, chunk_index: int) -> str:
     @classmethod
     def frombuffer(cls, buffer: bytes):
         version, ids = decode_chunkids(buffer)
-        instance = cls()
-        instance._shards = [ids]
-        return instance
+        return cls(ids)
 
     @property
     def num_chunks(self) -> int:
@@ -134,11 +135,12 @@ def get_entry(self, idx):
         return self._buffer[y] if x < 0 else self._shards[x][y]
 
     @property
-    def last_entry(self) -> int:
+    def last_entry(self) -> Union[np.ndarray, List[int]]:
         if self._buffer:
             return self._buffer[-1]
         if self._shards:
             return self._shards[-1][-1]
+        return None
 
     @property
     def last_index(self) -> int:
@@ -149,7 +151,15 @@ def last_index(self) -> int:
 
     @property
     def num_samples(self) -> int:
-        return self.last_index + 1
+        if self._buffer:
+            return self._buffer[-1][LAST_INDEX_INDEX] + 1
+        elif self._shards:
+            return int(self._shards[-1][-1, LAST_INDEX_INDEX] + 1)
+        return 0
+
+    @property
+    def empty(self) -> bool:
+        return not self._buffer and not self._shards
 
     def generate_chunk_id(self) -> ENCODING_DTYPE:
         """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it.
@@ -158,12 +168,10 @@ def generate_chunk_id(self) -> ENCODING_DTYPE:
         Returns:
             ENCODING_DTYPE: The random chunk ID.
         """
-
         id = ENCODING_DTYPE(uuid4().int >> UUID_SHIFT_AMOUNT)
         self._buffer.append([id, self.last_index])
         return id
 
-
     def register_samples_to_last_chunk_id(self, num_samples: int):
         """Registers samples to the chunk ID that was generated last with the `generate_chunk_id` method.
         This method should be called at least once per chunk created.
@@ -182,7 +190,7 @@ def register_samples_to_last_chunk_id(self, num_samples: int):
                 f"Cannot register negative num samples. Got: {num_samples}"
             )
 
-        if self.num_samples == 0:
+        if self.empty:
             raise ChunkIdEncoderError(
                 f"Cannot register samples because no chunk IDs exist. {self._buffer}, {self._shards}"
             )
@@ -192,12 +200,14 @@ def register_samples_to_last_chunk_id(self, num_samples: int):
                 "Cannot register 0 num_samples (signifying a partial sample continuing the last chunk) when no last chunk exists."
             )
 
-        current_entry = self.last_entry
-
-        # this operation will trigger an overflow for the first addition, so supress the warning
-        # np.seterr(over="ignore")
-        self.last_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples)
-        # np.seterr(over="warn")
+        last_entry = self.last_entry
+        if self._buffer:
+            last_entry[LAST_INDEX_INDEX] += num_samples
+        else:
+            err = np.geterr()["over"]
+            np.seterr(over="ignore")
+            last_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples)
+            np.seterr(over=err)
 
     def get_local_sample_index(self, global_sample_index: int) -> int:
         """Converts `global_sample_index` into a new index that is relative to the chunk the sample belongs to.
@@ -232,14 +242,22 @@ def get_local_sample_index(self, global_sample_index: int) -> int:
         if not shard_index and not chunk_index:
             return global_sample_index
 
+        if chunk_index:
+            chunk_index -= 1
+        else:
+            shard_index -= 1
+            chunk_index = len(self._shards[shard_index]) - 1
+
         # current_entry = self._encoded_ids[chunk_index - 1]
-        current_entry = self._shards[shard_index][chunk_index - 1]  # buffer already flushed by get() call
+        current_entry = self._shards[shard_index][
+            chunk_index
+        ]  # buffer already flushed by get() call
         last_num_samples = current_entry[LAST_INDEX_INDEX] + 1
 
         return global_sample_index - int(last_num_samples)
 
     def __getitem__(self, sample_index: int) -> int:
-        return self.get(sample_index)
+        return self.get(sample_index)  # type: ignore
 
     def get(
         self, sample_index: int, return_chunk_index: bool = False

From 44b5ade3cf9ecde973f2f39814cf3128e2e668de Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Thu, 8 Jul 2021 19:42:27 +0530
Subject: [PATCH 18/79] ren shards->data

---
 hub/core/meta/encode/chunk_id.py | 36 ++++++++++++++++----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index a0eff31206..7fa56739f9 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -70,16 +70,16 @@ def __init__(self, ids=None):
 
         """
         self._buffer: List[List[int]] = []
-        self._shards: List[np.ndarray] = [] if ids is None else [ids]
+        self._data: List[np.ndarray] = [] if ids is None else [ids]
 
     def _flush_buffer(self):
         if self._buffer:
-            self._shards.append(np.array(self._buffer, dtype=ENCODING_DTYPE))
+            self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE))
         self._buffer.clear()
 
     def _get_2d_idx(self, idx: int) -> Tuple[int, int]:
         i = 0
-        data = self._shards
+        data = self._data
         while True:
             try:
                 num_data_i = len(data[i])
@@ -94,11 +94,11 @@ def _get_2d_idx(self, idx: int) -> Tuple[int, int]:
 
     def tobytes(self) -> memoryview:
         self._flush_buffer()
-        encoded = encode_chunkids(hub.__version__, self._shards)
+        encoded = encode_chunkids(hub.__version__, self._data)
         decoded = decode_chunkids(encoded)[1]
-        if self._shards:
+        if self._data:
             np.testing.assert_array_equal(
-                decoded, np.concatenate(self._shards), err_msg=str(bytes(encoded))
+                decoded, np.concatenate(self._data), err_msg=str(bytes(encoded))
             )
         return encoded
 
@@ -128,18 +128,18 @@ def frombuffer(cls, buffer: bytes):
 
     @property
     def num_chunks(self) -> int:
-        return sum(map(len, self._shards)) + len(self._buffer)
+        return sum(map(len, self._data)) + len(self._buffer)
 
     def get_entry(self, idx):
         x, y = self._get_2d_idx(idx)
-        return self._buffer[y] if x < 0 else self._shards[x][y]
+        return self._buffer[y] if x < 0 else self._data[x][y]
 
     @property
     def last_entry(self) -> Union[np.ndarray, List[int]]:
         if self._buffer:
             return self._buffer[-1]
-        if self._shards:
-            return self._shards[-1][-1]
+        if self._data:
+            return self._data[-1][-1]
         return None
 
     @property
@@ -153,13 +153,13 @@ def last_index(self) -> int:
     def num_samples(self) -> int:
         if self._buffer:
             return self._buffer[-1][LAST_INDEX_INDEX] + 1
-        elif self._shards:
-            return int(self._shards[-1][-1, LAST_INDEX_INDEX] + 1)
+        elif self._data:
+            return int(self._data[-1][-1, LAST_INDEX_INDEX] + 1)
         return 0
 
     @property
     def empty(self) -> bool:
-        return not self._buffer and not self._shards
+        return not self._buffer and not self._data
 
     def generate_chunk_id(self) -> ENCODING_DTYPE:
         """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it.
@@ -192,7 +192,7 @@ def register_samples_to_last_chunk_id(self, num_samples: int):
 
         if self.empty:
             raise ChunkIdEncoderError(
-                f"Cannot register samples because no chunk IDs exist. {self._buffer}, {self._shards}"
+                f"Cannot register samples because no chunk IDs exist. {self._buffer}, {self._data}"
             )
 
         if num_samples == 0 and self.num_chunks < 2:
@@ -246,10 +246,10 @@ def get_local_sample_index(self, global_sample_index: int) -> int:
             chunk_index -= 1
         else:
             shard_index -= 1
-            chunk_index = len(self._shards[shard_index]) - 1
+            chunk_index = len(self._data[shard_index]) - 1
 
         # current_entry = self._encoded_ids[chunk_index - 1]
-        current_entry = self._shards[shard_index][
+        current_entry = self._data[shard_index][
             chunk_index
         ]  # buffer already flushed by get() call
         last_num_samples = current_entry[LAST_INDEX_INDEX] + 1
@@ -286,9 +286,9 @@ def get(
             sample_index = (self.num_samples) + sample_index
 
         self._flush_buffer()
-        last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._shards]
+        last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data]
         shard_idx = np.searchsorted(last_idxs, sample_index)
-        shard = self._shards[shard_idx]
+        shard = self._data[shard_idx]
         idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
         id = shard[idx, CHUNK_ID_INDEX]
         chunk_index = idx

From 6f086e25d3e7a77d56efe7d442b76b84b98a49a6 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Fri, 9 Jul 2021 00:10:06 +0530
Subject: [PATCH 19/79] faster buff load

---
 hub/core/lowlevel.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 76ca91a77b..5b95593c7c 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -81,15 +81,7 @@ def memcpy(dest: Pointer, src: Pointer, count=None) -> None:
 
 
 def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer:
-    try:
-        ptr2 = Pointer(c_array=(ctypes.c_byte * len(byts))(*byts))
-    except NotImplementedError:
-        # TODO: exceptions.py
-        raise Exception(
-            "Reference for pointer was garbage collected. Maybe because the cache killed it?"
-        )
-
-    memcpy(ptr, ptr2)
+    memcpy(ptr, _ndarray_to_ptr(np.frombuffer(byts, dtype=np.byte)))
     return ptr + len(byts)
 
 

From 31aa04d2bbfe45c9d439c7f2ea445e339fd25166 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Fri, 9 Jul 2021 00:20:55 +0530
Subject: [PATCH 20/79] save 1 memcpy

---
 hub/core/lowlevel.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 5b95593c7c..a83a153940 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -89,6 +89,9 @@ def _ndarray_to_ptr(arr: np.ndarray) -> Pointer:
     return Pointer(arr.__array_interface__["data"][0], arr.itemsize * arr.size)
 
 
+def _pybytes_to_c_array(byts: bytes) -> Pointer:
+    return Pointer(np.frombuffer(byts, dtype=np.byte).__array_interface__["data"][0], len(byts))
+
 def _infer_chunk_num_bytes(
     version: str,
     shape_info: np.ndarray,
@@ -154,13 +157,7 @@ def decode_chunk(
     buff: Union[bytes, Pointer, memoryview]
 ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
     if not isinstance(buff, Pointer):
-        try:
-            buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff))
-        except NotImplementedError:
-            # TODO: exceptions.py
-            raise Exception(
-                "Reference for pointer was garbage collected. Maybe because the cache killed it?"
-            )
+        buff = _pybytes_to_c_array(buff)
         copy = True
     else:
         copy = False
@@ -228,13 +225,7 @@ def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
 
 
 def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]:
-    try:
-        ptr = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff))
-    except NotImplementedError:
-        # TODO: exceptions.py
-        raise Exception(
-            "Reference for pointer was garbage collected. Maybe because the cache killed it?"
-        )
+    ptr = _pybytes_to_c_array(buff)
 
     # Read version
     len_version = ptr[0]

From e98b008438990e214b85daccdae97de37206c0d8 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Fri, 9 Jul 2021 01:26:46 +0530
Subject: [PATCH 21/79] indexing

---
 hub/core/lowlevel.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index a83a153940..c5e17268d7 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -51,8 +51,28 @@ def __iadd__(self, i: int) -> "Pointer":
     def __setitem__(self, idx: int, byte: int) -> None:
         self._c_array[idx] = byte
 
-    def __getitem__(self, idx: int) -> int:
-        return self._c_array[idx]
+    def __getitem__(self, idx: Union[int, slice]) -> Union[int, "Pointer"]:
+        if isinstance(idx, int):
+            return self._c_array[idx]
+        elif isinstance(idx, slice):
+            assert idx.step is None
+            start = idx.start
+            end  = idx.stop
+            n = self.size
+            if start is None:
+                start = 0
+            elif start < 0:
+                start += n
+            if end is None:
+                end = n
+            elif end < 0:
+                end += n
+            assert start >= 0 and start < n
+            assert end >= start and end <= n
+            ret = Pointer(self.address + start, end - start)
+            ret._refs.append(self)
+            return ret
+
 
     @property
     def memoryview(self):
@@ -80,7 +100,7 @@ def memcpy(dest: Pointer, src: Pointer, count=None) -> None:
     ctypes.memmove(dest.address, src.address, count)
 
 
-def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer:
+def _write_pybytes(ptr: Pointer, byts: Union[bytes, memoryview]) -> Pointer:
     memcpy(ptr, _ndarray_to_ptr(np.frombuffer(byts, dtype=np.byte)))
     return ptr + len(byts)
 
@@ -148,6 +168,8 @@ def encode_chunk(
 
     # write actual data
     for d in data:
+        if isinstance(d, Pointer):
+            d = d.memoryview
         ptr = _write_pybytes(ptr, d)
 
     return memoryview(flatbuff.bytes)

From 3b68c5740425804b13e148580b2bf03d1d8d69d4 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Fri, 9 Jul 2021 01:59:38 +0530
Subject: [PATCH 22/79] cache data len

---
 hub/core/chunk.py        | 12 +++++++++++-
 hub/core/chunk_engine.py | 21 ++++++++++++---------
 hub/core/lowlevel.py     | 26 ++++++++++++++------------
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index d001421cfb..91f7d1f0e0 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -53,6 +53,7 @@ def __init__(
         self.byte_positions_encoder = BytePositionsEncoder(encoded_byte_positions)
 
         self._data: List[memoryview] = [] if data is None else [data]
+        self._len_data = len(self._data)
 
     @property
     def memoryview_data(self):
@@ -156,6 +157,7 @@ def append_sample(self, buffer: memoryview, max_data_bytes: int, shape: Tuple[in
 
         # note: incoming_num_bytes can be 0 (empty sample)
         self._data.append(buffer)
+        self._len_data += len(buffer)
         self.update_headers(incoming_num_bytes, shape)
 
     def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
@@ -179,7 +181,14 @@ def __len__(self):
             hub.__version__,
             self.shapes_encoder.array,
             self.byte_positions_encoder.array,
-            self._data,
+            len_data=self._len_data,
+        )
+        return (
+            17
+            + len(hub.__version__)
+            + self.shapes_encoder.array.nbytes
+            + self.byte_positions_encoder.array.nbytes
+            + self._len_data
         )
 
     def tobytes(self) -> memoryview:
@@ -191,6 +200,7 @@ def tobytes(self) -> memoryview:
             self.shapes_encoder.array,
             self.byte_positions_encoder.array,
             self._data,
+            self._len_data,
         )
 
     @classmethod
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index e53e23cea3..67a8ae7d43 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -270,8 +270,17 @@ def _create_new_chunk(self):
 
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
         """Formats a batch of `samples` and feeds them into `_append_bytes`."""
-
+        uniform = False
         if isinstance(samples, np.ndarray):
+            uniform = True
+        elif isinstance(samples, Sequence):
+            if is_uniform_sequence(samples):
+                uniform = True
+            if not isinstance(samples[0], np.ndarray):
+                samples = np.array(samples)
+        else:
+            raise TypeError(f"Unsupported type for extending. Got: {type(samples)}")
+        if uniform:
             compression = self.tensor_meta.sample_compression
             if compression == UNCOMPRESSED:
                 buffers = []
@@ -297,15 +306,9 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
 
                 for sample_object in sample_objects:
                     self.append(sample_object)
-
-        elif isinstance(samples, Sequence):
-            if is_uniform_sequence(samples):
-                self.extend(np.array(samples))
-            else:
-                for sample in samples:
-                    self.append(sample)
         else:
-            raise TypeError(f"Unsupported type for extending. Got: {type(samples)}")
+            for sample in samples:
+                self.append(sample)
 
         self.cache.maybe_flush()
 
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index c5e17268d7..69db5ae9b7 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -57,7 +57,7 @@ def __getitem__(self, idx: Union[int, slice]) -> Union[int, "Pointer"]:
         elif isinstance(idx, slice):
             assert idx.step is None
             start = idx.start
-            end  = idx.stop
+            end = idx.stop
             n = self.size
             if start is None:
                 start = 0
@@ -73,7 +73,6 @@ def __getitem__(self, idx: Union[int, slice]) -> Union[int, "Pointer"]:
             ret._refs.append(self)
             return ret
 
-
     @property
     def memoryview(self):
         return memoryview(self._c_array)
@@ -110,13 +109,17 @@ def _ndarray_to_ptr(arr: np.ndarray) -> Pointer:
 
 
 def _pybytes_to_c_array(byts: bytes) -> Pointer:
-    return Pointer(np.frombuffer(byts, dtype=np.byte).__array_interface__["data"][0], len(byts))
+    return Pointer(
+        np.frombuffer(byts, dtype=np.byte).__array_interface__["data"][0], len(byts)
+    )
+
 
 def _infer_chunk_num_bytes(
     version: str,
     shape_info: np.ndarray,
     byte_positions: np.ndarray,
-    data: Union[Sequence[bytes], Sequence[memoryview]],
+    data: Optional[Union[Sequence[bytes], Sequence[memoryview]]] = None,
+    len_data: Optional[int] = None,
 ):
     # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128)
     # NOTE: Assumption: len(version) < 256
@@ -128,13 +131,9 @@ def _infer_chunk_num_bytes(
     # shape_info_slice_size = 4 + 4 + shape_info.nbytes
     # byte_positions_slice_size = 4 + 4 + byte_positions.nbytes
     # data_slice_size = sum(map(len, data))
-    return (
-        len(version)
-        + shape_info.nbytes
-        + byte_positions.nbytes
-        + sum(map(len, data))
-        + 17
-    )
+    if len_data is None:
+        len_data = sum(map(len, data))
+    return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 17
 
 
 def encode_chunk(
@@ -142,9 +141,12 @@ def encode_chunk(
     shape_info: np.ndarray,
     byte_positions: np.ndarray,
     data: Union[Sequence[bytes], Sequence[memoryview]],
+    len_data: Optional[int],
 ) -> memoryview:
 
-    flatbuff = malloc(_infer_chunk_num_bytes(version, shape_info, byte_positions, data))
+    flatbuff = malloc(
+        _infer_chunk_num_bytes(version, shape_info, byte_positions, data, len_data)
+    )
     ptr = flatbuff + 0
 
     # write version

From 2d9177239dc8cd1ac7382d342b269362c18c6042 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Fri, 9 Jul 2021 02:14:32 +0530
Subject: [PATCH 23/79] cache _num_chunks

---
 hub/core/lowlevel.py             | 14 +++++++-------
 hub/core/meta/encode/chunk_id.py |  4 +++-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 69db5ae9b7..8ae4c201ef 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -70,7 +70,7 @@ def __getitem__(self, idx: Union[int, slice]) -> Union[int, "Pointer"]:
             assert start >= 0 and start < n
             assert end >= start and end <= n
             ret = Pointer(self.address + start, end - start)
-            ret._refs.append(self)
+            ret._refs.append(self._c_array)
             return ret
 
     @property
@@ -120,7 +120,7 @@ def _infer_chunk_num_bytes(
     byte_positions: np.ndarray,
     data: Optional[Union[Sequence[bytes], Sequence[memoryview]]] = None,
     len_data: Optional[int] = None,
-):
+) -> int:
     # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128)
     # NOTE: Assumption: len(version) < 256
     assert len(version) < 256
@@ -132,7 +132,7 @@ def _infer_chunk_num_bytes(
     # byte_positions_slice_size = 4 + 4 + byte_positions.nbytes
     # data_slice_size = sum(map(len, data))
     if len_data is None:
-        len_data = sum(map(len, data))
+        len_data = sum(map(len, data))  # type: ignore
     return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 17
 
 
@@ -188,11 +188,11 @@ def decode_chunk(
     ptr = buff + 0
 
     # read version
-    len_version = ptr[0]
+    len_version: int = ptr[0]  # type: ignore
     version = ""
     ptr += 1
     for i in range(len_version):
-        version += chr(ptr[i])
+        version += chr(ptr[i])  # type: ignore
     ptr += len_version
 
     # read shape info
@@ -252,11 +252,11 @@ def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]:
     ptr = _pybytes_to_c_array(buff)
 
     # Read version
-    len_version = ptr[0]
+    len_version: int = ptr[0]  # type: ignore
     ptr += 1
     version = ""
     for i in range(len_version):
-        version += chr(ptr[i])
+        version += chr(ptr[i])  # type: ignore
 
     ptr += len_version
 
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 7fa56739f9..829c760d02 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -71,6 +71,7 @@ def __init__(self, ids=None):
         """
         self._buffer: List[List[int]] = []
         self._data: List[np.ndarray] = [] if ids is None else [ids]
+        self._num_chunks = sum(map(len, self._data))
 
     def _flush_buffer(self):
         if self._buffer:
@@ -128,7 +129,7 @@ def frombuffer(cls, buffer: bytes):
 
     @property
     def num_chunks(self) -> int:
-        return sum(map(len, self._data)) + len(self._buffer)
+        return self._num_chunks
 
     def get_entry(self, idx):
         x, y = self._get_2d_idx(idx)
@@ -170,6 +171,7 @@ def generate_chunk_id(self) -> ENCODING_DTYPE:
         """
         id = ENCODING_DTYPE(uuid4().int >> UUID_SHIFT_AMOUNT)
         self._buffer.append([id, self.last_index])
+        self._num_chunks += 1
         return id
 
     def register_samples_to_last_chunk_id(self, num_samples: int):

From dcea7cc2a18697da30e4b7e1c20b3f521fe2d0b5 Mon Sep 17 00:00:00 2001
From: McCrearyD <mccreary@dyllan.ai>
Date: Fri, 9 Jul 2021 10:32:20 -0700
Subject: [PATCH 24/79] chunk engine updates cache size

---
 hub/core/chunk_engine.py      |  5 +----
 hub/core/storage/lru_cache.py | 13 +++++++++++--
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index a791ab088a..f93e2cb47a 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -200,10 +200,7 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype):
             self._append_to_new_chunk(buffer, shape)
 
         self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples)
-
-        # TODO implement tests for cache size compute
-        if self.last_chunk is not None:
-            self.cache[self.last_chunk_key] = self.last_chunk
+        self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk))
 
     def _try_appending_to_last_chunk(
         self, buffer: memoryview, shape: Tuple[int]
diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py
index 6d05367d1a..19fa02d503 100644
--- a/hub/core/storage/lru_cache.py
+++ b/hub/core/storage/lru_cache.py
@@ -37,6 +37,15 @@ def __init__(
         self.dirty_keys: Set[str] = set()  # keys present in cache but not next_storage
         self.cache_used = 0
 
+    def update_used_cache_for_path(self, path: str, new_size: int):
+        if new_size < 0:
+            raise ValueError(f"`new_size` must be >= 0. Got: {new_size}")
+        if path in self.lru_sizes:
+            old_size = self.lru_sizes[path]
+            self.cache_used -= old_size
+        self.cache_used += new_size
+        self.lru_sizes[path] = new_size
+
     def flush(self):
         """Writes data from cache_storage to next_storage. Only the dirty keys are written.
         This is a cascading function and leads to data being written to the final storage in case of a chained cache.
@@ -248,8 +257,8 @@ def _insert_in_cache(self, path: str, value: Union[bytes, Cachable]):
         self.check_readonly()
         self._free_up_space(len(value))
         self.cache_storage[path] = value  # type: ignore
-        self.cache_used += len(value)
-        self.lru_sizes[path] = len(value)
+
+        self.update_used_cache_for_path(path, len(value))
 
     def _list_keys(self):
         """Helper function that lists all the objects present in the cache and the underlying storage.

From 04276f0ad0ca793fdf885e42c2fc7b632921303f Mon Sep 17 00:00:00 2001
From: McCrearyD <mccreary@dyllan.ai>
Date: Fri, 9 Jul 2021 10:45:15 -0700
Subject: [PATCH 25/79] rename `remove` -> `remove_from_dirty`

---
 hub/core/storage/lru_cache.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py
index 19fa02d503..9e439e20b1 100644
--- a/hub/core/storage/lru_cache.py
+++ b/hub/core/storage/lru_cache.py
@@ -201,24 +201,24 @@ def __iter__(self):
         """
         yield from self._list_keys()
 
-    def _forward(self, path, remove=False):
+    def _forward(self, path, remove_from_dirty=False):
         """Forward the value at a given path to the next storage, and un-marks its key.
-        If the value at the path is Cachable, it will only be un-dirtied if remove=True.
+        If the value at the path is Cachable, it will only be un-dirtied if remove_from_dirty=True.
         """
-        self._forward_value(path, self.cache_storage[path], remove)
+        self._forward_value(path, self.cache_storage[path], remove_from_dirty)
 
-    def _forward_value(self, path, value, remove=False):
+    def _forward_value(self, path, value, remove_from_dirty=False):
         """Forwards a path-value pair to the next storage, and un-marks its key.
 
         Args:
             path (str): the path to the object relative to the root of the provider.
             value (bytes, Cachable): the value to send to the next storage.
-            remove (bool, optional): cachable values are not un-marked automatically,
+            remove_from_dirty (bool, optional): cachable values are not un-marked automatically,
                 as they are externally mutable. Set this to True to un-mark them anyway.
         """
         cachable = isinstance(value, Cachable)
 
-        if not cachable or remove:
+        if not cachable or remove_from_dirty:
             self.dirty_keys.discard(path)
 
         if cachable:
@@ -240,7 +240,7 @@ def _pop_from_cache(self):
         """Helper function that pops the least recently used key, value pair from the cache"""
         key, itemsize = self.lru_sizes.popitem(last=False)
         if key in self.dirty_keys:
-            self._forward(key, remove=True)
+            self._forward(key, remove_from_dirty=True)
         del self.cache_storage[key]
         self.cache_used -= itemsize
 

From 36002e2ff76e9ce21b9d40b344b20170e39a4087 Mon Sep 17 00:00:00 2001
From: McCrearyD <mccreary@dyllan.ai>
Date: Fri, 9 Jul 2021 11:10:10 -0700
Subject: [PATCH 26/79] remove some `sum`s

---
 hub/core/chunk.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index 91f7d1f0e0..7ef7ad045d 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -52,15 +52,19 @@ def __init__(
         self.shapes_encoder = ShapeEncoder(encoded_shapes)
         self.byte_positions_encoder = BytePositionsEncoder(encoded_byte_positions)
 
-        self._data: List[memoryview] = [] if data is None else [data]
-        self._len_data = len(self._data)
+        self._data: List[memoryview] = []
+        self._len_data: int = 0
+
+        if data is not None:
+            self._data.append(data)
+            self._len_data += len(data)
 
     @property
     def memoryview_data(self):
         # deprecated
         if len(self._data) == 1:
             return self._data[0]
-        ptr = malloc(sum(map(len, self._data)))
+        ptr = malloc(self.num_data_bytes)
         for data in self._data:
             ptr = _write_pybytes(ptr, data)
         return memoryview(ptr.bytes)
@@ -122,7 +126,7 @@ def num_samples(self):
 
     @property
     def num_data_bytes(self):
-        return sum(map(len, self._data))
+        return self._len_data
 
     def is_under_min_space(self, min_data_bytes_target: int) -> bool:
         """If this chunk's data is less than `min_data_bytes_target`, returns True."""
@@ -181,14 +185,14 @@ def __len__(self):
             hub.__version__,
             self.shapes_encoder.array,
             self.byte_positions_encoder.array,
-            len_data=self._len_data,
+            len_data=self.num_data_bytes,
         )
         return (
             17
             + len(hub.__version__)
             + self.shapes_encoder.array.nbytes
             + self.byte_positions_encoder.array.nbytes
-            + self._len_data
+            + self.num_data_bytes
         )
 
     def tobytes(self) -> memoryview:
@@ -200,7 +204,7 @@ def tobytes(self) -> memoryview:
             self.shapes_encoder.array,
             self.byte_positions_encoder.array,
             self._data,
-            self._len_data,
+            self.num_data_bytes,
         )
 
     @classmethod

From fea211d3f525e41d5e157b690434e4f6d387aa4f Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Fri, 9 Jul 2021 23:43:06 +0530
Subject: [PATCH 27/79] optims for seq access

---
 hub/core/meta/encode/chunk_id.py | 108 +++++++++++++++++++++++++------
 1 file changed, 89 insertions(+), 19 deletions(-)

diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 829c760d02..779a268d8a 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -73,9 +73,15 @@ def __init__(self, ids=None):
         self._data: List[np.ndarray] = [] if ids is None else [ids]
         self._num_chunks = sum(map(len, self._data))
 
+        self._prev_sample_index: Optional[int] = None
+        self._prev_chunk_index: Optional[Tuple[int, int]] = None
+        self._prev_chunk_id: Optional[int] = None
+
     def _flush_buffer(self):
         if self._buffer:
             self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE))
+        if self._prev_chunk_index and self._prev_chunk_index[0] < 0:
+            self._prev_chunk_index = (len(self._data) - 1, self._prev_chunk_index[1])
         self._buffer.clear()
 
     def _get_2d_idx(self, idx: int) -> Tuple[int, int]:
@@ -131,10 +137,35 @@ def frombuffer(cls, buffer: bytes):
     def num_chunks(self) -> int:
         return self._num_chunks
 
-    def get_entry(self, idx):
+    def get_entry(self, idx: int):
         x, y = self._get_2d_idx(idx)
         return self._buffer[y] if x < 0 else self._data[x][y]
 
+    def _get_entry_2d(self, x: int, y: int):
+        return self._buffer[y] if x < 0 else self._data[x][y]
+
+    def _decr_2d(self, x, y):
+        if x < 0:
+            if y:
+                return x, y - 1
+            return len(self._data) - 1, len(self._data[-1]) - 1
+        if y:
+            return x, y - 1
+        if x:
+            x -= 1
+            return x, len(self._data[x]) - 1
+        raise IndexError()
+
+    def _incr_2d(self, x: int, y:  int):
+        if x < 0:
+            return x, y + 1
+        # assert y < len(self._data[x])
+        if y == len(self._data[x]) - 1:
+            if x == len(self._data) - 1:
+                return -1, 0
+            return x + 1, 0
+        return x, y + 1
+
     @property
     def last_entry(self) -> Union[np.ndarray, List[int]]:
         if self._buffer:
@@ -244,16 +275,20 @@ def get_local_sample_index(self, global_sample_index: int) -> int:
         if not shard_index and not chunk_index:
             return global_sample_index
 
-        if chunk_index:
-            chunk_index -= 1
+        if shard_index < 0:
+            if chunk_index:
+                current_entry = self._buffer[chunk_index - 1]
+            else:
+                current_entry = self._data[-1][-1]
         else:
-            shard_index -= 1
-            chunk_index = len(self._data[shard_index]) - 1
-
-        # current_entry = self._encoded_ids[chunk_index - 1]
-        current_entry = self._data[shard_index][
-            chunk_index
-        ]  # buffer already flushed by get() call
+            if chunk_index:
+                chunk_index -= 1
+            else:
+                shard_index -= 1
+                chunk_index = len(self._data[shard_index]) - 1
+            current_entry = self._data[shard_index][
+                chunk_index
+            ]
         last_num_samples = current_entry[LAST_INDEX_INDEX] + 1
 
         return global_sample_index - int(last_num_samples)
@@ -287,15 +322,50 @@ def get(
         if sample_index < 0:
             sample_index = (self.num_samples) + sample_index
 
-        self._flush_buffer()
-        last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data]
-        shard_idx = np.searchsorted(last_idxs, sample_index)
-        shard = self._data[shard_idx]
-        idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
-        id = shard[idx, CHUNK_ID_INDEX]
-        chunk_index = idx
+        chunk_id = None
+        if self._prev_chunk_index:
+            # Optimization for sequential look up
+            prev_chunk_index = self._prev_chunk_index
+            # if sample_index == self._prev_sample_index:
+            #     if return_chunk_index:
+            #         return self._prev_chunk_id, prev_chunk_index
+            #     return self._prev_chunk_id
+            curr_entry = self._get_entry_2d(*prev_chunk_index)
+            if sample_index <= curr_entry[LAST_INDEX_INDEX]:
+                if any(prev_chunk_index):
+                    prev_entry = self._get_entry_2d(*(self._decr_2d(*prev_chunk_index)))
+                    if sample_index > prev_entry[LAST_INDEX_INDEX]:
+                        chunk_id = self._prev_chunk_id
+                else:
+                    chunk_id = self._prev_chunk_id
+                if chunk_id is not None:
+                    self._prev_sample_index = sample_index
+                    if return_chunk_index:
+                        return chunk_id, prev_chunk_index
+                    return chunk_id
+
+            try:
+                chunk_index = self._incr_2d(*prev_chunk_index)
+                next_entry = self._get_entry_2d(*chunk_index)
+                if sample_index <= next_entry[LAST_INDEX_INDEX]:
+                    chunk_id = next_entry[CHUNK_ID_INDEX]
+            except IndexError:
+                pass
+
+        if chunk_id is None:
+            self._flush_buffer()
+            last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data]
+            shard_index = np.searchsorted(last_idxs, sample_index)
+            shard = self._data[shard_index]
+            idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
+            chunk_id = shard[idx, CHUNK_ID_INDEX]
+            chunk_index = (shard_index, idx)
+
+        self._prev_sample_index = sample_index
+        self._prev_chunk_index = chunk_index
+        self._prev_chunk_id = chunk_id
 
         if return_chunk_index:
-            return id, (shard_idx, chunk_index)
+            return chunk_id, chunk_index
 
-        return id
+        return chunk_id

From 1dfb3c21bb30de0c067645c4814489517f73ac72 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sat, 10 Jul 2021 00:05:37 +0530
Subject: [PATCH 28/79] cache entry

---
 hub/core/meta/encode/chunk_id.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 779a268d8a..a0fb6ba20c 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -76,6 +76,7 @@ def __init__(self, ids=None):
         self._prev_sample_index: Optional[int] = None
         self._prev_chunk_index: Optional[Tuple[int, int]] = None
         self._prev_chunk_id: Optional[int] = None
+        self._prev_entry: Optional[Union[np.ndarray, List[int]]] = None
 
     def _flush_buffer(self):
         if self._buffer:
@@ -330,7 +331,7 @@ def get(
             #     if return_chunk_index:
             #         return self._prev_chunk_id, prev_chunk_index
             #     return self._prev_chunk_id
-            curr_entry = self._get_entry_2d(*prev_chunk_index)
+            curr_entry = self._prev_entry
             if sample_index <= curr_entry[LAST_INDEX_INDEX]:
                 if any(prev_chunk_index):
                     prev_entry = self._get_entry_2d(*(self._decr_2d(*prev_chunk_index)))
@@ -346,9 +347,9 @@ def get(
 
             try:
                 chunk_index = self._incr_2d(*prev_chunk_index)
-                next_entry = self._get_entry_2d(*chunk_index)
-                if sample_index <= next_entry[LAST_INDEX_INDEX]:
-                    chunk_id = next_entry[CHUNK_ID_INDEX]
+                current_entry = self._get_entry_2d(*chunk_index)
+                if sample_index <= current_entry[LAST_INDEX_INDEX]:
+                    chunk_id = current_entry[CHUNK_ID_INDEX]
             except IndexError:
                 pass
 
@@ -358,7 +359,8 @@ def get(
             shard_index = np.searchsorted(last_idxs, sample_index)
             shard = self._data[shard_index]
             idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
-            chunk_id = shard[idx, CHUNK_ID_INDEX]
+            current_entry = shard[idx]
+            chunk_id = current_entry[CHUNK_ID_INDEX]
             chunk_index = (shard_index, idx)
 
         self._prev_sample_index = sample_index

From c8a993167194fe711a40c72c230f12250baf7bb6 Mon Sep 17 00:00:00 2001
From: McCrearyD <mccreary@dyllan.ai>
Date: Fri, 9 Jul 2021 12:33:42 -0700
Subject: [PATCH 29/79] 10s upload speedup

---
 hub/constants.py         |  3 +--
 hub/core/chunk_engine.py | 30 +++++++++++++++++++-----------
 hub/util/keys.py         |  4 +---
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/hub/constants.py b/hub/constants.py
index 3e3591a47f..ad94561856 100644
--- a/hub/constants.py
+++ b/hub/constants.py
@@ -37,10 +37,9 @@
 
 CHUNKS_FOLDER = "chunks"
 
-CHUNK_EXTENSION = "npz"
 ENCODED_CHUNK_NAMES_FOLDER = "chunks_index"
 # unsharded naming will help with backwards compatibility
-ENCODED_CHUNK_NAMES_FILENAME = f"unsharded.{CHUNK_EXTENSION}"
+ENCODED_CHUNK_NAMES_FILENAME = f"unsharded"
 
 ENCODING_DTYPE = np.uint32
 # caclulate the number of bits to shift right when converting a 128-bit uuid into `ENCODING_DTYPE`
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 22b4ce7dd7..27b72311da 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -114,6 +114,8 @@ def __init__(
         # only the last chunk may be less than this
         self.min_chunk_size = self.max_chunk_size // 2
 
+        self._last_chunk = None
+
     @property
     def chunk_id_encoder(self) -> ChunkIdEncoder:
         """Gets the chunk id encoder from cache, if one is not found it creates a blank encoder.
@@ -159,18 +161,16 @@ def num_samples(self) -> int:
             return 0
         return self.chunk_id_encoder.num_samples
 
-    @property
-    def last_chunk(self) -> Optional[Chunk]:
+    def get_last_chunk(self) -> Optional[Chunk]:
         if self.num_chunks == 0:
             return None
 
-        return self.cache.get_cachable(self.last_chunk_key, Chunk)
-
-    @property
-    def last_chunk_key(self) -> str:
         last_chunk_name = self.chunk_id_encoder.get_name_for_chunk(-1)
         last_chunk_key = get_chunk_key(self.key, last_chunk_name)
-        return last_chunk_key
+
+        self._last_chunk = self.cache.get_cachable(last_chunk_key, Chunk)
+        self._last_chunk.key = last_chunk_key
+        return self._last_chunk
 
     @property
     def tensor_meta(self):
@@ -200,7 +200,9 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype):
             self._append_to_new_chunk(buffer, shape)
 
         self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples)
-        self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk))
+        self.cache.update_used_cache_for_path(
+            self._last_chunk.key, len(self._last_chunk)
+        )
 
     def _try_appending_to_last_chunk(
         self, buffer: memoryview, shape: Tuple[int]
@@ -216,7 +218,7 @@ def _try_appending_to_last_chunk(
             bool: True if `buffer` was successfully written to the last chunk, otherwise False.
         """
 
-        last_chunk = self.last_chunk
+        last_chunk = self._last_chunk
         if last_chunk is None:
             return False
 
@@ -261,12 +263,16 @@ def _create_new_chunk(self):
         chunk_id = self.chunk_id_encoder.generate_chunk_id()
         chunk = Chunk()
         chunk_name = ChunkIdEncoder.name_from_id(chunk_id)
-        chunk_key = get_chunk_key(self.key, chunk_name)
-        self.cache[chunk_key] = chunk
+        chunk.key = get_chunk_key(self.key, chunk_name)
+        self.cache[chunk.key] = chunk
+        self._last_chunk = chunk
         return chunk
 
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
         """Formats a batch of `samples` and feeds them into `_append_bytes`."""
+
+        self.get_last_chunk()
+
         uniform = False
         if isinstance(samples, np.ndarray):
             uniform = True
@@ -312,6 +318,8 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
     def append(self, sample: SampleValue):
         """Formats a single `sample` (compresseses/decompresses if applicable) and feeds it into `_append_bytes`."""
 
+        self.get_last_chunk()
+
         if isinstance(sample, Sample):
             # has to decompress to read the array's shape and dtype
             # might be able to optimize this away
diff --git a/hub/util/keys.py b/hub/util/keys.py
index 95d9a7069a..f10fb6649e 100644
--- a/hub/util/keys.py
+++ b/hub/util/keys.py
@@ -5,9 +5,7 @@
 
 
 def get_chunk_key(key: str, chunk_name: str) -> str:
-    return posixpath.join(
-        key, constants.CHUNKS_FOLDER, f"{chunk_name}.{constants.CHUNK_EXTENSION}"
-    )
+    return posixpath.join(key, constants.CHUNKS_FOLDER, f"{chunk_name}")
 
 
 def get_dataset_meta_key() -> str:

From 16d5fb9f9f60ceec8f496678304ff86f5bd4942e Mon Sep 17 00:00:00 2001
From: McCrearyD <mccreary@dyllan.ai>
Date: Fri, 9 Jul 2021 12:37:35 -0700
Subject: [PATCH 30/79] fix mypy

---
 hub/core/chunk_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index f93e2cb47a..b7483aef52 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -200,7 +200,7 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype):
             self._append_to_new_chunk(buffer, shape)
 
         self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples)
-        self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk))
+        self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk))  # type: ignore
 
     def _try_appending_to_last_chunk(
         self, buffer: memoryview, shape: Tuple[int]

From 121753de9863e397d7b3bc733c6b6c44f3d79ace Mon Sep 17 00:00:00 2001
From: McCrearyD <mccreary@dyllan.ai>
Date: Fri, 9 Jul 2021 14:19:48 -0700
Subject: [PATCH 31/79] load chunk ID encoder

---
 hub/core/chunk.py        |  8 ++++----
 hub/core/chunk_engine.py | 37 +++++++++++++++----------------------
 2 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index 7ef7ad045d..7c5a918ca3 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -53,11 +53,11 @@ def __init__(
         self.byte_positions_encoder = BytePositionsEncoder(encoded_byte_positions)
 
         self._data: List[memoryview] = []
-        self._len_data: int = 0
+        self._num_data_bytes: int = 0  # replaces: sum(map(len, self._data))
 
         if data is not None:
             self._data.append(data)
-            self._len_data += len(data)
+            self._num_data_bytes += len(data)
 
     @property
     def memoryview_data(self):
@@ -126,7 +126,7 @@ def num_samples(self):
 
     @property
     def num_data_bytes(self):
-        return self._len_data
+        return self._num_data_bytes
 
     def is_under_min_space(self, min_data_bytes_target: int) -> bool:
         """If this chunk's data is less than `min_data_bytes_target`, returns True."""
@@ -161,7 +161,7 @@ def append_sample(self, buffer: memoryview, max_data_bytes: int, shape: Tuple[in
 
         # note: incoming_num_bytes can be 0 (empty sample)
         self._data.append(buffer)
-        self._len_data += len(buffer)
+        self._num_data_bytes += len(buffer)
         self.update_headers(incoming_num_bytes, shape)
 
     def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 6cbf3b1dd5..5d5cf21977 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -114,10 +114,10 @@ def __init__(
         # only the last chunk may be less than this
         self.min_chunk_size = self.max_chunk_size // 2
 
-        self._last_chunk = None
+        self.get_chunk_id_encoder()
+        self.get_last_chunk()
 
-    @property
-    def chunk_id_encoder(self) -> ChunkIdEncoder:
+    def get_chunk_id_encoder(self) -> ChunkIdEncoder:
         """Gets the chunk id encoder from cache, if one is not found it creates a blank encoder.
         For more information on what `ChunkIdEncoder` is used for, see the `__init__` docstring.
 
@@ -130,46 +130,39 @@ def chunk_id_encoder(self) -> ChunkIdEncoder:
         """
 
         key = get_chunk_id_encoder_key(self.key)
-        if not self.chunk_id_encoder_exists:
+        if key in self.cache:
+            self.chunk_id_encoder = self.cache.get_cachable(key, ChunkIdEncoder)
 
+        else:
             # 1 because we always update the meta information before writing the samples (to account for potentially corrupted data in the future)
             if self.tensor_meta.length > 1:
                 raise CorruptedMetaError(
                     f"Tensor length is {self.tensor_meta.length}, but could not find the chunk id encoder."
                 )
 
-            enc = ChunkIdEncoder()
-            self.cache[key] = enc
-            return enc
+            self.chunk_id_encoder = ChunkIdEncoder()
+            self.cache[key] = self.chunk_id_encoder
 
-        enc = self.cache.get_cachable(key, ChunkIdEncoder)
-        return enc
-
-    @property
-    def chunk_id_encoder_exists(self) -> bool:
-        return get_chunk_id_encoder_key(self.key) in self.cache
+        return self.chunk_id_encoder
 
     @property
     def num_chunks(self) -> int:
-        if not self.chunk_id_encoder_exists:
-            return 0
         return self.chunk_id_encoder.num_chunks
 
     @property
     def num_samples(self) -> int:
-        if not self.chunk_id_encoder_exists:
-            return 0
         return self.chunk_id_encoder.num_samples
 
     def get_last_chunk(self) -> Optional[Chunk]:
         if self.num_chunks == 0:
-            return None
+            self._last_chunk = None
+        else:
+            last_chunk_name = self.chunk_id_encoder.get_name_for_chunk(-1)
+            last_chunk_key = get_chunk_key(self.key, last_chunk_name)
 
-        last_chunk_name = self.chunk_id_encoder.get_name_for_chunk(-1)
-        last_chunk_key = get_chunk_key(self.key, last_chunk_name)
+            self._last_chunk = self.cache.get_cachable(last_chunk_key, Chunk)
+            self._last_chunk.key = last_chunk_key
 
-        self._last_chunk = self.cache.get_cachable(last_chunk_key, Chunk)
-        self._last_chunk.key = last_chunk_key
         return self._last_chunk
 
     @property

From 7c2221e4aa9ca6b06f877a85347a7707960bd6e2 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sat, 10 Jul 2021 05:10:28 +0530
Subject: [PATCH 32/79] mypass binsearch

---
 hub/core/chunk_engine.py                      |  16 +-
 hub/core/meta/encode/chunk_id.py              | 241 ++++++++++++------
 .../encode/tests/test_chunk_id_encoder.py     |   2 +-
 3 files changed, 167 insertions(+), 92 deletions(-)

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 22b4ce7dd7..30e2d61991 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -200,7 +200,7 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype):
             self._append_to_new_chunk(buffer, shape)
 
         self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples)
-        self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk))
+        self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk))  # type: ignore
 
     def _try_appending_to_last_chunk(
         self, buffer: memoryview, shape: Tuple[int]
@@ -345,27 +345,23 @@ def numpy(
         last_shape = None
         samples = []
 
-        for global_sample_index in index.values[0].indices(length):
-            chunk_id = enc[global_sample_index]
+        for chunk_id, local_sample_index in enc.iter(index.values[0].value):
             chunk_name = ChunkIdEncoder.name_from_id(chunk_id)
             chunk_key = get_chunk_key(self.key, chunk_name)
             chunk = self.cache.get_cachable(chunk_key, Chunk)
-            sample = self.read_sample_from_chunk(global_sample_index, chunk)
+            sample = self.read_sample_from_chunk(chunk, local_sample_index)
             shape = sample.shape
-
             if not aslist and last_shape is not None:
                 if shape != last_shape:
                     raise DynamicTensorNumpyError(self.key, index, "shape")
-
             samples.append(sample)
             last_shape = shape
-
         return _format_samples(samples, index, aslist)
 
     def read_sample_from_chunk(
-        self, global_sample_index: int, chunk: Chunk
+        self, chunk: Chunk, local_sample_index: int
     ) -> np.ndarray:
-        """Read a sample from a chunk, converts the global index into a local index. Handles decompressing if applicable."""
+        """Read a sample from a chunk, given the local index. Handles decompressing if applicable."""
 
         expect_compressed = self.tensor_meta.sample_compression != UNCOMPRESSED
         dtype = self.tensor_meta.dtype
@@ -373,7 +369,7 @@ def read_sample_from_chunk(
         enc = self.chunk_id_encoder
 
         # buffer = chunk.memoryview_data
-        local_sample_index = enc.get_local_sample_index(global_sample_index)
+        # local_sample_index = enc.get_local_sample_index(global_sample_index)
         shape = chunk.shapes_encoder[local_sample_index]
         sb, eb = chunk.byte_positions_encoder[local_sample_index]
 
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index a0fb6ba20c..a2d73e5595 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -7,6 +7,8 @@
 import numpy as np
 from uuid import uuid4
 from hub.core.lowlevel import encode_chunkids, decode_chunkids
+from hub.core.index import IndexEntry
+import math
 
 
 # these constants are for accessing the data layout. see the `ChunkIdEncoder` docstring.
@@ -73,16 +75,9 @@ def __init__(self, ids=None):
         self._data: List[np.ndarray] = [] if ids is None else [ids]
         self._num_chunks = sum(map(len, self._data))
 
-        self._prev_sample_index: Optional[int] = None
-        self._prev_chunk_index: Optional[Tuple[int, int]] = None
-        self._prev_chunk_id: Optional[int] = None
-        self._prev_entry: Optional[Union[np.ndarray, List[int]]] = None
-
     def _flush_buffer(self):
         if self._buffer:
             self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE))
-        if self._prev_chunk_index and self._prev_chunk_index[0] < 0:
-            self._prev_chunk_index = (len(self._data) - 1, self._prev_chunk_index[1])
         self._buffer.clear()
 
     def _get_2d_idx(self, idx: int) -> Tuple[int, int]:
@@ -145,7 +140,7 @@ def get_entry(self, idx: int):
     def _get_entry_2d(self, x: int, y: int):
         return self._buffer[y] if x < 0 else self._data[x][y]
 
-    def _decr_2d(self, x, y):
+    def _decr_2d(self, x: int, y: int) -> Tuple[int, int]:
         if x < 0:
             if y:
                 return x, y - 1
@@ -157,7 +152,7 @@ def _decr_2d(self, x, y):
             return x, len(self._data[x]) - 1
         raise IndexError()
 
-    def _incr_2d(self, x: int, y:  int):
+    def _incr_2d(self, x: int, y: int) -> Tuple[int, int]:
         if x < 0:
             return x, y + 1
         # assert y < len(self._data[x])
@@ -167,6 +162,13 @@ def _incr_2d(self, x: int, y:  int):
             return x + 1, 0
         return x, y + 1
 
+    def _is_origin(self, x: int, y: int) -> bool:
+        if not x and not y:
+            return True
+        if x < 0 and not self._data and not y:
+            return True
+        return False
+
     @property
     def last_entry(self) -> Union[np.ndarray, List[int]]:
         if self._buffer:
@@ -180,12 +182,12 @@ def last_index(self) -> int:
         last_entry = self.last_entry
         if last_entry is None:
             return -1
-        return last_entry[LAST_INDEX_INDEX]
+        return int(last_entry[LAST_INDEX_INDEX])
 
     @property
     def num_samples(self) -> int:
         if self._buffer:
-            return self._buffer[-1][LAST_INDEX_INDEX] + 1
+            return int(self._buffer[-1][LAST_INDEX_INDEX] + 1)
         elif self._data:
             return int(self._data[-1][-1, LAST_INDEX_INDEX] + 1)
         return 0
@@ -218,7 +220,6 @@ def register_samples_to_last_chunk_id(self, num_samples: int):
             ChunkIdEncoderError: Must call `generate_chunk_id` before registering samples.
             ChunkIdEncoderError: `num_samples` can only be 0 if it is able to be a sample continuation accross chunks.
         """
-
         if num_samples < 0:
             raise ValueError(
                 f"Cannot register negative num samples. Got: {num_samples}"
@@ -271,34 +272,16 @@ def get_local_sample_index(self, global_sample_index: int) -> int:
             int: local index value between 0 and the amount of samples the chunk contains - 1.
         """
 
-        _, (shard_index, chunk_index) = self.get(global_sample_index, return_chunk_index=True)  # type: ignore
-
-        if not shard_index and not chunk_index:
-            return global_sample_index
-
-        if shard_index < 0:
-            if chunk_index:
-                current_entry = self._buffer[chunk_index - 1]
-            else:
-                current_entry = self._data[-1][-1]
-        else:
-            if chunk_index:
-                chunk_index -= 1
-            else:
-                shard_index -= 1
-                chunk_index = len(self._data[shard_index]) - 1
-            current_entry = self._data[shard_index][
-                chunk_index
-            ]
-        last_num_samples = current_entry[LAST_INDEX_INDEX] + 1
-
-        return global_sample_index - int(last_num_samples)
+        return self.get(global_sample_index, return_local_sample_index=True)[1]
 
     def __getitem__(self, sample_index: int) -> int:
         return self.get(sample_index)  # type: ignore
 
     def get(
-        self, sample_index: int, return_chunk_index: bool = False
+        self,
+        sample_index: int,
+        return_chunk_index: bool = False,
+        return_local_sample_index: bool = False,
     ) -> Union[int, Tuple[int, Tuple[int, int]]]:
         """Get the ID for the chunk that `sample_index` is stored in.
         To get the name of the chunk, use `name_from_id`.
@@ -323,51 +306,147 @@ def get(
         if sample_index < 0:
             sample_index = (self.num_samples) + sample_index
 
-        chunk_id = None
-        if self._prev_chunk_index:
-            # Optimization for sequential look up
-            prev_chunk_index = self._prev_chunk_index
-            # if sample_index == self._prev_sample_index:
-            #     if return_chunk_index:
-            #         return self._prev_chunk_id, prev_chunk_index
-            #     return self._prev_chunk_id
-            curr_entry = self._prev_entry
-            if sample_index <= curr_entry[LAST_INDEX_INDEX]:
-                if any(prev_chunk_index):
-                    prev_entry = self._get_entry_2d(*(self._decr_2d(*prev_chunk_index)))
-                    if sample_index > prev_entry[LAST_INDEX_INDEX]:
-                        chunk_id = self._prev_chunk_id
-                else:
-                    chunk_id = self._prev_chunk_id
-                if chunk_id is not None:
-                    self._prev_sample_index = sample_index
-                    if return_chunk_index:
-                        return chunk_id, prev_chunk_index
-                    return chunk_id
-
-            try:
-                chunk_index = self._incr_2d(*prev_chunk_index)
-                current_entry = self._get_entry_2d(*chunk_index)
-                if sample_index <= current_entry[LAST_INDEX_INDEX]:
-                    chunk_id = current_entry[CHUNK_ID_INDEX]
-            except IndexError:
-                pass
-
-        if chunk_id is None:
+        self._flush_buffer()
+        last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data]
+        shard_index = np.searchsorted(last_idxs, sample_index)
+        shard = self._data[shard_index]
+        idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
+        current_entry = shard[idx]
+        chunk_id = current_entry[CHUNK_ID_INDEX]
+        chunk_index = (shard_index, idx)
+        ret = [chunk_id]
+        if return_chunk_index:
+            ret.append(chunk_index)
+        if return_local_sample_index:
+            if any(chunk_index):
+                prev_entry = self._get_entry_2d(*self._decr_2d(*chunk_index))
+                local_sample_index = (
+                    sample_index - int(prev_entry[LAST_INDEX_INDEX]) - 1
+                )
+            else:
+                local_sample_index = sample_index
+            ret.append(local_sample_index)
+
+        return tuple(ret)
+
+    def iter(self, index: Union[int, slice, tuple] = slice(None)):
+        if isinstance(index, int):
+            yield self.get(index, return_local_sample_index=True)
+        elif isinstance(index, slice):
+            start = 0 if index.start is None else index.start
+            stop = self.num_samples if index.stop is None else index.stop
+            step = 1 if index.step is None else index.step
+            assert isinstance(start, int)
+            assert isinstance(stop, int)
+            assert isinstance(step, int)
+            assert step != 0
+            if step > 0:
+                total = math.ceil((stop - start) / step)
+                forward = True
+            else:
+                step = -step
+                total = math.ceil((stop - start) / step)
+                start, stop = stop - 1, start
+                forward = False
+            if not total:
+                return
+            n = 0
             self._flush_buffer()
-            last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data]
-            shard_index = np.searchsorted(last_idxs, sample_index)
+            chunk_id, (shard_index, chunk_index), local_sample_index = self.get(
+                start, return_chunk_index=True, return_local_sample_index=True
+            )
             shard = self._data[shard_index]
-            idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
-            current_entry = shard[idx]
-            chunk_id = current_entry[CHUNK_ID_INDEX]
-            chunk_index = (shard_index, idx)
-
-        self._prev_sample_index = sample_index
-        self._prev_chunk_index = chunk_index
-        self._prev_chunk_id = chunk_id
-
-        if return_chunk_index:
-            return chunk_id, chunk_index
-
-        return chunk_id
+            yield chunk_id, local_sample_index
+            n += 1
+            if n == total:
+                return
+            ctr = Counter(step)
+            if forward:
+                last_index = int(shard[chunk_index, LAST_INDEX_INDEX])
+                for i in range(local_sample_index + 1, last_index + 1):
+                    if ctr():
+                        yield chunk_id, i
+                        n += 1
+                        if n == total:
+                            return
+                for chunk_index in range(chunk_index + 1, len(shard)):
+                    entry = shard[chunk_index]
+                    chunk_id = entry[CHUNK_ID_INDEX]
+                    new_last_index = int(entry[LAST_INDEX_INDEX])
+                    for i in range(new_last_index - last_index):
+                        if ctr():
+                            yield chunk_id, i
+                            n += 1
+                            if n == total:
+                                return
+                    last_index = new_last_index
+                for shard_index in range(shard_index + 1, len(self._data)):
+                    shard = self._data[shard_index]
+                    for entry in shard:
+                        chunk_id = entry[CHUNK_ID_INDEX]
+                        new_last_index = int(entry[LAST_INDEX_INDEX])
+                        for i in range(new_last_index - last_index):
+                            if ctr():
+                                yield chunk_id, i
+                                n += 1
+                                if n == total:
+                                    return
+                        last_index = new_last_index
+            else:
+                last_index = int(shard[chunk_index, LAST_INDEX_INDEX])
+                for local_sample_index in range(local_sample_index - 1, -1, -1):
+                    if ctr():
+                        yield chunk_id, local_sample_index
+                        n += 1
+                        if n == total:
+                            return
+                for chunk_index in range(chunk_index - 1, -1, -1):
+                    entry = shard[chunk_index]
+                    chunk_id = entry[CHUNK_ID_INDEX]
+                    last_index = entry[LAST_INDEX_INDEX]
+                    if chunk_index:
+                        last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX]
+                    elif shard_index:
+                        last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX]
+                    for local_sample_index in range(last_index, -1, -1):
+                        if ctr():
+                            yield chunk_id, local_sample_index
+                            n += 1
+                            if n == total:
+                                return
+                for shard_index in range(shard_index - 1, -1, -1):
+                    shard = self._data[shard_index]
+                    for chunk_index in range(len(shard) - 1, -1, -1):
+                        entry = shard[chunk_index]
+                        chunk_id = entry[CHUNK_ID_INDEX]
+                        last_index = entry[LAST_INDEX_INDEX]
+                        if chunk_index:
+                            last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX]
+                        elif shard_index:
+                            last_index -= self._data[shard_index - 1][
+                                -1, LAST_INDEX_INDEX
+                            ]
+                        for local_sample_index in range(last_index, -1, -1):
+                            if ctr():
+                                yield chunk_id, local_sample_index
+                                n += 1
+                                if n == total:
+                                    return
+        elif isinstance(index, tuple):
+            for i in index:
+                # Random access
+                yield self.get(i, return_local_sample_index=True)
+
+
+class Counter:
+    # TODO: refac this
+    def __init__(self, n: int) -> None:
+        self.n = n
+        self.i = 0
+
+    def __call__(self):
+        self.i += 1
+        if self.i == self.n:
+            self.i = 0
+            return True
+        return False
diff --git a/hub/core/meta/encode/tests/test_chunk_id_encoder.py b/hub/core/meta/encode/tests/test_chunk_id_encoder.py
index aa891f27fb..d34a9bd20a 100644
--- a/hub/core/meta/encode/tests/test_chunk_id_encoder.py
+++ b/hub/core/meta/encode/tests/test_chunk_id_encoder.py
@@ -49,7 +49,7 @@ def test_trivial():
 
     # test local indexing
     assert enc.get_local_sample_index(0) == 0
-    assert enc.get_local_sample_index(1) == 1
+    assert enc.get_local_sample_index(1) == 1, (enc._data, enc._buffer)
     assert enc.get_local_sample_index(29) == 29
     assert enc.get_local_sample_index(30) == 0
     assert enc.get_local_sample_index(31) == 0

From 60b73d3f9edadd6a1fe693aa7b7977ebe9a9794a Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sat, 10 Jul 2021 05:14:59 +0530
Subject: [PATCH 33/79] format

---
 hub/core/meta/encode/chunk_id.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index a2d73e5595..0f24f64899 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -272,7 +272,7 @@ def get_local_sample_index(self, global_sample_index: int) -> int:
             int: local index value between 0 and the amount of samples the chunk contains - 1.
         """
 
-        return self.get(global_sample_index, return_local_sample_index=True)[1]
+        return self.get(global_sample_index, return_local_sample_index=True)[1]  # type: ignore
 
     def __getitem__(self, sample_index: int) -> int:
         return self.get(sample_index)  # type: ignore
@@ -282,7 +282,7 @@ def get(
         sample_index: int,
         return_chunk_index: bool = False,
         return_local_sample_index: bool = False,
-    ) -> Union[int, Tuple[int, Tuple[int, int]]]:
+    ) -> Union[int, Tuple[int, Tuple[int, int]], Tuple[int, Tuple[int, int], int], Tuple[int, int]]:
         """Get the ID for the chunk that `sample_index` is stored in.
         To get the name of the chunk, use `name_from_id`.
 
@@ -327,7 +327,7 @@ def get(
                 local_sample_index = sample_index
             ret.append(local_sample_index)
 
-        return tuple(ret)
+        return tuple(ret)  # type: ignore
 
     def iter(self, index: Union[int, slice, tuple] = slice(None)):
         if isinstance(index, int):
@@ -352,7 +352,7 @@ def iter(self, index: Union[int, slice, tuple] = slice(None)):
                 return
             n = 0
             self._flush_buffer()
-            chunk_id, (shard_index, chunk_index), local_sample_index = self.get(
+            chunk_id, (shard_index, chunk_index), local_sample_index = self.get(  # type: ignore
                 start, return_chunk_index=True, return_local_sample_index=True
             )
             shard = self._data[shard_index]

From 8cb1bce7a3e9ece12c6fadf8c027c6e1d9e78b4e Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sat, 10 Jul 2021 05:25:58 +0530
Subject: [PATCH 34/79] rem debug line

---
 hub/core/meta/encode/tests/test_chunk_id_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hub/core/meta/encode/tests/test_chunk_id_encoder.py b/hub/core/meta/encode/tests/test_chunk_id_encoder.py
index d34a9bd20a..aa891f27fb 100644
--- a/hub/core/meta/encode/tests/test_chunk_id_encoder.py
+++ b/hub/core/meta/encode/tests/test_chunk_id_encoder.py
@@ -49,7 +49,7 @@ def test_trivial():
 
     # test local indexing
     assert enc.get_local_sample_index(0) == 0
-    assert enc.get_local_sample_index(1) == 1, (enc._data, enc._buffer)
+    assert enc.get_local_sample_index(1) == 1
     assert enc.get_local_sample_index(29) == 29
     assert enc.get_local_sample_index(30) == 0
     assert enc.get_local_sample_index(31) == 0

From 91596e494e7ce7e8fa389f50cfe45f296f58060a Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sat, 10 Jul 2021 06:19:18 +0530
Subject: [PATCH 35/79] fr_optimizations_2

---
 hub/core/chunk_engine.py         |  2 +-
 hub/core/meta/encode/chunk_id.py | 42 ++++++++++++++++++++++++++------
 hub/integrations/pytorch.py      |  8 +++---
 3 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 3038bb8f5a..99dd8b4a9e 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -195,7 +195,7 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype):
         self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples)
 
         last_chunk = self._last_chunk
-        key = last_chunk.key
+        key = last_chunk.key  # type: ignore
         self.cache.update_used_cache_for_path(key, len(last_chunk))  # type: ignore
 
     def _try_appending_to_last_chunk(
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 0f24f64899..8166d754cc 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -75,9 +75,16 @@ def __init__(self, ids=None):
         self._data: List[np.ndarray] = [] if ids is None else [ids]
         self._num_chunks = sum(map(len, self._data))
 
+        self._prev_sample_index: Optional[int] = None
+        self._prev_chunk_id: Optional[int] = None
+        self._prev_chunk_index: Optional[Tuple[int, int]] = None
+        self._prev_entry: Optional[Union[np.ndarray, List[int]]] = None
+
     def _flush_buffer(self):
         if self._buffer:
             self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE))
+        if self._prev_chunk_index and self._prev_chunk_index[0] < 0:
+            self._prev_chunk_index = (len(self._data) -1, self._prev_chunk_index[1])
         self._buffer.clear()
 
     def _get_2d_idx(self, idx: int) -> Tuple[int, int]:
@@ -306,14 +313,33 @@ def get(
         if sample_index < 0:
             sample_index = (self.num_samples) + sample_index
 
-        self._flush_buffer()
-        last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data]
-        shard_index = np.searchsorted(last_idxs, sample_index)
-        shard = self._data[shard_index]
-        idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
-        current_entry = shard[idx]
-        chunk_id = current_entry[CHUNK_ID_INDEX]
-        chunk_index = (shard_index, idx)
+        if self._prev_sample_index is not None and sample_index == self._prev_sample_index + 1:
+            if sample_index > self._prev_entry[LAST_INDEX_INDEX]:
+                chunk_index = self._incr_2d(*self._prev_chunk_index)
+                current_entry = self._get_entry_2d(*chunk_index)
+                chunk_id = current_entry[CHUNK_ID_INDEX]
+                self._prev_entry = current_entry
+                self._prev_chunk_id = chunk_id
+            else:
+                chunk_id = self._prev_chunk_id
+                chunk_index = self._prev_chunk_index
+        else:
+            self._flush_buffer()
+            last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data]
+            shard_index = np.searchsorted(last_idxs, sample_index)
+            shard = self._data[shard_index]
+            idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
+            current_entry = shard[idx]
+            chunk_id = current_entry[CHUNK_ID_INDEX]
+            chunk_index = (shard_index, idx)
+            self._prev_entry = current_entry
+            self._prev_chunk_id = chunk_id
+
+        self._prev_sample_index = sample_index
+        self._prev_chunk_index = chunk_index
+
+        if not return_chunk_index and not return_local_sample_index:
+            return chunk_id
         ret = [chunk_id]
         if return_chunk_index:
             ret.append(chunk_index)
diff --git a/hub/integrations/pytorch.py b/hub/integrations/pytorch.py
index a1b90a912f..2d6a2f360d 100644
--- a/hub/integrations/pytorch.py
+++ b/hub/integrations/pytorch.py
@@ -243,10 +243,10 @@ def _generate_shared_memory_names(self, chunk_names: Set[str]):
             ls.append(f"al_{self.last_chunk_num_generated}")
         return ls
 
-    def _numpy_from_chunk(self, index: int, key: str, chunk):
+    def _numpy_from_chunk(self, chunk, key: str, local_index: int):
         """Takes a list of chunks and returns a numpy array from it"""
         chunk_engine = self.all_chunk_engines[key]
-        value = chunk_engine.read_sample_from_chunk(index, chunk)
+        value = chunk_engine.read_sample_from_chunk(chunk, local_index)
 
         # typecast if incompatible with pytorch
         if value.dtype == "uint16":
@@ -279,14 +279,14 @@ def _get_data_from_chunks(
             actual_index = self.index_offset + i
             # TODO change this once it returns list/set of str
             chunk_engine = self.all_chunk_engines[key]
-            chunk_id = chunk_engine.chunk_id_encoder[actual_index]
+            chunk_id, local_index = chunk_engine.chunk_id_encoder.get(actual_index, return_local_sample_index=True)
             chunk_name = chunk_engine.chunk_id_encoder.name_from_id(chunk_id)  # type: ignore
             if chunk_name not in chunk_map:
                 self.last_index_meta[key] = i - 1
                 return
             chunk = chunk_map[chunk_name]
             self.all_index_value_maps[key][i] = self._numpy_from_chunk(
-                actual_index, key, chunk
+                chunk, key, local_index
             )
 
         self.last_index_meta[key] = len(self) - 1

From 2f28314164b564a7afce9bc46887c548f6559db2 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sun, 11 Jul 2021 14:01:36 +0530
Subject: [PATCH 36/79] optimize tensor iteration

---
 hub/api/tensor.py                | 15 +++++++++++++--
 hub/core/chunk_engine.py         |  9 ++++++---
 hub/core/meta/encode/chunk_id.py | 20 ++++++++++++++------
 hub/integrations/pytorch.py      |  4 +++-
 4 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 1f24783900..3af85e05e3 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -43,6 +43,8 @@ def __init__(
 
         self.chunk_engine = ChunkEngine(self.key, self.storage)
 
+        self._sample: Optional[Tuple(int, int)] = None
+
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
         """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array,
         or a sequence of `hub.load` outputs, which can be used to load files. See examples down below.
@@ -71,6 +73,7 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
                 The length should be equal to the number of samples to add.
         """
         self.chunk_engine.extend(samples)
+        self._sample = None
 
     def append(
         self,
@@ -192,8 +195,12 @@ def __setitem__(self, item: Union[int, slice], value: np.ndarray):
         raise NotImplementedError("Tensor update not currently supported!")
 
     def __iter__(self):
-        for i in range(len(self)):
-            yield self[i]
+        for i, (chunk_id, local_sample_index) in enumerate(
+            self.chunk_engine.chunk_id_encoder.iter(self.index.values[0].value)
+        ):
+            tensor_i = Tensor(self.key, self.storage, index=self.index[i])
+            tensor_i._sample = chunk_id, local_sample_index
+            yield tensor_i
 
     def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]:
         """Computes the contents of the tensor in numpy format.
@@ -209,6 +216,10 @@ def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]:
         Returns:
             A numpy array containing the data represented by this tensor.
         """
+        if self._sample:
+            chunk_id, local_sample_index = self._sample
+            chunk = self.chunk_engine.get_chunk_from_id(chunk_id)
+            return self.chunk_engine.read_sample_from_chunk(chunk, local_sample_index)
 
         return self.chunk_engine.numpy(self.index, aslist=aslist)
 
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 99dd8b4a9e..b50eadb850 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -348,9 +348,7 @@ def numpy(
         samples = []
 
         for chunk_id, local_sample_index in enc.iter(index.values[0].value):
-            chunk_name = ChunkIdEncoder.name_from_id(chunk_id)
-            chunk_key = get_chunk_key(self.key, chunk_name)
-            chunk = self.cache.get_cachable(chunk_key, Chunk)
+            chunk = self.get_chunk_from_id(chunk_id)
             sample = self.read_sample_from_chunk(chunk, local_sample_index)
             shape = sample.shape
             if not aslist and last_shape is not None:
@@ -360,6 +358,11 @@ def numpy(
             last_shape = shape
         return _format_samples(samples, index, aslist)
 
+    def get_chunk_from_id(self, chunk_id: int) -> Chunk:
+        chunk_name = ChunkIdEncoder.name_from_id(chunk_id)
+        chunk_key = get_chunk_key(self.key, chunk_name)
+        return self.cache.get_cachable(chunk_key, Chunk)
+
     def read_sample_from_chunk(
         self, chunk: Chunk, local_sample_index: int
     ) -> np.ndarray:
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 8166d754cc..1472e5d053 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -84,7 +84,7 @@ def _flush_buffer(self):
         if self._buffer:
             self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE))
         if self._prev_chunk_index and self._prev_chunk_index[0] < 0:
-            self._prev_chunk_index = (len(self._data) -1, self._prev_chunk_index[1])
+            self._prev_chunk_index = (len(self._data) - 1, self._prev_chunk_index[1])
         self._buffer.clear()
 
     def _get_2d_idx(self, idx: int) -> Tuple[int, int]:
@@ -289,7 +289,12 @@ def get(
         sample_index: int,
         return_chunk_index: bool = False,
         return_local_sample_index: bool = False,
-    ) -> Union[int, Tuple[int, Tuple[int, int]], Tuple[int, Tuple[int, int], int], Tuple[int, int]]:
+    ) -> Union[
+        int,
+        Tuple[int, Tuple[int, int]],
+        Tuple[int, Tuple[int, int], int],
+        Tuple[int, int],
+    ]:
         """Get the ID for the chunk that `sample_index` is stored in.
         To get the name of the chunk, use `name_from_id`.
 
@@ -313,16 +318,19 @@ def get(
         if sample_index < 0:
             sample_index = (self.num_samples) + sample_index
 
-        if self._prev_sample_index is not None and sample_index == self._prev_sample_index + 1:
-            if sample_index > self._prev_entry[LAST_INDEX_INDEX]:
-                chunk_index = self._incr_2d(*self._prev_chunk_index)
+        if (
+            self._prev_sample_index is not None
+            and sample_index == self._prev_sample_index + 1
+        ):
+            if sample_index > self._prev_entry[LAST_INDEX_INDEX]:  # type: ignore
+                chunk_index = self._incr_2d(*self._prev_chunk_index)  # type: ignore
                 current_entry = self._get_entry_2d(*chunk_index)
                 chunk_id = current_entry[CHUNK_ID_INDEX]
                 self._prev_entry = current_entry
                 self._prev_chunk_id = chunk_id
             else:
                 chunk_id = self._prev_chunk_id
-                chunk_index = self._prev_chunk_index
+                chunk_index = self._prev_chunk_index  # type: ignore
         else:
             self._flush_buffer()
             last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data]
diff --git a/hub/integrations/pytorch.py b/hub/integrations/pytorch.py
index 2d6a2f360d..61eddb504c 100644
--- a/hub/integrations/pytorch.py
+++ b/hub/integrations/pytorch.py
@@ -279,7 +279,9 @@ def _get_data_from_chunks(
             actual_index = self.index_offset + i
             # TODO change this once it returns list/set of str
             chunk_engine = self.all_chunk_engines[key]
-            chunk_id, local_index = chunk_engine.chunk_id_encoder.get(actual_index, return_local_sample_index=True)
+            chunk_id, local_index = chunk_engine.chunk_id_encoder.get(
+                actual_index, return_local_sample_index=True
+            )
             chunk_name = chunk_engine.chunk_id_encoder.name_from_id(chunk_id)  # type: ignore
             if chunk_name not in chunk_map:
                 self.last_index_meta[key] = i - 1

From 7fd01931b5727fe1b2fcddfb7e4c3ffcc9f816f8 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sun, 11 Jul 2021 15:43:51 +0530
Subject: [PATCH 37/79] dsiter

---
 hub/api/dataset.py        | 17 +++++++++++++----
 hub/api/tensor.py         | 12 ++++++++++++
 hub/api/tests/test_api.py | 27 +++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 1c839b1385..6bb123e95a 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -38,6 +38,7 @@ def __init__(
         storage: Optional[StorageProvider] = None,
         public: Optional[bool] = True,
         token: Optional[str] = None,
+        _tensors: Optional[Dict[str, Tensor]] = None
     ):
         """Initializes a new or existing dataset.
 
@@ -89,7 +90,7 @@ def __init__(
         self.storage.autoflush = True
         self.index = index or Index()
 
-        self.tensors: Dict[str, Tensor] = {}
+        self.tensors: Dict[str, Tensor] =  _tensors if _tensors else {}
 
         self._token = token
 
@@ -213,6 +214,14 @@ def __setattr__(self, name: str, value):
     def __iter__(self):
         for i in range(len(self)):
             yield self[i]
+        return
+        tensor_names = list(self.tensors)
+        tensors_sliced = [t[self.index][:len(self)] for t in self.tensors.values()]
+        num_tensors = len(tensor_names)
+        for tensors in zip(*tensors_sliced):
+            tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)}
+            ds = Dataset(read_only=True, storage=self.storage, _tensors=tensors)
+            yield ds
 
     def _load_meta(self):
         meta_key = get_dataset_meta_key()
@@ -220,9 +229,9 @@ def _load_meta(self):
         if dataset_exists(self.storage):
             logger.info(f"{self.path} loaded successfully.")
             self.meta = self.storage.get_cachable(meta_key, DatasetMeta)
-
-            for tensor_name in self.meta.tensors:
-                self.tensors[tensor_name] = Tensor(tensor_name, self.storage)
+            if not self.tensors:
+                for tensor_name in self.meta.tensors:
+                    self.tensors[tensor_name] = Tensor(tensor_name, self.storage)
 
         elif len(self.storage) > 0:
             # dataset does not exist, but the path was not empty
diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 3af85e05e3..53360f9b38 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -10,6 +10,8 @@
 from hub.util.exceptions import TensorDoesNotExistError, InvalidKeyTypeError
 from hub.core.index import Index
 
+import warnings
+
 
 class Tensor:
     def __init__(
@@ -44,6 +46,7 @@ def __init__(
         self.chunk_engine = ChunkEngine(self.key, self.storage)
 
         self._sample: Optional[Tuple(int, int)] = None
+        self._index_history: List[int] = []
 
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
         """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array,
@@ -189,6 +192,15 @@ def __getitem__(
     ):
         if not isinstance(item, (int, slice, list, tuple, Index)):
             raise InvalidKeyTypeError(item)
+        hist = self._index_history
+        if isinstance(item, int):
+            hist.append(item)
+            if len(hist) == 100:
+                if hist == list(range(hist[0], hist[-1] + 1, hist[1] - hist[0])):
+                    warnings.warn("Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to iterate through the tensor.")
+                hist.clear()
+        else:
+            self._index_history.clear()
         return Tensor(self.key, self.storage, index=self.index[item])
 
     def __setitem__(self, item: Union[int, slice], value: np.ndarray):
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 2721179ed7..f70ca54901 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -17,6 +17,8 @@
 MAX_FLOAT_DTYPE = np.float_.__name__
 
 
+
+
 def test_persist_local(local_storage):
     ds = Dataset(local_storage.root, local_cache_size=512)
     ds.create_tensor("image")
@@ -465,3 +467,28 @@ def test_hub_cloud_dataset():
         np.testing.assert_array_equal(ds.image[i].numpy(), i * np.ones((100, 100)))
 
     ds.delete()
+
+
+def test_iter_perf(memory_ds: Dataset):
+    orig_searchsorted = np.searchsorted
+    call_count = {"n": 0}
+
+    def searchsorted(*args, **kwargs):
+        call_count["n"] += 1
+        orig_searchsorted(*args, **kwargs)
+
+    np.searchsorted = searchsorted
+    ds = memory_ds
+    ds.create_tensor("x")
+    ds.create_tensor("y")
+    for _ in range(10):
+        ds.x.append(np.zeros((10, 10)))
+        ds.y.append(np.ones((10, 10)))
+
+    for i, sub_ds in enumerate(ds):
+        np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10)))
+        np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10)))
+
+    assert call_count["n"] == 4
+
+    np.searchsorted = orig_searchsorted

From b63c56506f057b84dc9f9fa2f6d91457059cef6b Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sun, 11 Jul 2021 15:51:58 +0530
Subject: [PATCH 38/79] fix test

---
 hub/api/tests/test_api.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index f70ca54901..0bc0020be0 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -475,9 +475,9 @@ def test_iter_perf(memory_ds: Dataset):
 
     def searchsorted(*args, **kwargs):
         call_count["n"] += 1
-        orig_searchsorted(*args, **kwargs)
+        return orig_searchsorted(*args, **kwargs)
+
 
-    np.searchsorted = searchsorted
     ds = memory_ds
     ds.create_tensor("x")
     ds.create_tensor("y")
@@ -485,9 +485,9 @@ def searchsorted(*args, **kwargs):
         ds.x.append(np.zeros((10, 10)))
         ds.y.append(np.ones((10, 10)))
 
+    np.searchsorted = searchsorted
     for i, sub_ds in enumerate(ds):
-        np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10)))
-        np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10)))
+        print(i, sub_ds)
 
     assert call_count["n"] == 4
 

From ae3c17d5b2a524465a239c6f5692b3241750ae97 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sun, 11 Jul 2021 15:55:42 +0530
Subject: [PATCH 39/79] fix test

---
 hub/api/dataset.py        | 3 ---
 hub/api/tests/test_api.py | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 6bb123e95a..a2f78b99a2 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -212,9 +212,6 @@ def __setattr__(self, name: str, value):
             return super().__setattr__(name, value)
 
     def __iter__(self):
-        for i in range(len(self)):
-            yield self[i]
-        return
         tensor_names = list(self.tensors)
         tensors_sliced = [t[self.index][:len(self)] for t in self.tensors.values()]
         num_tensors = len(tensor_names)
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 0bc0020be0..5e45fcd295 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -477,7 +477,6 @@ def searchsorted(*args, **kwargs):
         call_count["n"] += 1
         return orig_searchsorted(*args, **kwargs)
 
-
     ds = memory_ds
     ds.create_tensor("x")
     ds.create_tensor("y")
@@ -487,7 +486,8 @@ def searchsorted(*args, **kwargs):
 
     np.searchsorted = searchsorted
     for i, sub_ds in enumerate(ds):
-        print(i, sub_ds)
+        np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10)))
+        np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10)))
 
     assert call_count["n"] == 4
 

From 069a9f69f11fdc000d65ec0561780a612373321f Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sun, 11 Jul 2021 16:51:04 +0530
Subject: [PATCH 40/79] ds iter fixes

---
 hub/api/dataset.py               |  6 +++++-
 hub/api/tensor.py                |  1 -
 hub/api/tests/test_api.py        | 10 ++++++++--
 hub/core/chunk_engine.py         |  3 ---
 hub/core/meta/encode/chunk_id.py |  1 -
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index a2f78b99a2..9005414984 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -91,7 +91,9 @@ def __init__(
         self.index = index or Index()
 
         self.tensors: Dict[str, Tensor] =  _tensors if _tensors else {}
-
+        if self.tensors:
+            for t in self.tensors.values():
+                assert t._sample
         self._token = token
 
         if self.path.startswith("hub://"):
@@ -129,6 +131,8 @@ def __getitem__(
             if item not in self.tensors:
                 raise TensorDoesNotExistError(item)
             else:
+                if self.index.is_trivial():
+                    return self.tensors[item]
                 return self.tensors[item][self.index]
         elif isinstance(item, (int, slice, list, tuple, Index)):
             return Dataset(
diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 53360f9b38..8e59913377 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -232,7 +232,6 @@ def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]:
             chunk_id, local_sample_index = self._sample
             chunk = self.chunk_engine.get_chunk_from_id(chunk_id)
             return self.chunk_engine.read_sample_from_chunk(chunk, local_sample_index)
-
         return self.chunk_engine.numpy(self.index, aslist=aslist)
 
     def __str__(self):
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 5e45fcd295..bd5f577f1f 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -468,12 +468,14 @@ def test_hub_cloud_dataset():
 
     ds.delete()
 
-
+@pytest.mark.xfail(raises=AssertionError, reason="future")
 def test_iter_perf(memory_ds: Dataset):
     orig_searchsorted = np.searchsorted
     call_count = {"n": 0}
-
+    callers = []
     def searchsorted(*args, **kwargs):
+        import inspect
+        callers.append(inspect.stack()[1][3])
         call_count["n"] += 1
         return orig_searchsorted(*args, **kwargs)
 
@@ -486,6 +488,10 @@ def searchsorted(*args, **kwargs):
 
     np.searchsorted = searchsorted
     for i, sub_ds in enumerate(ds):
+        assert sub_ds.x._sample
+        assert sub_ds.y._sample
+        sub_ds.x.numpy()
+        sub_ds.y.numpy()
         np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10)))
         np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10)))
 
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index b50eadb850..57971aaf8d 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -341,7 +341,6 @@ def numpy(
         Returns:
             Union[np.ndarray, Sequence[np.ndarray]]: Either a list of numpy arrays or a single numpy array (depending on the `aslist` argument).
         """
-
         length = self.num_samples
         enc = self.chunk_id_encoder
         last_shape = None
@@ -373,8 +372,6 @@ def read_sample_from_chunk(
 
         enc = self.chunk_id_encoder
 
-        # buffer = chunk.memoryview_data
-        # local_sample_index = enc.get_local_sample_index(global_sample_index)
         shape = chunk.shapes_encoder[local_sample_index]
         sb, eb = chunk.byte_positions_encoder[local_sample_index]
 
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 1472e5d053..1403282a20 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -309,7 +309,6 @@ def get(
             Tuple[Tuple[ENCODING_DTYPE], Optional[Tuple[int]]]: Returns the chunk ID for `sample_index`. If `return_chunk_index` is True,
                 there will be 2 values. The second one being the chunk's index.
         """
-
         if self.num_samples == 0:
             raise IndexError(
                 f"Index {sample_index} is out of bounds for an empty chunk names encoding."

From d0306a5627a85af87df6cdeee78bed2334512954 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sun, 11 Jul 2021 19:03:35 +0530
Subject: [PATCH 41/79] tests

---
 hub/api/dataset.py        | 12 +++++-------
 hub/api/tensor.py         | 11 +++++++++--
 hub/api/tests/test_api.py | 19 +++++++++++--------
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 9005414984..bdb7206da2 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -38,7 +38,7 @@ def __init__(
         storage: Optional[StorageProvider] = None,
         public: Optional[bool] = True,
         token: Optional[str] = None,
-        _tensors: Optional[Dict[str, Tensor]] = None
+        _tensors: Optional[Dict[str, Tensor]] = None,
     ):
         """Initializes a new or existing dataset.
 
@@ -60,7 +60,7 @@ def __init__(
                 Use this if you want to specify the storage provider object manually instead of using a tag or url to generate it.
             public (bool, optional): Applied only if storage is Hub cloud storage and a new Dataset is being created. Defines if the dataset will have public access.
             token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.
-
+            _tensors: (list, optional): Internal.
 
         Raises:
             ValueError: If an existing local path is given, it must be a directory.
@@ -90,10 +90,8 @@ def __init__(
         self.storage.autoflush = True
         self.index = index or Index()
 
-        self.tensors: Dict[str, Tensor] =  _tensors if _tensors else {}
-        if self.tensors:
-            for t in self.tensors.values():
-                assert t._sample
+        self.tensors: Dict[str, Tensor] = _tensors if _tensors else {}
+
         self._token = token
 
         if self.path.startswith("hub://"):
@@ -217,7 +215,7 @@ def __setattr__(self, name: str, value):
 
     def __iter__(self):
         tensor_names = list(self.tensors)
-        tensors_sliced = [t[self.index][:len(self)] for t in self.tensors.values()]
+        tensors_sliced = [t[self.index][: len(self)] for t in self.tensors.values()]
         num_tensors = len(tensor_names)
         for tensors in zip(*tensors_sliced):
             tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)}
diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 8e59913377..bde3799e31 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -197,7 +197,9 @@ def __getitem__(
             hist.append(item)
             if len(hist) == 100:
                 if hist == list(range(hist[0], hist[-1] + 1, hist[1] - hist[0])):
-                    warnings.warn("Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to iterate through the tensor.")
+                    warnings.warn(
+                        "Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to iterate through the tensor."
+                    )
                 hist.clear()
         else:
             self._index_history.clear()
@@ -231,7 +233,12 @@ def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]:
         if self._sample:
             chunk_id, local_sample_index = self._sample
             chunk = self.chunk_engine.get_chunk_from_id(chunk_id)
-            return self.chunk_engine.read_sample_from_chunk(chunk, local_sample_index)
+            ret = self.chunk_engine.read_sample_from_chunk(chunk, local_sample_index)
+            if aslist:
+                ret = list(ret)
+            for entry in self.index.values[1:]:
+                ret = ret[entry.value]
+            return ret
         return self.chunk_engine.numpy(self.index, aslist=aslist)
 
     def __str__(self):
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index bd5f577f1f..2f4728659d 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -17,8 +17,6 @@
 MAX_FLOAT_DTYPE = np.float_.__name__
 
 
-
-
 def test_persist_local(local_storage):
     ds = Dataset(local_storage.root, local_cache_size=512)
     ds.create_tensor("image")
@@ -468,13 +466,15 @@ def test_hub_cloud_dataset():
 
     ds.delete()
 
-@pytest.mark.xfail(raises=AssertionError, reason="future")
+
 def test_iter_perf(memory_ds: Dataset):
     orig_searchsorted = np.searchsorted
     call_count = {"n": 0}
     callers = []
+
     def searchsorted(*args, **kwargs):
         import inspect
+
         callers.append(inspect.stack()[1][3])
         call_count["n"] += 1
         return orig_searchsorted(*args, **kwargs)
@@ -488,13 +488,16 @@ def searchsorted(*args, **kwargs):
 
     np.searchsorted = searchsorted
     for i, sub_ds in enumerate(ds):
-        assert sub_ds.x._sample
-        assert sub_ds.y._sample
-        sub_ds.x.numpy()
-        sub_ds.y.numpy()
         np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10)))
         np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10)))
 
-    assert call_count["n"] == 4
+    assert call_count["n"] == 44
+
+    for _ in range(100):
+        ds.x.append(np.zeros((3, 2)))
+
+    with pytest.warns():
+        for i in range(len(ds.x)):
+            sample = ds.x[i]
 
     np.searchsorted = orig_searchsorted

From ec4516bd99ec1abcc1d92e97aabfb192551dbd31 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sun, 11 Jul 2021 19:11:18 +0530
Subject: [PATCH 42/79] test

---
 hub/api/dataset.py        | 4 +++-
 hub/api/tests/test_api.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index bdb7206da2..baa8413364 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -219,7 +219,9 @@ def __iter__(self):
         num_tensors = len(tensor_names)
         for tensors in zip(*tensors_sliced):
             tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)}
-            ds = Dataset(read_only=True, storage=self.storage, _tensors=tensors)
+            ds = Dataset(
+                read_only=self.read_only, storage=self.storage, _tensors=tensors
+            )
             yield ds
 
     def _load_meta(self):
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 2f4728659d..5e5b3a73cf 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -496,7 +496,7 @@ def searchsorted(*args, **kwargs):
     for _ in range(100):
         ds.x.append(np.zeros((3, 2)))
 
-    with pytest.warns():
+    with pytest.warns(UserWarning, match=r"Use *"):
         for i in range(len(ds.x)):
             sample = ds.x[i]
 

From f6d71f0d2ef2f3a6f6cdbc3859a5a4f4a27d8edd Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sun, 11 Jul 2021 19:15:32 +0530
Subject: [PATCH 43/79] format

---
 hub/api/tensor.py         | 8 ++++++--
 hub/api/tests/test_api.py | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index bde3799e31..8c7c954134 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -194,11 +194,15 @@ def __getitem__(
             raise InvalidKeyTypeError(item)
         hist = self._index_history
         if isinstance(item, int):
+            if item < 0:
+                item += len(self)
             hist.append(item)
             if len(hist) == 100:
-                if hist == list(range(hist[0], hist[-1] + 1, hist[1] - hist[0])):
+                if hist[1] - hist[0] > 1 and hist == list(
+                    range(hist[0], hist[-1] + 1, hist[1] - hist[0])
+                ):
                     warnings.warn(
-                        "Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to iterate through the tensor."
+                        "Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to efficiently  iterate through the tensor."
                     )
                 hist.clear()
         else:
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 5e5b3a73cf..0505a21b83 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -493,11 +493,11 @@ def searchsorted(*args, **kwargs):
 
     assert call_count["n"] == 44
 
-    for _ in range(100):
+    for _ in range(200):
         ds.x.append(np.zeros((3, 2)))
 
     with pytest.warns(UserWarning, match=r"Use *"):
-        for i in range(len(ds.x)):
+        for i in range(0, len(ds.x), 2):
             sample = ds.x[i]
 
     np.searchsorted = orig_searchsorted

From 1a107ca29847eae0407834eef17b5fe794893b60 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sun, 11 Jul 2021 19:54:07 +0530
Subject: [PATCH 44/79] pytorch training optims

---
 hub/api/tensor.py                | 16 ----------------
 hub/api/tests/test_api.py        |  4 ----
 hub/core/meta/encode/chunk_id.py | 31 ++++++++++++++++---------------
 3 files changed, 16 insertions(+), 35 deletions(-)

diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 8c7c954134..c91e418cd2 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -46,7 +46,6 @@ def __init__(
         self.chunk_engine = ChunkEngine(self.key, self.storage)
 
         self._sample: Optional[Tuple(int, int)] = None
-        self._index_history: List[int] = []
 
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
         """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array,
@@ -192,21 +191,6 @@ def __getitem__(
     ):
         if not isinstance(item, (int, slice, list, tuple, Index)):
             raise InvalidKeyTypeError(item)
-        hist = self._index_history
-        if isinstance(item, int):
-            if item < 0:
-                item += len(self)
-            hist.append(item)
-            if len(hist) == 100:
-                if hist[1] - hist[0] > 1 and hist == list(
-                    range(hist[0], hist[-1] + 1, hist[1] - hist[0])
-                ):
-                    warnings.warn(
-                        "Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to efficiently  iterate through the tensor."
-                    )
-                hist.clear()
-        else:
-            self._index_history.clear()
         return Tensor(self.key, self.storage, index=self.index[item])
 
     def __setitem__(self, item: Union[int, slice], value: np.ndarray):
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 0505a21b83..afd4994c87 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -496,8 +496,4 @@ def searchsorted(*args, **kwargs):
     for _ in range(200):
         ds.x.append(np.zeros((3, 2)))
 
-    with pytest.warns(UserWarning, match=r"Use *"):
-        for i in range(0, len(ds.x), 2):
-            sample = ds.x[i]
-
     np.searchsorted = orig_searchsorted
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 1403282a20..7e94a7876a 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -317,20 +317,21 @@ def get(
         if sample_index < 0:
             sample_index = (self.num_samples) + sample_index
 
-        if (
-            self._prev_sample_index is not None
-            and sample_index == self._prev_sample_index + 1
-        ):
-            if sample_index > self._prev_entry[LAST_INDEX_INDEX]:  # type: ignore
-                chunk_index = self._incr_2d(*self._prev_chunk_index)  # type: ignore
-                current_entry = self._get_entry_2d(*chunk_index)
-                chunk_id = current_entry[CHUNK_ID_INDEX]
-                self._prev_entry = current_entry
-                self._prev_chunk_id = chunk_id
-            else:
+        chunk_id = None
+        if self._prev_sample_index is not None and sample_index >= self._prev_sample_index:
+            if sample_index <= self._prev_entry[LAST_INDEX_INDEX]:
                 chunk_id = self._prev_chunk_id
-                chunk_index = self._prev_chunk_index  # type: ignore
-        else:
+                chunk_index = self._prev_chunk_index
+                current_entry = self._prev_entry
+            else:
+                next_index = self._incr_2d(*self._prev_chunk_index)  # type: ignore
+                next_entry = self._get_entry_2d(*next_index)
+                if sample_index <= next_entry[LAST_INDEX_INDEX]:
+                    chunk_index = next_index
+                    current_entry = next_entry
+                    chunk_id = current_entry[CHUNK_ID_INDEX]
+
+        if chunk_id is None:
             self._flush_buffer()
             last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data]
             shard_index = np.searchsorted(last_idxs, sample_index)
@@ -339,11 +340,11 @@ def get(
             current_entry = shard[idx]
             chunk_id = current_entry[CHUNK_ID_INDEX]
             chunk_index = (shard_index, idx)
-            self._prev_entry = current_entry
-            self._prev_chunk_id = chunk_id
 
         self._prev_sample_index = sample_index
         self._prev_chunk_index = chunk_index
+        self._prev_entry = current_entry
+        self._prev_chunk_id = chunk_id
 
         if not return_chunk_index and not return_local_sample_index:
             return chunk_id

From 0cefcd2ad9680cad2cab62fd3bce666733ccacaf Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Sun, 11 Jul 2021 20:07:29 +0530
Subject: [PATCH 45/79] rem bad checks

---
 hub/api/tests/test_api.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index afd4994c87..ed115dba6c 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -493,7 +493,4 @@ def searchsorted(*args, **kwargs):
 
     assert call_count["n"] == 44
 
-    for _ in range(200):
-        ds.x.append(np.zeros((3, 2)))
-
     np.searchsorted = orig_searchsorted

From 318d49684b98b64bfafedabf585c4b62f7dfe326 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Mon, 12 Jul 2021 13:41:51 +0530
Subject: [PATCH 46/79] format

---
 hub/core/meta/encode/chunk_id.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 7e94a7876a..4b465a7da4 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -318,7 +318,10 @@ def get(
             sample_index = (self.num_samples) + sample_index
 
         chunk_id = None
-        if self._prev_sample_index is not None and sample_index >= self._prev_sample_index:
+        if (
+            self._prev_sample_index is not None
+            and sample_index >= self._prev_sample_index
+        ):
             if sample_index <= self._prev_entry[LAST_INDEX_INDEX]:
                 chunk_id = self._prev_chunk_id
                 chunk_index = self._prev_chunk_index

From b1591c4dd993f046a044017233095229dff9fbd5 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Mon, 12 Jul 2021 15:31:15 +0530
Subject: [PATCH 47/79] format + smoll change in encoding format

---
 hub/api/dataset.py               |  4 +--
 hub/api/tensor.py                |  6 ++--
 hub/api/tests/test_api.py        | 10 +++---
 hub/core/lowlevel.py             | 60 +++++---------------------------
 hub/core/meta/encode/chunk_id.py | 22 +++++++-----
 hub/core/tests/test_lowlevel.py  | 51 +++++++++++++++++++++++++++
 6 files changed, 83 insertions(+), 70 deletions(-)
 create mode 100644 hub/core/tests/test_lowlevel.py

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index baa8413364..c372addd10 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -1,6 +1,6 @@
 from hub.core.tensor import create_tensor
 from hub.constants import DEFAULT_HTYPE
-from typing import Callable, Dict, Optional, Union, Tuple, List
+from typing import Callable, Dict, Optional, Union, Tuple, List, Iterator
 import numpy as np
 
 from hub.api.tensor import Tensor
@@ -213,7 +213,7 @@ def __setattr__(self, name: str, value):
         else:
             return super().__setattr__(name, value)
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator["Dataset"]:
         tensor_names = list(self.tensors)
         tensors_sliced = [t[self.index][: len(self)] for t in self.tensors.values()]
         num_tensors = len(tensor_names)
diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index c91e418cd2..6d825c48a9 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -1,6 +1,6 @@
 from hub.util.keys import tensor_exists
 from hub.core.sample import Sample  # type: ignore
-from typing import List, Sequence, Union, Optional, Tuple, Dict
+from typing import List, Sequence, Union, Optional, Tuple, Dict, Iterator
 from hub.util.shape import ShapeInterval
 
 import numpy as np
@@ -45,7 +45,7 @@ def __init__(
 
         self.chunk_engine = ChunkEngine(self.key, self.storage)
 
-        self._sample: Optional[Tuple(int, int)] = None
+        self._sample: Optional[Tuple[int, int]] = None
 
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
         """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array,
@@ -196,7 +196,7 @@ def __getitem__(
     def __setitem__(self, item: Union[int, slice], value: np.ndarray):
         raise NotImplementedError("Tensor update not currently supported!")
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator["Tensor"]:
         for i, (chunk_id, local_sample_index) in enumerate(
             self.chunk_engine.chunk_id_encoder.iter(self.index.values[0].value)
         ):
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index ed115dba6c..fd274359bd 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -195,7 +195,7 @@ def test_empty_samples(ds: Dataset):
 
     # test indexing individual empty samples with numpy while looping, this may seem redundant but this was failing before
     for actual_sample, expected in zip(ds, expected_list):
-        actual = actual_sample.with_empty.numpy()
+        actual = actual_sample["with_empty"].numpy()
         np.testing.assert_array_equal(actual, expected)
 
 
@@ -483,13 +483,13 @@ def searchsorted(*args, **kwargs):
     ds.create_tensor("x")
     ds.create_tensor("y")
     for _ in range(10):
-        ds.x.append(np.zeros((10, 10)))
-        ds.y.append(np.ones((10, 10)))
+        ds["x"].append(np.zeros((10, 10)))
+        ds["y"].append(np.ones((10, 10)))
 
     np.searchsorted = searchsorted
     for i, sub_ds in enumerate(ds):
-        np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10)))
-        np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10)))
+        np.testing.assert_array_equal(sub_ds["x"].numpy(), np.zeros((10, 10)))
+        np.testing.assert_array_equal(sub_ds["y"].numpy(), np.ones((10, 10)))
 
     assert call_count["n"] == 44
 
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
index 8ae4c201ef..4dfd9dc848 100644
--- a/hub/core/lowlevel.py
+++ b/hub/core/lowlevel.py
@@ -129,11 +129,11 @@ def _infer_chunk_num_bytes(
     # assert byte_positions.ndim == 2
     # version_slice_size = 1 + len(version)
     # shape_info_slice_size = 4 + 4 + shape_info.nbytes
-    # byte_positions_slice_size = 4 + 4 + byte_positions.nbytes
+    # byte_positions_slice_size = 4 + byte_positions.nbytes
     # data_slice_size = sum(map(len, data))
     if len_data is None:
         len_data = sum(map(len, data))  # type: ignore
-    return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 17
+    return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 13
 
 
 def encode_chunk(
@@ -141,9 +141,12 @@ def encode_chunk(
     shape_info: np.ndarray,
     byte_positions: np.ndarray,
     data: Union[Sequence[bytes], Sequence[memoryview]],
-    len_data: Optional[int],
+    len_data: Optional[int] = None,
 ) -> memoryview:
 
+    if len_data is None:
+        len_data = sum(map(len, data))
+
     flatbuff = malloc(
         _infer_chunk_num_bytes(version, shape_info, byte_positions, data, len_data)
     )
@@ -164,7 +167,6 @@ def encode_chunk(
 
     # write byte positions
     ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[0]).tobytes())
-    ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[1]).tobytes())
     memcpy(ptr, _ndarray_to_ptr(byte_positions))
     ptr += byte_positions.nbytes
 
@@ -209,8 +211,8 @@ def decode_chunk(
 
     # read byte positions
     byte_positions_dtype = np.dtype(hub.constants.ENCODING_DTYPE)
-    byte_positions_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32)
-    ptr += 8
+    byte_positions_shape = (int(np.frombuffer(ptr.memoryview[:4], dtype=np.int32)), 3)
+    ptr += 4
     byte_positions_data_size = int(
         np.prod(byte_positions_shape) * byte_positions_dtype.itemsize
     )
@@ -268,49 +270,3 @@ def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]:
     )
 
     return version, ids
-
-
-def test_chunk_encoding():
-    version = hub.__version__
-    shape_info = np.cast[hub.constants.ENCODING_DTYPE](
-        np.random.randint(100, size=(17, 63))
-    )
-    byte_positions = np.cast[hub.constants.ENCODING_DTYPE](
-        np.random.randint(100, size=(31, 79))
-    )
-    data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9]
-    encoded = bytes(encode_chunk(version, shape_info, byte_positions, data))
-
-    # from bytes
-    decoded = decode_chunk(encoded)
-    version2, shape_info2, byte_positions2, data2 = decoded
-    assert version2 == version
-    np.testing.assert_array_equal(shape_info, shape_info2)
-    np.testing.assert_array_equal(byte_positions, byte_positions2)
-    assert b"".join(data) == bytes(data2)
-
-    # from pointer
-    buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded))
-    decoded = decode_chunk(buff)
-    version2, shape_info2, byte_positions2, data2 = decoded
-    assert version2 == version
-    np.testing.assert_array_equal(shape_info, shape_info2)
-    np.testing.assert_array_equal(byte_positions, byte_positions2)
-    assert b"".join(data) == bytes(data2)
-
-
-def test_chunkids_encoding():
-    version = hub.__version__
-    shards = [
-        np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(100, 2)))
-    ]
-    encoded = encode_chunkids(version, shards)
-    decoded = decode_chunkids(encoded)
-    version2, ids = decoded
-    assert version2 == version
-    np.testing.assert_array_equal(np.concatenate(shards), ids)
-
-
-if __name__ == "__main__":
-    test_chunk_encoding()
-    test_chunkids_encoding()
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 4b465a7da4..f5368fed62 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -3,7 +3,7 @@
 import hub
 from hub.core.storage.cachable import Cachable
 from io import BytesIO
-from typing import Optional, Tuple, Union, List
+from typing import Optional, Tuple, Union, List, Iterable
 import numpy as np
 from uuid import uuid4
 from hub.core.lowlevel import encode_chunkids, decode_chunkids
@@ -322,7 +322,7 @@ def get(
             self._prev_sample_index is not None
             and sample_index >= self._prev_sample_index
         ):
-            if sample_index <= self._prev_entry[LAST_INDEX_INDEX]:
+            if sample_index <= self._prev_entry[LAST_INDEX_INDEX]:  # type: ignore
                 chunk_id = self._prev_chunk_id
                 chunk_index = self._prev_chunk_index
                 current_entry = self._prev_entry
@@ -355,8 +355,8 @@ def get(
         if return_chunk_index:
             ret.append(chunk_index)
         if return_local_sample_index:
-            if any(chunk_index):
-                prev_entry = self._get_entry_2d(*self._decr_2d(*chunk_index))
+            if any(chunk_index):  # type: ignore
+                prev_entry = self._get_entry_2d(*self._decr_2d(*chunk_index))  # type: ignore
                 local_sample_index = (
                     sample_index - int(prev_entry[LAST_INDEX_INDEX]) - 1
                 )
@@ -366,13 +366,19 @@ def get(
 
         return tuple(ret)  # type: ignore
 
-    def iter(self, index: Union[int, slice, tuple] = slice(None)):
+    def iter(
+        self, index: Union[int, slice, tuple] = slice(None)
+    ) -> Iterable[Tuple[int, int]]:
         if isinstance(index, int):
-            yield self.get(index, return_local_sample_index=True)
+            yield self.get(index, return_local_sample_index=True)  # type: ignore
         elif isinstance(index, slice):
             start = 0 if index.start is None else index.start
             stop = self.num_samples if index.stop is None else index.stop
             step = 1 if index.step is None else index.step
+            if start < 0:
+                start += self.num_samples
+            if stop < 0:
+                stop += self.num_samples
             assert isinstance(start, int)
             assert isinstance(stop, int)
             assert isinstance(step, int)
@@ -392,12 +398,12 @@ def iter(self, index: Union[int, slice, tuple] = slice(None)):
             chunk_id, (shard_index, chunk_index), local_sample_index = self.get(  # type: ignore
                 start, return_chunk_index=True, return_local_sample_index=True
             )
-            shard = self._data[shard_index]
             yield chunk_id, local_sample_index
             n += 1
             if n == total:
                 return
             ctr = Counter(step)
+            shard = self._data[shard_index]
             if forward:
                 last_index = int(shard[chunk_index, LAST_INDEX_INDEX])
                 for i in range(local_sample_index + 1, last_index + 1):
@@ -472,7 +478,7 @@ def iter(self, index: Union[int, slice, tuple] = slice(None)):
         elif isinstance(index, tuple):
             for i in index:
                 # Random access
-                yield self.get(i, return_local_sample_index=True)
+                yield self.get(i, return_local_sample_index=True)  # type: ignore
 
 
 class Counter:
diff --git a/hub/core/tests/test_lowlevel.py b/hub/core/tests/test_lowlevel.py
new file mode 100644
index 0000000000..5381aba075
--- /dev/null
+++ b/hub/core/tests/test_lowlevel.py
@@ -0,0 +1,51 @@
+from hub.core.lowlevel import (
+    Pointer,
+    encode_chunk,
+    decode_chunk,
+    encode_chunkids,
+    decode_chunkids,
+)
+import numpy as np
+import ctypes
+import hub
+
+
+def test_chunk_encoding():
+    version = hub.__version__
+    shape_info = np.cast[hub.constants.ENCODING_DTYPE](
+        np.random.randint(100, size=(17, 63))
+    )
+    byte_positions = np.cast[hub.constants.ENCODING_DTYPE](
+        np.random.randint(100, size=(31, 3))
+    )
+    data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9]
+    encoded = bytes(encode_chunk(version, shape_info, byte_positions, data))
+
+    # from bytes
+    decoded = decode_chunk(encoded)
+    version2, shape_info2, byte_positions2, data2 = decoded
+    assert version2 == version
+    np.testing.assert_array_equal(shape_info, shape_info2)
+    np.testing.assert_array_equal(byte_positions, byte_positions2)
+    assert b"".join(data) == bytes(data2)
+
+    # from pointer
+    buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded))
+    decoded = decode_chunk(buff)
+    version2, shape_info2, byte_positions2, data2 = decoded
+    assert version2 == version
+    np.testing.assert_array_equal(shape_info, shape_info2)
+    np.testing.assert_array_equal(byte_positions, byte_positions2)
+    assert b"".join(data) == bytes(data2)
+
+
+def test_chunkids_encoding():
+    version = hub.__version__
+    shards = [
+        np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(100, 2)))
+    ]
+    encoded = encode_chunkids(version, shards)
+    decoded = decode_chunkids(encoded)
+    version2, ids = decoded
+    assert version2 == version
+    np.testing.assert_array_equal(np.concatenate(shards), ids)

From 8f475a913b342ca781399643e7d2e8dc7213393e Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Tue, 13 Jul 2021 00:23:50 +0530
Subject: [PATCH 48/79] minimize searchsorted calls

---
 hub/api/tests/test_api.py        |  2 +-
 hub/core/meta/encode/chunk_id.py | 20 ++++++++++++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index fd274359bd..64749dfe95 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -491,6 +491,6 @@ def searchsorted(*args, **kwargs):
         np.testing.assert_array_equal(sub_ds["x"].numpy(), np.zeros((10, 10)))
         np.testing.assert_array_equal(sub_ds["y"].numpy(), np.ones((10, 10)))
 
-    assert call_count["n"] == 44
+    assert call_count["n"] == 40
 
     np.searchsorted = orig_searchsorted
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index f5368fed62..fc51755159 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -383,6 +383,10 @@ def iter(
             assert isinstance(stop, int)
             assert isinstance(step, int)
             assert step != 0
+            if start < 0:
+                start += self.num_samples
+            if stop < 0:
+                stop += self.num_samples
             if step > 0:
                 total = math.ceil((stop - start) / step)
                 forward = True
@@ -395,15 +399,23 @@ def iter(
                 return
             n = 0
             self._flush_buffer()
-            chunk_id, (shard_index, chunk_index), local_sample_index = self.get(  # type: ignore
-                start, return_chunk_index=True, return_local_sample_index=True
-            )
+            if start:
+                chunk_id, (shard_index, chunk_index), local_sample_index = self.get(  # type: ignore
+                    start, return_chunk_index=True, return_local_sample_index=True
+                )
+                shard = self._data[shard_index]
+            else:
+                shard_index = 0
+                chunk_index = 0
+                shard = self._data[0]
+                local_sample_index = 0
+                chunk_id = shard[0, CHUNK_ID_INDEX]
             yield chunk_id, local_sample_index
             n += 1
             if n == total:
                 return
             ctr = Counter(step)
-            shard = self._data[shard_index]
+
             if forward:
                 last_index = int(shard[chunk_index, LAST_INDEX_INDEX])
                 for i in range(local_sample_index + 1, last_index + 1):

From a7dd7f173fb57e0ba6e7bbec2bf07a8b24e94de2 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Tue, 13 Jul 2021 14:16:06 +0530
Subject: [PATCH 49/79] refac chunk_id.py

---
 hub/core/meta/encode/chunk_id.py | 221 +++++++++++++++++--------------
 1 file changed, 122 insertions(+), 99 deletions(-)

diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index fc51755159..fec2fc58e0 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -366,38 +366,130 @@ def get(
 
         return tuple(ret)  # type: ignore
 
+    def _preproc_slice(self, index: slice) -> Tuple[int, int, int, int, bool]:
+        start = 0 if index.start is None else index.start
+        stop = self.num_samples if index.stop is None else index.stop
+        step = 1 if index.step is None else index.step
+        assert isinstance(start, int)
+        assert isinstance(stop, int)
+        assert isinstance(step, int)
+        if start < 0:
+            start += self.num_samples
+        if stop < 0:
+            stop += self.num_samples
+        assert step != 0
+        if step > 0:
+            total = math.ceil((stop - start) / step)
+            forward = True
+        else:
+            step = -step
+            total = math.ceil((stop - start) / step)
+            start, stop = stop - 1, start
+            forward = False
+        return start, stop, step, total, forward
+
+    def _iter_forward(
+        self,
+        chunk_id: int,
+        shard_index: int,
+        chunk_index: int,
+        local_sample_index: int,
+        total: int,
+        step: int,
+    ) -> Iterable[Tuple[int, int]]:
+        n = 0
+        ctr = Counter(step)
+        shard = self._data[shard_index]
+        last_index = int(shard[chunk_index, LAST_INDEX_INDEX])
+        for i in range(local_sample_index + 1, last_index + 1):
+            if ctr():
+                yield chunk_id, i
+                n += 1
+                if n == total:
+                    return
+        for chunk_index in range(chunk_index + 1, len(shard)):
+            entry = shard[chunk_index]
+            chunk_id = entry[CHUNK_ID_INDEX]
+            new_last_index = int(entry[LAST_INDEX_INDEX])
+            for i in range(new_last_index - last_index):
+                if ctr():
+                    yield chunk_id, i
+                    n += 1
+                    if n == total:
+                        return
+            last_index = new_last_index
+        for shard_index in range(shard_index + 1, len(self._data)):
+            shard = self._data[shard_index]
+            for entry in shard:
+                chunk_id = entry[CHUNK_ID_INDEX]
+                new_last_index = int(entry[LAST_INDEX_INDEX])
+                for i in range(new_last_index - last_index):
+                    if ctr():
+                        yield chunk_id, i
+                        n += 1
+                        if n == total:
+                            return
+                last_index = new_last_index
+
+    def _iter_reverse(
+        self,
+        chunk_id: int,
+        shard_index: int,
+        chunk_index: int,
+        local_sample_index: int,
+        total: int,
+        step: int,
+    ) -> Iterable[Tuple[int, int]]:
+        n = 0
+        ctr = Counter(step)
+        shard = self._data[shard_index]
+        last_index = int(shard[chunk_index, LAST_INDEX_INDEX])
+        for local_sample_index in range(local_sample_index - 1, -1, -1):
+            if ctr():
+                yield chunk_id, local_sample_index
+                n += 1
+                if n == total:
+                    return
+        for chunk_index in range(chunk_index - 1, -1, -1):
+            entry = shard[chunk_index]
+            chunk_id = entry[CHUNK_ID_INDEX]
+            last_index = entry[LAST_INDEX_INDEX]
+            if chunk_index:
+                last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX]
+            elif shard_index:
+                last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX]
+            for local_sample_index in range(last_index, -1, -1):
+                if ctr():
+                    yield chunk_id, local_sample_index
+                    n += 1
+                    if n == total:
+                        return
+        for shard_index in range(shard_index - 1, -1, -1):
+            shard = self._data[shard_index]
+            for chunk_index in range(len(shard) - 1, -1, -1):
+                entry = shard[chunk_index]
+                chunk_id = entry[CHUNK_ID_INDEX]
+                last_index = entry[LAST_INDEX_INDEX]
+                if chunk_index:
+                    last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX]
+                elif shard_index:
+                    last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX]
+                for local_sample_index in range(last_index, -1, -1):
+                    if ctr():
+                        yield chunk_id, local_sample_index
+                        n += 1
+                        if n == total:
+                            return
+
     def iter(
         self, index: Union[int, slice, tuple] = slice(None)
     ) -> Iterable[Tuple[int, int]]:
         if isinstance(index, int):
             yield self.get(index, return_local_sample_index=True)  # type: ignore
         elif isinstance(index, slice):
-            start = 0 if index.start is None else index.start
-            stop = self.num_samples if index.stop is None else index.stop
-            step = 1 if index.step is None else index.step
-            if start < 0:
-                start += self.num_samples
-            if stop < 0:
-                stop += self.num_samples
-            assert isinstance(start, int)
-            assert isinstance(stop, int)
-            assert isinstance(step, int)
-            assert step != 0
-            if start < 0:
-                start += self.num_samples
-            if stop < 0:
-                stop += self.num_samples
-            if step > 0:
-                total = math.ceil((stop - start) / step)
-                forward = True
-            else:
-                step = -step
-                total = math.ceil((stop - start) / step)
-                start, stop = stop - 1, start
-                forward = False
+            start, stop, step, total, forward = self._preproc_slice(index)
             if not total:
                 return
-            n = 0
             self._flush_buffer()
             if start:
                 chunk_id, (shard_index, chunk_index), local_sample_index = self.get(  # type: ignore
@@ -411,82 +503,13 @@ def iter(
                 local_sample_index = 0
                 chunk_id = shard[0, CHUNK_ID_INDEX]
             yield chunk_id, local_sample_index
-            n += 1
-            if n == total:
+            if total == 1:
                 return
-            ctr = Counter(step)
-
-            if forward:
-                last_index = int(shard[chunk_index, LAST_INDEX_INDEX])
-                for i in range(local_sample_index + 1, last_index + 1):
-                    if ctr():
-                        yield chunk_id, i
-                        n += 1
-                        if n == total:
-                            return
-                for chunk_index in range(chunk_index + 1, len(shard)):
-                    entry = shard[chunk_index]
-                    chunk_id = entry[CHUNK_ID_INDEX]
-                    new_last_index = int(entry[LAST_INDEX_INDEX])
-                    for i in range(new_last_index - last_index):
-                        if ctr():
-                            yield chunk_id, i
-                            n += 1
-                            if n == total:
-                                return
-                    last_index = new_last_index
-                for shard_index in range(shard_index + 1, len(self._data)):
-                    shard = self._data[shard_index]
-                    for entry in shard:
-                        chunk_id = entry[CHUNK_ID_INDEX]
-                        new_last_index = int(entry[LAST_INDEX_INDEX])
-                        for i in range(new_last_index - last_index):
-                            if ctr():
-                                yield chunk_id, i
-                                n += 1
-                                if n == total:
-                                    return
-                        last_index = new_last_index
-            else:
-                last_index = int(shard[chunk_index, LAST_INDEX_INDEX])
-                for local_sample_index in range(local_sample_index - 1, -1, -1):
-                    if ctr():
-                        yield chunk_id, local_sample_index
-                        n += 1
-                        if n == total:
-                            return
-                for chunk_index in range(chunk_index - 1, -1, -1):
-                    entry = shard[chunk_index]
-                    chunk_id = entry[CHUNK_ID_INDEX]
-                    last_index = entry[LAST_INDEX_INDEX]
-                    if chunk_index:
-                        last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX]
-                    elif shard_index:
-                        last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX]
-                    for local_sample_index in range(last_index, -1, -1):
-                        if ctr():
-                            yield chunk_id, local_sample_index
-                            n += 1
-                            if n == total:
-                                return
-                for shard_index in range(shard_index - 1, -1, -1):
-                    shard = self._data[shard_index]
-                    for chunk_index in range(len(shard) - 1, -1, -1):
-                        entry = shard[chunk_index]
-                        chunk_id = entry[CHUNK_ID_INDEX]
-                        last_index = entry[LAST_INDEX_INDEX]
-                        if chunk_index:
-                            last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX]
-                        elif shard_index:
-                            last_index -= self._data[shard_index - 1][
-                                -1, LAST_INDEX_INDEX
-                            ]
-                        for local_sample_index in range(last_index, -1, -1):
-                            if ctr():
-                                yield chunk_id, local_sample_index
-                                n += 1
-                                if n == total:
-                                    return
+            iter_f = self._iter_forward if forward else self._iter_reverse
+            for chunk_id, local_sample_index in iter_f(
+                chunk_id, shard_index, chunk_index, local_sample_index, total - 1, step
+            ):
+                yield chunk_id, local_sample_index
         elif isinstance(index, tuple):
             for i in index:
                 # Random access

From 746201c5eea47a977a70fc146d7539bab5765c40 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Tue, 13 Jul 2021 21:39:25 +0530
Subject: [PATCH 50/79] more refacc

---
 hub/core/chunk.py                             |  42 +--
 hub/core/lowlevel.py                          | 272 ------------------
 hub/core/meta/encode/chunk_id.py              |   2 +-
 hub/core/serialize.py                         | 138 +++++++++
 .../{test_lowlevel.py => test_serialize.py}   |  12 +-
 5 files changed, 150 insertions(+), 316 deletions(-)
 delete mode 100644 hub/core/lowlevel.py
 create mode 100644 hub/core/serialize.py
 rename hub/core/tests/{test_lowlevel.py => test_serialize.py} (74%)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index 7c5a918ca3..f8ae9a8066 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -8,13 +8,7 @@
 from hub.core.meta.encode.shape import ShapeEncoder
 from hub.core.meta.encode.byte_positions import BytePositionsEncoder
 
-from hub.core.lowlevel import (
-    encode_chunk,
-    decode_chunk,
-    malloc,
-    _write_pybytes,
-    _infer_chunk_num_bytes,
-)
+from hub.core.serialize import encode_chunk, decode_chunk, infer_chunk_num_bytes
 
 
 class Chunk(Cachable):
@@ -59,16 +53,6 @@ def __init__(
             self._data.append(data)
             self._num_data_bytes += len(data)
 
-    @property
-    def memoryview_data(self):
-        # deprecated
-        if len(self._data) == 1:
-            return self._data[0]
-        ptr = malloc(self.num_data_bytes)
-        for data in self._data:
-            ptr = _write_pybytes(ptr, data)
-        return memoryview(ptr.bytes)
-
     def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]:
         """Converts `byte_index`, which is an index for a flattened stream of bytes, into a 2D index that can
         be used for a list of byte streams of varying lengths. Used for accessing `self._data`, which is a list
@@ -102,9 +86,7 @@ def view(self, start_byte: int, end_byte: int):
         end2dx, end2dy = self._get_2d_idx(end_byte)
         if start2dx == end2dx:
             # Indexing to the same inner chunk, this would be fast
-            buff = malloc(end2dy - start2dy)
-            _write_pybytes(buff, self._data[start2dx][start2dy:end2dy])
-            return buff.memoryview
+            return self._data[start2dx][start2dy:end2dy]
 
         # TODO: document this
         # builds a list of memoryviews that contain the pieces we need for the output view
@@ -114,11 +96,14 @@ def view(self, start_byte: int, end_byte: int):
         for i in range(start2dx + 1, end2dx):
             byts.append(self._data[i])
         byts.append(self._data[end2dx][:end2dy])
-        buff = malloc(sum(map(len, byts)))
-        ptr = buff + 0
+
+        buff = np.zeros(sum(map(len, byts)), dtype=np.byte)
+        offset = 0
         for byt in byts:
-            ptr = _write_pybytes(ptr, byt.cast("B"))
-        return buff.memoryview
+            n = len(byt)
+            buff[offset : offset + n] = byt
+            offset += n
+        return memoryview(buff.tobytes())
 
     @property
     def num_samples(self):
@@ -181,19 +166,12 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
 
     def __len__(self):
         """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached."""
-        return _infer_chunk_num_bytes(
+        return infer_chunk_num_bytes(
             hub.__version__,
             self.shapes_encoder.array,
             self.byte_positions_encoder.array,
             len_data=self.num_data_bytes,
         )
-        return (
-            17
-            + len(hub.__version__)
-            + self.shapes_encoder.array.nbytes
-            + self.byte_positions_encoder.array.nbytes
-            + self.num_data_bytes
-        )
 
     def tobytes(self) -> memoryview:
         if self.num_samples == 0:
diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py
deleted file mode 100644
index 4dfd9dc848..0000000000
--- a/hub/core/lowlevel.py
+++ /dev/null
@@ -1,272 +0,0 @@
-import numpy as np
-import ctypes
-from collections import namedtuple
-from typing import Tuple, Sequence, Union, Optional, List
-import hub
-
-
-class Pointer(object):
-    __slots__ = ("address", "size", "_c_array", "_refs")
-
-    def __init__(
-        self,
-        address: Optional[int] = None,
-        size: Optional[int] = None,
-        c_array: Optional[ctypes.Array] = None,
-    ) -> None:
-        self._refs: List[ctypes.Array] = []
-        if c_array is None:
-            if address is None or size is None:
-                raise ValueError("Expected c_array or address and size args.")
-            self.address = address
-            self.size = size
-            self._set_c_array()
-        else:
-            self._c_array = c_array
-            self.address = ctypes.addressof(c_array)
-            self.size = len(c_array)
-
-    def _set_c_array(self) -> None:
-        try:
-            self._refs.append(self._c_array)
-        except AttributeError:
-            pass
-        self._c_array = (ctypes.c_byte * self.size).from_address(self.address)
-
-    def __add__(self, i: int) -> "Pointer":
-        assert i >= 0
-        assert i <= self.size
-        ret = Pointer(self.address + i, self.size - i)
-        ret._refs.append(self._c_array)
-        return ret
-
-    def __iadd__(self, i: int) -> "Pointer":
-        assert i >= 0
-        assert i <= self.size
-        self.address += i
-        self.size -= i
-        self._set_c_array()
-        return self
-
-    def __setitem__(self, idx: int, byte: int) -> None:
-        self._c_array[idx] = byte
-
-    def __getitem__(self, idx: Union[int, slice]) -> Union[int, "Pointer"]:
-        if isinstance(idx, int):
-            return self._c_array[idx]
-        elif isinstance(idx, slice):
-            assert idx.step is None
-            start = idx.start
-            end = idx.stop
-            n = self.size
-            if start is None:
-                start = 0
-            elif start < 0:
-                start += n
-            if end is None:
-                end = n
-            elif end < 0:
-                end += n
-            assert start >= 0 and start < n
-            assert end >= start and end <= n
-            ret = Pointer(self.address + start, end - start)
-            ret._refs.append(self._c_array)
-            return ret
-
-    @property
-    def memoryview(self):
-        return memoryview(self._c_array)
-
-    @property
-    def bytes(self):
-        return bytes(self._c_array)
-
-    @property
-    def bytearray(self):
-        return bytearray(self._c_array)
-
-    def __len__(self):
-        return self.size
-
-
-def malloc(size: int) -> Pointer:
-    return Pointer(c_array=(ctypes.c_byte * size)())
-
-
-def memcpy(dest: Pointer, src: Pointer, count=None) -> None:
-    if count is None:
-        count = src.size
-    ctypes.memmove(dest.address, src.address, count)
-
-
-def _write_pybytes(ptr: Pointer, byts: Union[bytes, memoryview]) -> Pointer:
-    memcpy(ptr, _ndarray_to_ptr(np.frombuffer(byts, dtype=np.byte)))
-    return ptr + len(byts)
-
-
-def _ndarray_to_ptr(arr: np.ndarray) -> Pointer:
-    return Pointer(arr.__array_interface__["data"][0], arr.itemsize * arr.size)
-
-
-def _pybytes_to_c_array(byts: bytes) -> Pointer:
-    return Pointer(
-        np.frombuffer(byts, dtype=np.byte).__array_interface__["data"][0], len(byts)
-    )
-
-
-def _infer_chunk_num_bytes(
-    version: str,
-    shape_info: np.ndarray,
-    byte_positions: np.ndarray,
-    data: Optional[Union[Sequence[bytes], Sequence[memoryview]]] = None,
-    len_data: Optional[int] = None,
-) -> int:
-    # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128)
-    # NOTE: Assumption: len(version) < 256
-    assert len(version) < 256
-    assert max((map(ord, version))) < 128
-    # assert shape_info.ndim == 2
-    # assert byte_positions.ndim == 2
-    # version_slice_size = 1 + len(version)
-    # shape_info_slice_size = 4 + 4 + shape_info.nbytes
-    # byte_positions_slice_size = 4 + byte_positions.nbytes
-    # data_slice_size = sum(map(len, data))
-    if len_data is None:
-        len_data = sum(map(len, data))  # type: ignore
-    return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 13
-
-
-def encode_chunk(
-    version: str,
-    shape_info: np.ndarray,
-    byte_positions: np.ndarray,
-    data: Union[Sequence[bytes], Sequence[memoryview]],
-    len_data: Optional[int] = None,
-) -> memoryview:
-
-    if len_data is None:
-        len_data = sum(map(len, data))
-
-    flatbuff = malloc(
-        _infer_chunk_num_bytes(version, shape_info, byte_positions, data, len_data)
-    )
-    ptr = flatbuff + 0
-
-    # write version
-    ptr[0] = len(version)
-    ptr += 1
-    for c in version:
-        ptr[0] = ord(c)
-        ptr += 1
-
-    # write shape info
-    ptr = _write_pybytes(ptr, np.int32(shape_info.shape[0]).tobytes())
-    ptr = _write_pybytes(ptr, np.int32(shape_info.shape[1]).tobytes())
-    memcpy(ptr, _ndarray_to_ptr(shape_info))
-    ptr += shape_info.nbytes
-
-    # write byte positions
-    ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[0]).tobytes())
-    memcpy(ptr, _ndarray_to_ptr(byte_positions))
-    ptr += byte_positions.nbytes
-
-    # write actual data
-    for d in data:
-        if isinstance(d, Pointer):
-            d = d.memoryview
-        ptr = _write_pybytes(ptr, d)
-
-    return memoryview(flatbuff.bytes)
-
-
-def decode_chunk(
-    buff: Union[bytes, Pointer, memoryview]
-) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
-    if not isinstance(buff, Pointer):
-        buff = _pybytes_to_c_array(buff)
-        copy = True
-    else:
-        copy = False
-    ptr = buff + 0
-
-    # read version
-    len_version: int = ptr[0]  # type: ignore
-    version = ""
-    ptr += 1
-    for i in range(len_version):
-        version += chr(ptr[i])  # type: ignore
-    ptr += len_version
-
-    # read shape info
-    shape_info_dtype = np.dtype(hub.constants.ENCODING_DTYPE)
-    shape_info_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32)
-    ptr += 8
-    shape_info_data_size = int(np.prod(shape_info_shape) * shape_info_dtype.itemsize)
-    shape_info = np.frombuffer(
-        ptr.memoryview[:shape_info_data_size], dtype=shape_info_dtype
-    ).reshape(shape_info_shape)
-    if copy:
-        shape_info = shape_info.copy()
-    ptr += shape_info_data_size
-
-    # read byte positions
-    byte_positions_dtype = np.dtype(hub.constants.ENCODING_DTYPE)
-    byte_positions_shape = (int(np.frombuffer(ptr.memoryview[:4], dtype=np.int32)), 3)
-    ptr += 4
-    byte_positions_data_size = int(
-        np.prod(byte_positions_shape) * byte_positions_dtype.itemsize
-    )
-    byte_positions = np.frombuffer(
-        ptr.memoryview[:byte_positions_data_size], dtype=byte_positions_dtype
-    ).reshape(byte_positions_shape)
-    if copy:
-        byte_positions = byte_positions.copy()
-    ptr += byte_positions_data_size
-    if copy:
-        data = memoryview(ptr.bytes)
-    else:
-        data = ptr.memoryview
-    return version, shape_info, byte_positions, data
-
-
-def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
-    len_version = len(version)
-    flatbuff = malloc(1 + len_version + sum([x.nbytes for x in ids]))
-
-    # Write version
-    ptr = flatbuff + 0
-    ptr[0] = len_version
-    ptr += 1
-
-    for i, c in enumerate(version):
-        ptr[i] = ord(c)
-
-    ptr += len_version
-
-    for arr in ids:
-        memcpy(ptr, _ndarray_to_ptr(arr))
-        ptr += arr.nbytes
-
-    return memoryview(flatbuff.bytes)
-
-
-def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]:
-    ptr = _pybytes_to_c_array(buff)
-
-    # Read version
-    len_version: int = ptr[0]  # type: ignore
-    ptr += 1
-    version = ""
-    for i in range(len_version):
-        version += chr(ptr[i])  # type: ignore
-
-    ptr += len_version
-
-    # Read chunk ids
-    ids = (
-        np.frombuffer(ptr.memoryview, dtype=hub.constants.ENCODING_DTYPE)
-        .reshape(-1, 2)
-        .copy()
-    )
-
-    return version, ids
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index fec2fc58e0..7fe00a04a5 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -6,7 +6,7 @@
 from typing import Optional, Tuple, Union, List, Iterable
 import numpy as np
 from uuid import uuid4
-from hub.core.lowlevel import encode_chunkids, decode_chunkids
+from hub.core.serialize import encode_chunkids, decode_chunkids
 from hub.core.index import IndexEntry
 import math
 
diff --git a/hub/core/serialize.py b/hub/core/serialize.py
new file mode 100644
index 0000000000..3b0fc68158
--- /dev/null
+++ b/hub/core/serialize.py
@@ -0,0 +1,138 @@
+from typing import Optional, Sequence, Union, Tuple
+
+import hub
+import ctypes
+import numpy as np
+
+
+def infer_chunk_num_bytes(
+    version: str,
+    shape_info: np.ndarray,
+    byte_positions: np.ndarray,
+    data: Optional[Union[Sequence[bytes], Sequence[memoryview]]] = None,
+    len_data: Optional[int] = None,
+) -> int:
+    # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128)
+    # NOTE: Assumption: len(version) < 256
+    assert len(version) < 256
+    assert max((map(ord, version))) < 128
+    if len_data is None:
+        len_data = sum(map(len, data))  # type: ignore
+    return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 13
+
+
+def encode_chunk(
+    version: str,
+    shape_info: np.ndarray,
+    byte_positions: np.ndarray,
+    data: Union[Sequence[bytes], Sequence[memoryview]],
+    len_data: Optional[int] = None,
+) -> memoryview:
+    nbytes = infer_chunk_num_bytes(version, shape_info, byte_positions, data, len_data)
+    flatbuff = np.zeros(nbytes, dtype=np.byte)
+
+    # Write version
+    len_version = len(version)
+    flatbuff[0] = len_version
+    flatbuff[1 : 1 + len_version] = list(map(ord, version))
+    offset = 1 + len_version
+
+    # Write shape info
+    flatbuff[offset : offset + 8] = np.array(shape_info.shape, dtype=np.int32).view(
+        np.byte
+    )
+    offset += 8
+    flatbuff[offset : offset + shape_info.nbytes] = shape_info.reshape(-1).view(np.byte)
+    offset += shape_info.nbytes
+
+    # Write byte positions
+    flatbuff[offset : offset + 4] = np.int32(byte_positions.shape[0]).view((np.byte, 4))
+    offset += 4
+    flatbuff[offset : offset + byte_positions.nbytes] = byte_positions.reshape(-1).view(
+        np.byte
+    )
+    offset += byte_positions.nbytes
+
+    # Write actual data
+    for byts in data:
+        n = len(byts)
+        flatbuff[offset : offset + n] = np.frombuffer(byts, dtype=np.byte)
+        offset += n
+    return memoryview(flatbuff.tobytes())
+
+
+def decode_chunk(
+    byts: Union[bytes, memoryview]
+) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
+
+    enc_dtype = np.dtype(hub.constants.ENCODING_DTYPE)
+
+    buff = np.frombuffer(byts, dtype=np.byte)
+
+    # Read version
+    len_version = buff[0]
+    version = "".join(map(chr, buff[1 : 1 + len_version]))
+    offset = 1 + len_version
+
+    # Read shape info
+    shape_info_shape = buff[offset : offset + 8].view(np.int32)
+    offset += 8
+    shape_info_nbytes = np.prod(shape_info_shape) * enc_dtype.itemsize
+    shape_info = (
+        buff[offset : offset + shape_info_nbytes]
+        .view(enc_dtype)
+        .reshape(shape_info_shape)
+        .copy()
+    )
+    offset += shape_info_nbytes
+
+    # Read byte positions
+    byte_positions_rows = buff[offset : offset + 4].view(np.int32)[0]
+    offset += 4
+    byte_positions_nbytes = byte_positions_rows * 3 * enc_dtype.itemsize
+    byte_positions = (
+        buff[offset : offset + byte_positions_nbytes]
+        .view(enc_dtype)
+        .reshape(byte_positions_rows, 3)
+        .copy()
+    )
+    offset += byte_positions_nbytes
+
+    # Read data
+    data = buff[offset:].copy()
+
+    return version, shape_info, byte_positions, data
+
+
+def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
+    len_version = len(version)
+    flatbuff = np.zeros(1 + len_version + sum([x.nbytes for x in ids]), dtype=np.byte)
+
+    # Write version
+    len_version = len(version)
+    flatbuff[0] = len_version
+    flatbuff[1 : 1 + len_version] = list(map(ord, version))
+    offset = 1 + len_version
+
+    # Write ids
+    for arr in ids:
+        flatbuff[offset : offset + arr.nbytes] = arr.view(np.byte).reshape(-1)
+        offset += arr.nbytes
+
+    return memoryview(flatbuff.tobytes())
+
+
+def decode_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]:
+    enc_dtype = np.dtype(hub.constants.ENCODING_DTYPE)
+
+    buff = np.frombuffer(byts, dtype=np.byte)
+
+    # Read version
+    len_version = buff[0]
+    version = "".join(map(chr, buff[1 : 1 + len_version]))
+    offset = 1 + len_version
+
+    # Read chunk ids
+    ids = buff[offset:].view(enc_dtype).reshape(-1, 2).copy()
+
+    return version, ids
diff --git a/hub/core/tests/test_lowlevel.py b/hub/core/tests/test_serialize.py
similarity index 74%
rename from hub/core/tests/test_lowlevel.py
rename to hub/core/tests/test_serialize.py
index 5381aba075..92fc6ddb17 100644
--- a/hub/core/tests/test_lowlevel.py
+++ b/hub/core/tests/test_serialize.py
@@ -1,5 +1,4 @@
-from hub.core.lowlevel import (
-    Pointer,
+from hub.core.serialize import (
     encode_chunk,
     decode_chunk,
     encode_chunkids,
@@ -29,15 +28,6 @@ def test_chunk_encoding():
     np.testing.assert_array_equal(byte_positions, byte_positions2)
     assert b"".join(data) == bytes(data2)
 
-    # from pointer
-    buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded))
-    decoded = decode_chunk(buff)
-    version2, shape_info2, byte_positions2, data2 = decoded
-    assert version2 == version
-    np.testing.assert_array_equal(shape_info, shape_info2)
-    np.testing.assert_array_equal(byte_positions, byte_positions2)
-    assert b"".join(data) == bytes(data2)
-
 
 def test_chunkids_encoding():
     version = hub.__version__

From 440a0b70da08eafb98ec0d36bfaecc0f18bbbbcf Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Tue, 13 Jul 2021 21:48:11 +0530
Subject: [PATCH 51/79] encode_*->serialize_*

---
 hub/core/chunk.py                |  6 +++---
 hub/core/meta/encode/chunk_id.py | 11 +++--------
 hub/core/serialize.py            |  8 ++++----
 hub/core/tests/test_serialize.py | 20 ++++++++++----------
 4 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index f8ae9a8066..892f1cdba0 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -8,7 +8,7 @@
 from hub.core.meta.encode.shape import ShapeEncoder
 from hub.core.meta.encode.byte_positions import BytePositionsEncoder
 
-from hub.core.serialize import encode_chunk, decode_chunk, infer_chunk_num_bytes
+from hub.core.serialize import serialize_chunk, deserialize_chunk, infer_chunk_num_bytes
 
 
 class Chunk(Cachable):
@@ -177,7 +177,7 @@ def tobytes(self) -> memoryview:
         if self.num_samples == 0:
             return memoryview(bytes())
 
-        return encode_chunk(
+        return serialize_chunk(
             hub.__version__,
             self.shapes_encoder.array,
             self.byte_positions_encoder.array,
@@ -189,5 +189,5 @@ def tobytes(self) -> memoryview:
     def frombuffer(cls, buffer: bytes) -> "Chunk":
         if len(buffer) == 0:
             return cls()
-        version, shapes, byte_positions, data = decode_chunk(buffer)
+        version, shapes, byte_positions, data = deserialize_chunk(buffer)
         return cls(shapes, byte_positions, data=data)
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 7fe00a04a5..69c47ca01b 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -6,7 +6,7 @@
 from typing import Optional, Tuple, Union, List, Iterable
 import numpy as np
 from uuid import uuid4
-from hub.core.serialize import encode_chunkids, decode_chunkids
+from hub.core.serialize import serialize_chunkids, deserialize_chunkids
 from hub.core.index import IndexEntry
 import math
 
@@ -104,12 +104,7 @@ def _get_2d_idx(self, idx: int) -> Tuple[int, int]:
 
     def tobytes(self) -> memoryview:
         self._flush_buffer()
-        encoded = encode_chunkids(hub.__version__, self._data)
-        decoded = decode_chunkids(encoded)[1]
-        if self._data:
-            np.testing.assert_array_equal(
-                decoded, np.concatenate(self._data), err_msg=str(bytes(encoded))
-            )
+        encoded = serialize_chunkids(hub.__version__, self._data)
         return encoded
 
     @staticmethod
@@ -133,7 +128,7 @@ def get_name_for_chunk(self, chunk_index: int) -> str:
 
     @classmethod
     def frombuffer(cls, buffer: bytes):
-        version, ids = decode_chunkids(buffer)
+        version, ids = deserialize_chunkids(buffer)
         return cls(ids)
 
     @property
diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index 3b0fc68158..58726682da 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -21,7 +21,7 @@ def infer_chunk_num_bytes(
     return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 13
 
 
-def encode_chunk(
+def serialize_chunk(
     version: str,
     shape_info: np.ndarray,
     byte_positions: np.ndarray,
@@ -61,7 +61,7 @@ def encode_chunk(
     return memoryview(flatbuff.tobytes())
 
 
-def decode_chunk(
+def deserialize_chunk(
     byts: Union[bytes, memoryview]
 ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
 
@@ -104,7 +104,7 @@ def decode_chunk(
     return version, shape_info, byte_positions, data
 
 
-def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
+def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
     len_version = len(version)
     flatbuff = np.zeros(1 + len_version + sum([x.nbytes for x in ids]), dtype=np.byte)
 
@@ -122,7 +122,7 @@ def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
     return memoryview(flatbuff.tobytes())
 
 
-def decode_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]:
+def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]:
     enc_dtype = np.dtype(hub.constants.ENCODING_DTYPE)
 
     buff = np.frombuffer(byts, dtype=np.byte)
diff --git a/hub/core/tests/test_serialize.py b/hub/core/tests/test_serialize.py
index 92fc6ddb17..543e64717a 100644
--- a/hub/core/tests/test_serialize.py
+++ b/hub/core/tests/test_serialize.py
@@ -1,15 +1,15 @@
 from hub.core.serialize import (
-    encode_chunk,
-    decode_chunk,
-    encode_chunkids,
-    decode_chunkids,
+    serialize_chunk,
+    deserialize_chunk,
+    serialize_chunkids,
+    deserialize_chunkids,
 )
 import numpy as np
 import ctypes
 import hub
 
 
-def test_chunk_encoding():
+def test_chunk_serialize():
     version = hub.__version__
     shape_info = np.cast[hub.constants.ENCODING_DTYPE](
         np.random.randint(100, size=(17, 63))
@@ -18,10 +18,10 @@ def test_chunk_encoding():
         np.random.randint(100, size=(31, 3))
     )
     data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9]
-    encoded = bytes(encode_chunk(version, shape_info, byte_positions, data))
+    encoded = bytes(serialize_chunk(version, shape_info, byte_positions, data))
 
     # from bytes
-    decoded = decode_chunk(encoded)
+    decoded = deserialize_chunk(encoded)
     version2, shape_info2, byte_positions2, data2 = decoded
     assert version2 == version
     np.testing.assert_array_equal(shape_info, shape_info2)
@@ -29,13 +29,13 @@ def test_chunk_encoding():
     assert b"".join(data) == bytes(data2)
 
 
-def test_chunkids_encoding():
+def test_chunkids_serialize():
     version = hub.__version__
     shards = [
         np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(100, 2)))
     ]
-    encoded = encode_chunkids(version, shards)
-    decoded = decode_chunkids(encoded)
+    encoded = serialize_chunkids(version, shards)
+    decoded = deserialize_chunkids(encoded)
     version2, ids = decoded
     assert version2 == version
     np.testing.assert_array_equal(np.concatenate(shards), ids)

From ceae226105a3597c976b608137231708932bbf11 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Tue, 13 Jul 2021 20:25:59 +0400
Subject: [PATCH 52/79] Update hub/core/chunk.py

Co-authored-by: dyllan <mccreary@dyllan.ai>
---
 hub/core/chunk.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index 892f1cdba0..dd1996781d 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -141,8 +141,6 @@ def append_sample(self, buffer: memoryview, max_data_bytes: int, shape: Tuple[in
             )
 
         # `_data` will be a `memoryview` if `frombuffer` is called.
-        # if isinstance(self._data, memoryview):
-        #     self._data = bytearray(self._data)
 
         # note: incoming_num_bytes can be 0 (empty sample)
         self._data.append(buffer)

From 88ab4bb8a9acabf6534a413513eec9e1d9085411 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Tue, 13 Jul 2021 22:25:09 +0530
Subject: [PATCH 53/79] docstring

---
 hub/api/dataset.py    | 14 ++++-------
 hub/api/tensor.py     |  3 +++
 hub/core/chunk.py     |  5 ++--
 hub/core/serialize.py | 56 ++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index c372addd10..f49e64d59c 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -60,7 +60,6 @@ def __init__(
                 Use this if you want to specify the storage provider object manually instead of using a tag or url to generate it.
             public (bool, optional): Applied only if storage is Hub cloud storage and a new Dataset is being created. Defines if the dataset will have public access.
             token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.
-            _tensors: (list, optional): Internal.
 
         Raises:
             ValueError: If an existing local path is given, it must be a directory.
@@ -90,7 +89,7 @@ def __init__(
         self.storage.autoflush = True
         self.index = index or Index()
 
-        self.tensors: Dict[str, Tensor] = _tensors if _tensors else {}
+        self.tensors: Dict[str, Tensor] = {}
 
         self._token = token
 
@@ -218,10 +217,8 @@ def __iter__(self) -> Iterator["Dataset"]:
         tensors_sliced = [t[self.index][: len(self)] for t in self.tensors.values()]
         num_tensors = len(tensor_names)
         for tensors in zip(*tensors_sliced):
-            tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)}
-            ds = Dataset(
-                read_only=self.read_only, storage=self.storage, _tensors=tensors
-            )
+            ds = Dataset(read_only=self.read_only, storage=self.storage)
+            ds.tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)}
             yield ds
 
     def _load_meta(self):
@@ -230,9 +227,8 @@ def _load_meta(self):
         if dataset_exists(self.storage):
             logger.info(f"{self.path} loaded successfully.")
             self.meta = self.storage.get_cachable(meta_key, DatasetMeta)
-            if not self.tensors:
-                for tensor_name in self.meta.tensors:
-                    self.tensors[tensor_name] = Tensor(tensor_name, self.storage)
+            for tensor_name in self.meta.tensors:
+                self.tensors[tensor_name] = Tensor(tensor_name, self.storage)
 
         elif len(self.storage) > 0:
             # dataset does not exist, but the path was not empty
diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 6d825c48a9..a74e53bbe8 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -45,6 +45,9 @@ def __init__(
 
         self.chunk_engine = ChunkEngine(self.key, self.storage)
 
+        # If this tensor corresponds to a sample in a parent tensor,
+        # `_sample` caches the chunk id and local sample index
+        # for that sample. Set during iteration through the parent tensor.
         self._sample: Optional[Tuple[int, int]] = None
 
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index 892f1cdba0..dc8cea2e00 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -79,6 +79,7 @@ def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]:
         return i, byte_index
 
     def view(self, start_byte: int, end_byte: int):
+        """Returns a sliced view of the chunk's data"""
         if len(self._data) == 1:
             return self._data[0][start_byte:end_byte]
 
@@ -88,9 +89,7 @@ def view(self, start_byte: int, end_byte: int):
             # Indexing to the same inner chunk, this would be fast
             return self._data[start2dx][start2dy:end2dy]
 
-        # TODO: document this
-        # builds a list of memoryviews that contain the pieces we need for the output view
-
+        # build a list of memoryviews that contain the pieces we need for the output view
         byts = []
         byts.append(self._data[start2dx][start2dy:])
         for i in range(start2dx + 1, end2dx):
diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index 58726682da..8cbaf6e3bf 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -12,6 +12,18 @@ def infer_chunk_num_bytes(
     data: Optional[Union[Sequence[bytes], Sequence[memoryview]]] = None,
     len_data: Optional[int] = None,
 ) -> int:
+    """Calculates the number of bytes in a chunk without serializing it. Used by `LRUCache` to determine if a chunk can be cached.
+
+    Args:
+        version: (str) Version of hub library
+        shape_info: (numpy.ndarray) Encoded shapes info from the chunk's `ShapeEncoder` instance.
+        byte_positions: (numpy.ndarray) Encoded byte positions from the chunk's `BytePositionsEncoder` instance.
+        data: (list) `_data` field of the chunk
+        len_data: (int, optional) Number of bytes in the chunk
+
+    Returns:
+        Length of the chunk when serialized as int
+    """
     # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128)
     # NOTE: Assumption: len(version) < 256
     assert len(version) < 256
@@ -28,6 +40,18 @@ def serialize_chunk(
     data: Union[Sequence[bytes], Sequence[memoryview]],
     len_data: Optional[int] = None,
 ) -> memoryview:
+    """Serializes a chunk
+
+    Args:
+        version: (str) Version of hub library.
+        shape_info: (numpy.ndarray) Encoded shapes info from the chunk's `ShapeEncoder` instance.
+        byte_positions: (numpy.ndarray) Encoded byte positions from the chunk's `BytePositionsEncoder` instance.
+        data: (list) `_data` field of the chunk.
+        len_data: (int, optional) Number of bytes in the chunk.
+
+    Returns:
+        Serialized chunk as memoryview.
+    """
     nbytes = infer_chunk_num_bytes(version, shape_info, byte_positions, data, len_data)
     flatbuff = np.zeros(nbytes, dtype=np.byte)
 
@@ -64,7 +88,18 @@ def serialize_chunk(
 def deserialize_chunk(
     byts: Union[bytes, memoryview]
 ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
-
+    """Deserializes a chunk
+
+    Args:
+        byts: (bytes) Serialized chunk.
+
+    Returns:
+        Tuple of:
+        hub version used to create the chunk,
+        encoded shapes info as numpy array,
+        encoded byte positions as numpy array,
+        chunk data as memoryview.
+    """
     enc_dtype = np.dtype(hub.constants.ENCODING_DTYPE)
 
     buff = np.frombuffer(byts, dtype=np.byte)
@@ -105,6 +140,15 @@ def deserialize_chunk(
 
 
 def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
+    """Serializes chunk ids
+
+    Args:
+        version: (str) Version of hub library.
+        ids: (list) Encoded chunk ids from a `ChunkIdEncoder` instance.
+
+    Returns:
+        Serialized chunk ids as memoryview.
+    """
     len_version = len(version)
     flatbuff = np.zeros(1 + len_version + sum([x.nbytes for x in ids]), dtype=np.byte)
 
@@ -123,6 +167,16 @@ def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
 
 
 def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]:
+    """Deserializes chunk ids
+
+    Args:
+        byts: (bytes) Serialized chunk ids.
+
+    Returns:
+        Tuple of:
+        hub version used to create the chunk,
+        encoded chunk ids as memoryview.
+    """
     enc_dtype = np.dtype(hub.constants.ENCODING_DTYPE)
 
     buff = np.frombuffer(byts, dtype=np.byte)

From 4b29507b92f61fcf4f1fe8040b8e95e29c5e3a59 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Tue, 13 Jul 2021 23:31:58 +0530
Subject: [PATCH 54/79] docstring

---
 hub/core/meta/encode/chunk_id.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 69c47ca01b..8c171813aa 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -295,14 +295,16 @@ def get(
 
         Args:
             sample_index (int): Global index (relative to the tensor). This will be converted to the local chunk index.
-            return_chunk_index (bool): If True, 2 values are returned, the second one being the chunk's index. Defaults to False.
+            return_chunk_index (bool): If True, a tuple of 2 ints representing the chunks index is returned along with the chunk id.
+            return_local_sample_index (bool): If True, the local index of the sample within the chunk is returned along with the chunk id.
 
         Raises:
             IndexError: If no samples exist or `sample_index` exceeds the available indices.
 
         Returns:
-            Tuple[Tuple[ENCODING_DTYPE], Optional[Tuple[int]]]: Returns the chunk ID for `sample_index`. If `return_chunk_index` is True,
-                there will be 2 values. The second one being the chunk's index.
+            Union[int, Tuple[int, Tuple[int, int]], Tuple[int, int], Tuple[int, Tuple[int, int], int]]: Returns either just the chunk id
+            or a tuple containing the chunk id and one or both of the chunk index and local sample index based on the `return_chunk_index`
+            and `return_local_sample_index` arguments.
         """
         if self.num_samples == 0:
             raise IndexError(

From 5b386087bdf59b926edf0f801220d4600c5cb549 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Tue, 13 Jul 2021 23:42:22 +0530
Subject: [PATCH 55/79] rm comments

---
 hub/api/tests/test_api.py        | 2 +-
 hub/core/meta/encode/chunk_id.py | 1 -
 hub/core/tests/test_serialize.py | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 67809ad311..de53b2df8e 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -502,7 +502,7 @@ def searchsorted(*args, **kwargs):
 
     np.searchsorted = orig_searchsorted
 
-    
+
 def test_array_interface(memory_ds: Dataset):
     tensor = memory_ds.create_tensor("tensor")
     x = np.random.random((32, 32))
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 8c171813aa..def87fe2ae 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -157,7 +157,6 @@ def _decr_2d(self, x: int, y: int) -> Tuple[int, int]:
     def _incr_2d(self, x: int, y: int) -> Tuple[int, int]:
         if x < 0:
             return x, y + 1
-        # assert y < len(self._data[x])
         if y == len(self._data[x]) - 1:
             if x == len(self._data) - 1:
                 return -1, 0
diff --git a/hub/core/tests/test_serialize.py b/hub/core/tests/test_serialize.py
index 543e64717a..c799ae9191 100644
--- a/hub/core/tests/test_serialize.py
+++ b/hub/core/tests/test_serialize.py
@@ -20,7 +20,6 @@ def test_chunk_serialize():
     data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9]
     encoded = bytes(serialize_chunk(version, shape_info, byte_positions, data))
 
-    # from bytes
     decoded = deserialize_chunk(encoded)
     version2, shape_info2, byte_positions2, data2 = decoded
     assert version2 == version

From 4f25b210b41456d08eebaa459079381c0e1c0e39 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Tue, 13 Jul 2021 23:43:32 +0530
Subject: [PATCH 56/79] rm unused import

---
 hub/core/serialize.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index 8cbaf6e3bf..f238ceac02 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -1,7 +1,6 @@
 from typing import Optional, Sequence, Union, Tuple
 
 import hub
-import ctypes
 import numpy as np
 
 

From 82ce5bea3c6d34f3de48234a102a95ce0d04b96d Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 03:55:39 +0530
Subject: [PATCH 57/79] revert dataset.py

---
 hub/api/dataset.py | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 53c96d44af..dabd61e6ee 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -1,7 +1,6 @@
-from hub.core.storage.provider import StorageProvider
 from hub.core.tensor import create_tensor
-from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence, Iterator
-from hub.constants import DEFAULT_HTYPE, UNSPECIFIED
+from hub.constants import DEFAULT_HTYPE
+from typing import Callable, Dict, Optional, Union, Tuple, List
 import numpy as np
 
 from hub.api.tensor import Tensor
@@ -9,6 +8,7 @@
 
 from hub.core.meta.dataset_meta import DatasetMeta
 
+from hub.core.typing import StorageProvider
 from hub.core.index import Index
 from hub.integrations import dataset_to_tensorflow
 from hub.util.keys import dataset_exists, get_dataset_meta_key, tensor_exists
@@ -38,7 +38,6 @@ def __init__(
         storage: Optional[StorageProvider] = None,
         public: Optional[bool] = True,
         token: Optional[str] = None,
-        _tensors: Optional[Dict[str, Tensor]] = None,
     ):
         """Initializes a new or existing dataset.
 
@@ -61,6 +60,7 @@ def __init__(
             public (bool, optional): Applied only if storage is Hub cloud storage and a new Dataset is being created. Defines if the dataset will have public access.
             token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.
 
+
         Raises:
             ValueError: If an existing local path is given, it must be a directory.
             ImproperDatasetInitialization: Exactly one argument out of 'path' and 'storage' needs to be specified.
@@ -128,8 +128,6 @@ def __getitem__(
             if item not in self.tensors:
                 raise TensorDoesNotExistError(item)
             else:
-                if self.index.is_trivial():
-                    return self.tensors[item]
                 return self.tensors[item][self.index]
         elif isinstance(item, (int, slice, list, tuple, Index)):
             return Dataset(
@@ -145,8 +143,10 @@ def create_tensor(
         self,
         name: str,
         htype: str = DEFAULT_HTYPE,
-        dtype: Union[str, np.dtype, type] = UNSPECIFIED,
-        sample_compression: str = UNSPECIFIED,
+        chunk_size: int = None,
+        dtype: Union[str, np.dtype, type] = None,
+        sample_compression: str = None,
+        chunk_compression: str = None,
         **kwargs,
     ):
         """Creates a new tensor in the dataset.
@@ -158,8 +158,12 @@ def create_tensor(
                 For example, `htype="image"` would have `dtype` default to `uint8`.
                 These defaults can be overridden by explicitly passing any of the other parameters to this function.
                 May also modify the defaults for other parameters.
+            chunk_size (int): Optionally override this tensor's `chunk_size`. In short, `chunk_size` determines the
+                size of files (chunks) being created to represent this tensor's samples.
+                For more on chunking, check out `hub.core.chunk_engine.chunker`.
             dtype (str): Optionally override this tensor's `dtype`. All subsequent samples are required to have this `dtype`.
-            sample_compression (str): All samples will be compressed in the provided format. If `None`, samples are uncompressed.
+            sample_compression (str): Optionally override this tensor's `sample_compression`. Only used when the incoming data is uncompressed.
+            chunk_compression (str): Optionally override this tensor's `chunk_compression`. Currently not implemented.
             **kwargs: `htype` defaults can be overridden by passing any of the compatible parameters.
                 To see all `htype`s and their correspondent arguments, check out `hub/htypes.py`.
 
@@ -171,6 +175,10 @@ def create_tensor(
             NotImplementedError: If trying to override `chunk_compression`.
         """
 
+        if chunk_compression is not None:
+            # TODO: implement chunk compression + update docstring
+            raise NotImplementedError("Chunk compression is not implemented yet!")
+
         if tensor_exists(name, self.storage):
             raise TensorAlreadyExistsError(name)
 
@@ -179,8 +187,10 @@ def create_tensor(
             name,
             self.storage,
             htype=htype,
+            chunk_size=chunk_size,
             dtype=dtype,
             sample_compression=sample_compression,
+            chunk_compression=chunk_compression,
             **kwargs,
         )
         tensor = Tensor(name, self.storage)  # type: ignore
@@ -200,14 +210,9 @@ def __setattr__(self, name: str, value):
         else:
             return super().__setattr__(name, value)
 
-    def __iter__(self) -> Iterator["Dataset"]:
-        tensor_names = list(self.tensors)
-        tensors_sliced = [t[self.index][: len(self)] for t in self.tensors.values()]
-        num_tensors = len(tensor_names)
-        for tensors in zip(*tensors_sliced):
-            ds = Dataset(read_only=self.read_only, storage=self.storage)
-            ds.tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)}
-            yield ds
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
 
     def _load_meta(self):
         meta_key = get_dataset_meta_key()
@@ -215,6 +220,7 @@ def _load_meta(self):
         if dataset_exists(self.storage):
             logger.info(f"{self.path} loaded successfully.")
             self.meta = self.storage.get_cachable(meta_key, DatasetMeta)
+
             for tensor_name in self.meta.tensors:
                 self.tensors[tensor_name] = Tensor(tensor_name, self.storage)
 
@@ -248,7 +254,6 @@ def read_only(self, value: bool):
     def pytorch(
         self,
         transform: Optional[Callable] = None,
-        tensors: Optional[Sequence[str]] = None,
         num_workers: int = 1,
         batch_size: Optional[int] = 1,
         drop_last: Optional[bool] = False,
@@ -263,7 +268,6 @@ def pytorch(
 
         Args:
             transform (Callable, optional) : Transformation function to be applied to each sample.
-            tensors (List, optional): Optionally provide a list of tensor names in the ordering that your training script expects. For example, if you have a dataset that has "image" and "label" tensors, if `tensors=["image", "label"]`, your training script should expect each batch will be provided as a tuple of (image, label).
             num_workers (int): The number of workers to use for fetching data in parallel.
             batch_size (int, optional): Number of samples per batch to load. Default value is 1.
             drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size.
@@ -282,7 +286,6 @@ def pytorch(
         return dataset_to_pytorch(
             self,
             transform,
-            tensors,
             num_workers=num_workers,
             batch_size=batch_size,
             drop_last=drop_last,

From 35d3a4a8e335e32108aeb64a429f6bda7144d23c Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 03:56:13 +0530
Subject: [PATCH 58/79] revert tensor.py

---
 hub/api/tensor.py | 33 +++++----------------------------
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 93786c53c6..1f24783900 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -1,6 +1,6 @@
 from hub.util.keys import tensor_exists
 from hub.core.sample import Sample  # type: ignore
-from typing import List, Sequence, Union, Optional, Tuple, Dict, Iterator
+from typing import List, Sequence, Union, Optional, Tuple, Dict
 from hub.util.shape import ShapeInterval
 
 import numpy as np
@@ -10,8 +10,6 @@
 from hub.util.exceptions import TensorDoesNotExistError, InvalidKeyTypeError
 from hub.core.index import Index
 
-import warnings
-
 
 class Tensor:
     def __init__(
@@ -45,11 +43,6 @@ def __init__(
 
         self.chunk_engine = ChunkEngine(self.key, self.storage)
 
-        # If this tensor corresponds to a sample in a parent tensor,
-        # `_sample` caches the chunk id and local sample index
-        # for that sample. Set during iteration through the parent tensor.
-        self._sample: Optional[Tuple[int, int]] = None
-
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
         """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array,
         or a sequence of `hub.load` outputs, which can be used to load files. See examples down below.
@@ -78,7 +71,6 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
                 The length should be equal to the number of samples to add.
         """
         self.chunk_engine.extend(samples)
-        self._sample = None
 
     def append(
         self,
@@ -199,13 +191,9 @@ def __getitem__(
     def __setitem__(self, item: Union[int, slice], value: np.ndarray):
         raise NotImplementedError("Tensor update not currently supported!")
 
-    def __iter__(self) -> Iterator["Tensor"]:
-        for i, (chunk_id, local_sample_index) in enumerate(
-            self.chunk_engine.chunk_id_encoder.iter(self.index.values[0].value)
-        ):
-            tensor_i = Tensor(self.key, self.storage, index=self.index[i])
-            tensor_i._sample = chunk_id, local_sample_index
-            yield tensor_i
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
 
     def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]:
         """Computes the contents of the tensor in numpy format.
@@ -221,15 +209,7 @@ def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]:
         Returns:
             A numpy array containing the data represented by this tensor.
         """
-        if self._sample:
-            chunk_id, local_sample_index = self._sample
-            chunk = self.chunk_engine.get_chunk_from_id(chunk_id)
-            ret = self.chunk_engine.read_sample_from_chunk(chunk, local_sample_index)
-            if aslist:
-                ret = list(ret)
-            for entry in self.index.values[1:]:
-                ret = ret[entry.value]
-            return ret
+
         return self.chunk_engine.numpy(self.index, aslist=aslist)
 
     def __str__(self):
@@ -238,7 +218,4 @@ def __str__(self):
             index_str = ""
         return f"Tensor(key={repr(self.key)}{index_str})"
 
-    def __array__(self) -> np.ndarray:
-        return self.numpy()
-
     __repr__ = __str__

From cb4ea21852ccc0cae11d354973140bcf840c37b2 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 03:57:13 +0530
Subject: [PATCH 59/79] revert test_api.py

---
 hub/api/tests/test_api.py | 89 ++++++++++-----------------------------
 1 file changed, 22 insertions(+), 67 deletions(-)

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index de53b2df8e..ea28236487 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -1,3 +1,4 @@
+from hub.constants import UNCOMPRESSED
 import numpy as np
 import pytest
 import uuid
@@ -6,10 +7,7 @@
 from hub.api.dataset import Dataset
 from hub.core.tests.common import parametrize_all_dataset_storages
 from hub.tests.common import assert_array_lists_equal
-from hub.util.exceptions import (
-    TensorDtypeMismatchError,
-    TensorInvalidSampleShapeError,
-)
+from hub.util.exceptions import TensorDtypeMismatchError, TensorInvalidSampleShapeError
 from hub.client.client import HubBackendClient
 from hub.client.utils import has_hub_testing_creds
 from click.testing import CliRunner
@@ -187,7 +185,8 @@ def test_empty_samples(ds: Dataset):
     actual_list = tensor.numpy(aslist=True)
     expected_list = [a1, *a2, a3, *a4]
 
-    assert tensor.meta.sample_compression is None
+    assert tensor.meta.sample_compression == UNCOMPRESSED
+    assert tensor.meta.chunk_compression == UNCOMPRESSED
 
     assert len(tensor) == 16
     assert tensor.shape_interval.lower == (16, 0, 0, 2)
@@ -197,7 +196,7 @@ def test_empty_samples(ds: Dataset):
 
     # test indexing individual empty samples with numpy while looping, this may seem redundant but this was failing before
     for actual_sample, expected in zip(ds, expected_list):
-        actual = actual_sample["with_empty"].numpy()
+        actual = actual_sample.with_empty.numpy()
         np.testing.assert_array_equal(actual, expected)
 
 
@@ -218,27 +217,30 @@ def test_scalar_samples(ds: Dataset):
     tensor.append(-99)
     tensor.append(np.array(4))
 
-    tensor.append(np.int16(4))
+    with pytest.raises(TensorDtypeMismatchError):
+        tensor.append(np.int16(4))
 
     with pytest.raises(TensorDtypeMismatchError):
         tensor.append(np.float32(4))
 
-    tensor.append(np.uint8(3))
+    with pytest.raises(TensorDtypeMismatchError):
+        tensor.append(np.uint8(3))
 
     tensor.extend([10, 1, 4])
     tensor.extend([1])
     tensor.extend(np.array([1, 2, 3], dtype=MAX_INT_DTYPE))
 
-    tensor.extend(np.array([4, 5, 33], dtype="int16"))
+    with pytest.raises(TensorDtypeMismatchError):
+        tensor.extend(np.array([4, 5, 33], dtype="int16"))
 
-    assert len(tensor) == 16
+    assert len(tensor) == 11
 
-    expected = np.array([5, 10, -99, 4, 4, 3, 10, 1, 4, 1, 1, 2, 3, 4, 5, 33])
+    expected = np.array([5, 10, -99, 4, 10, 1, 4, 1, 1, 2, 3])
     np.testing.assert_array_equal(tensor.numpy(), expected)
 
     assert tensor.numpy(aslist=True) == expected.tolist()
 
-    assert tensor.shape == (16,)
+    assert tensor.shape == (11,)
 
     # len(shape) for a scalar is `()`. len(shape) for [1] is `(1,)`
     with pytest.raises(TensorInvalidSampleShapeError):
@@ -255,7 +257,6 @@ def test_sequence_samples(ds: Dataset):
 
     tensor.append([1, 2, 3])
     tensor.extend([[4, 5, 6]])
-    ds.clear_cache()
 
     assert len(tensor) == 2
 
@@ -383,7 +384,7 @@ def test_shape_property(memory_ds):
 
 
 def test_htype(memory_ds: Dataset):
-    image = memory_ds.create_tensor("image", htype="image", sample_compression="png")
+    image = memory_ds.create_tensor("image", htype="image")
     bbox = memory_ds.create_tensor("bbox", htype="bbox")
     label = memory_ds.create_tensor("label", htype="class_label")
     video = memory_ds.create_tensor("video", htype="video")
@@ -426,22 +427,18 @@ def test_dtype(memory_ds: Dataset):
     np_dtyped_tensor.append(np.ones((10, 10), dtype=MAX_FLOAT_DTYPE))
     py_dtyped_tensor.append(np.ones((10, 10), dtype=MAX_FLOAT_DTYPE))
 
-    # test auto upcasting
-    np_dtyped_tensor.append(np.ones((10, 10), dtype="float32"))
-    py_dtyped_tensor.append(np.ones((10, 10), dtype="float32"))
-
-    with pytest.raises(TensorDtypeMismatchError):
-        tensor.append(np.ones((10, 10), dtype="float64"))
-
-    with pytest.raises(TensorDtypeMismatchError):
-        dtyped_tensor.append(np.ones((10, 10), dtype="uint64") * 256)
-
     assert tensor.dtype == np.float32
     assert dtyped_tensor.dtype == np.uint8
     assert np_dtyped_tensor.dtype == MAX_FLOAT_DTYPE
     assert py_dtyped_tensor.dtype == MAX_FLOAT_DTYPE
 
 
+@pytest.mark.xfail(raises=TensorDtypeMismatchError, strict=True)
+def test_dtype_mismatch(memory_ds: Dataset):
+    tensor = memory_ds.create_tensor("tensor", dtype="float16")
+    tensor.append(np.ones(100, dtype="uint8"))
+
+
 @pytest.mark.xfail(raises=TypeError, strict=True)
 def test_fails_on_wrong_tensor_syntax(memory_ds):
     memory_ds.some_tensor = np.ones((28, 28))
@@ -474,55 +471,13 @@ def test_hub_cloud_dataset():
     ds.delete()
 
 
-def test_iter_perf(memory_ds: Dataset):
-    orig_searchsorted = np.searchsorted
-    call_count = {"n": 0}
-    callers = []
-
-    def searchsorted(*args, **kwargs):
-        import inspect
-
-        callers.append(inspect.stack()[1][3])
-        call_count["n"] += 1
-        return orig_searchsorted(*args, **kwargs)
-
-    ds = memory_ds
-    ds.create_tensor("x")
-    ds.create_tensor("y")
-    for _ in range(10):
-        ds["x"].append(np.zeros((10, 10)))
-        ds["y"].append(np.ones((10, 10)))
-
-    np.searchsorted = searchsorted
-    for i, sub_ds in enumerate(ds):
-        np.testing.assert_array_equal(sub_ds["x"].numpy(), np.zeros((10, 10)))
-        np.testing.assert_array_equal(sub_ds["y"].numpy(), np.ones((10, 10)))
-
-    assert call_count["n"] == 40
-
-    np.searchsorted = orig_searchsorted
-
-
-def test_array_interface(memory_ds: Dataset):
-    tensor = memory_ds.create_tensor("tensor")
-    x = np.random.random((32, 32))
-    tensor.append(x)
-    arr1 = np.array(tensor)
-    arr2 = np.array(tensor)
-    np.testing.assert_array_equal(x, arr1[0])
-    np.testing.assert_array_equal(x, arr2[0])
-    assert arr1.__array_interface__["data"][0] == arr1.__array_interface__["data"][0]
-    tensor.append(x)
-    np.testing.assert_array_equal(tensor.numpy(), np.concatenate([arr1, arr2]))
-
-
 @parametrize_all_dataset_storages
 def test_hub_dataset_suffix_bug(ds):
     # creating dataset with similar name but some suffix removed from end
     ds2 = Dataset(ds.path[:-1])
     ds2.delete()
 
-
+    
 def test_empty_dataset():
     with CliRunner().isolated_filesystem():
         ds = Dataset("test")

From 868004ed8f6d267eb73fabac02d82dcbe0b1e20d Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 04:00:13 +0530
Subject: [PATCH 60/79] revert ChunkEngine.numpy

---
 hub/core/chunk_engine.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 56b760d142..d73cb3f86c 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -343,22 +343,30 @@ def numpy(
         Returns:
             Union[np.ndarray, Sequence[np.ndarray]]: Either a list of numpy arrays or a single numpy array (depending on the `aslist` argument).
         """
+
         length = self.num_samples
         enc = self.chunk_id_encoder
         last_shape = None
         samples = []
 
-        for chunk_id, local_sample_index in enc.iter(index.values[0].value):
-            chunk = self.get_chunk_from_id(chunk_id)
-            sample = self.read_sample_from_chunk(chunk, local_sample_index)
+        for global_sample_index in index.values[0].indices(length):
+            chunk_id = enc[global_sample_index]
+            chunk_name = ChunkIdEncoder.name_from_id(chunk_id)
+            chunk_key = get_chunk_key(self.key, chunk_name)
+            chunk = self.cache.get_cachable(chunk_key, Chunk)
+            sample = self.read_sample_from_chunk(global_sample_index, chunk)
             shape = sample.shape
+
             if not aslist and last_shape is not None:
                 if shape != last_shape:
                     raise DynamicTensorNumpyError(self.key, index, "shape")
+
             samples.append(sample)
             last_shape = shape
+
         return _format_samples(samples, index, aslist)
 
+
     def get_chunk_from_id(self, chunk_id: int) -> Chunk:
         chunk_name = ChunkIdEncoder.name_from_id(chunk_id)
         chunk_key = get_chunk_key(self.key, chunk_name)

From c7a1321eff384cd06845d95bd433652d73168b15 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 04:01:49 +0530
Subject: [PATCH 61/79] revert read_sample_from_chunk

---
 hub/core/chunk_engine.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index d73cb3f86c..05e9baf9fb 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -373,15 +373,16 @@ def get_chunk_from_id(self, chunk_id: int) -> Chunk:
         return self.cache.get_cachable(chunk_key, Chunk)
 
     def read_sample_from_chunk(
-        self, chunk: Chunk, local_sample_index: int
+        self, global_sample_index: int, chunk: Chunk
     ) -> np.ndarray:
-        """Read a sample from a chunk, given the local index. Handles decompressing if applicable."""
+        """Read a sample from a chunk, converts the global index into a local index. Handles decompressing if applicable."""
 
-        expect_compressed = self.tensor_meta.sample_compression is not None
+        expect_compressed = self.tensor_meta.sample_compression != UNCOMPRESSED
         dtype = self.tensor_meta.dtype
 
         enc = self.chunk_id_encoder
 
+        local_sample_index = enc.get_local_sample_index(global_sample_index)
         shape = chunk.shapes_encoder[local_sample_index]
         sb, eb = chunk.byte_positions_encoder[local_sample_index]
 

From 6519986f22f451a17ea6f5e40f50635e464b0535 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 04:02:51 +0530
Subject: [PATCH 62/79] rem unreachable

---
 hub/core/chunk_engine.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 05e9baf9fb..c9f1348bbe 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -366,12 +366,6 @@ def numpy(
 
         return _format_samples(samples, index, aslist)
 
-
-    def get_chunk_from_id(self, chunk_id: int) -> Chunk:
-        chunk_name = ChunkIdEncoder.name_from_id(chunk_id)
-        chunk_key = get_chunk_key(self.key, chunk_name)
-        return self.cache.get_cachable(chunk_key, Chunk)
-
     def read_sample_from_chunk(
         self, global_sample_index: int, chunk: Chunk
     ) -> np.ndarray:

From 0ddc61bb782d9d96806079d5fdb863e2d5861787 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 04:09:03 +0530
Subject: [PATCH 63/79] remove iter logic

---
 hub/core/meta/encode/chunk_id.py | 405 +++++--------------------------
 1 file changed, 63 insertions(+), 342 deletions(-)

diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index def87fe2ae..af828ba4d7 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -3,12 +3,10 @@
 import hub
 from hub.core.storage.cachable import Cachable
 from io import BytesIO
-from typing import Optional, Tuple, Union, List, Iterable
+from typing import Optional, Tuple
 import numpy as np
 from uuid import uuid4
 from hub.core.serialize import serialize_chunkids, deserialize_chunkids
-from hub.core.index import IndexEntry
-import math
 
 
 # these constants are for accessing the data layout. see the `ChunkIdEncoder` docstring.
@@ -17,7 +15,7 @@
 
 
 class ChunkIdEncoder(Cachable):
-    def __init__(self, ids=None):
+    def __init__(self):
         """Custom compressor that allows reading of chunk IDs from a sample index without decompressing.
 
         Chunk IDs:
@@ -71,41 +69,11 @@ def __init__(self, ids=None):
                 Then, you get the left-most column and that is your chunk ID!
 
         """
-        self._buffer: List[List[int]] = []
-        self._data: List[np.ndarray] = [] if ids is None else [ids]
-        self._num_chunks = sum(map(len, self._data))
-
-        self._prev_sample_index: Optional[int] = None
-        self._prev_chunk_id: Optional[int] = None
-        self._prev_chunk_index: Optional[Tuple[int, int]] = None
-        self._prev_entry: Optional[Union[np.ndarray, List[int]]] = None
-
-    def _flush_buffer(self):
-        if self._buffer:
-            self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE))
-        if self._prev_chunk_index and self._prev_chunk_index[0] < 0:
-            self._prev_chunk_index = (len(self._data) - 1, self._prev_chunk_index[1])
-        self._buffer.clear()
-
-    def _get_2d_idx(self, idx: int) -> Tuple[int, int]:
-        i = 0
-        data = self._data
-        while True:
-            try:
-                num_data_i = len(data[i])
-            except IndexError:  # slightly faster than checking i < len(self._data) in a loop
-                return -1, idx
-            if num_data_i <= idx:
-                idx -= num_data_i
-                i += 1
-            else:
-                break
-        return i, idx
+
+        self._encoded_ids = None
 
     def tobytes(self) -> memoryview:
-        self._flush_buffer()
-        encoded = serialize_chunkids(hub.__version__, self._data)
-        return encoded
+        return serialize_chunkids(hub.__version__, [self._encoded_ids])
 
     @staticmethod
     def name_from_id(id: ENCODING_DTYPE) -> str:
@@ -123,79 +91,28 @@ def id_from_name(name: str) -> ENCODING_DTYPE:
     def get_name_for_chunk(self, chunk_index: int) -> str:
         """Gets the name for the chunk at index `chunk_index`. If you need to get the name for a chunk from a sample index, instead
         use `__getitem__`, then `name_from_id`."""
-        chunk_id = self.get_entry(chunk_index)[CHUNK_ID_INDEX]
+
+        chunk_id = self._encoded_ids[:, CHUNK_ID_INDEX][chunk_index]
         return ChunkIdEncoder.name_from_id(chunk_id)
 
     @classmethod
     def frombuffer(cls, buffer: bytes):
+        instance = cls()
         version, ids = deserialize_chunkids(buffer)
-        return cls(ids)
+        instance._encoded_ids = ids
+        return instance
 
     @property
     def num_chunks(self) -> int:
-        return self._num_chunks
-
-    def get_entry(self, idx: int):
-        x, y = self._get_2d_idx(idx)
-        return self._buffer[y] if x < 0 else self._data[x][y]
-
-    def _get_entry_2d(self, x: int, y: int):
-        return self._buffer[y] if x < 0 else self._data[x][y]
-
-    def _decr_2d(self, x: int, y: int) -> Tuple[int, int]:
-        if x < 0:
-            if y:
-                return x, y - 1
-            return len(self._data) - 1, len(self._data[-1]) - 1
-        if y:
-            return x, y - 1
-        if x:
-            x -= 1
-            return x, len(self._data[x]) - 1
-        raise IndexError()
-
-    def _incr_2d(self, x: int, y: int) -> Tuple[int, int]:
-        if x < 0:
-            return x, y + 1
-        if y == len(self._data[x]) - 1:
-            if x == len(self._data) - 1:
-                return -1, 0
-            return x + 1, 0
-        return x, y + 1
-
-    def _is_origin(self, x: int, y: int) -> bool:
-        if not x and not y:
-            return True
-        if x < 0 and not self._data and not y:
-            return True
-        return False
-
-    @property
-    def last_entry(self) -> Union[np.ndarray, List[int]]:
-        if self._buffer:
-            return self._buffer[-1]
-        if self._data:
-            return self._data[-1][-1]
-        return None
-
-    @property
-    def last_index(self) -> int:
-        last_entry = self.last_entry
-        if last_entry is None:
-            return -1
-        return int(last_entry[LAST_INDEX_INDEX])
+        if self._encoded_ids is None:
+            return 0
+        return len(self._encoded_ids)
 
     @property
     def num_samples(self) -> int:
-        if self._buffer:
-            return int(self._buffer[-1][LAST_INDEX_INDEX] + 1)
-        elif self._data:
-            return int(self._data[-1][-1, LAST_INDEX_INDEX] + 1)
-        return 0
-
-    @property
-    def empty(self) -> bool:
-        return not self._buffer and not self._data
+        if self._encoded_ids is None:
+            return 0
+        return int(self._encoded_ids[-1, LAST_INDEX_INDEX] + 1)
 
     def generate_chunk_id(self) -> ENCODING_DTYPE:
         """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it.
@@ -204,9 +121,21 @@ def generate_chunk_id(self) -> ENCODING_DTYPE:
         Returns:
             ENCODING_DTYPE: The random chunk ID.
         """
+
         id = ENCODING_DTYPE(uuid4().int >> UUID_SHIFT_AMOUNT)
-        self._buffer.append([id, self.last_index])
-        self._num_chunks += 1
+
+        if self.num_samples == 0:
+            self._encoded_ids = np.array([[id, -1]], dtype=ENCODING_DTYPE)
+
+        else:
+            last_index = self.num_samples - 1
+
+            new_entry = np.array(
+                [[id, last_index]],
+                dtype=ENCODING_DTYPE,
+            )
+            self._encoded_ids = np.concatenate([self._encoded_ids, new_entry])
+
         return id
 
     def register_samples_to_last_chunk_id(self, num_samples: int):
@@ -221,14 +150,15 @@ def register_samples_to_last_chunk_id(self, num_samples: int):
             ChunkIdEncoderError: Must call `generate_chunk_id` before registering samples.
             ChunkIdEncoderError: `num_samples` can only be 0 if it is able to be a sample continuation accross chunks.
         """
+
         if num_samples < 0:
             raise ValueError(
                 f"Cannot register negative num samples. Got: {num_samples}"
             )
 
-        if self.empty:
+        if self.num_samples == 0:
             raise ChunkIdEncoderError(
-                f"Cannot register samples because no chunk IDs exist. {self._buffer}, {self._data}"
+                "Cannot register samples because no chunk IDs exist."
             )
 
         if num_samples == 0 and self.num_chunks < 2:
@@ -236,14 +166,12 @@ def register_samples_to_last_chunk_id(self, num_samples: int):
                 "Cannot register 0 num_samples (signifying a partial sample continuing the last chunk) when no last chunk exists."
             )
 
-        last_entry = self.last_entry
-        if self._buffer:
-            last_entry[LAST_INDEX_INDEX] += num_samples
-        else:
-            err = np.geterr()["over"]
-            np.seterr(over="ignore")
-            last_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples)
-            np.seterr(over=err)
+        current_entry = self._encoded_ids[-1]
+
+        # this operation will trigger an overflow for the first addition, so supress the warning
+        np.seterr(over="ignore")
+        current_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples)
+        np.seterr(over="warn")
 
     def get_local_sample_index(self, global_sample_index: int) -> int:
         """Converts `global_sample_index` into a new index that is relative to the chunk the sample belongs to.
@@ -273,38 +201,34 @@ def get_local_sample_index(self, global_sample_index: int) -> int:
             int: local index value between 0 and the amount of samples the chunk contains - 1.
         """
 
-        return self.get(global_sample_index, return_local_sample_index=True)[1]  # type: ignore
-
-    def __getitem__(self, sample_index: int) -> int:
-        return self.get(sample_index)  # type: ignore
-
-    def get(
-        self,
-        sample_index: int,
-        return_chunk_index: bool = False,
-        return_local_sample_index: bool = False,
-    ) -> Union[
-        int,
-        Tuple[int, Tuple[int, int]],
-        Tuple[int, Tuple[int, int], int],
-        Tuple[int, int],
-    ]:
+        _, chunk_index = self.__getitem__(global_sample_index, return_chunk_index=True)  # type: ignore
+
+        if chunk_index == 0:
+            return global_sample_index
+
+        current_entry = self._encoded_ids[chunk_index - 1]  # type: ignore
+        last_num_samples = current_entry[LAST_INDEX_INDEX] + 1
+
+        return int(global_sample_index - last_num_samples)
+
+    def __getitem__(
+        self, sample_index: int, return_chunk_index: bool = False
+    ) -> Tuple[ENCODING_DTYPE, Optional[int]]:
         """Get the ID for the chunk that `sample_index` is stored in.
         To get the name of the chunk, use `name_from_id`.
 
         Args:
             sample_index (int): Global index (relative to the tensor). This will be converted to the local chunk index.
-            return_chunk_index (bool): If True, a tuple of 2 ints representing the chunks index is returned along with the chunk id.
-            return_local_sample_index (bool): If True, the local index of the sample within the chunk is returned along with the chunk id.
+            return_chunk_index (bool): If True, 2 values are returned, the second one being the chunk's index. Defaults to False.
 
         Raises:
             IndexError: If no samples exist or `sample_index` exceeds the available indices.
 
         Returns:
-            Union[int, Tuple[int, Tuple[int, int]], Tuple[int, int], Tuple[int, Tuple[int, int], int]]: Returns either just the chunk id
-            or a tuple containing the chunk id and one or both of the chunk index and local sample index based on the `return_chunk_index`
-            and `return_local_sample_index` arguments.
+            Tuple[Tuple[ENCODING_DTYPE], Optional[Tuple[int]]]: Returns the chunk ID for `sample_index`. If `return_chunk_index` is True,
+                there will be 2 values. The second one being the chunk's index.
         """
+
         if self.num_samples == 0:
             raise IndexError(
                 f"Index {sample_index} is out of bounds for an empty chunk names encoding."
@@ -313,214 +237,11 @@ def get(
         if sample_index < 0:
             sample_index = (self.num_samples) + sample_index
 
-        chunk_id = None
-        if (
-            self._prev_sample_index is not None
-            and sample_index >= self._prev_sample_index
-        ):
-            if sample_index <= self._prev_entry[LAST_INDEX_INDEX]:  # type: ignore
-                chunk_id = self._prev_chunk_id
-                chunk_index = self._prev_chunk_index
-                current_entry = self._prev_entry
-            else:
-                next_index = self._incr_2d(*self._prev_chunk_index)  # type: ignore
-                next_entry = self._get_entry_2d(*next_index)
-                if sample_index <= next_entry[LAST_INDEX_INDEX]:
-                    chunk_index = next_index
-                    current_entry = next_entry
-                    chunk_id = current_entry[CHUNK_ID_INDEX]
-
-        if chunk_id is None:
-            self._flush_buffer()
-            last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data]
-            shard_index = np.searchsorted(last_idxs, sample_index)
-            shard = self._data[shard_index]
-            idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index)
-            current_entry = shard[idx]
-            chunk_id = current_entry[CHUNK_ID_INDEX]
-            chunk_index = (shard_index, idx)
-
-        self._prev_sample_index = sample_index
-        self._prev_chunk_index = chunk_index
-        self._prev_entry = current_entry
-        self._prev_chunk_id = chunk_id
-
-        if not return_chunk_index and not return_local_sample_index:
-            return chunk_id
-        ret = [chunk_id]
+        idx = np.searchsorted(self._encoded_ids[:, LAST_INDEX_INDEX], sample_index)
+        id = self._encoded_ids[idx, CHUNK_ID_INDEX]
+        chunk_index = idx
+
         if return_chunk_index:
-            ret.append(chunk_index)
-        if return_local_sample_index:
-            if any(chunk_index):  # type: ignore
-                prev_entry = self._get_entry_2d(*self._decr_2d(*chunk_index))  # type: ignore
-                local_sample_index = (
-                    sample_index - int(prev_entry[LAST_INDEX_INDEX]) - 1
-                )
-            else:
-                local_sample_index = sample_index
-            ret.append(local_sample_index)
-
-        return tuple(ret)  # type: ignore
-
-    def _preproc_slice(self, index: slice) -> Tuple[int, int, int, int, bool]:
-        start = 0 if index.start is None else index.start
-        stop = self.num_samples if index.stop is None else index.stop
-        step = 1 if index.step is None else index.step
-        assert isinstance(start, int)
-        assert isinstance(stop, int)
-        assert isinstance(step, int)
-        if start < 0:
-            start += self.num_samples
-        if stop < 0:
-            stop += self.num_samples
-        assert step != 0
-        if step > 0:
-            total = math.ceil((stop - start) / step)
-            forward = True
-        else:
-            step = -step
-            total = math.ceil((stop - start) / step)
-            start, stop = stop - 1, start
-            forward = False
-        return start, stop, step, total, forward
-
-    def _iter_forward(
-        self,
-        chunk_id: int,
-        shard_index: int,
-        chunk_index: int,
-        local_sample_index: int,
-        total: int,
-        step: int,
-    ) -> Iterable[Tuple[int, int]]:
-        n = 0
-        ctr = Counter(step)
-        shard = self._data[shard_index]
-        last_index = int(shard[chunk_index, LAST_INDEX_INDEX])
-        for i in range(local_sample_index + 1, last_index + 1):
-            if ctr():
-                yield chunk_id, i
-                n += 1
-                if n == total:
-                    return
-        for chunk_index in range(chunk_index + 1, len(shard)):
-            entry = shard[chunk_index]
-            chunk_id = entry[CHUNK_ID_INDEX]
-            new_last_index = int(entry[LAST_INDEX_INDEX])
-            for i in range(new_last_index - last_index):
-                if ctr():
-                    yield chunk_id, i
-                    n += 1
-                    if n == total:
-                        return
-            last_index = new_last_index
-        for shard_index in range(shard_index + 1, len(self._data)):
-            shard = self._data[shard_index]
-            for entry in shard:
-                chunk_id = entry[CHUNK_ID_INDEX]
-                new_last_index = int(entry[LAST_INDEX_INDEX])
-                for i in range(new_last_index - last_index):
-                    if ctr():
-                        yield chunk_id, i
-                        n += 1
-                        if n == total:
-                            return
-                last_index = new_last_index
-
-    def _iter_reverse(
-        self,
-        chunk_id: int,
-        shard_index: int,
-        chunk_index: int,
-        local_sample_index: int,
-        total: int,
-        step: int,
-    ) -> Iterable[Tuple[int, int]]:
-        n = 0
-        ctr = Counter(step)
-        shard = self._data[shard_index]
-        last_index = int(shard[chunk_index, LAST_INDEX_INDEX])
-        for local_sample_index in range(local_sample_index - 1, -1, -1):
-            if ctr():
-                yield chunk_id, local_sample_index
-                n += 1
-                if n == total:
-                    return
-        for chunk_index in range(chunk_index - 1, -1, -1):
-            entry = shard[chunk_index]
-            chunk_id = entry[CHUNK_ID_INDEX]
-            last_index = entry[LAST_INDEX_INDEX]
-            if chunk_index:
-                last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX]
-            elif shard_index:
-                last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX]
-            for local_sample_index in range(last_index, -1, -1):
-                if ctr():
-                    yield chunk_id, local_sample_index
-                    n += 1
-                    if n == total:
-                        return
-        for shard_index in range(shard_index - 1, -1, -1):
-            shard = self._data[shard_index]
-            for chunk_index in range(len(shard) - 1, -1, -1):
-                entry = shard[chunk_index]
-                chunk_id = entry[CHUNK_ID_INDEX]
-                last_index = entry[LAST_INDEX_INDEX]
-                if chunk_index:
-                    last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX]
-                elif shard_index:
-                    last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX]
-                for local_sample_index in range(last_index, -1, -1):
-                    if ctr():
-                        yield chunk_id, local_sample_index
-                        n += 1
-                        if n == total:
-                            return
-
-    def iter(
-        self, index: Union[int, slice, tuple] = slice(None)
-    ) -> Iterable[Tuple[int, int]]:
-        if isinstance(index, int):
-            yield self.get(index, return_local_sample_index=True)  # type: ignore
-        elif isinstance(index, slice):
-            start, stop, step, total, forward = self._preproc_slice(index)
-            if not total:
-                return
-            self._flush_buffer()
-            if start:
-                chunk_id, (shard_index, chunk_index), local_sample_index = self.get(  # type: ignore
-                    start, return_chunk_index=True, return_local_sample_index=True
-                )
-                shard = self._data[shard_index]
-            else:
-                shard_index = 0
-                chunk_index = 0
-                shard = self._data[0]
-                local_sample_index = 0
-                chunk_id = shard[0, CHUNK_ID_INDEX]
-            yield chunk_id, local_sample_index
-            if total == 1:
-                return
-            iter_f = self._iter_forward if forward else self._iter_reverse
-            for chunk_id, local_sample_index in iter_f(
-                chunk_id, shard_index, chunk_index, local_sample_index, total - 1, step
-            ):
-                yield chunk_id, local_sample_index
-        elif isinstance(index, tuple):
-            for i in index:
-                # Random access
-                yield self.get(i, return_local_sample_index=True)  # type: ignore
-
-
-class Counter:
-    # TODO: refac this
-    def __init__(self, n: int) -> None:
-        self.n = n
-        self.i = 0
-
-    def __call__(self):
-        self.i += 1
-        if self.i == self.n:
-            self.i = 0
-            return True
-        return False
+            return id, chunk_index
+
+        return id

From 609ea67cd161125e0da3ffc9905873d5985fc18f Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 04:10:18 +0530
Subject: [PATCH 64/79] revert pytorch.py

---
 hub/integrations/pytorch/pytorch.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/hub/integrations/pytorch/pytorch.py b/hub/integrations/pytorch/pytorch.py
index 95f33387f1..596073305f 100644
--- a/hub/integrations/pytorch/pytorch.py
+++ b/hub/integrations/pytorch/pytorch.py
@@ -253,10 +253,10 @@ def _generate_shared_memory_names(self, chunk_names: Set[str]):
             ls.append(f"al_{self.last_chunk_num_generated}")
         return ls
 
-    def _numpy_from_chunk(self, chunk, key: str, local_index: int):
+    def _numpy_from_chunk(self, index: int, key: str, chunk):
         """Takes a list of chunks and returns a numpy array from it"""
         chunk_engine = self.all_chunk_engines[key]
-        value = chunk_engine.read_sample_from_chunk(chunk, local_index)
+        value = chunk_engine.read_sample_from_chunk(index, chunk)
 
         # typecast if incompatible with pytorch
         if value.dtype == "uint16":
@@ -289,16 +289,14 @@ def _get_data_from_chunks(
             actual_index = self.index_offset + i
             # TODO change this once it returns list/set of str
             chunk_engine = self.all_chunk_engines[key]
-            chunk_id, local_index = chunk_engine.chunk_id_encoder.get(
-                actual_index, return_local_sample_index=True
-            )
+            chunk_id = chunk_engine.chunk_id_encoder[actual_index]
             chunk_name = chunk_engine.chunk_id_encoder.name_from_id(chunk_id)  # type: ignore
             if chunk_name not in chunk_map:
                 self.last_index_meta[key] = i - 1
                 return
             chunk = chunk_map[chunk_name]
             self.all_index_value_maps[key][i] = self._numpy_from_chunk(
-                chunk, key, local_index
+                actual_index, key, chunk
             )
 
         self.last_index_meta[key] = len(self) - 1

From 7823060c9cf561e86d41d381672c4601d182967d Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 04:12:48 +0530
Subject: [PATCH 65/79] revert dataset.py

---
 hub/api/dataset.py | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index dabd61e6ee..b6a802c217 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -1,6 +1,7 @@
+from hub.core.storage.provider import StorageProvider
 from hub.core.tensor import create_tensor
-from hub.constants import DEFAULT_HTYPE
-from typing import Callable, Dict, Optional, Union, Tuple, List
+from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence
+from hub.constants import DEFAULT_HTYPE, UNSPECIFIED
 import numpy as np
 
 from hub.api.tensor import Tensor
@@ -8,7 +9,6 @@
 
 from hub.core.meta.dataset_meta import DatasetMeta
 
-from hub.core.typing import StorageProvider
 from hub.core.index import Index
 from hub.integrations import dataset_to_tensorflow
 from hub.util.keys import dataset_exists, get_dataset_meta_key, tensor_exists
@@ -143,10 +143,8 @@ def create_tensor(
         self,
         name: str,
         htype: str = DEFAULT_HTYPE,
-        chunk_size: int = None,
-        dtype: Union[str, np.dtype, type] = None,
-        sample_compression: str = None,
-        chunk_compression: str = None,
+        dtype: Union[str, np.dtype, type] = UNSPECIFIED,
+        sample_compression: str = UNSPECIFIED,
         **kwargs,
     ):
         """Creates a new tensor in the dataset.
@@ -158,12 +156,8 @@ def create_tensor(
                 For example, `htype="image"` would have `dtype` default to `uint8`.
                 These defaults can be overridden by explicitly passing any of the other parameters to this function.
                 May also modify the defaults for other parameters.
-            chunk_size (int): Optionally override this tensor's `chunk_size`. In short, `chunk_size` determines the
-                size of files (chunks) being created to represent this tensor's samples.
-                For more on chunking, check out `hub.core.chunk_engine.chunker`.
             dtype (str): Optionally override this tensor's `dtype`. All subsequent samples are required to have this `dtype`.
-            sample_compression (str): Optionally override this tensor's `sample_compression`. Only used when the incoming data is uncompressed.
-            chunk_compression (str): Optionally override this tensor's `chunk_compression`. Currently not implemented.
+            sample_compression (str): All samples will be compressed in the provided format. If `None`, samples are uncompressed.
             **kwargs: `htype` defaults can be overridden by passing any of the compatible parameters.
                 To see all `htype`s and their correspondent arguments, check out `hub/htypes.py`.
 
@@ -175,10 +169,6 @@ def create_tensor(
             NotImplementedError: If trying to override `chunk_compression`.
         """
 
-        if chunk_compression is not None:
-            # TODO: implement chunk compression + update docstring
-            raise NotImplementedError("Chunk compression is not implemented yet!")
-
         if tensor_exists(name, self.storage):
             raise TensorAlreadyExistsError(name)
 
@@ -187,10 +177,8 @@ def create_tensor(
             name,
             self.storage,
             htype=htype,
-            chunk_size=chunk_size,
             dtype=dtype,
             sample_compression=sample_compression,
-            chunk_compression=chunk_compression,
             **kwargs,
         )
         tensor = Tensor(name, self.storage)  # type: ignore
@@ -254,6 +242,7 @@ def read_only(self, value: bool):
     def pytorch(
         self,
         transform: Optional[Callable] = None,
+        tensors: Optional[Sequence[str]] = None,
         num_workers: int = 1,
         batch_size: Optional[int] = 1,
         drop_last: Optional[bool] = False,
@@ -268,6 +257,7 @@ def pytorch(
 
         Args:
             transform (Callable, optional) : Transformation function to be applied to each sample.
+            tensors (List, optional): Optionally provide a list of tensor names in the ordering that your training script expects. For example, if you have a dataset that has "image" and "label" tensors, if `tensors=["image", "label"]`, your training script should expect each batch will be provided as a tuple of (image, label).
             num_workers (int): The number of workers to use for fetching data in parallel.
             batch_size (int, optional): Number of samples per batch to load. Default value is 1.
             drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size.
@@ -286,6 +276,7 @@ def pytorch(
         return dataset_to_pytorch(
             self,
             transform,
+            tensors,
             num_workers=num_workers,
             batch_size=batch_size,
             drop_last=drop_last,

From 390151ba5146bb1a5b437e1eb867c5e4f26772dc Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 04:22:39 +0530
Subject: [PATCH 66/79] reverts

---
 hub/api/tests/test_api.py        | 58 ++++++++++++++++++++------------
 hub/core/chunk_engine.py         | 26 +++++---------
 hub/core/meta/encode/chunk_id.py |  4 +++
 3 files changed, 50 insertions(+), 38 deletions(-)

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index ea28236487..5bb9b75751 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -1,4 +1,3 @@
-from hub.constants import UNCOMPRESSED
 import numpy as np
 import pytest
 import uuid
@@ -7,7 +6,10 @@
 from hub.api.dataset import Dataset
 from hub.core.tests.common import parametrize_all_dataset_storages
 from hub.tests.common import assert_array_lists_equal
-from hub.util.exceptions import TensorDtypeMismatchError, TensorInvalidSampleShapeError
+from hub.util.exceptions import (
+    TensorDtypeMismatchError,
+    TensorInvalidSampleShapeError,
+)
 from hub.client.client import HubBackendClient
 from hub.client.utils import has_hub_testing_creds
 from click.testing import CliRunner
@@ -185,8 +187,7 @@ def test_empty_samples(ds: Dataset):
     actual_list = tensor.numpy(aslist=True)
     expected_list = [a1, *a2, a3, *a4]
 
-    assert tensor.meta.sample_compression == UNCOMPRESSED
-    assert tensor.meta.chunk_compression == UNCOMPRESSED
+    assert tensor.meta.sample_compression is None
 
     assert len(tensor) == 16
     assert tensor.shape_interval.lower == (16, 0, 0, 2)
@@ -217,30 +218,27 @@ def test_scalar_samples(ds: Dataset):
     tensor.append(-99)
     tensor.append(np.array(4))
 
-    with pytest.raises(TensorDtypeMismatchError):
-        tensor.append(np.int16(4))
+    tensor.append(np.int16(4))
 
     with pytest.raises(TensorDtypeMismatchError):
         tensor.append(np.float32(4))
 
-    with pytest.raises(TensorDtypeMismatchError):
-        tensor.append(np.uint8(3))
+    tensor.append(np.uint8(3))
 
     tensor.extend([10, 1, 4])
     tensor.extend([1])
     tensor.extend(np.array([1, 2, 3], dtype=MAX_INT_DTYPE))
 
-    with pytest.raises(TensorDtypeMismatchError):
-        tensor.extend(np.array([4, 5, 33], dtype="int16"))
+    tensor.extend(np.array([4, 5, 33], dtype="int16"))
 
-    assert len(tensor) == 11
+    assert len(tensor) == 16
 
-    expected = np.array([5, 10, -99, 4, 10, 1, 4, 1, 1, 2, 3])
+    expected = np.array([5, 10, -99, 4, 4, 3, 10, 1, 4, 1, 1, 2, 3, 4, 5, 33])
     np.testing.assert_array_equal(tensor.numpy(), expected)
 
     assert tensor.numpy(aslist=True) == expected.tolist()
 
-    assert tensor.shape == (11,)
+    assert tensor.shape == (16,)
 
     # len(shape) for a scalar is `()`. len(shape) for [1] is `(1,)`
     with pytest.raises(TensorInvalidSampleShapeError):
@@ -257,6 +255,7 @@ def test_sequence_samples(ds: Dataset):
 
     tensor.append([1, 2, 3])
     tensor.extend([[4, 5, 6]])
+    ds.clear_cache()
 
     assert len(tensor) == 2
 
@@ -384,7 +383,7 @@ def test_shape_property(memory_ds):
 
 
 def test_htype(memory_ds: Dataset):
-    image = memory_ds.create_tensor("image", htype="image")
+    image = memory_ds.create_tensor("image", htype="image", sample_compression="png")
     bbox = memory_ds.create_tensor("bbox", htype="bbox")
     label = memory_ds.create_tensor("label", htype="class_label")
     video = memory_ds.create_tensor("video", htype="video")
@@ -427,18 +426,22 @@ def test_dtype(memory_ds: Dataset):
     np_dtyped_tensor.append(np.ones((10, 10), dtype=MAX_FLOAT_DTYPE))
     py_dtyped_tensor.append(np.ones((10, 10), dtype=MAX_FLOAT_DTYPE))
 
+    # test auto upcasting
+    np_dtyped_tensor.append(np.ones((10, 10), dtype="float32"))
+    py_dtyped_tensor.append(np.ones((10, 10), dtype="float32"))
+
+    with pytest.raises(TensorDtypeMismatchError):
+        tensor.append(np.ones((10, 10), dtype="float64"))
+
+    with pytest.raises(TensorDtypeMismatchError):
+        dtyped_tensor.append(np.ones((10, 10), dtype="uint64") * 256)
+
     assert tensor.dtype == np.float32
     assert dtyped_tensor.dtype == np.uint8
     assert np_dtyped_tensor.dtype == MAX_FLOAT_DTYPE
     assert py_dtyped_tensor.dtype == MAX_FLOAT_DTYPE
 
 
-@pytest.mark.xfail(raises=TensorDtypeMismatchError, strict=True)
-def test_dtype_mismatch(memory_ds: Dataset):
-    tensor = memory_ds.create_tensor("tensor", dtype="float16")
-    tensor.append(np.ones(100, dtype="uint8"))
-
-
 @pytest.mark.xfail(raises=TypeError, strict=True)
 def test_fails_on_wrong_tensor_syntax(memory_ds):
     memory_ds.some_tensor = np.ones((28, 28))
@@ -471,13 +474,26 @@ def test_hub_cloud_dataset():
     ds.delete()
 
 
+def test_array_interface(memory_ds: Dataset):
+    tensor = memory_ds.create_tensor("tensor")
+    x = np.random.random((32, 32))
+    tensor.append(x)
+    arr1 = np.array(tensor)
+    arr2 = np.array(tensor)
+    np.testing.assert_array_equal(x, arr1[0])
+    np.testing.assert_array_equal(x, arr2[0])
+    assert arr1.__array_interface__["data"][0] == arr1.__array_interface__["data"][0]
+    tensor.append(x)
+    np.testing.assert_array_equal(tensor.numpy(), np.concatenate([arr1, arr2]))
+
+
 @parametrize_all_dataset_storages
 def test_hub_dataset_suffix_bug(ds):
     # creating dataset with similar name but some suffix removed from end
     ds2 = Dataset(ds.path[:-1])
     ds2.delete()
 
-    
+
 def test_empty_dataset():
     with CliRunner().isolated_filesystem():
         ds = Dataset("test")
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index c9f1348bbe..583868839f 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -267,19 +267,7 @@ def _create_new_chunk(self):
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
         """Formats a batch of `samples` and feeds them into `_append_bytes`."""
 
-        self.get_last_chunk()
-
-        uniform = False
         if isinstance(samples, np.ndarray):
-            uniform = True
-        elif isinstance(samples, Sequence):
-            if is_uniform_sequence(samples):
-                uniform = True
-            if not isinstance(samples[0], np.ndarray):
-                samples = np.array(samples)
-        else:
-            raise TypeError(f"Unsupported type for extending. Got: {type(samples)}")
-        if uniform:
             compression = self.tensor_meta.sample_compression
             if compression is None:
                 buffers = []
@@ -305,11 +293,15 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
 
                 for sample_object in sample_objects:
                     self.append(sample_object)
-        else:
-            for sample in samples:
-                self.append(sample)
 
-        self.cache.maybe_flush()
+        elif isinstance(samples, Sequence):
+            if is_uniform_sequence(samples):
+                self.extend(np.array(samples))
+            else:
+                for sample in samples:
+                    self.append(sample)
+        else:
+            raise TypeError(f"Unsupported type for extending. Got: {type(samples)}")
 
     def append(self, sample: SampleValue):
         """Formats a single `sample` (compresseses/decompresses if applicable) and feeds it into `_append_bytes`."""
@@ -371,7 +363,7 @@ def read_sample_from_chunk(
     ) -> np.ndarray:
         """Read a sample from a chunk, converts the global index into a local index. Handles decompressing if applicable."""
 
-        expect_compressed = self.tensor_meta.sample_compression != UNCOMPRESSED
+        expect_compressed = self.tensor_meta.sample_compression is not None
         dtype = self.tensor_meta.dtype
 
         enc = self.chunk_id_encoder
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index af828ba4d7..0f5f3cf6bd 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -73,6 +73,8 @@ def __init__(self):
         self._encoded_ids = None
 
     def tobytes(self) -> memoryview:
+        if self._encoded_ids is None:
+            return b""
         return serialize_chunkids(hub.__version__, [self._encoded_ids])
 
     @staticmethod
@@ -98,6 +100,8 @@ def get_name_for_chunk(self, chunk_index: int) -> str:
     @classmethod
     def frombuffer(cls, buffer: bytes):
         instance = cls()
+        if not buffer:
+            return instance
         version, ids = deserialize_chunkids(buffer)
         instance._encoded_ids = ids
         return instance

From 77ca4f7b6a1bac8298a888104558c6d9d9faacfb Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 04:23:16 +0530
Subject: [PATCH 67/79] revert tensor.py

---
 hub/api/tensor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 1f24783900..7de61a8372 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -218,4 +218,7 @@ def __str__(self):
             index_str = ""
         return f"Tensor(key={repr(self.key)}{index_str})"
 
+    def __array__(self) -> np.ndarray:
+        return self.numpy()
+
     __repr__ = __str__

From e3ab3bf406b15f5ce59b23481f8ac3749285973f Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 04:36:29 +0530
Subject: [PATCH 68/79] reverts

---
 hub/core/chunk.py | 96 +++++++----------------------------------------
 1 file changed, 14 insertions(+), 82 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index a3f25d4e88..2ec4336f2c 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -1,7 +1,7 @@
 from hub.util.exceptions import FullChunkError
 import hub
 from hub.core.storage.cachable import Cachable
-from typing import List, Sequence, Tuple, Union
+from typing import Sequence, Tuple, Union
 import numpy as np
 from io import BytesIO
 
@@ -10,7 +10,6 @@
 
 from hub.core.serialize import serialize_chunk, deserialize_chunk, infer_chunk_num_bytes
 
-
 class Chunk(Cachable):
     def __init__(
         self,
@@ -46,71 +45,17 @@ def __init__(
         self.shapes_encoder = ShapeEncoder(encoded_shapes)
         self.byte_positions_encoder = BytePositionsEncoder(encoded_byte_positions)
 
-        self._data: List[memoryview] = []
-        self._num_data_bytes: int = 0  # replaces: sum(map(len, self._data))
-
-        if data is not None:
-            self._data.append(data)
-            self._num_data_bytes += len(data)
-
-    def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]:
-        """Converts `byte_index`, which is an index for a flattened stream of bytes, into a 2D index that can
-        be used for a list of byte streams of varying lengths. Used for accessing `self._data`, which is a list
-        of `memoryview`s.
-
-        Args:
-            byte_index (int): Index over a flattened stream of bytes.
-
-        Returns:
-            Tuple[int, int]: 2D index to be used to access `self._data`.
-        """
-        i = 0
-        data = self._data
-        while True:
-            try:
-                num_data_i = len(data[i])
-            except IndexError:  # slightly faster than checking i < len(self._data) in a loop
-                return i - 1, len(data[i - 1]) + byte_index
-            if num_data_i <= byte_index:
-                byte_index -= num_data_i
-                i += 1
-            else:
-                break
-        return i, byte_index
-
-    def view(self, start_byte: int, end_byte: int):
-        """Returns a sliced view of the chunk's data"""
-        if len(self._data) == 1:
-            return self._data[0][start_byte:end_byte]
-
-        start2dx, start2dy = self._get_2d_idx(start_byte)
-        end2dx, end2dy = self._get_2d_idx(end_byte)
-        if start2dx == end2dx:
-            # Indexing to the same inner chunk, this would be fast
-            return self._data[start2dx][start2dy:end2dy]
-
-        # build a list of memoryviews that contain the pieces we need for the output view
-        byts = []
-        byts.append(self._data[start2dx][start2dy:])
-        for i in range(start2dx + 1, end2dx):
-            byts.append(self._data[i])
-        byts.append(self._data[end2dx][:end2dy])
-
-        buff = np.zeros(sum(map(len, byts)), dtype=np.byte)
-        offset = 0
-        for byt in byts:
-            n = len(byt)
-            buff[offset : offset + n] = byt
-            offset += n
-        return memoryview(buff.tobytes())
+        self._data: Union[memoryview, bytearray] = data or bytearray()
 
     @property
-    def num_samples(self):
-        return self.shapes_encoder.num_samples
+    def memoryview_data(self):
+        if isinstance(self._data, memoryview):
+            return self._data
+        return memoryview(self._data)
 
     @property
     def num_data_bytes(self):
-        return self._num_data_bytes
+        return len(self._data)
 
     def is_under_min_space(self, min_data_bytes_target: int) -> bool:
         """If this chunk's data is less than `min_data_bytes_target`, returns True."""
@@ -140,10 +85,11 @@ def append_sample(self, buffer: memoryview, max_data_bytes: int, shape: Tuple[in
             )
 
         # `_data` will be a `memoryview` if `frombuffer` is called.
+        if isinstance(self._data, memoryview):
+            self._data = bytearray(self._data)
 
         # note: incoming_num_bytes can be 0 (empty sample)
-        self._data.append(buffer)
-        self._num_data_bytes += len(buffer)
+        self._data += buffer
         self.update_headers(incoming_num_bytes, shape)
 
     def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
@@ -163,28 +109,14 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
 
     def __len__(self):
         """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached."""
-        return infer_chunk_num_bytes(
-            hub.__version__,
-            self.shapes_encoder.array,
-            self.byte_positions_encoder.array,
-            len_data=self.num_data_bytes,
-        )
+        return infer_chunk_num_bytes(hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, len_data=len(self._data))
 
     def tobytes(self) -> memoryview:
-        if self.num_samples == 0:
-            return memoryview(bytes())
-
-        return serialize_chunk(
-            hub.__version__,
-            self.shapes_encoder.array,
-            self.byte_positions_encoder.array,
-            self._data,
-            self.num_data_bytes,
-        )
+        return serialize_chunk(hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, [self._data])
 
     @classmethod
-    def frombuffer(cls, buffer: bytes) -> "Chunk":
-        if len(buffer) == 0:
+    def frombuffer(cls, buffer: bytes):
+        if not buffer:
             return cls()
         version, shapes, byte_positions, data = deserialize_chunk(buffer)
         return cls(shapes, byte_positions, data=data)

From 0e5e84ce15cb9afdb70fbde51034e515d6bfa716 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 05:01:35 +0530
Subject: [PATCH 69/79] fixes

---
 hub/core/chunk.py                | 15 ++++++++++--
 hub/core/chunk_engine.py         |  2 +-
 hub/core/meta/encode/chunk_id.py |  2 +-
 hub/core/serialize.py            | 41 ++++++++++++++++++++++----------
 4 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
index 2ec4336f2c..7aa81db641 100644
--- a/hub/core/chunk.py
+++ b/hub/core/chunk.py
@@ -10,6 +10,7 @@
 
 from hub.core.serialize import serialize_chunk, deserialize_chunk, infer_chunk_num_bytes
 
+
 class Chunk(Cachable):
     def __init__(
         self,
@@ -109,10 +110,20 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
 
     def __len__(self):
         """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached."""
-        return infer_chunk_num_bytes(hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, len_data=len(self._data))
+        return infer_chunk_num_bytes(
+            hub.__version__,
+            self.shapes_encoder.array,
+            self.byte_positions_encoder.array,
+            len_data=len(self._data),
+        )
 
     def tobytes(self) -> memoryview:
-        return serialize_chunk(hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, [self._data])
+        return serialize_chunk(
+            hub.__version__,
+            self.shapes_encoder.array,
+            self.byte_positions_encoder.array,
+            [self._data],
+        )
 
     @classmethod
     def frombuffer(cls, buffer: bytes):
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 583868839f..52f94cccd4 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -372,7 +372,7 @@ def read_sample_from_chunk(
         shape = chunk.shapes_encoder[local_sample_index]
         sb, eb = chunk.byte_positions_encoder[local_sample_index]
 
-        buffer = chunk.view(sb, eb)
+        buffer = chunk.memoryview_data[sb:eb]
         if expect_compressed:
             sample = decompress_array(buffer, shape)
         else:
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 0f5f3cf6bd..4fe06eec40 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -74,7 +74,7 @@ def __init__(self):
 
     def tobytes(self) -> memoryview:
         if self._encoded_ids is None:
-            return b""
+            return memoryview(b"")
         return serialize_chunkids(hub.__version__, [self._encoded_ids])
 
     @staticmethod
diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index f238ceac02..b33944d52e 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -61,20 +61,35 @@ def serialize_chunk(
     offset = 1 + len_version
 
     # Write shape info
-    flatbuff[offset : offset + 8] = np.array(shape_info.shape, dtype=np.int32).view(
-        np.byte
-    )
-    offset += 8
-    flatbuff[offset : offset + shape_info.nbytes] = shape_info.reshape(-1).view(np.byte)
-    offset += shape_info.nbytes
+    if shape_info.ndim == 1:
+        assert shape_info.nbytes == 0
+        flatbuff[offset : offset + 8] = np.zeros(8, dtype=np.byte)
+        offset += 8
+    else:
+        assert shape_info.ndim == 2
+        flatbuff[offset : offset + 8] = np.array(shape_info.shape, dtype=np.int32).view(
+            np.byte
+        )
+        offset += 8
+        flatbuff[offset : offset + shape_info.nbytes] = shape_info.reshape(-1).view(
+            np.byte
+        )
+        offset += shape_info.nbytes
 
     # Write byte positions
-    flatbuff[offset : offset + 4] = np.int32(byte_positions.shape[0]).view((np.byte, 4))
-    offset += 4
-    flatbuff[offset : offset + byte_positions.nbytes] = byte_positions.reshape(-1).view(
-        np.byte
-    )
-    offset += byte_positions.nbytes
+    if byte_positions.ndim == 1:
+        assert byte_positions.nbytes == 0
+        flatbuff[offset : offset + 4] = np.zeros(4, dtype=np.byte)
+        offset += 4
+    else:
+        flatbuff[offset : offset + 4] = np.int32(byte_positions.shape[0]).view(
+            (np.byte, 4)
+        )
+        offset += 4
+        flatbuff[offset : offset + byte_positions.nbytes] = byte_positions.reshape(
+            -1
+        ).view(np.byte)
+        offset += byte_positions.nbytes
 
     # Write actual data
     for byts in data:
@@ -133,7 +148,7 @@ def deserialize_chunk(
     offset += byte_positions_nbytes
 
     # Read data
-    data = buff[offset:].copy()
+    data = memoryview(buff[offset:].tobytes())
 
     return version, shape_info, byte_positions, data
 

From 0ee485d58425004c8d50ce7be26ade9568abe6b7 Mon Sep 17 00:00:00 2001
From: McCrearyD <mccreary@dyllan.ai>
Date: Tue, 13 Jul 2021 16:33:25 -0700
Subject: [PATCH 70/79] add chunk size tests

---
 hub/api/tests/test_chunk_sizes.py | 122 ++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 hub/api/tests/test_chunk_sizes.py

diff --git a/hub/api/tests/test_chunk_sizes.py b/hub/api/tests/test_chunk_sizes.py
new file mode 100644
index 0000000000..da006bfdd5
--- /dev/null
+++ b/hub/api/tests/test_chunk_sizes.py
@@ -0,0 +1,122 @@
+import numpy as np
+from hub.constants import KB
+from hub.core.tests.common import parametrize_all_dataset_storages
+
+
+def _update_chunk_sizes(ds, max_chunk_size: int):
+    """Updates all chunk sizes for tensors that already exist in `ds`. If
+    more tensors are created after calling this method, those tensors will NOT have
+    the same chunk size.
+    """
+
+    # TODO: set / update chunk sizes API (to replace this function)
+
+    min_chunk_size = max_chunk_size // 2
+
+    for tensor in ds.tensors.values():
+        chunk_engine = tensor.chunk_engine
+
+        chunk_engine.max_chunk_size = max_chunk_size
+        chunk_engine.min_chunk_size = min_chunk_size
+
+
+def _assert_num_chunks(tensor, expected_num_chunks):
+    chunk_engine = tensor.chunk_engine
+    actual_num_chunks = chunk_engine.chunk_id_encoder.num_chunks
+    assert actual_num_chunks == expected_num_chunks
+
+
+def _create_tensors(ds):
+    images = ds.create_tensor("images", htype="image", sample_compression=None)
+    labels = ds.create_tensor("labels", htype="class_label")
+    return images, labels
+
+
+def _append_tensors(images, labels):
+    for i in range(100):
+        x = np.ones((28, 28), dtype=np.uint8) * i
+        y = np.uint32(i)
+
+        images.append(x)
+        labels.append(y)
+
+
+def _extend_tensors(images, labels):
+    images.extend(np.ones((100, 28, 28), dtype=np.uint8))
+    labels.extend(np.ones(100, dtype=np.uint32))
+
+
+@parametrize_all_dataset_storages
+def test_append(ds):
+    images, labels = _create_tensors(ds)
+    _update_chunk_sizes(ds, 32 * KB)
+
+    _append_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 5)
+
+    _append_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 10)
+
+    _append_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 15)
+
+    assert len(ds) == 300
+
+
+@parametrize_all_dataset_storages
+def test_extend(ds):
+    images, labels = _create_tensors(ds)
+
+    _update_chunk_sizes(ds, 32 * KB)
+
+    _extend_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 5)
+
+    _extend_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 10)
+
+    _extend_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 15)
+
+    assert len(ds) == 300
+
+
+@parametrize_all_dataset_storages
+def test_extend_and_append(ds):
+    images, labels = _create_tensors(ds)
+
+    _update_chunk_sizes(ds, 32 * KB)
+
+    _extend_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 5)
+
+    _append_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 10)
+
+    _extend_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 15)
+
+    _append_tensors(images, labels)
+
+    _assert_num_chunks(labels, 1)
+    _assert_num_chunks(images, 20)
+
+    assert len(ds) == 400

From d16d550108e59143287a8ef85024ff156ac4dc21 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 12:40:18 +0530
Subject: [PATCH 71/79] fixes

---
 hub/api/tests/test_api.py        |  2 +-
 hub/core/meta/encode/chunk_id.py |  8 ++++--
 hub/core/serialize.py            | 48 +++++++++++++++++++++-----------
 3 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 5bb9b75751..55fa85ddef 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -26,7 +26,7 @@ def test_persist_local(local_storage):
     ds.image.extend(np.ones((4, 224, 224, 3)))
 
     ds_new = Dataset(local_storage.root)
-    assert len(ds_new) == 4
+    assert len(ds_new) == 4, (ds_new.image.chunk_engine.chunk_id_encoder._encoded_ids,)
 
     assert ds_new.image.shape == (4, 224, 224, 3)
     np.testing.assert_array_equal(ds_new.image.numpy(), np.ones((4, 224, 224, 3)))
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index 4fe06eec40..c6a16d609f 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -74,7 +74,7 @@ def __init__(self):
 
     def tobytes(self) -> memoryview:
         if self._encoded_ids is None:
-            return memoryview(b"")
+            return serialize_chunkids(hub.__version__, [np.array([], dtype=ENCODING_DTYPE)])
         return serialize_chunkids(hub.__version__, [self._encoded_ids])
 
     @staticmethod
@@ -103,7 +103,8 @@ def frombuffer(cls, buffer: bytes):
         if not buffer:
             return instance
         version, ids = deserialize_chunkids(buffer)
-        instance._encoded_ids = ids
+        if ids.nbytes:
+            instance._encoded_ids = ids
         return instance
 
     @property
@@ -116,7 +117,8 @@ def num_chunks(self) -> int:
     def num_samples(self) -> int:
         if self._encoded_ids is None:
             return 0
-        return int(self._encoded_ids[-1, LAST_INDEX_INDEX] + 1)
+        return int(self._encoded_ids[-1, LAST_INDEX_INDEX]) + 1
+
 
     def generate_chunk_id(self) -> ENCODING_DTYPE:
         """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it.
diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index b33944d52e..6ee63d6fc9 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -96,7 +96,13 @@ def serialize_chunk(
         n = len(byts)
         flatbuff[offset : offset + n] = np.frombuffer(byts, dtype=np.byte)
         offset += n
-    return memoryview(flatbuff.tobytes())
+    ret = flatbuff.tobytes()
+    v, s, b, d = deserialize_chunk(ret)
+    assert v == version
+    np.testing.assert_array_equal(s, shape_info)
+    np.testing.assert_array_equal(b, byte_positions)
+    assert bytes(d) == bytes(data[0])
+    return bytes(ret)
 
 
 def deserialize_chunk(
@@ -127,25 +133,31 @@ def deserialize_chunk(
     shape_info_shape = buff[offset : offset + 8].view(np.int32)
     offset += 8
     shape_info_nbytes = np.prod(shape_info_shape) * enc_dtype.itemsize
-    shape_info = (
-        buff[offset : offset + shape_info_nbytes]
-        .view(enc_dtype)
-        .reshape(shape_info_shape)
-        .copy()
-    )
-    offset += shape_info_nbytes
+    if shape_info_nbytes == 0:
+        shape_info = np.array([], dtype=enc_dtype)
+    else:
+        shape_info = (
+            buff[offset : offset + shape_info_nbytes]
+            .view(enc_dtype)
+            .reshape(shape_info_shape)
+            .copy()
+        )
+        offset += shape_info_nbytes
 
     # Read byte positions
     byte_positions_rows = buff[offset : offset + 4].view(np.int32)[0]
     offset += 4
     byte_positions_nbytes = byte_positions_rows * 3 * enc_dtype.itemsize
-    byte_positions = (
-        buff[offset : offset + byte_positions_nbytes]
-        .view(enc_dtype)
-        .reshape(byte_positions_rows, 3)
-        .copy()
-    )
-    offset += byte_positions_nbytes
+    if byte_positions_nbytes == 0:
+        byte_positions = np.array([], dtype=enc_dtype)
+    else:
+        byte_positions = (
+            buff[offset : offset + byte_positions_nbytes]
+            .view(enc_dtype)
+            .reshape(byte_positions_rows, 3)
+            .copy()
+        )
+        offset += byte_positions_nbytes
 
     # Read data
     data = memoryview(buff[offset:].tobytes())
@@ -177,7 +189,11 @@ def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
         flatbuff[offset : offset + arr.nbytes] = arr.view(np.byte).reshape(-1)
         offset += arr.nbytes
 
-    return memoryview(flatbuff.tobytes())
+    ret = memoryview(flatbuff.tobytes())
+    v, ids2 = deserialize_chunkids(ret)
+    assert v == version
+    np.testing.assert_array_equal(ids[0].reshape(-1, 2), ids2)
+    return ret
 
 
 def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]:

From 1b0973a7474dee284220c1ddfb36b6bb2c7f8793 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 13:00:52 +0530
Subject: [PATCH 72/79] fixes

---
 hub/core/chunk_engine.py         | 63 ++++++++++++++++++--------------
 hub/core/meta/encode/chunk_id.py |  5 ++-
 hub/core/serialize.py            | 14 +------
 3 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 52f94cccd4..11fe63e259 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -114,10 +114,8 @@ def __init__(
         # only the last chunk may be less than this
         self.min_chunk_size = self.max_chunk_size // 2
 
-        self.get_chunk_id_encoder()
-        self.get_last_chunk()
-
-    def get_chunk_id_encoder(self) -> ChunkIdEncoder:
+    @property
+    def chunk_id_encoder(self) -> ChunkIdEncoder:
         """Gets the chunk id encoder from cache, if one is not found it creates a blank encoder.
         For more information on what `ChunkIdEncoder` is used for, see the `__init__` docstring.
 
@@ -130,40 +128,49 @@ def get_chunk_id_encoder(self) -> ChunkIdEncoder:
         """
 
         key = get_chunk_id_encoder_key(self.key)
-        if key in self.cache:
-            self.chunk_id_encoder = self.cache.get_cachable(key, ChunkIdEncoder)
+        if not self.chunk_id_encoder_exists:
 
-        else:
             # 1 because we always update the meta information before writing the samples (to account for potentially corrupted data in the future)
             if self.tensor_meta.length > 1:
                 raise CorruptedMetaError(
                     f"Tensor length is {self.tensor_meta.length}, but could not find the chunk id encoder."
                 )
 
-            self.chunk_id_encoder = ChunkIdEncoder()
-            self.cache[key] = self.chunk_id_encoder
+            enc = ChunkIdEncoder()
+            self.cache[key] = enc
+            return enc
 
-        return self.chunk_id_encoder
+        enc = self.cache.get_cachable(key, ChunkIdEncoder)
+        return enc
+
+    @property
+    def chunk_id_encoder_exists(self) -> bool:
+        return get_chunk_id_encoder_key(self.key) in self.cache
 
     @property
     def num_chunks(self) -> int:
+        if not self.chunk_id_encoder_exists:
+            return 0
         return self.chunk_id_encoder.num_chunks
 
     @property
     def num_samples(self) -> int:
+        if not self.chunk_id_encoder_exists:
+            return 0
         return self.chunk_id_encoder.num_samples
 
-    def get_last_chunk(self) -> Optional[Chunk]:
+    @property
+    def last_chunk(self) -> Optional[Chunk]:
         if self.num_chunks == 0:
-            self._last_chunk = None
-        else:
-            last_chunk_name = self.chunk_id_encoder.get_name_for_chunk(-1)
-            last_chunk_key = get_chunk_key(self.key, last_chunk_name)
+            return None
 
-            self._last_chunk = self.cache.get_cachable(last_chunk_key, Chunk)
-            self._last_chunk.key = last_chunk_key
+        return self.cache.get_cachable(self.last_chunk_key, Chunk)
 
-        return self._last_chunk
+    @property
+    def last_chunk_key(self) -> str:
+        last_chunk_name = self.chunk_id_encoder.get_name_for_chunk(-1)
+        last_chunk_key = get_chunk_key(self.key, last_chunk_name)
+        return last_chunk_key
 
     @property
     def tensor_meta(self):
@@ -196,9 +203,9 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype):
 
         self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples)
 
-        last_chunk = self._last_chunk
-        key = last_chunk.key  # type: ignore
-        self.cache.update_used_cache_for_path(key, len(last_chunk))  # type: ignore
+        # TODO implement tests for cache size compute
+        if self.last_chunk is not None:
+            self.cache[self.last_chunk_key] = self.last_chunk
 
     def _try_appending_to_last_chunk(
         self, buffer: memoryview, shape: Tuple[int]
@@ -214,7 +221,7 @@ def _try_appending_to_last_chunk(
             bool: True if `buffer` was successfully written to the last chunk, otherwise False.
         """
 
-        last_chunk = self._last_chunk
+        last_chunk = self.last_chunk
         if last_chunk is None:
             return False
 
@@ -259,9 +266,8 @@ def _create_new_chunk(self):
         chunk_id = self.chunk_id_encoder.generate_chunk_id()
         chunk = Chunk()
         chunk_name = ChunkIdEncoder.name_from_id(chunk_id)
-        chunk.key = get_chunk_key(self.key, chunk_name)
-        self.cache[chunk.key] = chunk
-        self._last_chunk = chunk
+        chunk_key = get_chunk_key(self.key, chunk_name)
+        self.cache[chunk_key] = chunk
         return chunk
 
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
@@ -303,11 +309,11 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
         else:
             raise TypeError(f"Unsupported type for extending. Got: {type(samples)}")
 
+        self.cache.maybe_flush()
+
     def append(self, sample: SampleValue):
         """Formats a single `sample` (compresseses/decompresses if applicable) and feeds it into `_append_bytes`."""
 
-        self.get_last_chunk()
-
         if isinstance(sample, Sample):
             # has to decompress to read the array's shape and dtype
             # might be able to optimize this away
@@ -368,11 +374,12 @@ def read_sample_from_chunk(
 
         enc = self.chunk_id_encoder
 
+        buffer = chunk.memoryview_data
         local_sample_index = enc.get_local_sample_index(global_sample_index)
         shape = chunk.shapes_encoder[local_sample_index]
         sb, eb = chunk.byte_positions_encoder[local_sample_index]
 
-        buffer = chunk.memoryview_data[sb:eb]
+        buffer = buffer[sb:eb]
         if expect_compressed:
             sample = decompress_array(buffer, shape)
         else:
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
index c6a16d609f..fd6b0ecc41 100644
--- a/hub/core/meta/encode/chunk_id.py
+++ b/hub/core/meta/encode/chunk_id.py
@@ -74,7 +74,9 @@ def __init__(self):
 
     def tobytes(self) -> memoryview:
         if self._encoded_ids is None:
-            return serialize_chunkids(hub.__version__, [np.array([], dtype=ENCODING_DTYPE)])
+            return serialize_chunkids(
+                hub.__version__, [np.array([], dtype=ENCODING_DTYPE)]
+            )
         return serialize_chunkids(hub.__version__, [self._encoded_ids])
 
     @staticmethod
@@ -119,7 +121,6 @@ def num_samples(self) -> int:
             return 0
         return int(self._encoded_ids[-1, LAST_INDEX_INDEX]) + 1
 
-
     def generate_chunk_id(self) -> ENCODING_DTYPE:
         """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it.
         This method should be called once per chunk created.
diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index 6ee63d6fc9..1aa7958366 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -96,13 +96,7 @@ def serialize_chunk(
         n = len(byts)
         flatbuff[offset : offset + n] = np.frombuffer(byts, dtype=np.byte)
         offset += n
-    ret = flatbuff.tobytes()
-    v, s, b, d = deserialize_chunk(ret)
-    assert v == version
-    np.testing.assert_array_equal(s, shape_info)
-    np.testing.assert_array_equal(b, byte_positions)
-    assert bytes(d) == bytes(data[0])
-    return bytes(ret)
+    return memoryview(flatbuff.tobytes())
 
 
 def deserialize_chunk(
@@ -189,11 +183,7 @@ def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
         flatbuff[offset : offset + arr.nbytes] = arr.view(np.byte).reshape(-1)
         offset += arr.nbytes
 
-    ret = memoryview(flatbuff.tobytes())
-    v, ids2 = deserialize_chunkids(ret)
-    assert v == version
-    np.testing.assert_array_equal(ids[0].reshape(-1, 2), ids2)
-    return ret
+    return memoryview(flatbuff.tobytes())
 
 
 def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]:

From 58954381d1d5f6fc4cd5d755298f24e0a1cb46d5 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 13:38:51 +0530
Subject: [PATCH 73/79] rem assert

---
 hub/api/tests/test_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 55fa85ddef..5bb9b75751 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -26,7 +26,7 @@ def test_persist_local(local_storage):
     ds.image.extend(np.ones((4, 224, 224, 3)))
 
     ds_new = Dataset(local_storage.root)
-    assert len(ds_new) == 4, (ds_new.image.chunk_engine.chunk_id_encoder._encoded_ids,)
+    assert len(ds_new) == 4
 
     assert ds_new.image.shape == (4, 224, 224, 3)
     np.testing.assert_array_equal(ds_new.image.numpy(), np.ones((4, 224, 224, 3)))

From f15d71c43aa7ea6a56e6b07ede7bdf93d5087906 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 13:56:15 +0530
Subject: [PATCH 74/79] test chunk sizes on memds only

---
 hub/api/tests/test_chunk_sizes.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/hub/api/tests/test_chunk_sizes.py b/hub/api/tests/test_chunk_sizes.py
index da006bfdd5..2a767bc1f5 100644
--- a/hub/api/tests/test_chunk_sizes.py
+++ b/hub/api/tests/test_chunk_sizes.py
@@ -1,6 +1,5 @@
 import numpy as np
 from hub.constants import KB
-from hub.core.tests.common import parametrize_all_dataset_storages
 
 
 def _update_chunk_sizes(ds, max_chunk_size: int):
@@ -46,8 +45,8 @@ def _extend_tensors(images, labels):
     labels.extend(np.ones(100, dtype=np.uint32))
 
 
-@parametrize_all_dataset_storages
-def test_append(ds):
+def test_append(memory_ds):
+    ds = memory_ds
     images, labels = _create_tensors(ds)
     _update_chunk_sizes(ds, 32 * KB)
 
@@ -69,8 +68,8 @@ def test_append(ds):
     assert len(ds) == 300
 
 
-@parametrize_all_dataset_storages
-def test_extend(ds):
+def test_extend(memory_ds):
+    ds = memory_ds
     images, labels = _create_tensors(ds)
 
     _update_chunk_sizes(ds, 32 * KB)
@@ -93,8 +92,8 @@ def test_extend(ds):
     assert len(ds) == 300
 
 
-@parametrize_all_dataset_storages
-def test_extend_and_append(ds):
+def test_extend_and_append(memory_ds):
+    ds = memory_ds
     images, labels = _create_tensors(ds)
 
     _update_chunk_sizes(ds, 32 * KB)

From 0d7e9f257e9e75f99ba9f25708915f0a27c7e98c Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 12:37:42 +0400
Subject: [PATCH 75/79] Update hub/core/serialize.py

Co-authored-by: dyllan <mccreary@dyllan.ai>
---
 hub/core/serialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index 1aa7958366..43f0128266 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -39,7 +39,7 @@ def serialize_chunk(
     data: Union[Sequence[bytes], Sequence[memoryview]],
     len_data: Optional[int] = None,
 ) -> memoryview:
-    """Serializes a chunk
+    """Serializes a chunk's headers and data into a single byte stream. This is how the chunk will be written to the storage provider.
 
     Args:
         version: (str) Version of hub library.

From 8310e36a2c903a90581729c4be354870b1094b2b Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 12:37:47 +0400
Subject: [PATCH 76/79] Update hub/core/serialize.py

Co-authored-by: dyllan <mccreary@dyllan.ai>
---
 hub/core/serialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index 43f0128266..b295663920 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -160,7 +160,7 @@ def deserialize_chunk(
 
 
 def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
-    """Serializes chunk ids
+    """Serializes chunk ID encoders into a single byte stream. This is how the encoders will be written to the storage provider.
 
     Args:
         version: (str) Version of hub library.

From 3a8ccc840b7c8570e9ad5d7dffe3efdc00cafc00 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 12:37:53 +0400
Subject: [PATCH 77/79] Update hub/core/serialize.py

Co-authored-by: dyllan <mccreary@dyllan.ai>
---
 hub/core/serialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index b295663920..bd2f86f181 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -187,7 +187,7 @@ def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview:
 
 
 def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]:
-    """Deserializes chunk ids
+    """Deserializes a chunk ID encoder from the serialized byte stream. This is how the encoder can be accessed/modified after it is read from storage.
 
     Args:
         byts: (bytes) Serialized chunk ids.

From d9a846b89852190559d1025672dfc29e8de8925f Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 12:38:00 +0400
Subject: [PATCH 78/79] Update hub/core/serialize.py

Co-authored-by: dyllan <mccreary@dyllan.ai>
---
 hub/core/serialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index bd2f86f181..71c80a9a19 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -102,7 +102,7 @@ def serialize_chunk(
 def deserialize_chunk(
     byts: Union[bytes, memoryview]
 ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]:
-    """Deserializes a chunk
+    """Deserializes a chunk from the serialized byte stream. This is how the chunk can be accessed/modified after it is read from storage.
 
     Args:
         byts: (bytes) Serialized chunk.

From 8c3b83b94a535da4c9b298c2c44d9c3f5985b1b5 Mon Sep 17 00:00:00 2001
From: Fariz Rahman <farizrahman4u@gmail.com>
Date: Wed, 14 Jul 2021 14:29:30 +0530
Subject: [PATCH 79/79] rem assertions

---
 hub/core/serialize.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index 1aa7958366..b2bd03c5a6 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -25,8 +25,6 @@ def infer_chunk_num_bytes(
     """
     # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128)
     # NOTE: Assumption: len(version) < 256
-    assert len(version) < 256
-    assert max((map(ord, version))) < 128
     if len_data is None:
         len_data = sum(map(len, data))  # type: ignore
     return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 13
@@ -62,11 +60,9 @@ def serialize_chunk(
 
     # Write shape info
     if shape_info.ndim == 1:
-        assert shape_info.nbytes == 0
         flatbuff[offset : offset + 8] = np.zeros(8, dtype=np.byte)
         offset += 8
     else:
-        assert shape_info.ndim == 2
         flatbuff[offset : offset + 8] = np.array(shape_info.shape, dtype=np.int32).view(
             np.byte
         )
@@ -78,7 +74,6 @@ def serialize_chunk(
 
     # Write byte positions
     if byte_positions.ndim == 1:
-        assert byte_positions.nbytes == 0
         flatbuff[offset : offset + 4] = np.zeros(4, dtype=np.byte)
         offset += 4
     else: