From dbd83e33ee013b9ae1b1cfe739ae3d5bc776c26a Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 7 Jul 2021 07:29:40 +0530 Subject: [PATCH 01/79] initial --- hub/core/chunk.py | 69 ++++++++------ hub/core/chunk_engine.py | 4 +- hub/core/lowlevel.py | 191 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 236 insertions(+), 28 deletions(-) create mode 100644 hub/core/lowlevel.py diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 95c1f941ff..05cbc4b9a8 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -8,6 +8,8 @@ from hub.core.meta.encode.shape import ShapeEncoder from hub.core.meta.encode.byte_positions import BytePositionsEncoder +from hub.core.lowlevel import encode, decode, malloc, _write_pybytes + class Chunk(Cachable): def __init__( @@ -44,17 +46,46 @@ def __init__( self.shapes_encoder = ShapeEncoder(encoded_shapes) self.byte_positions_encoder = BytePositionsEncoder(encoded_byte_positions) - self._data: Union[memoryview, bytearray] = data or bytearray() + self._data: List[memoryview] = [] if data is None else [data] + + + @property def memoryview_data(self): - if isinstance(self._data, memoryview): - return self._data - return memoryview(self._data) + # deprecated + if len(self._data) == 1: + return self._data[0] + ptr = malloc(sum(map(len,self._data))) + for data in self._data: + ptr = _write_pybytes(ptr, data) + return memoryview(ptr.bytes) + + def _get_2d_idx(self, idx): + i = 0 + while len(self._data[i]) <= idx: + i += 1 + idx -= len(self._data[i]) + return i, idx + + def view(self, start, end): + if len(self._data) == 1: + return self._data[0][start: end] + start2d = self._get_2d_idx(start) + end2d = self._get_2d_idx(end) + byts = [] + byts.append(self._data[start2d[0]][start2d[1]:]) + for i in range(start2d[0] + 1, end2d[0]): + byts.append(self._data[i]) + byts.append(self._data[end2d[0]][:end2d[1]]) + ptr = malloc(end - start) + for byt in byts: + ptr = _write_pybytes(ptr, byt) + return memoryview(ptr.bytes) @property def num_data_bytes(self): - return len(self._data) + return sum(map(len, self._data)) def is_under_min_space(self, min_data_bytes_target: int) -> bool: """If this chunk's data is less than `min_data_bytes_target`, returns True.""" @@ -84,11 +115,11 @@ def append_sample(self, buffer: memoryview, max_data_bytes: int, shape: Tuple[in ) # `_data` will be a `memoryview` if `frombuffer` is called. - if isinstance(self._data, memoryview): - self._data = bytearray(self._data) + # if isinstance(self._data, memoryview): + # self._data = bytearray(self._data) # note: incoming_num_bytes can be 0 (empty sample) - self._data += buffer + self._data.append(buffer) self.update_headers(incoming_num_bytes, shape) def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]): @@ -116,24 +147,10 @@ def __len__(self): return shape_nbytes + range_nbytes + self.num_data_bytes + error_bytes def tobytes(self) -> memoryview: - out = BytesIO() - - # TODO: for fault tolerance, we should have a chunk store the ID for the next chunk - # TODO: in case the index chunk meta gets pwned (especially during a potentially failed transform job merge) + return encode(hub.__version__, self.shapes.encoder.array, self.byte_positions_encoder.array, self._data) - np.savez( - out, - version=hub.__encoded_version__, - shapes=self.shapes_encoder.array, - byte_positions=self.byte_positions_encoder.array, - data=np.frombuffer(self.memoryview_data, dtype=np.uint8), - ) - out.seek(0) - return out.getbuffer() @classmethod - def frombuffer(cls, buffer: bytes): - bio = BytesIO(buffer) - npz = np.load(bio) - data = memoryview(npz["data"].tobytes()) - return cls(npz["shapes"], npz["byte_positions"], data=data) + def frombuffer(cls, buffer: bytes) -> "Chunk": + version, shapes, byte_positions, data = decode(buffer) + return cls(shapes, byte_position, data=data) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 51eb5c4747..b695dc0a9d 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -365,12 +365,12 @@ def read_sample_from_chunk( enc = self.chunk_id_encoder - buffer = chunk.memoryview_data + # buffer = chunk.memoryview_data local_sample_index = enc.get_local_sample_index(global_sample_index) shape = chunk.shapes_encoder[local_sample_index] sb, eb = chunk.byte_positions_encoder[local_sample_index] - buffer = buffer[sb:eb] + buffer = chunk.view(sb, eb) if expect_compressed: sample = decompress_array(buffer, shape) else: diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py new file mode 100644 index 0000000000..94e361019d --- /dev/null +++ b/hub/core/lowlevel.py @@ -0,0 +1,191 @@ +import numpy as np +import ctypes +from collections import namedtuple +from typing import Tuple, List, Union, Optional +import hub + + +class Pointer(object): + __slots__ = ("address", "size", "_c_array") + + def __init__(self, address: Optional[int] = None, size: Optional[int] = None, c_array: Optional[ctypes.Array] = None) -> None: + if c_array is None: + if address is None or size is None: + raise ValueError("Expected c_array or address and size args.") + self.address = address + self.size = size + self._set_c_array() + else: + self._c_array = c_array + self.address = ctypes.addressof(c_array) + self.size = len(c_array) + + def _set_c_array(self) -> None: + self._c_array = (ctypes.c_byte * self.size).from_address(self.address) + + def __add__(self, i: int) -> "Pointer": + assert i >= 0 + assert i <= self.size + return Pointer(self.address + i, self.size - i) + + def __iadd__(self, i: int) -> "Pointer": + assert i >= 0 + assert i <= self.size + self.address += i + self.size -= i + self._set_c_array() + return self + + def __setitem__(self, idx: int, byte: int) -> None: + self._c_array[idx] = byte + + def __getitem__(self, idx: int) -> int: + return self._c_array[idx] + + @property + def memoryview(self): + return memoryview(self._c_array) + + @property + def bytes(self): + return bytes(self._c_array) + + def __len__(self): + return self.size + + +def malloc(size: int) -> Pointer: + return Pointer(c_array=(ctypes.c_byte * size)()) + + +def memcpy(dest: Pointer, src:Pointer, count=None) -> None: + if count is None: + count = src.size + ctypes.memmove(dest.address, src.address, count) + + +def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer: + ptr2 = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff)) + memcpy(ptr, ptr2) + ptr += len(byts) + return ptr + + +def _ndarray_to_ptr(arr: np.ndarray) -> Pointer: + return Pointer(arr.__array_interface__['data'][0], arr.itemsize * arr.size) + + +def encode(version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: List[bytes]) -> memoryview: + # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128) + # NOTE: Assumption: len(version) < 256 + assert len(version) < 256 + assert max((map(ord, version))) < 128 + version_slice_size = 1 + len(version) + shape_info_data_size = shape_info.itemsize * shape_info.size + shape_info_slice_size = 4 + 4 + shape_info_data_size + byte_positions_data_size = byte_positions.itemsize * byte_positions.size + byte_positions_slice_size = 4 + 4 + byte_positions_data_size + data_slice_size = sum(map(len, data)) + flatbuff = malloc(version_slice_size + shape_info_slice_size + byte_positions_slice_size + data_slice_size) + ptr = flatbuff + 0 + + # write version + ptr[0] = len(version) + ptr += 1 + for c in version: + ptr[0] = ord(c) + ptr += 1 + + # write shape info + ptr = _write_pybytes(ptr, np.int32(shape_info.shape[0]).tobytes()) + ptr = _write_pybytes(ptr, np.int32(shape_info.shape[1]).tobytes()) + memcpy(ptr, _ndarray_to_ptr(shape_info)) + ptr += shape_info_data_size + + # write byte positions + ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[0]).tobytes()) + ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[1]).tobytes()) + memcpy(ptr, _ndarray_to_ptr(byte_positions)) + ptr += byte_positions_data_size + + # write actual data + for d in data: + ptr = _write_pybytes(ptr, d) + + assert ptr.size == 0 + + return flatbuff.memoryview + + +def decode(buff: Union[bytes, Pointer]) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: + if isinstance(buff, bytes): + buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff)) + copy = True + else: + copy = False + ptr = buff + 0 + + # read version + len_version = ptr[0] + version = '' + ptr += 1 + for i in range(len_version): + version += chr(ptr[i]) + ptr += len_version + + # read shape info + shape_info_dtype = np.dtype(hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE) + shape_info_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32) + ptr += 8 + shape_info_data_size = int(np.prod(shape_info_shape) * shape_info_dtype.itemsize) + shape_info = np.frombuffer(ptr.memoryview[:shape_info_data_size], dtype=shape_info_dtype).reshape(shape_info_shape) + if copy: + shape_info = shape_info.copy() + ptr += shape_info_data_size + + # read byte positions + byte_positions_dtype = np.dtype(hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE) + byte_positions_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32) + ptr += 8 + byte_positions_data_size = int(np.prod(byte_positions_shape) * byte_positions_dtype.itemsize) + byte_positions = np.frombuffer(ptr.memoryview[:byte_positions_data_size], dtype=byte_positions_dtype).reshape(byte_positions_shape) + if copy: + byte_positions = byte_positions.copy() + ptr += byte_positions_data_size + if copy: + data = memoryview(ptr.bytes) + else: + data = ptr.memoryview + return version, shape_info, byte_positions, ptr.memoryview + + +def test(): + version = hub.__version__ + shape_info = np.cast[hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE](np.random.randint(100 ,size=(17, 63))) + byte_positions = np.cast[hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE](np.random.randint(100 ,size=(31, 79))) + data = [ + b'1234' * 7, + b'abcdefg' * 8, + b'qwertyuiop' * 9 + ] + encoded = bytes(encode(version, shape_info, byte_positions, data)) + + # from bytes + decoded = decode(encoded) + version2, shape_info2, byte_positions2, data2 = decoded + assert version2 == version + np.testing.assert_array_equal(shape_info, shape_info2) + np.testing.assert_array_equal(byte_positions, byte_positions2) + assert b''.join(data) == bytes(data2) + + # from pointer + buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded)) + decoded = decode(buff) + version2, shape_info2, byte_positions2, data2 = decoded + assert version2 == version + np.testing.assert_array_equal(shape_info, shape_info2) + np.testing.assert_array_equal(byte_positions, byte_positions2) + assert b''.join(data) == bytes(data2) + +if __name__ == "__main__": + test() \ No newline at end of file From 557df265e006aaa96e56b947d8fb942addcf3cfc Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 7 Jul 2021 07:32:02 +0530 Subject: [PATCH 02/79] format --- hub/core/chunk.py | 19 +++++++------ hub/core/lowlevel.py | 65 ++++++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 30 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 05cbc4b9a8..defd6fbcd3 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -48,15 +48,12 @@ def __init__( self._data: List[memoryview] = [] if data is None else [data] - - - @property def memoryview_data(self): # deprecated if len(self._data) == 1: return self._data[0] - ptr = malloc(sum(map(len,self._data))) + ptr = malloc(sum(map(len, self._data))) for data in self._data: ptr = _write_pybytes(ptr, data) return memoryview(ptr.bytes) @@ -70,14 +67,14 @@ def _get_2d_idx(self, idx): def view(self, start, end): if len(self._data) == 1: - return self._data[0][start: end] + return self._data[0][start:end] start2d = self._get_2d_idx(start) end2d = self._get_2d_idx(end) byts = [] - byts.append(self._data[start2d[0]][start2d[1]:]) + byts.append(self._data[start2d[0]][start2d[1] :]) for i in range(start2d[0] + 1, end2d[0]): byts.append(self._data[i]) - byts.append(self._data[end2d[0]][:end2d[1]]) + byts.append(self._data[end2d[0]][: end2d[1]]) ptr = malloc(end - start) for byt in byts: ptr = _write_pybytes(ptr, byt) @@ -147,8 +144,12 @@ def __len__(self): return shape_nbytes + range_nbytes + self.num_data_bytes + error_bytes def tobytes(self) -> memoryview: - return encode(hub.__version__, self.shapes.encoder.array, self.byte_positions_encoder.array, self._data) - + return encode( + hub.__version__, + self.shapes.encoder.array, + self.byte_positions_encoder.array, + self._data, + ) @classmethod def frombuffer(cls, buffer: bytes) -> "Chunk": diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 94e361019d..79cb4798bf 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -8,7 +8,12 @@ class Pointer(object): __slots__ = ("address", "size", "_c_array") - def __init__(self, address: Optional[int] = None, size: Optional[int] = None, c_array: Optional[ctypes.Array] = None) -> None: + def __init__( + self, + address: Optional[int] = None, + size: Optional[int] = None, + c_array: Optional[ctypes.Array] = None, + ) -> None: if c_array is None: if address is None or size is None: raise ValueError("Expected c_array or address and size args.") @@ -58,7 +63,7 @@ def malloc(size: int) -> Pointer: return Pointer(c_array=(ctypes.c_byte * size)()) -def memcpy(dest: Pointer, src:Pointer, count=None) -> None: +def memcpy(dest: Pointer, src: Pointer, count=None) -> None: if count is None: count = src.size ctypes.memmove(dest.address, src.address, count) @@ -72,10 +77,12 @@ def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer: def _ndarray_to_ptr(arr: np.ndarray) -> Pointer: - return Pointer(arr.__array_interface__['data'][0], arr.itemsize * arr.size) + return Pointer(arr.__array_interface__["data"][0], arr.itemsize * arr.size) -def encode(version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: List[bytes]) -> memoryview: +def encode( + version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: List[bytes] +) -> memoryview: # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128) # NOTE: Assumption: len(version) < 256 assert len(version) < 256 @@ -86,7 +93,12 @@ def encode(version: str, shape_info: np.ndarray, byte_positions: np.ndarray, dat byte_positions_data_size = byte_positions.itemsize * byte_positions.size byte_positions_slice_size = 4 + 4 + byte_positions_data_size data_slice_size = sum(map(len, data)) - flatbuff = malloc(version_slice_size + shape_info_slice_size + byte_positions_slice_size + data_slice_size) + flatbuff = malloc( + version_slice_size + + shape_info_slice_size + + byte_positions_slice_size + + data_slice_size + ) ptr = flatbuff + 0 # write version @@ -117,7 +129,9 @@ def encode(version: str, shape_info: np.ndarray, byte_positions: np.ndarray, dat return flatbuff.memoryview -def decode(buff: Union[bytes, Pointer]) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: +def decode( + buff: Union[bytes, Pointer] +) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: if isinstance(buff, bytes): buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff)) copy = True @@ -127,7 +141,7 @@ def decode(buff: Union[bytes, Pointer]) -> Tuple[str, np.ndarray, np.ndarray, me # read version len_version = ptr[0] - version = '' + version = "" ptr += 1 for i in range(len_version): version += chr(ptr[i]) @@ -138,17 +152,25 @@ def decode(buff: Union[bytes, Pointer]) -> Tuple[str, np.ndarray, np.ndarray, me shape_info_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32) ptr += 8 shape_info_data_size = int(np.prod(shape_info_shape) * shape_info_dtype.itemsize) - shape_info = np.frombuffer(ptr.memoryview[:shape_info_data_size], dtype=shape_info_dtype).reshape(shape_info_shape) + shape_info = np.frombuffer( + ptr.memoryview[:shape_info_data_size], dtype=shape_info_dtype + ).reshape(shape_info_shape) if copy: shape_info = shape_info.copy() ptr += shape_info_data_size # read byte positions - byte_positions_dtype = np.dtype(hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE) + byte_positions_dtype = np.dtype( + hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE + ) byte_positions_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32) ptr += 8 - byte_positions_data_size = int(np.prod(byte_positions_shape) * byte_positions_dtype.itemsize) - byte_positions = np.frombuffer(ptr.memoryview[:byte_positions_data_size], dtype=byte_positions_dtype).reshape(byte_positions_shape) + byte_positions_data_size = int( + np.prod(byte_positions_shape) * byte_positions_dtype.itemsize + ) + byte_positions = np.frombuffer( + ptr.memoryview[:byte_positions_data_size], dtype=byte_positions_dtype + ).reshape(byte_positions_shape) if copy: byte_positions = byte_positions.copy() ptr += byte_positions_data_size @@ -161,13 +183,13 @@ def decode(buff: Union[bytes, Pointer]) -> Tuple[str, np.ndarray, np.ndarray, me def test(): version = hub.__version__ - shape_info = np.cast[hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE](np.random.randint(100 ,size=(17, 63))) - byte_positions = np.cast[hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE](np.random.randint(100 ,size=(31, 79))) - data = [ - b'1234' * 7, - b'abcdefg' * 8, - b'qwertyuiop' * 9 - ] + shape_info = np.cast[hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE]( + np.random.randint(100, size=(17, 63)) + ) + byte_positions = np.cast[ + hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE + ](np.random.randint(100, size=(31, 79))) + data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9] encoded = bytes(encode(version, shape_info, byte_positions, data)) # from bytes @@ -176,7 +198,7 @@ def test(): assert version2 == version np.testing.assert_array_equal(shape_info, shape_info2) np.testing.assert_array_equal(byte_positions, byte_positions2) - assert b''.join(data) == bytes(data2) + assert b"".join(data) == bytes(data2) # from pointer buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded)) @@ -185,7 +207,8 @@ def test(): assert version2 == version np.testing.assert_array_equal(shape_info, shape_info2) np.testing.assert_array_equal(byte_positions, byte_positions2) - assert b''.join(data) == bytes(data2) + assert b"".join(data) == bytes(data2) + if __name__ == "__main__": - test() \ No newline at end of file + test() From 95ce176bc546e9442b9bda6df6531450c0d6ed4f Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 7 Jul 2021 07:44:00 +0530 Subject: [PATCH 03/79] typo --- hub/core/chunk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index defd6fbcd3..b5d962b950 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -146,7 +146,7 @@ def __len__(self): def tobytes(self) -> memoryview: return encode( hub.__version__, - self.shapes.encoder.array, + self.shapes_encoder.array, self.byte_positions_encoder.array, self._data, ) From 950ac7c6df6af56518f7216d7fad2b5dd8537a23 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 7 Jul 2021 07:46:43 +0530 Subject: [PATCH 04/79] typo --- hub/core/lowlevel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 79cb4798bf..0a7c1ba671 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -70,7 +70,7 @@ def memcpy(dest: Pointer, src: Pointer, count=None) -> None: def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer: - ptr2 = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff)) + ptr2 = Pointer(c_array=(ctypes.c_byte * len(byts))(*byts)) memcpy(ptr, ptr2) ptr += len(byts) return ptr From b55fc388b88f2c281fda4c023611b59431e55ec9 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 7 Jul 2021 08:39:56 +0530 Subject: [PATCH 05/79] bug fix --- hub/core/chunk.py | 5 +++-- hub/core/lowlevel.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index b5d962b950..6071d75038 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -75,10 +75,11 @@ def view(self, start, end): for i in range(start2d[0] + 1, end2d[0]): byts.append(self._data[i]) byts.append(self._data[end2d[0]][: end2d[1]]) - ptr = malloc(end - start) + buff = malloc(end - start) + ptr = buff + 0 for byt in byts: ptr = _write_pybytes(ptr, byt) - return memoryview(ptr.bytes) + return memoryview(buff.bytes) @property def num_data_bytes(self): diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 0a7c1ba671..2600431190 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -87,6 +87,8 @@ def encode( # NOTE: Assumption: len(version) < 256 assert len(version) < 256 assert max((map(ord, version))) < 128 + assert shape_info.ndim == 2 + assert byte_positions.ndim == 2 version_slice_size = 1 + len(version) shape_info_data_size = shape_info.itemsize * shape_info.size shape_info_slice_size = 4 + 4 + shape_info_data_size From b0f7d88fac6773204b96782a768f0d2c3d86d179 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Tue, 6 Jul 2021 20:11:32 -0700 Subject: [PATCH 06/79] some docs and fix 1D shapes --- hub/core/chunk.py | 52 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index b5d962b950..478ed67c0a 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -1,7 +1,7 @@ from hub.util.exceptions import FullChunkError import hub from hub.core.storage.cachable import Cachable -from typing import Sequence, Tuple, Union +from typing import List, Sequence, Tuple, Union import numpy as np from io import BytesIO @@ -58,28 +58,50 @@ def memoryview_data(self): ptr = _write_pybytes(ptr, data) return memoryview(ptr.bytes) - def _get_2d_idx(self, idx): + def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]: + """Converts `byte_index`, which is an index for a flattened stream of bytes, into a 2D index that can + be used for a list of byte streams of varying lengths. Used for accessing `self._data`, which is a list + of `memoryview`s. + + Args: + byte_index (int): Index over a flattened stream of bytes. + + Returns: + Tuple[int, int]: 2D index to be used to access `self._data`. + """ + i = 0 - while len(self._data[i]) <= idx: + while len(self._data[i]) <= byte_index: i += 1 - idx -= len(self._data[i]) - return i, idx + byte_index -= len(self._data[i]) + return i, byte_index - def view(self, start, end): + def view(self, start_byte: int, end_byte: int): if len(self._data) == 1: - return self._data[0][start:end] - start2d = self._get_2d_idx(start) - end2d = self._get_2d_idx(end) + return self._data[0][start_byte:end_byte] + + start2d = self._get_2d_idx(start_byte) + end2d = self._get_2d_idx(end_byte) + + # TODO: document this + # builds a list of memoryviews that contain the pieces we need for the output view byts = [] byts.append(self._data[start2d[0]][start2d[1] :]) for i in range(start2d[0] + 1, end2d[0]): byts.append(self._data[i]) byts.append(self._data[end2d[0]][: end2d[1]]) - ptr = malloc(end - start) + + ptr = malloc(end_byte - start_byte) + for byt in byts: ptr = _write_pybytes(ptr, byt) + return memoryview(ptr.bytes) + @property + def num_samples(self): + return self.shapes_encoder.num_samples + @property def num_data_bytes(self): return sum(map(len, self._data)) @@ -139,11 +161,14 @@ def __len__(self): shape_nbytes = self.shapes_encoder.nbytes range_nbytes = self.byte_positions_encoder.nbytes - error_bytes = 32 # to account for any extra delimeters/stuff that `np.savez` may create in excess + error_bytes = 32 # TODO: calculate these bytes actually return shape_nbytes + range_nbytes + self.num_data_bytes + error_bytes def tobytes(self) -> memoryview: + if self.num_samples == 0: + return memoryview(bytes()) + return encode( hub.__version__, self.shapes_encoder.array, @@ -153,5 +178,8 @@ def tobytes(self) -> memoryview: @classmethod def frombuffer(cls, buffer: bytes) -> "Chunk": + if len(buffer) == 0: + return cls() + version, shapes, byte_positions, data = decode(buffer) - return cls(shapes, byte_position, data=data) + return cls(shapes, byte_positions, data=data) From 22816920e0c04a8904342de8b3f4f9e91eca0f0c Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 7 Jul 2021 09:24:32 +0530 Subject: [PATCH 07/79] add assertion for easy debugging --- hub/core/chunk.py | 5 ++--- hub/core/lowlevel.py | 16 ++++++---------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index fb7838719d..8aa29a21ec 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -69,11 +69,11 @@ def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]: Returns: Tuple[int, int]: 2D index to be used to access `self._data`. """ - + assert byte_index < sum(map(len, self._data)) i = 0 while len(self._data[i]) <= byte_index: - i += 1 byte_index -= len(self._data[i]) + i += 1 return i, byte_index def view(self, start_byte: int, end_byte: int): @@ -178,6 +178,5 @@ def tobytes(self) -> memoryview: def frombuffer(cls, buffer: bytes) -> "Chunk": if len(buffer) == 0: return cls() - version, shapes, byte_positions, data = decode(buffer) return cls(shapes, byte_positions, data=data) diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 2600431190..85a341e3ab 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -132,9 +132,9 @@ def encode( def decode( - buff: Union[bytes, Pointer] + buff: Union[bytes, Pointer, memoryview] ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: - if isinstance(buff, bytes): + if not isinstance(buff, Pointer): buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff)) copy = True else: @@ -150,7 +150,7 @@ def decode( ptr += len_version # read shape info - shape_info_dtype = np.dtype(hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE) + shape_info_dtype = np.dtype(hub.constants.ENCODING_DTYPE) shape_info_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32) ptr += 8 shape_info_data_size = int(np.prod(shape_info_shape) * shape_info_dtype.itemsize) @@ -162,9 +162,7 @@ def decode( ptr += shape_info_data_size # read byte positions - byte_positions_dtype = np.dtype( - hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE - ) + byte_positions_dtype = np.dtype(hub.constants.ENCODING_DTYPE) byte_positions_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32) ptr += 8 byte_positions_data_size = int( @@ -185,12 +183,10 @@ def decode( def test(): version = hub.__version__ - shape_info = np.cast[hub.core.meta.encode.shape.SHAPE_ENCODING_DTYPE]( + shape_info = np.cast[hub.constants.ENCODING_DTYPE]( np.random.randint(100, size=(17, 63)) ) - byte_positions = np.cast[ - hub.core.meta.encode.byte_positions.POSITION_ENCODING_DTYPE - ](np.random.randint(100, size=(31, 79))) + byte_positions = np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(31, 79))) data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9] encoded = bytes(encode(version, shape_info, byte_positions, data)) From 26cd3772ed83418b74fec96a8d02c27b3f8f4e21 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 7 Jul 2021 09:28:25 +0530 Subject: [PATCH 08/79] one off --- hub/core/chunk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 8aa29a21ec..b84fbd9221 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -81,7 +81,7 @@ def view(self, start_byte: int, end_byte: int): return self._data[0][start_byte:end_byte] start2d = self._get_2d_idx(start_byte) - end2d = self._get_2d_idx(end_byte) + end2d = self._get_2d_idx(end_byte - 1) # TODO: document this # builds a list of memoryviews that contain the pieces we need for the output view @@ -89,7 +89,7 @@ def view(self, start_byte: int, end_byte: int): byts.append(self._data[start2d[0]][start2d[1] :]) for i in range(start2d[0] + 1, end2d[0]): byts.append(self._data[i]) - byts.append(self._data[end2d[0]][: end2d[1]]) + byts.append(self._data[end2d[0]][: end2d[1] + 1]) buff = malloc(end_byte - start_byte) ptr = buff + 0 for byt in byts: From fadc940d8e51ae1f6f902e3a4424714edc9fdfd7 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 7 Jul 2021 09:49:25 +0530 Subject: [PATCH 09/79] segfault fix --- hub/core/chunk.py | 4 ++-- hub/core/lowlevel.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index b84fbd9221..4a4dcdf28f 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -90,10 +90,10 @@ def view(self, start_byte: int, end_byte: int): for i in range(start2d[0] + 1, end2d[0]): byts.append(self._data[i]) byts.append(self._data[end2d[0]][: end2d[1] + 1]) - buff = malloc(end_byte - start_byte) + buff = malloc(sum(map(len, byts))) ptr = buff + 0 for byt in byts: - ptr = _write_pybytes(ptr, byt) + ptr = _write_pybytes(ptr, byt.cast("B")) return memoryview(buff.bytes) @property diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 85a341e3ab..145f1eeea0 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -135,7 +135,9 @@ def decode( buff: Union[bytes, Pointer, memoryview] ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: if not isinstance(buff, Pointer): - buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff)) + ptr = Pointer(c_array=(ctypes.c_byte * len(buff))()) + _write_pybytes(ptr, buff) + buff = ptr copy = True else: copy = False From 035bacc03f4d22801ff1ab446a65945bbbf8e5f5 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 7 Jul 2021 10:09:41 +0530 Subject: [PATCH 10/79] smol fixes --- hub/core/lowlevel.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 145f1eeea0..6cf8ecabd1 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -55,6 +55,10 @@ def memoryview(self): def bytes(self): return bytes(self._c_array) + @property + def bytearray(self): + return bytearray(self._c_array) + def __len__(self): return self.size @@ -128,7 +132,7 @@ def encode( assert ptr.size == 0 - return flatbuff.memoryview + return flatbuff.bytes def decode( @@ -180,7 +184,7 @@ def decode( data = memoryview(ptr.bytes) else: data = ptr.memoryview - return version, shape_info, byte_positions, ptr.memoryview + return version, shape_info, byte_positions, data def test(): From 1eddf740b6ee758d4bfbfc3d24b71bbb369f3aff Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Tue, 6 Jul 2021 21:44:59 -0700 Subject: [PATCH 11/79] add clear cache to memory test in api and fix return in `decode` --- hub/api/tests/test_api.py | 1 + hub/core/lowlevel.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 2210352dfc..212094d244 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -246,6 +246,7 @@ def test_sequence_samples(ds: Dataset): tensor.append([1, 2, 3]) tensor.extend([[4, 5, 6]]) + ds.clear_cache() assert len(tensor) == 2 diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 145f1eeea0..00894261fe 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -180,7 +180,7 @@ def decode( data = memoryview(ptr.bytes) else: data = ptr.memoryview - return version, shape_info, byte_positions, ptr.memoryview + return version, shape_info, byte_positions, data def test(): @@ -188,7 +188,9 @@ def test(): shape_info = np.cast[hub.constants.ENCODING_DTYPE]( np.random.randint(100, size=(17, 63)) ) - byte_positions = np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(31, 79))) + byte_positions = np.cast[hub.constants.ENCODING_DTYPE]( + np.random.randint(100, size=(31, 79)) + ) data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9] encoded = bytes(encode(version, shape_info, byte_positions, data)) From 3a409e17e058ee5fc7a45a780644175f14b2ee65 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Tue, 6 Jul 2021 22:06:29 -0700 Subject: [PATCH 12/79] add a better exception for pointer GC --- hub/core/lowlevel.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 00894261fe..cee97a442f 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -70,7 +70,14 @@ def memcpy(dest: Pointer, src: Pointer, count=None) -> None: def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer: - ptr2 = Pointer(c_array=(ctypes.c_byte * len(byts))(*byts)) + try: + ptr2 = Pointer(c_array=(ctypes.c_byte * len(byts))(*byts)) + except NotImplementedError: + # TODO: exceptions.py + raise Exception( + "Reference for pointer was garbage collected. Maybe because the cache killed it?" + ) + memcpy(ptr, ptr2) ptr += len(byts) return ptr From ef112524a3d14c421908175c540c476969fc1af7 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 8 Jul 2021 12:39:25 +0530 Subject: [PATCH 13/79] smol fix --- hub/core/lowlevel.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index e2c2600187..10ffe62a23 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -76,8 +76,7 @@ def memcpy(dest: Pointer, src: Pointer, count=None) -> None: def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer: ptr2 = Pointer(c_array=(ctypes.c_byte * len(byts))(*byts)) memcpy(ptr, ptr2) - ptr += len(byts) - return ptr + return ptr + len(byts) def _ndarray_to_ptr(arr: np.ndarray) -> Pointer: @@ -117,13 +116,13 @@ def encode( ptr = _write_pybytes(ptr, np.int32(shape_info.shape[0]).tobytes()) ptr = _write_pybytes(ptr, np.int32(shape_info.shape[1]).tobytes()) memcpy(ptr, _ndarray_to_ptr(shape_info)) - ptr += shape_info_data_size + ptr += shape_info.nbytes # write byte positions ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[0]).tobytes()) ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[1]).tobytes()) memcpy(ptr, _ndarray_to_ptr(byte_positions)) - ptr += byte_positions_data_size + ptr += byte_positions.nbytes # write actual data for d in data: From 57d2da7c0cd45c1be3b2dff7641a503861d501e0 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 8 Jul 2021 13:27:48 +0530 Subject: [PATCH 14/79] all fix --- hub/core/chunk.py | 35 ++++++++++++++++++++++++----------- hub/core/lowlevel.py | 31 +++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index cce8581e30..67d352dc61 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -69,32 +69,45 @@ def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]: Returns: Tuple[int, int]: 2D index to be used to access `self._data`. """ - assert byte_index < sum(map(len, self._data)) i = 0 - while len(self._data[i]) <= byte_index: - byte_index -= len(self._data[i]) - i += 1 + data = self._data + while True: + try: + num_data_i = len(data[i]) + except IndexError: # slightly faster than checking i < len(self._data) in a loop + return i - 1, len(data[i - 1]) + byte_index + if num_data_i <= byte_index: + byte_index -= num_data_i + i += 1 + else: + break return i, byte_index def view(self, start_byte: int, end_byte: int): if len(self._data) == 1: return self._data[0][start_byte:end_byte] - start2d = self._get_2d_idx(start_byte) - end2d = self._get_2d_idx(end_byte - 1) + start2dx, start2dy = self._get_2d_idx(start_byte) + end2dx, end2dy = self._get_2d_idx(end_byte) + if start2dx == end2dx: + # Indexing to the same inner chunk, this would be fast + buff = malloc(end2dy - start2dy) + _write_pybytes(buff, self._data[start2dx][start2dy:end2dy]) + return buff.memoryview # TODO: document this # builds a list of memoryviews that contain the pieces we need for the output view + byts = [] - byts.append(self._data[start2d[0]][start2d[1] :]) - for i in range(start2d[0] + 1, end2d[0]): + byts.append(self._data[start2dx][start2dy:]) + for i in range(start2dx + 1, end2dx): byts.append(self._data[i]) - byts.append(self._data[end2d[0]][: end2d[1] + 1]) + byts.append(self._data[end2dx][:end2dy]) buff = malloc(sum(map(len, byts))) ptr = buff + 0 for byt in byts: ptr = _write_pybytes(ptr, byt.cast("B")) - return memoryview(buff.bytes) + return buff.memoryview @property def num_samples(self): @@ -160,7 +173,7 @@ def __len__(self): hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, - self._data + self._data, ) def tobytes(self) -> memoryview: diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 3a0da9533b..1ea8505f4e 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -1,7 +1,7 @@ import numpy as np import ctypes from collections import namedtuple -from typing import Tuple, List, Union, Optional +from typing import Tuple, Sequence, Union, Optional import hub @@ -89,7 +89,13 @@ def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer: def _ndarray_to_ptr(arr: np.ndarray) -> Pointer: return Pointer(arr.__array_interface__["data"][0], arr.itemsize * arr.size) -def _infer_num_bytes(version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: List[bytes]): + +def _infer_num_bytes( + version: str, + shape_info: np.ndarray, + byte_positions: np.ndarray, + data: Union[Sequence[bytes], Sequence[memoryview]], +): # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128) # NOTE: Assumption: len(version) < 256 assert len(version) < 256 @@ -100,16 +106,23 @@ def _infer_num_bytes(version: str, shape_info: np.ndarray, byte_positions: np.nd # shape_info_slice_size = 4 + 4 + shape_info.nbytes # byte_positions_slice_size = 4 + 4 + byte_positions.nbytes # data_slice_size = sum(map(len, data)) - return len(version) + shape_info.nbytes + byte_positions.nbytes + sum(map(len, data)) + 17 + return ( + len(version) + + shape_info.nbytes + + byte_positions.nbytes + + sum(map(len, data)) + + 17 + ) + def encode( - version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: List[bytes] + version: str, + shape_info: np.ndarray, + byte_positions: np.ndarray, + data: Union[Sequence[bytes], Sequence[memoryview]], ) -> memoryview: - - flatbuff = malloc( - _infer_num_bytes(version, shape_info, byte_positions, data) - ) + flatbuff = malloc(_infer_num_bytes(version, shape_info, byte_positions, data)) ptr = flatbuff + 0 # write version @@ -135,8 +148,6 @@ def encode( for d in data: ptr = _write_pybytes(ptr, d) - assert ptr.size == 0 - return flatbuff.bytes From 4dde08a711709d57595f552522aa1af97ed46b48 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 8 Jul 2021 16:07:05 +0530 Subject: [PATCH 15/79] chuunk id optims init --- hub/core/chunk.py | 8 +- hub/core/lowlevel.py | 55 ++++++++++++-- hub/core/meta/encode/chunk_id.py | 121 +++++++++++++++++++------------ 3 files changed, 126 insertions(+), 58 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 67d352dc61..d5c008b928 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -8,7 +8,7 @@ from hub.core.meta.encode.shape import ShapeEncoder from hub.core.meta.encode.byte_positions import BytePositionsEncoder -from hub.core.lowlevel import encode, decode, malloc, _write_pybytes, _infer_num_bytes +from hub.core.lowlevel import encode_chunk, decode_chunk, malloc, _write_pybytes, _infer_chunk_num_bytes class Chunk(Cachable): @@ -169,7 +169,7 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]): def __len__(self): """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached.""" - return _infer_num_bytes( + return _infer_chunk_num_bytes( hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, @@ -180,7 +180,7 @@ def tobytes(self) -> memoryview: if self.num_samples == 0: return memoryview(bytes()) - return encode( + return encode_chunk( hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, @@ -191,5 +191,5 @@ def tobytes(self) -> memoryview: def frombuffer(cls, buffer: bytes) -> "Chunk": if len(buffer) == 0: return cls() - version, shapes, byte_positions, data = decode(buffer) + version, shapes, byte_positions, data = decode_chunk(buffer) return cls(shapes, byte_positions, data=data) diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 1ea8505f4e..43d7323367 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -90,7 +90,7 @@ def _ndarray_to_ptr(arr: np.ndarray) -> Pointer: return Pointer(arr.__array_interface__["data"][0], arr.itemsize * arr.size) -def _infer_num_bytes( +def _infer_chunk_num_bytes( version: str, shape_info: np.ndarray, byte_positions: np.ndarray, @@ -115,14 +115,14 @@ def _infer_num_bytes( ) -def encode( +def encode_chunk( version: str, shape_info: np.ndarray, byte_positions: np.ndarray, data: Union[Sequence[bytes], Sequence[memoryview]], ) -> memoryview: - flatbuff = malloc(_infer_num_bytes(version, shape_info, byte_positions, data)) + flatbuff = malloc(_infer_chunk_num_bytes(version, shape_info, byte_positions, data)) ptr = flatbuff + 0 # write version @@ -148,10 +148,10 @@ def encode( for d in data: ptr = _write_pybytes(ptr, d) - return flatbuff.bytes + return memoryview(flatbuff.bytes) -def decode( +def decode_chunk( buff: Union[bytes, Pointer, memoryview] ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: if not isinstance(buff, Pointer): @@ -202,6 +202,45 @@ def decode( data = ptr.memoryview return version, shape_info, byte_positions, data +def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: + len_version = len(version) + flatbuff = malloc( + 1 + len_version + sum([x.nbytes for x in ids]) + ) + + # Write version + ptr = flatbuff + 0 + ptr[0] = len_version + ptr += 1 + + for i, c in enumerate(version): + ptr[i] = ord(c) + + ptr += len_version + + for arr in ids: + memcpy(ptr, _ndarray_to_ptr(arr)) + ptr += arr.nbytes + + return memoryview(flatbuff.bytes) + +def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]: + ptr = Pointer(c_array=(ctypes.c_byte * len(buff))()) + _write_pybytes(ptr, buff) + buff = ptr + + # Read version + len_version = ptr[0] + ptr += 1 + version = "" + for i in range(len_version): + version += chr(ptr[i]) + ptr += len_version + + # Read chunk ids + ids = np.frombuffer(ptr.memoryview, dtype=hub.constants.ENCODING_DTYPE).reshape(-1, 2).copy() + + return version, ids def test(): version = hub.__version__ @@ -212,10 +251,10 @@ def test(): np.random.randint(100, size=(31, 79)) ) data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9] - encoded = bytes(encode(version, shape_info, byte_positions, data)) + encoded = bytes(encode_chunk(version, shape_info, byte_positions, data)) # from bytes - decoded = decode(encoded) + decoded = decode_chunk(encoded) version2, shape_info2, byte_positions2, data2 = decoded assert version2 == version np.testing.assert_array_equal(shape_info, shape_info2) @@ -224,7 +263,7 @@ def test(): # from pointer buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded)) - decoded = decode(buff) + decoded = decode_chunk(buff) version2, shape_info2, byte_positions2, data2 = decoded assert version2 == version np.testing.assert_array_equal(shape_info, shape_info2) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 92c737aa68..a0cdd4872a 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -3,9 +3,11 @@ import hub from hub.core.storage.cachable import Cachable from io import BytesIO -from typing import Optional, Tuple +from typing import Optional, Tuple, Union import numpy as np from uuid import uuid4 +from hub.core.lowlevel import encode_chunkids, decode_chunkids + # these constants are for accessing the data layout. see the `ChunkIdEncoder` docstring. CHUNK_ID_INDEX = 0 @@ -67,17 +69,35 @@ def __init__(self): Then, you get the left-most column and that is your chunk ID! """ - - self._encoded_ids = None + self._shards: List[np.ndarray] = [] + self._buffer: List[List(int, int)] = [] + + def _flush_buffer(self): + if self._buffer: + self._shards.append(np.array(self._buffer, dtype=ENCODING_DTYPE)) + self._buffer.clear() + + def _get_2d_idx(self, idx: int) -> Tuple[int, int]: + i = 0 + data = self._shards + while True: + try: + num_data_i = len(data[i]) + except IndexError: # slightly faster than checking i < len(self._data) in a loop + return -1, idx + if num_data_i <= idx: + idx -= num_data_i + i += 1 + else: + break + return i, idx def tobytes(self) -> memoryview: - bio = BytesIO() - np.savez( - bio, - version=hub.__encoded_version__, - ids=self._encoded_ids, + self._flush_buffer() + return encode_chunkids( + hub.__version__, + self._shards ) - return bio.getbuffer() @staticmethod def name_from_id(id: ENCODING_DTYPE) -> str: @@ -95,29 +115,41 @@ def id_from_name(name: str) -> ENCODING_DTYPE: def get_name_for_chunk(self, chunk_index: int) -> str: """Gets the name for the chunk at index `chunk_index`. If you need to get the name for a chunk from a sample index, instead use `__getitem__`, then `name_from_id`.""" - - chunk_id = self._encoded_ids[:, CHUNK_ID_INDEX][chunk_index] + chunk_id = self.get_entry(chunk_index)[CHUNK_ID_INDEX] return ChunkIdEncoder.name_from_id(chunk_id) @classmethod def frombuffer(cls, buffer: bytes): + version, ids = decode_chunkids(buffer) instance = cls() - bio = BytesIO(buffer) - npz = np.load(bio) - instance._encoded_ids = npz["ids"] + instance._shards = [ids] return instance @property def num_chunks(self) -> int: - if self._encoded_ids is None: - return 0 - return len(self._encoded_ids) + return sum(map(len, self._shards)) + len(self._buffer) + + def get_entry(self, idx): + x, y = self._get_2d_idx(idx) + return self._buffer[y] if x < 0 else self._shards[x][y] + + @property + def last_entry(self) -> int: + if self._buffer: + return self._buffer[-1] + if self._shards: + return self._shards[-1][-1] + + @property + def last_index(self) -> int: + last_entry = self.last_entry + if not last_entry: + return -1 + return last_entry[LAST_INDEX_INDEX] @property def num_samples(self) -> int: - if self._encoded_ids is None: - return 0 - return int(self._encoded_ids[-1, LAST_INDEX_INDEX] + 1) + return self.last_index + 1 def generate_chunk_id(self) -> ENCODING_DTYPE: """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it. @@ -128,21 +160,10 @@ def generate_chunk_id(self) -> ENCODING_DTYPE: """ id = ENCODING_DTYPE(uuid4().int >> UUID_SHIFT_AMOUNT) - - if self.num_samples == 0: - self._encoded_ids = np.array([[id, -1]], dtype=ENCODING_DTYPE) - - else: - last_index = self.num_samples - 1 - - new_entry = np.array( - [[id, last_index]], - dtype=ENCODING_DTYPE, - ) - self._encoded_ids = np.concatenate([self._encoded_ids, new_entry]) - + self._buffer.append([id, self.last_index]) return id + def register_samples_to_last_chunk_id(self, num_samples: int): """Registers samples to the chunk ID that was generated last with the `generate_chunk_id` method. This method should be called at least once per chunk created. @@ -171,12 +192,12 @@ def register_samples_to_last_chunk_id(self, num_samples: int): "Cannot register 0 num_samples (signifying a partial sample continuing the last chunk) when no last chunk exists." ) - current_entry = self._encoded_ids[-1] + current_entry = self.last_entry # this operation will trigger an overflow for the first addition, so supress the warning - np.seterr(over="ignore") - current_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples) - np.seterr(over="warn") + # np.seterr(over="ignore") + self.last_entry[LAST_INDEX_INDEX] += num_samples + # np.seterr(over="warn") def get_local_sample_index(self, global_sample_index: int) -> int: """Converts `global_sample_index` into a new index that is relative to the chunk the sample belongs to. @@ -206,19 +227,23 @@ def get_local_sample_index(self, global_sample_index: int) -> int: int: local index value between 0 and the amount of samples the chunk contains - 1. """ - _, chunk_index = self.__getitem__(global_sample_index, return_chunk_index=True) # type: ignore + _, (shard_index, chunk_index) = self.get(global_sample_index, return_chunk_index=True) # type: ignore - if chunk_index == 0: + if not shard_index and not chunk_index: return global_sample_index - current_entry = self._encoded_ids[chunk_index - 1] # type: ignore + # current_entry = self._encoded_ids[chunk_index - 1] + current_entry = self._shards[shard_index][chunk_index - 1] # buffer already flushed by get() call last_num_samples = current_entry[LAST_INDEX_INDEX] + 1 - return int(global_sample_index - last_num_samples) + return global_sample_index - int(last_num_samples) + + def __getitem__(self, sample_index: int) -> int: + return self.get(sample_index) - def __getitem__( + def get( self, sample_index: int, return_chunk_index: bool = False - ) -> Tuple[ENCODING_DTYPE, Optional[int]]: + ) -> Union[int, Tuple[int, Tuple[int, int]]]: """Get the ID for the chunk that `sample_index` is stored in. To get the name of the chunk, use `name_from_id`. @@ -242,11 +267,15 @@ def __getitem__( if sample_index < 0: sample_index = (self.num_samples) + sample_index - idx = np.searchsorted(self._encoded_ids[:, LAST_INDEX_INDEX], sample_index) - id = self._encoded_ids[idx, CHUNK_ID_INDEX] + self._flush_buffer() + last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._shards] + shard_idx = bp.searchsorted(last_idxs, sample_index) + shard = self._shards[shard_idx] + idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) + id = shard[idx, CHUNK_ID_INDEX] chunk_index = idx if return_chunk_index: - return id, chunk_index + return id, (shard_idx, chunk_index) return id From df034956b110a683456fc1e3cdc67b4a854d1a92 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 8 Jul 2021 16:49:50 +0530 Subject: [PATCH 16/79] debug msgs --- hub/core/meta/encode/chunk_id.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index a0cdd4872a..5ea3d79c0f 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -70,7 +70,7 @@ def __init__(self): """ self._shards: List[np.ndarray] = [] - self._buffer: List[List(int, int)] = [] + self._buffer: List[List[int]] = [] def _flush_buffer(self): if self._buffer: @@ -143,7 +143,7 @@ def last_entry(self) -> int: @property def last_index(self) -> int: last_entry = self.last_entry - if not last_entry: + if last_entry is None: return -1 return last_entry[LAST_INDEX_INDEX] @@ -184,7 +184,7 @@ def register_samples_to_last_chunk_id(self, num_samples: int): if self.num_samples == 0: raise ChunkIdEncoderError( - "Cannot register samples because no chunk IDs exist." + f"Cannot register samples because no chunk IDs exist. {self._buffer}, {self._shards}" ) if num_samples == 0 and self.num_chunks < 2: @@ -196,7 +196,7 @@ def register_samples_to_last_chunk_id(self, num_samples: int): # this operation will trigger an overflow for the first addition, so supress the warning # np.seterr(over="ignore") - self.last_entry[LAST_INDEX_INDEX] += num_samples + self.last_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples) # np.seterr(over="warn") def get_local_sample_index(self, global_sample_index: int) -> int: @@ -269,7 +269,7 @@ def get( self._flush_buffer() last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._shards] - shard_idx = bp.searchsorted(last_idxs, sample_index) + shard_idx = np.searchsorted(last_idxs, sample_index) shard = self._shards[shard_idx] idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) id = shard[idx, CHUNK_ID_INDEX] From 71ee06c80116bd9187ec31593d71e91791527c17 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 8 Jul 2021 19:13:28 +0530 Subject: [PATCH 17/79] fix refcounting bug --- hub/core/chunk.py | 8 +++- hub/core/lowlevel.py | 64 ++++++++++++++++++++++++-------- hub/core/meta/encode/chunk_id.py | 64 ++++++++++++++++++++------------ 3 files changed, 97 insertions(+), 39 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index d5c008b928..d001421cfb 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -8,7 +8,13 @@ from hub.core.meta.encode.shape import ShapeEncoder from hub.core.meta.encode.byte_positions import BytePositionsEncoder -from hub.core.lowlevel import encode_chunk, decode_chunk, malloc, _write_pybytes, _infer_chunk_num_bytes +from hub.core.lowlevel import ( + encode_chunk, + decode_chunk, + malloc, + _write_pybytes, + _infer_chunk_num_bytes, +) class Chunk(Cachable): diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 43d7323367..76ca91a77b 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -1,12 +1,12 @@ import numpy as np import ctypes from collections import namedtuple -from typing import Tuple, Sequence, Union, Optional +from typing import Tuple, Sequence, Union, Optional, List import hub class Pointer(object): - __slots__ = ("address", "size", "_c_array") + __slots__ = ("address", "size", "_c_array", "_refs") def __init__( self, @@ -14,6 +14,7 @@ def __init__( size: Optional[int] = None, c_array: Optional[ctypes.Array] = None, ) -> None: + self._refs: List[ctypes.Array] = [] if c_array is None: if address is None or size is None: raise ValueError("Expected c_array or address and size args.") @@ -26,12 +27,18 @@ def __init__( self.size = len(c_array) def _set_c_array(self) -> None: + try: + self._refs.append(self._c_array) + except AttributeError: + pass self._c_array = (ctypes.c_byte * self.size).from_address(self.address) def __add__(self, i: int) -> "Pointer": assert i >= 0 assert i <= self.size - return Pointer(self.address + i, self.size - i) + ret = Pointer(self.address + i, self.size - i) + ret._refs.append(self._c_array) + return ret def __iadd__(self, i: int) -> "Pointer": assert i >= 0 @@ -155,9 +162,13 @@ def decode_chunk( buff: Union[bytes, Pointer, memoryview] ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: if not isinstance(buff, Pointer): - ptr = Pointer(c_array=(ctypes.c_byte * len(buff))()) - _write_pybytes(ptr, buff) - buff = ptr + try: + buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff)) + except NotImplementedError: + # TODO: exceptions.py + raise Exception( + "Reference for pointer was garbage collected. Maybe because the cache killed it?" + ) copy = True else: copy = False @@ -202,11 +213,10 @@ def decode_chunk( data = ptr.memoryview return version, shape_info, byte_positions, data + def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: len_version = len(version) - flatbuff = malloc( - 1 + len_version + sum([x.nbytes for x in ids]) - ) + flatbuff = malloc(1 + len_version + sum([x.nbytes for x in ids])) # Write version ptr = flatbuff + 0 @@ -224,10 +234,15 @@ def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: return memoryview(flatbuff.bytes) + def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]: - ptr = Pointer(c_array=(ctypes.c_byte * len(buff))()) - _write_pybytes(ptr, buff) - buff = ptr + try: + ptr = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff)) + except NotImplementedError: + # TODO: exceptions.py + raise Exception( + "Reference for pointer was garbage collected. Maybe because the cache killed it?" + ) # Read version len_version = ptr[0] @@ -235,14 +250,20 @@ def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]: version = "" for i in range(len_version): version += chr(ptr[i]) + ptr += len_version # Read chunk ids - ids = np.frombuffer(ptr.memoryview, dtype=hub.constants.ENCODING_DTYPE).reshape(-1, 2).copy() + ids = ( + np.frombuffer(ptr.memoryview, dtype=hub.constants.ENCODING_DTYPE) + .reshape(-1, 2) + .copy() + ) return version, ids -def test(): + +def test_chunk_encoding(): version = hub.__version__ shape_info = np.cast[hub.constants.ENCODING_DTYPE]( np.random.randint(100, size=(17, 63)) @@ -271,5 +292,18 @@ def test(): assert b"".join(data) == bytes(data2) +def test_chunkids_encoding(): + version = hub.__version__ + shards = [ + np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(100, 2))) + ] + encoded = encode_chunkids(version, shards) + decoded = decode_chunkids(encoded) + version2, ids = decoded + assert version2 == version + np.testing.assert_array_equal(np.concatenate(shards), ids) + + if __name__ == "__main__": - test() + test_chunk_encoding() + test_chunkids_encoding() diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 5ea3d79c0f..a0eff31206 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -3,7 +3,7 @@ import hub from hub.core.storage.cachable import Cachable from io import BytesIO -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Union, List import numpy as np from uuid import uuid4 from hub.core.lowlevel import encode_chunkids, decode_chunkids @@ -15,7 +15,7 @@ class ChunkIdEncoder(Cachable): - def __init__(self): + def __init__(self, ids=None): """Custom compressor that allows reading of chunk IDs from a sample index without decompressing. Chunk IDs: @@ -69,8 +69,8 @@ def __init__(self): Then, you get the left-most column and that is your chunk ID! """ - self._shards: List[np.ndarray] = [] self._buffer: List[List[int]] = [] + self._shards: List[np.ndarray] = [] if ids is None else [ids] def _flush_buffer(self): if self._buffer: @@ -94,10 +94,13 @@ def _get_2d_idx(self, idx: int) -> Tuple[int, int]: def tobytes(self) -> memoryview: self._flush_buffer() - return encode_chunkids( - hub.__version__, - self._shards - ) + encoded = encode_chunkids(hub.__version__, self._shards) + decoded = decode_chunkids(encoded)[1] + if self._shards: + np.testing.assert_array_equal( + decoded, np.concatenate(self._shards), err_msg=str(bytes(encoded)) + ) + return encoded @staticmethod def name_from_id(id: ENCODING_DTYPE) -> str: @@ -121,9 +124,7 @@ def get_name_for_chunk(self, chunk_index: int) -> str: @classmethod def frombuffer(cls, buffer: bytes): version, ids = decode_chunkids(buffer) - instance = cls() - instance._shards = [ids] - return instance + return cls(ids) @property def num_chunks(self) -> int: @@ -134,11 +135,12 @@ def get_entry(self, idx): return self._buffer[y] if x < 0 else self._shards[x][y] @property - def last_entry(self) -> int: + def last_entry(self) -> Union[np.ndarray, List[int]]: if self._buffer: return self._buffer[-1] if self._shards: return self._shards[-1][-1] + return None @property def last_index(self) -> int: @@ -149,7 +151,15 @@ def last_index(self) -> int: @property def num_samples(self) -> int: - return self.last_index + 1 + if self._buffer: + return self._buffer[-1][LAST_INDEX_INDEX] + 1 + elif self._shards: + return int(self._shards[-1][-1, LAST_INDEX_INDEX] + 1) + return 0 + + @property + def empty(self) -> bool: + return not self._buffer and not self._shards def generate_chunk_id(self) -> ENCODING_DTYPE: """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it. @@ -158,12 +168,10 @@ def generate_chunk_id(self) -> ENCODING_DTYPE: Returns: ENCODING_DTYPE: The random chunk ID. """ - id = ENCODING_DTYPE(uuid4().int >> UUID_SHIFT_AMOUNT) self._buffer.append([id, self.last_index]) return id - def register_samples_to_last_chunk_id(self, num_samples: int): """Registers samples to the chunk ID that was generated last with the `generate_chunk_id` method. This method should be called at least once per chunk created. @@ -182,7 +190,7 @@ def register_samples_to_last_chunk_id(self, num_samples: int): f"Cannot register negative num samples. Got: {num_samples}" ) - if self.num_samples == 0: + if self.empty: raise ChunkIdEncoderError( f"Cannot register samples because no chunk IDs exist. {self._buffer}, {self._shards}" ) @@ -192,12 +200,14 @@ def register_samples_to_last_chunk_id(self, num_samples: int): "Cannot register 0 num_samples (signifying a partial sample continuing the last chunk) when no last chunk exists." ) - current_entry = self.last_entry - - # this operation will trigger an overflow for the first addition, so supress the warning - # np.seterr(over="ignore") - self.last_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples) - # np.seterr(over="warn") + last_entry = self.last_entry + if self._buffer: + last_entry[LAST_INDEX_INDEX] += num_samples + else: + err = np.geterr()["over"] + np.seterr(over="ignore") + last_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples) + np.seterr(over=err) def get_local_sample_index(self, global_sample_index: int) -> int: """Converts `global_sample_index` into a new index that is relative to the chunk the sample belongs to. @@ -232,14 +242,22 @@ def get_local_sample_index(self, global_sample_index: int) -> int: if not shard_index and not chunk_index: return global_sample_index + if chunk_index: + chunk_index -= 1 + else: + shard_index -= 1 + chunk_index = len(self._shards[shard_index]) - 1 + # current_entry = self._encoded_ids[chunk_index - 1] - current_entry = self._shards[shard_index][chunk_index - 1] # buffer already flushed by get() call + current_entry = self._shards[shard_index][ + chunk_index + ] # buffer already flushed by get() call last_num_samples = current_entry[LAST_INDEX_INDEX] + 1 return global_sample_index - int(last_num_samples) def __getitem__(self, sample_index: int) -> int: - return self.get(sample_index) + return self.get(sample_index) # type: ignore def get( self, sample_index: int, return_chunk_index: bool = False From 44b5ade3cf9ecde973f2f39814cf3128e2e668de Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 8 Jul 2021 19:42:27 +0530 Subject: [PATCH 18/79] ren shards->data --- hub/core/meta/encode/chunk_id.py | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index a0eff31206..7fa56739f9 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -70,16 +70,16 @@ def __init__(self, ids=None): """ self._buffer: List[List[int]] = [] - self._shards: List[np.ndarray] = [] if ids is None else [ids] + self._data: List[np.ndarray] = [] if ids is None else [ids] def _flush_buffer(self): if self._buffer: - self._shards.append(np.array(self._buffer, dtype=ENCODING_DTYPE)) + self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE)) self._buffer.clear() def _get_2d_idx(self, idx: int) -> Tuple[int, int]: i = 0 - data = self._shards + data = self._data while True: try: num_data_i = len(data[i]) @@ -94,11 +94,11 @@ def _get_2d_idx(self, idx: int) -> Tuple[int, int]: def tobytes(self) -> memoryview: self._flush_buffer() - encoded = encode_chunkids(hub.__version__, self._shards) + encoded = encode_chunkids(hub.__version__, self._data) decoded = decode_chunkids(encoded)[1] - if self._shards: + if self._data: np.testing.assert_array_equal( - decoded, np.concatenate(self._shards), err_msg=str(bytes(encoded)) + decoded, np.concatenate(self._data), err_msg=str(bytes(encoded)) ) return encoded @@ -128,18 +128,18 @@ def frombuffer(cls, buffer: bytes): @property def num_chunks(self) -> int: - return sum(map(len, self._shards)) + len(self._buffer) + return sum(map(len, self._data)) + len(self._buffer) def get_entry(self, idx): x, y = self._get_2d_idx(idx) - return self._buffer[y] if x < 0 else self._shards[x][y] + return self._buffer[y] if x < 0 else self._data[x][y] @property def last_entry(self) -> Union[np.ndarray, List[int]]: if self._buffer: return self._buffer[-1] - if self._shards: - return self._shards[-1][-1] + if self._data: + return self._data[-1][-1] return None @property @@ -153,13 +153,13 @@ def last_index(self) -> int: def num_samples(self) -> int: if self._buffer: return self._buffer[-1][LAST_INDEX_INDEX] + 1 - elif self._shards: - return int(self._shards[-1][-1, LAST_INDEX_INDEX] + 1) + elif self._data: + return int(self._data[-1][-1, LAST_INDEX_INDEX] + 1) return 0 @property def empty(self) -> bool: - return not self._buffer and not self._shards + return not self._buffer and not self._data def generate_chunk_id(self) -> ENCODING_DTYPE: """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it. @@ -192,7 +192,7 @@ def register_samples_to_last_chunk_id(self, num_samples: int): if self.empty: raise ChunkIdEncoderError( - f"Cannot register samples because no chunk IDs exist. {self._buffer}, {self._shards}" + f"Cannot register samples because no chunk IDs exist. {self._buffer}, {self._data}" ) if num_samples == 0 and self.num_chunks < 2: @@ -246,10 +246,10 @@ def get_local_sample_index(self, global_sample_index: int) -> int: chunk_index -= 1 else: shard_index -= 1 - chunk_index = len(self._shards[shard_index]) - 1 + chunk_index = len(self._data[shard_index]) - 1 # current_entry = self._encoded_ids[chunk_index - 1] - current_entry = self._shards[shard_index][ + current_entry = self._data[shard_index][ chunk_index ] # buffer already flushed by get() call last_num_samples = current_entry[LAST_INDEX_INDEX] + 1 @@ -286,9 +286,9 @@ def get( sample_index = (self.num_samples) + sample_index self._flush_buffer() - last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._shards] + last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data] shard_idx = np.searchsorted(last_idxs, sample_index) - shard = self._shards[shard_idx] + shard = self._data[shard_idx] idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) id = shard[idx, CHUNK_ID_INDEX] chunk_index = idx From 6f086e25d3e7a77d56efe7d442b76b84b98a49a6 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 9 Jul 2021 00:10:06 +0530 Subject: [PATCH 19/79] faster buff load --- hub/core/lowlevel.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 76ca91a77b..5b95593c7c 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -81,15 +81,7 @@ def memcpy(dest: Pointer, src: Pointer, count=None) -> None: def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer: - try: - ptr2 = Pointer(c_array=(ctypes.c_byte * len(byts))(*byts)) - except NotImplementedError: - # TODO: exceptions.py - raise Exception( - "Reference for pointer was garbage collected. Maybe because the cache killed it?" - ) - - memcpy(ptr, ptr2) + memcpy(ptr, _ndarray_to_ptr(np.frombuffer(byts, dtype=np.byte))) return ptr + len(byts) From 31aa04d2bbfe45c9d439c7f2ea445e339fd25166 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 9 Jul 2021 00:20:55 +0530 Subject: [PATCH 20/79] save 1 memcpy --- hub/core/lowlevel.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 5b95593c7c..a83a153940 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -89,6 +89,9 @@ def _ndarray_to_ptr(arr: np.ndarray) -> Pointer: return Pointer(arr.__array_interface__["data"][0], arr.itemsize * arr.size) +def _pybytes_to_c_array(byts: bytes) -> Pointer: + return Pointer(np.frombuffer(byts, dtype=np.byte).__array_interface__["data"][0], len(byts)) + def _infer_chunk_num_bytes( version: str, shape_info: np.ndarray, @@ -154,13 +157,7 @@ def decode_chunk( buff: Union[bytes, Pointer, memoryview] ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: if not isinstance(buff, Pointer): - try: - buff = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff)) - except NotImplementedError: - # TODO: exceptions.py - raise Exception( - "Reference for pointer was garbage collected. Maybe because the cache killed it?" - ) + buff = _pybytes_to_c_array(buff) copy = True else: copy = False @@ -228,13 +225,7 @@ def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]: - try: - ptr = Pointer(c_array=(ctypes.c_byte * len(buff))(*buff)) - except NotImplementedError: - # TODO: exceptions.py - raise Exception( - "Reference for pointer was garbage collected. Maybe because the cache killed it?" - ) + ptr = _pybytes_to_c_array(buff) # Read version len_version = ptr[0] From e98b008438990e214b85daccdae97de37206c0d8 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 9 Jul 2021 01:26:46 +0530 Subject: [PATCH 21/79] indexing --- hub/core/lowlevel.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index a83a153940..c5e17268d7 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -51,8 +51,28 @@ def __iadd__(self, i: int) -> "Pointer": def __setitem__(self, idx: int, byte: int) -> None: self._c_array[idx] = byte - def __getitem__(self, idx: int) -> int: - return self._c_array[idx] + def __getitem__(self, idx: Union[int, slice]) -> Union[int, "Pointer"]: + if isinstance(idx, int): + return self._c_array[idx] + elif isinstance(idx, slice): + assert idx.step is None + start = idx.start + end = idx.stop + n = self.size + if start is None: + start = 0 + elif start < 0: + start += n + if end is None: + end = n + elif end < 0: + end += n + assert start >= 0 and start < n + assert end >= start and end <= n + ret = Pointer(self.address + start, end - start) + ret._refs.append(self) + return ret + @property def memoryview(self): @@ -80,7 +100,7 @@ def memcpy(dest: Pointer, src: Pointer, count=None) -> None: ctypes.memmove(dest.address, src.address, count) -def _write_pybytes(ptr: Pointer, byts: bytes) -> Pointer: +def _write_pybytes(ptr: Pointer, byts: Union[bytes, memoryview]) -> Pointer: memcpy(ptr, _ndarray_to_ptr(np.frombuffer(byts, dtype=np.byte))) return ptr + len(byts) @@ -148,6 +168,8 @@ def encode_chunk( # write actual data for d in data: + if isinstance(d, Pointer): + d = d.memoryview ptr = _write_pybytes(ptr, d) return memoryview(flatbuff.bytes) From 3b68c5740425804b13e148580b2bf03d1d8d69d4 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 9 Jul 2021 01:59:38 +0530 Subject: [PATCH 22/79] cache data len --- hub/core/chunk.py | 12 +++++++++++- hub/core/chunk_engine.py | 21 ++++++++++++--------- hub/core/lowlevel.py | 26 ++++++++++++++------------ 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index d001421cfb..91f7d1f0e0 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -53,6 +53,7 @@ def __init__( self.byte_positions_encoder = BytePositionsEncoder(encoded_byte_positions) self._data: List[memoryview] = [] if data is None else [data] + self._len_data = len(self._data) @property def memoryview_data(self): @@ -156,6 +157,7 @@ def append_sample(self, buffer: memoryview, max_data_bytes: int, shape: Tuple[in # note: incoming_num_bytes can be 0 (empty sample) self._data.append(buffer) + self._len_data += len(buffer) self.update_headers(incoming_num_bytes, shape) def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]): @@ -179,7 +181,14 @@ def __len__(self): hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, - self._data, + len_data=self._len_data, + ) + return ( + 17 + + len(hub.__version__) + + self.shapes_encoder.array.nbytes + + self.byte_positions_encoder.array.nbytes + + self._len_data ) def tobytes(self) -> memoryview: @@ -191,6 +200,7 @@ def tobytes(self) -> memoryview: self.shapes_encoder.array, self.byte_positions_encoder.array, self._data, + self._len_data, ) @classmethod diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index e53e23cea3..67a8ae7d43 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -270,8 +270,17 @@ def _create_new_chunk(self): def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): """Formats a batch of `samples` and feeds them into `_append_bytes`.""" - + uniform = False if isinstance(samples, np.ndarray): + uniform = True + elif isinstance(samples, Sequence): + if is_uniform_sequence(samples): + uniform = True + if not isinstance(samples[0], np.ndarray): + samples = np.array(samples) + else: + raise TypeError(f"Unsupported type for extending. Got: {type(samples)}") + if uniform: compression = self.tensor_meta.sample_compression if compression == UNCOMPRESSED: buffers = [] @@ -297,15 +306,9 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): for sample_object in sample_objects: self.append(sample_object) - - elif isinstance(samples, Sequence): - if is_uniform_sequence(samples): - self.extend(np.array(samples)) - else: - for sample in samples: - self.append(sample) else: - raise TypeError(f"Unsupported type for extending. Got: {type(samples)}") + for sample in samples: + self.append(sample) self.cache.maybe_flush() diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index c5e17268d7..69db5ae9b7 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -57,7 +57,7 @@ def __getitem__(self, idx: Union[int, slice]) -> Union[int, "Pointer"]: elif isinstance(idx, slice): assert idx.step is None start = idx.start - end = idx.stop + end = idx.stop n = self.size if start is None: start = 0 @@ -73,7 +73,6 @@ def __getitem__(self, idx: Union[int, slice]) -> Union[int, "Pointer"]: ret._refs.append(self) return ret - @property def memoryview(self): return memoryview(self._c_array) @@ -110,13 +109,17 @@ def _ndarray_to_ptr(arr: np.ndarray) -> Pointer: def _pybytes_to_c_array(byts: bytes) -> Pointer: - return Pointer(np.frombuffer(byts, dtype=np.byte).__array_interface__["data"][0], len(byts)) + return Pointer( + np.frombuffer(byts, dtype=np.byte).__array_interface__["data"][0], len(byts) + ) + def _infer_chunk_num_bytes( version: str, shape_info: np.ndarray, byte_positions: np.ndarray, - data: Union[Sequence[bytes], Sequence[memoryview]], + data: Optional[Union[Sequence[bytes], Sequence[memoryview]]] = None, + len_data: Optional[int] = None, ): # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128) # NOTE: Assumption: len(version) < 256 @@ -128,13 +131,9 @@ def _infer_chunk_num_bytes( # shape_info_slice_size = 4 + 4 + shape_info.nbytes # byte_positions_slice_size = 4 + 4 + byte_positions.nbytes # data_slice_size = sum(map(len, data)) - return ( - len(version) - + shape_info.nbytes - + byte_positions.nbytes - + sum(map(len, data)) - + 17 - ) + if len_data is None: + len_data = sum(map(len, data)) + return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 17 def encode_chunk( @@ -142,9 +141,12 @@ def encode_chunk( shape_info: np.ndarray, byte_positions: np.ndarray, data: Union[Sequence[bytes], Sequence[memoryview]], + len_data: Optional[int], ) -> memoryview: - flatbuff = malloc(_infer_chunk_num_bytes(version, shape_info, byte_positions, data)) + flatbuff = malloc( + _infer_chunk_num_bytes(version, shape_info, byte_positions, data, len_data) + ) ptr = flatbuff + 0 # write version From 2d9177239dc8cd1ac7382d342b269362c18c6042 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 9 Jul 2021 02:14:32 +0530 Subject: [PATCH 23/79] cache _num_chunks --- hub/core/lowlevel.py | 14 +++++++------- hub/core/meta/encode/chunk_id.py | 4 +++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 69db5ae9b7..8ae4c201ef 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -70,7 +70,7 @@ def __getitem__(self, idx: Union[int, slice]) -> Union[int, "Pointer"]: assert start >= 0 and start < n assert end >= start and end <= n ret = Pointer(self.address + start, end - start) - ret._refs.append(self) + ret._refs.append(self._c_array) return ret @property @@ -120,7 +120,7 @@ def _infer_chunk_num_bytes( byte_positions: np.ndarray, data: Optional[Union[Sequence[bytes], Sequence[memoryview]]] = None, len_data: Optional[int] = None, -): +) -> int: # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128) # NOTE: Assumption: len(version) < 256 assert len(version) < 256 @@ -132,7 +132,7 @@ def _infer_chunk_num_bytes( # byte_positions_slice_size = 4 + 4 + byte_positions.nbytes # data_slice_size = sum(map(len, data)) if len_data is None: - len_data = sum(map(len, data)) + len_data = sum(map(len, data)) # type: ignore return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 17 @@ -188,11 +188,11 @@ def decode_chunk( ptr = buff + 0 # read version - len_version = ptr[0] + len_version: int = ptr[0] # type: ignore version = "" ptr += 1 for i in range(len_version): - version += chr(ptr[i]) + version += chr(ptr[i]) # type: ignore ptr += len_version # read shape info @@ -252,11 +252,11 @@ def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]: ptr = _pybytes_to_c_array(buff) # Read version - len_version = ptr[0] + len_version: int = ptr[0] # type: ignore ptr += 1 version = "" for i in range(len_version): - version += chr(ptr[i]) + version += chr(ptr[i]) # type: ignore ptr += len_version diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 7fa56739f9..829c760d02 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -71,6 +71,7 @@ def __init__(self, ids=None): """ self._buffer: List[List[int]] = [] self._data: List[np.ndarray] = [] if ids is None else [ids] + self._num_chunks = sum(map(len, self._data)) def _flush_buffer(self): if self._buffer: @@ -128,7 +129,7 @@ def frombuffer(cls, buffer: bytes): @property def num_chunks(self) -> int: - return sum(map(len, self._data)) + len(self._buffer) + return self._num_chunks def get_entry(self, idx): x, y = self._get_2d_idx(idx) @@ -170,6 +171,7 @@ def generate_chunk_id(self) -> ENCODING_DTYPE: """ id = ENCODING_DTYPE(uuid4().int >> UUID_SHIFT_AMOUNT) self._buffer.append([id, self.last_index]) + self._num_chunks += 1 return id def register_samples_to_last_chunk_id(self, num_samples: int): From dcea7cc2a18697da30e4b7e1c20b3f521fe2d0b5 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 9 Jul 2021 10:32:20 -0700 Subject: [PATCH 24/79] chunk engine updates cache size --- hub/core/chunk_engine.py | 5 +---- hub/core/storage/lru_cache.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index a791ab088a..f93e2cb47a 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -200,10 +200,7 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype): self._append_to_new_chunk(buffer, shape) self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples) - - # TODO implement tests for cache size compute - if self.last_chunk is not None: - self.cache[self.last_chunk_key] = self.last_chunk + self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk)) def _try_appending_to_last_chunk( self, buffer: memoryview, shape: Tuple[int] diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index 6d05367d1a..19fa02d503 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -37,6 +37,15 @@ def __init__( self.dirty_keys: Set[str] = set() # keys present in cache but not next_storage self.cache_used = 0 + def update_used_cache_for_path(self, path: str, new_size: int): + if new_size < 0: + raise ValueError(f"`new_size` must be >= 0. Got: {new_size}") + if path in self.lru_sizes: + old_size = self.lru_sizes[path] + self.cache_used -= old_size + self.cache_used += new_size + self.lru_sizes[path] = new_size + def flush(self): """Writes data from cache_storage to next_storage. Only the dirty keys are written. This is a cascading function and leads to data being written to the final storage in case of a chained cache. @@ -248,8 +257,8 @@ def _insert_in_cache(self, path: str, value: Union[bytes, Cachable]): self.check_readonly() self._free_up_space(len(value)) self.cache_storage[path] = value # type: ignore - self.cache_used += len(value) - self.lru_sizes[path] = len(value) + + self.update_used_cache_for_path(path, len(value)) def _list_keys(self): """Helper function that lists all the objects present in the cache and the underlying storage. From 04276f0ad0ca793fdf885e42c2fc7b632921303f Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 9 Jul 2021 10:45:15 -0700 Subject: [PATCH 25/79] rename `remove` -> `remove_from_dirty` --- hub/core/storage/lru_cache.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index 19fa02d503..9e439e20b1 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -201,24 +201,24 @@ def __iter__(self): """ yield from self._list_keys() - def _forward(self, path, remove=False): + def _forward(self, path, remove_from_dirty=False): """Forward the value at a given path to the next storage, and un-marks its key. - If the value at the path is Cachable, it will only be un-dirtied if remove=True. + If the value at the path is Cachable, it will only be un-dirtied if remove_from_dirty=True. """ - self._forward_value(path, self.cache_storage[path], remove) + self._forward_value(path, self.cache_storage[path], remove_from_dirty) - def _forward_value(self, path, value, remove=False): + def _forward_value(self, path, value, remove_from_dirty=False): """Forwards a path-value pair to the next storage, and un-marks its key. Args: path (str): the path to the object relative to the root of the provider. value (bytes, Cachable): the value to send to the next storage. - remove (bool, optional): cachable values are not un-marked automatically, + remove_from_dirty (bool, optional): cachable values are not un-marked automatically, as they are externally mutable. Set this to True to un-mark them anyway. """ cachable = isinstance(value, Cachable) - if not cachable or remove: + if not cachable or remove_from_dirty: self.dirty_keys.discard(path) if cachable: @@ -240,7 +240,7 @@ def _pop_from_cache(self): """Helper function that pops the least recently used key, value pair from the cache""" key, itemsize = self.lru_sizes.popitem(last=False) if key in self.dirty_keys: - self._forward(key, remove=True) + self._forward(key, remove_from_dirty=True) del self.cache_storage[key] self.cache_used -= itemsize From 36002e2ff76e9ce21b9d40b344b20170e39a4087 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 9 Jul 2021 11:10:10 -0700 Subject: [PATCH 26/79] remove some `sum`s --- hub/core/chunk.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 91f7d1f0e0..7ef7ad045d 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -52,15 +52,19 @@ def __init__( self.shapes_encoder = ShapeEncoder(encoded_shapes) self.byte_positions_encoder = BytePositionsEncoder(encoded_byte_positions) - self._data: List[memoryview] = [] if data is None else [data] - self._len_data = len(self._data) + self._data: List[memoryview] = [] + self._len_data: int = 0 + + if data is not None: + self._data.append(data) + self._len_data += len(data) @property def memoryview_data(self): # deprecated if len(self._data) == 1: return self._data[0] - ptr = malloc(sum(map(len, self._data))) + ptr = malloc(self.num_data_bytes) for data in self._data: ptr = _write_pybytes(ptr, data) return memoryview(ptr.bytes) @@ -122,7 +126,7 @@ def num_samples(self): @property def num_data_bytes(self): - return sum(map(len, self._data)) + return self._len_data def is_under_min_space(self, min_data_bytes_target: int) -> bool: """If this chunk's data is less than `min_data_bytes_target`, returns True.""" @@ -181,14 +185,14 @@ def __len__(self): hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, - len_data=self._len_data, + len_data=self.num_data_bytes, ) return ( 17 + len(hub.__version__) + self.shapes_encoder.array.nbytes + self.byte_positions_encoder.array.nbytes - + self._len_data + + self.num_data_bytes ) def tobytes(self) -> memoryview: @@ -200,7 +204,7 @@ def tobytes(self) -> memoryview: self.shapes_encoder.array, self.byte_positions_encoder.array, self._data, - self._len_data, + self.num_data_bytes, ) @classmethod From fea211d3f525e41d5e157b690434e4f6d387aa4f Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 9 Jul 2021 23:43:06 +0530 Subject: [PATCH 27/79] optims for seq access --- hub/core/meta/encode/chunk_id.py | 108 +++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 19 deletions(-) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 829c760d02..779a268d8a 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -73,9 +73,15 @@ def __init__(self, ids=None): self._data: List[np.ndarray] = [] if ids is None else [ids] self._num_chunks = sum(map(len, self._data)) + self._prev_sample_index: Optional[int] = None + self._prev_chunk_index: Optional[Tuple[int, int]] = None + self._prev_chunk_id: Optional[int] = None + def _flush_buffer(self): if self._buffer: self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE)) + if self._prev_chunk_index and self._prev_chunk_index[0] < 0: + self._prev_chunk_index = (len(self._data) - 1, self._prev_chunk_index[1]) self._buffer.clear() def _get_2d_idx(self, idx: int) -> Tuple[int, int]: @@ -131,10 +137,35 @@ def frombuffer(cls, buffer: bytes): def num_chunks(self) -> int: return self._num_chunks - def get_entry(self, idx): + def get_entry(self, idx: int): x, y = self._get_2d_idx(idx) return self._buffer[y] if x < 0 else self._data[x][y] + def _get_entry_2d(self, x: int, y: int): + return self._buffer[y] if x < 0 else self._data[x][y] + + def _decr_2d(self, x, y): + if x < 0: + if y: + return x, y - 1 + return len(self._data) - 1, len(self._data[-1]) - 1 + if y: + return x, y - 1 + if x: + x -= 1 + return x, len(self._data[x]) - 1 + raise IndexError() + + def _incr_2d(self, x: int, y: int): + if x < 0: + return x, y + 1 + # assert y < len(self._data[x]) + if y == len(self._data[x]) - 1: + if x == len(self._data) - 1: + return -1, 0 + return x + 1, 0 + return x, y + 1 + @property def last_entry(self) -> Union[np.ndarray, List[int]]: if self._buffer: @@ -244,16 +275,20 @@ def get_local_sample_index(self, global_sample_index: int) -> int: if not shard_index and not chunk_index: return global_sample_index - if chunk_index: - chunk_index -= 1 + if shard_index < 0: + if chunk_index: + current_entry = self._buffer[chunk_index - 1] + else: + current_entry = self._data[-1][-1] else: - shard_index -= 1 - chunk_index = len(self._data[shard_index]) - 1 - - # current_entry = self._encoded_ids[chunk_index - 1] - current_entry = self._data[shard_index][ - chunk_index - ] # buffer already flushed by get() call + if chunk_index: + chunk_index -= 1 + else: + shard_index -= 1 + chunk_index = len(self._data[shard_index]) - 1 + current_entry = self._data[shard_index][ + chunk_index + ] last_num_samples = current_entry[LAST_INDEX_INDEX] + 1 return global_sample_index - int(last_num_samples) @@ -287,15 +322,50 @@ def get( if sample_index < 0: sample_index = (self.num_samples) + sample_index - self._flush_buffer() - last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data] - shard_idx = np.searchsorted(last_idxs, sample_index) - shard = self._data[shard_idx] - idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) - id = shard[idx, CHUNK_ID_INDEX] - chunk_index = idx + chunk_id = None + if self._prev_chunk_index: + # Optimization for sequential look up + prev_chunk_index = self._prev_chunk_index + # if sample_index == self._prev_sample_index: + # if return_chunk_index: + # return self._prev_chunk_id, prev_chunk_index + # return self._prev_chunk_id + curr_entry = self._get_entry_2d(*prev_chunk_index) + if sample_index <= curr_entry[LAST_INDEX_INDEX]: + if any(prev_chunk_index): + prev_entry = self._get_entry_2d(*(self._decr_2d(*prev_chunk_index))) + if sample_index > prev_entry[LAST_INDEX_INDEX]: + chunk_id = self._prev_chunk_id + else: + chunk_id = self._prev_chunk_id + if chunk_id is not None: + self._prev_sample_index = sample_index + if return_chunk_index: + return chunk_id, prev_chunk_index + return chunk_id + + try: + chunk_index = self._incr_2d(*prev_chunk_index) + next_entry = self._get_entry_2d(*chunk_index) + if sample_index <= next_entry[LAST_INDEX_INDEX]: + chunk_id = next_entry[CHUNK_ID_INDEX] + except IndexError: + pass + + if chunk_id is None: + self._flush_buffer() + last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data] + shard_index = np.searchsorted(last_idxs, sample_index) + shard = self._data[shard_index] + idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) + chunk_id = shard[idx, CHUNK_ID_INDEX] + chunk_index = (shard_index, idx) + + self._prev_sample_index = sample_index + self._prev_chunk_index = chunk_index + self._prev_chunk_id = chunk_id if return_chunk_index: - return id, (shard_idx, chunk_index) + return chunk_id, chunk_index - return id + return chunk_id From 1dfb3c21bb30de0c067645c4814489517f73ac72 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sat, 10 Jul 2021 00:05:37 +0530 Subject: [PATCH 28/79] cache entry --- hub/core/meta/encode/chunk_id.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 779a268d8a..a0fb6ba20c 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -76,6 +76,7 @@ def __init__(self, ids=None): self._prev_sample_index: Optional[int] = None self._prev_chunk_index: Optional[Tuple[int, int]] = None self._prev_chunk_id: Optional[int] = None + self._prev_entry: Optional[Union[np.ndarray, List[int]]] = None def _flush_buffer(self): if self._buffer: @@ -330,7 +331,7 @@ def get( # if return_chunk_index: # return self._prev_chunk_id, prev_chunk_index # return self._prev_chunk_id - curr_entry = self._get_entry_2d(*prev_chunk_index) + curr_entry = self._prev_entry if sample_index <= curr_entry[LAST_INDEX_INDEX]: if any(prev_chunk_index): prev_entry = self._get_entry_2d(*(self._decr_2d(*prev_chunk_index))) @@ -346,9 +347,9 @@ def get( try: chunk_index = self._incr_2d(*prev_chunk_index) - next_entry = self._get_entry_2d(*chunk_index) - if sample_index <= next_entry[LAST_INDEX_INDEX]: - chunk_id = next_entry[CHUNK_ID_INDEX] + current_entry = self._get_entry_2d(*chunk_index) + if sample_index <= current_entry[LAST_INDEX_INDEX]: + chunk_id = current_entry[CHUNK_ID_INDEX] except IndexError: pass @@ -358,7 +359,8 @@ def get( shard_index = np.searchsorted(last_idxs, sample_index) shard = self._data[shard_index] idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) - chunk_id = shard[idx, CHUNK_ID_INDEX] + current_entry = shard[idx] + chunk_id = current_entry[CHUNK_ID_INDEX] chunk_index = (shard_index, idx) self._prev_sample_index = sample_index From c8a993167194fe711a40c72c230f12250baf7bb6 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 9 Jul 2021 12:33:42 -0700 Subject: [PATCH 29/79] 10s upload speedup --- hub/constants.py | 3 +-- hub/core/chunk_engine.py | 30 +++++++++++++++++++----------- hub/util/keys.py | 4 +--- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/hub/constants.py b/hub/constants.py index 3e3591a47f..ad94561856 100644 --- a/hub/constants.py +++ b/hub/constants.py @@ -37,10 +37,9 @@ CHUNKS_FOLDER = "chunks" -CHUNK_EXTENSION = "npz" ENCODED_CHUNK_NAMES_FOLDER = "chunks_index" # unsharded naming will help with backwards compatibility -ENCODED_CHUNK_NAMES_FILENAME = f"unsharded.{CHUNK_EXTENSION}" +ENCODED_CHUNK_NAMES_FILENAME = f"unsharded" ENCODING_DTYPE = np.uint32 # caclulate the number of bits to shift right when converting a 128-bit uuid into `ENCODING_DTYPE` diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 22b4ce7dd7..27b72311da 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -114,6 +114,8 @@ def __init__( # only the last chunk may be less than this self.min_chunk_size = self.max_chunk_size // 2 + self._last_chunk = None + @property def chunk_id_encoder(self) -> ChunkIdEncoder: """Gets the chunk id encoder from cache, if one is not found it creates a blank encoder. @@ -159,18 +161,16 @@ def num_samples(self) -> int: return 0 return self.chunk_id_encoder.num_samples - @property - def last_chunk(self) -> Optional[Chunk]: + def get_last_chunk(self) -> Optional[Chunk]: if self.num_chunks == 0: return None - return self.cache.get_cachable(self.last_chunk_key, Chunk) - - @property - def last_chunk_key(self) -> str: last_chunk_name = self.chunk_id_encoder.get_name_for_chunk(-1) last_chunk_key = get_chunk_key(self.key, last_chunk_name) - return last_chunk_key + + self._last_chunk = self.cache.get_cachable(last_chunk_key, Chunk) + self._last_chunk.key = last_chunk_key + return self._last_chunk @property def tensor_meta(self): @@ -200,7 +200,9 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype): self._append_to_new_chunk(buffer, shape) self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples) - self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk)) + self.cache.update_used_cache_for_path( + self._last_chunk.key, len(self._last_chunk) + ) def _try_appending_to_last_chunk( self, buffer: memoryview, shape: Tuple[int] @@ -216,7 +218,7 @@ def _try_appending_to_last_chunk( bool: True if `buffer` was successfully written to the last chunk, otherwise False. """ - last_chunk = self.last_chunk + last_chunk = self._last_chunk if last_chunk is None: return False @@ -261,12 +263,16 @@ def _create_new_chunk(self): chunk_id = self.chunk_id_encoder.generate_chunk_id() chunk = Chunk() chunk_name = ChunkIdEncoder.name_from_id(chunk_id) - chunk_key = get_chunk_key(self.key, chunk_name) - self.cache[chunk_key] = chunk + chunk.key = get_chunk_key(self.key, chunk_name) + self.cache[chunk.key] = chunk + self._last_chunk = chunk return chunk def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): """Formats a batch of `samples` and feeds them into `_append_bytes`.""" + + self.get_last_chunk() + uniform = False if isinstance(samples, np.ndarray): uniform = True @@ -312,6 +318,8 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): def append(self, sample: SampleValue): """Formats a single `sample` (compresseses/decompresses if applicable) and feeds it into `_append_bytes`.""" + self.get_last_chunk() + if isinstance(sample, Sample): # has to decompress to read the array's shape and dtype # might be able to optimize this away diff --git a/hub/util/keys.py b/hub/util/keys.py index 95d9a7069a..f10fb6649e 100644 --- a/hub/util/keys.py +++ b/hub/util/keys.py @@ -5,9 +5,7 @@ def get_chunk_key(key: str, chunk_name: str) -> str: - return posixpath.join( - key, constants.CHUNKS_FOLDER, f"{chunk_name}.{constants.CHUNK_EXTENSION}" - ) + return posixpath.join(key, constants.CHUNKS_FOLDER, f"{chunk_name}") def get_dataset_meta_key() -> str: From 16d5fb9f9f60ceec8f496678304ff86f5bd4942e Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 9 Jul 2021 12:37:35 -0700 Subject: [PATCH 30/79] fix mypy --- hub/core/chunk_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index f93e2cb47a..b7483aef52 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -200,7 +200,7 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype): self._append_to_new_chunk(buffer, shape) self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples) - self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk)) + self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk)) # type: ignore def _try_appending_to_last_chunk( self, buffer: memoryview, shape: Tuple[int] From 121753de9863e397d7b3bc733c6b6c44f3d79ace Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 9 Jul 2021 14:19:48 -0700 Subject: [PATCH 31/79] load chunk ID encoder --- hub/core/chunk.py | 8 ++++---- hub/core/chunk_engine.py | 37 +++++++++++++++---------------------- 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 7ef7ad045d..7c5a918ca3 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -53,11 +53,11 @@ def __init__( self.byte_positions_encoder = BytePositionsEncoder(encoded_byte_positions) self._data: List[memoryview] = [] - self._len_data: int = 0 + self._num_data_bytes: int = 0 # replaces: sum(map(len, self._data)) if data is not None: self._data.append(data) - self._len_data += len(data) + self._num_data_bytes += len(data) @property def memoryview_data(self): @@ -126,7 +126,7 @@ def num_samples(self): @property def num_data_bytes(self): - return self._len_data + return self._num_data_bytes def is_under_min_space(self, min_data_bytes_target: int) -> bool: """If this chunk's data is less than `min_data_bytes_target`, returns True.""" @@ -161,7 +161,7 @@ def append_sample(self, buffer: memoryview, max_data_bytes: int, shape: Tuple[in # note: incoming_num_bytes can be 0 (empty sample) self._data.append(buffer) - self._len_data += len(buffer) + self._num_data_bytes += len(buffer) self.update_headers(incoming_num_bytes, shape) def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]): diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 6cbf3b1dd5..5d5cf21977 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -114,10 +114,10 @@ def __init__( # only the last chunk may be less than this self.min_chunk_size = self.max_chunk_size // 2 - self._last_chunk = None + self.get_chunk_id_encoder() + self.get_last_chunk() - @property - def chunk_id_encoder(self) -> ChunkIdEncoder: + def get_chunk_id_encoder(self) -> ChunkIdEncoder: """Gets the chunk id encoder from cache, if one is not found it creates a blank encoder. For more information on what `ChunkIdEncoder` is used for, see the `__init__` docstring. @@ -130,46 +130,39 @@ def chunk_id_encoder(self) -> ChunkIdEncoder: """ key = get_chunk_id_encoder_key(self.key) - if not self.chunk_id_encoder_exists: + if key in self.cache: + self.chunk_id_encoder = self.cache.get_cachable(key, ChunkIdEncoder) + else: # 1 because we always update the meta information before writing the samples (to account for potentially corrupted data in the future) if self.tensor_meta.length > 1: raise CorruptedMetaError( f"Tensor length is {self.tensor_meta.length}, but could not find the chunk id encoder." ) - enc = ChunkIdEncoder() - self.cache[key] = enc - return enc + self.chunk_id_encoder = ChunkIdEncoder() + self.cache[key] = self.chunk_id_encoder - enc = self.cache.get_cachable(key, ChunkIdEncoder) - return enc - - @property - def chunk_id_encoder_exists(self) -> bool: - return get_chunk_id_encoder_key(self.key) in self.cache + return self.chunk_id_encoder @property def num_chunks(self) -> int: - if not self.chunk_id_encoder_exists: - return 0 return self.chunk_id_encoder.num_chunks @property def num_samples(self) -> int: - if not self.chunk_id_encoder_exists: - return 0 return self.chunk_id_encoder.num_samples def get_last_chunk(self) -> Optional[Chunk]: if self.num_chunks == 0: - return None + self._last_chunk = None + else: + last_chunk_name = self.chunk_id_encoder.get_name_for_chunk(-1) + last_chunk_key = get_chunk_key(self.key, last_chunk_name) - last_chunk_name = self.chunk_id_encoder.get_name_for_chunk(-1) - last_chunk_key = get_chunk_key(self.key, last_chunk_name) + self._last_chunk = self.cache.get_cachable(last_chunk_key, Chunk) + self._last_chunk.key = last_chunk_key - self._last_chunk = self.cache.get_cachable(last_chunk_key, Chunk) - self._last_chunk.key = last_chunk_key return self._last_chunk @property From 7c2221e4aa9ca6b06f877a85347a7707960bd6e2 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sat, 10 Jul 2021 05:10:28 +0530 Subject: [PATCH 32/79] mypass binsearch --- hub/core/chunk_engine.py | 16 +- hub/core/meta/encode/chunk_id.py | 241 ++++++++++++------ .../encode/tests/test_chunk_id_encoder.py | 2 +- 3 files changed, 167 insertions(+), 92 deletions(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 22b4ce7dd7..30e2d61991 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -200,7 +200,7 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype): self._append_to_new_chunk(buffer, shape) self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples) - self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk)) + self.cache.update_used_cache_for_path(self.last_chunk_key, len(self.last_chunk)) # type: ignore def _try_appending_to_last_chunk( self, buffer: memoryview, shape: Tuple[int] @@ -345,27 +345,23 @@ def numpy( last_shape = None samples = [] - for global_sample_index in index.values[0].indices(length): - chunk_id = enc[global_sample_index] + for chunk_id, local_sample_index in enc.iter(index.values[0].value): chunk_name = ChunkIdEncoder.name_from_id(chunk_id) chunk_key = get_chunk_key(self.key, chunk_name) chunk = self.cache.get_cachable(chunk_key, Chunk) - sample = self.read_sample_from_chunk(global_sample_index, chunk) + sample = self.read_sample_from_chunk(chunk, local_sample_index) shape = sample.shape - if not aslist and last_shape is not None: if shape != last_shape: raise DynamicTensorNumpyError(self.key, index, "shape") - samples.append(sample) last_shape = shape - return _format_samples(samples, index, aslist) def read_sample_from_chunk( - self, global_sample_index: int, chunk: Chunk + self, chunk: Chunk, local_sample_index: int ) -> np.ndarray: - """Read a sample from a chunk, converts the global index into a local index. Handles decompressing if applicable.""" + """Read a sample from a chunk, given the local index. Handles decompressing if applicable.""" expect_compressed = self.tensor_meta.sample_compression != UNCOMPRESSED dtype = self.tensor_meta.dtype @@ -373,7 +369,7 @@ def read_sample_from_chunk( enc = self.chunk_id_encoder # buffer = chunk.memoryview_data - local_sample_index = enc.get_local_sample_index(global_sample_index) + # local_sample_index = enc.get_local_sample_index(global_sample_index) shape = chunk.shapes_encoder[local_sample_index] sb, eb = chunk.byte_positions_encoder[local_sample_index] diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index a0fb6ba20c..a2d73e5595 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -7,6 +7,8 @@ import numpy as np from uuid import uuid4 from hub.core.lowlevel import encode_chunkids, decode_chunkids +from hub.core.index import IndexEntry +import math # these constants are for accessing the data layout. see the `ChunkIdEncoder` docstring. @@ -73,16 +75,9 @@ def __init__(self, ids=None): self._data: List[np.ndarray] = [] if ids is None else [ids] self._num_chunks = sum(map(len, self._data)) - self._prev_sample_index: Optional[int] = None - self._prev_chunk_index: Optional[Tuple[int, int]] = None - self._prev_chunk_id: Optional[int] = None - self._prev_entry: Optional[Union[np.ndarray, List[int]]] = None - def _flush_buffer(self): if self._buffer: self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE)) - if self._prev_chunk_index and self._prev_chunk_index[0] < 0: - self._prev_chunk_index = (len(self._data) - 1, self._prev_chunk_index[1]) self._buffer.clear() def _get_2d_idx(self, idx: int) -> Tuple[int, int]: @@ -145,7 +140,7 @@ def get_entry(self, idx: int): def _get_entry_2d(self, x: int, y: int): return self._buffer[y] if x < 0 else self._data[x][y] - def _decr_2d(self, x, y): + def _decr_2d(self, x: int, y: int) -> Tuple[int, int]: if x < 0: if y: return x, y - 1 @@ -157,7 +152,7 @@ def _decr_2d(self, x, y): return x, len(self._data[x]) - 1 raise IndexError() - def _incr_2d(self, x: int, y: int): + def _incr_2d(self, x: int, y: int) -> Tuple[int, int]: if x < 0: return x, y + 1 # assert y < len(self._data[x]) @@ -167,6 +162,13 @@ def _incr_2d(self, x: int, y: int): return x + 1, 0 return x, y + 1 + def _is_origin(self, x: int, y: int) -> bool: + if not x and not y: + return True + if x < 0 and not self._data and not y: + return True + return False + @property def last_entry(self) -> Union[np.ndarray, List[int]]: if self._buffer: @@ -180,12 +182,12 @@ def last_index(self) -> int: last_entry = self.last_entry if last_entry is None: return -1 - return last_entry[LAST_INDEX_INDEX] + return int(last_entry[LAST_INDEX_INDEX]) @property def num_samples(self) -> int: if self._buffer: - return self._buffer[-1][LAST_INDEX_INDEX] + 1 + return int(self._buffer[-1][LAST_INDEX_INDEX] + 1) elif self._data: return int(self._data[-1][-1, LAST_INDEX_INDEX] + 1) return 0 @@ -218,7 +220,6 @@ def register_samples_to_last_chunk_id(self, num_samples: int): ChunkIdEncoderError: Must call `generate_chunk_id` before registering samples. ChunkIdEncoderError: `num_samples` can only be 0 if it is able to be a sample continuation accross chunks. """ - if num_samples < 0: raise ValueError( f"Cannot register negative num samples. Got: {num_samples}" @@ -271,34 +272,16 @@ def get_local_sample_index(self, global_sample_index: int) -> int: int: local index value between 0 and the amount of samples the chunk contains - 1. """ - _, (shard_index, chunk_index) = self.get(global_sample_index, return_chunk_index=True) # type: ignore - - if not shard_index and not chunk_index: - return global_sample_index - - if shard_index < 0: - if chunk_index: - current_entry = self._buffer[chunk_index - 1] - else: - current_entry = self._data[-1][-1] - else: - if chunk_index: - chunk_index -= 1 - else: - shard_index -= 1 - chunk_index = len(self._data[shard_index]) - 1 - current_entry = self._data[shard_index][ - chunk_index - ] - last_num_samples = current_entry[LAST_INDEX_INDEX] + 1 - - return global_sample_index - int(last_num_samples) + return self.get(global_sample_index, return_local_sample_index=True)[1] def __getitem__(self, sample_index: int) -> int: return self.get(sample_index) # type: ignore def get( - self, sample_index: int, return_chunk_index: bool = False + self, + sample_index: int, + return_chunk_index: bool = False, + return_local_sample_index: bool = False, ) -> Union[int, Tuple[int, Tuple[int, int]]]: """Get the ID for the chunk that `sample_index` is stored in. To get the name of the chunk, use `name_from_id`. @@ -323,51 +306,147 @@ def get( if sample_index < 0: sample_index = (self.num_samples) + sample_index - chunk_id = None - if self._prev_chunk_index: - # Optimization for sequential look up - prev_chunk_index = self._prev_chunk_index - # if sample_index == self._prev_sample_index: - # if return_chunk_index: - # return self._prev_chunk_id, prev_chunk_index - # return self._prev_chunk_id - curr_entry = self._prev_entry - if sample_index <= curr_entry[LAST_INDEX_INDEX]: - if any(prev_chunk_index): - prev_entry = self._get_entry_2d(*(self._decr_2d(*prev_chunk_index))) - if sample_index > prev_entry[LAST_INDEX_INDEX]: - chunk_id = self._prev_chunk_id - else: - chunk_id = self._prev_chunk_id - if chunk_id is not None: - self._prev_sample_index = sample_index - if return_chunk_index: - return chunk_id, prev_chunk_index - return chunk_id - - try: - chunk_index = self._incr_2d(*prev_chunk_index) - current_entry = self._get_entry_2d(*chunk_index) - if sample_index <= current_entry[LAST_INDEX_INDEX]: - chunk_id = current_entry[CHUNK_ID_INDEX] - except IndexError: - pass - - if chunk_id is None: + self._flush_buffer() + last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data] + shard_index = np.searchsorted(last_idxs, sample_index) + shard = self._data[shard_index] + idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) + current_entry = shard[idx] + chunk_id = current_entry[CHUNK_ID_INDEX] + chunk_index = (shard_index, idx) + ret = [chunk_id] + if return_chunk_index: + ret.append(chunk_index) + if return_local_sample_index: + if any(chunk_index): + prev_entry = self._get_entry_2d(*self._decr_2d(*chunk_index)) + local_sample_index = ( + sample_index - int(prev_entry[LAST_INDEX_INDEX]) - 1 + ) + else: + local_sample_index = sample_index + ret.append(local_sample_index) + + return tuple(ret) + + def iter(self, index: Union[int, slice, tuple] = slice(None)): + if isinstance(index, int): + yield self.get(index, return_local_sample_index=True) + elif isinstance(index, slice): + start = 0 if index.start is None else index.start + stop = self.num_samples if index.stop is None else index.stop + step = 1 if index.step is None else index.step + assert isinstance(start, int) + assert isinstance(stop, int) + assert isinstance(step, int) + assert step != 0 + if step > 0: + total = math.ceil((stop - start) / step) + forward = True + else: + step = -step + total = math.ceil((stop - start) / step) + start, stop = stop - 1, start + forward = False + if not total: + return + n = 0 self._flush_buffer() - last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data] - shard_index = np.searchsorted(last_idxs, sample_index) + chunk_id, (shard_index, chunk_index), local_sample_index = self.get( + start, return_chunk_index=True, return_local_sample_index=True + ) shard = self._data[shard_index] - idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) - current_entry = shard[idx] - chunk_id = current_entry[CHUNK_ID_INDEX] - chunk_index = (shard_index, idx) - - self._prev_sample_index = sample_index - self._prev_chunk_index = chunk_index - self._prev_chunk_id = chunk_id - - if return_chunk_index: - return chunk_id, chunk_index - - return chunk_id + yield chunk_id, local_sample_index + n += 1 + if n == total: + return + ctr = Counter(step) + if forward: + last_index = int(shard[chunk_index, LAST_INDEX_INDEX]) + for i in range(local_sample_index + 1, last_index + 1): + if ctr(): + yield chunk_id, i + n += 1 + if n == total: + return + for chunk_index in range(chunk_index + 1, len(shard)): + entry = shard[chunk_index] + chunk_id = entry[CHUNK_ID_INDEX] + new_last_index = int(entry[LAST_INDEX_INDEX]) + for i in range(new_last_index - last_index): + if ctr(): + yield chunk_id, i + n += 1 + if n == total: + return + last_index = new_last_index + for shard_index in range(shard_index + 1, len(self._data)): + shard = self._data[shard_index] + for entry in shard: + chunk_id = entry[CHUNK_ID_INDEX] + new_last_index = int(entry[LAST_INDEX_INDEX]) + for i in range(new_last_index - last_index): + if ctr(): + yield chunk_id, i + n += 1 + if n == total: + return + last_index = new_last_index + else: + last_index = int(shard[chunk_index, LAST_INDEX_INDEX]) + for local_sample_index in range(local_sample_index - 1, -1, -1): + if ctr(): + yield chunk_id, local_sample_index + n += 1 + if n == total: + return + for chunk_index in range(chunk_index - 1, -1, -1): + entry = shard[chunk_index] + chunk_id = entry[CHUNK_ID_INDEX] + last_index = entry[LAST_INDEX_INDEX] + if chunk_index: + last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX] + elif shard_index: + last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX] + for local_sample_index in range(last_index, -1, -1): + if ctr(): + yield chunk_id, local_sample_index + n += 1 + if n == total: + return + for shard_index in range(shard_index - 1, -1, -1): + shard = self._data[shard_index] + for chunk_index in range(len(shard) - 1, -1, -1): + entry = shard[chunk_index] + chunk_id = entry[CHUNK_ID_INDEX] + last_index = entry[LAST_INDEX_INDEX] + if chunk_index: + last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX] + elif shard_index: + last_index -= self._data[shard_index - 1][ + -1, LAST_INDEX_INDEX + ] + for local_sample_index in range(last_index, -1, -1): + if ctr(): + yield chunk_id, local_sample_index + n += 1 + if n == total: + return + elif isinstance(index, tuple): + for i in index: + # Random access + yield self.get(i, return_local_sample_index=True) + + +class Counter: + # TODO: refac this + def __init__(self, n: int) -> None: + self.n = n + self.i = 0 + + def __call__(self): + self.i += 1 + if self.i == self.n: + self.i = 0 + return True + return False diff --git a/hub/core/meta/encode/tests/test_chunk_id_encoder.py b/hub/core/meta/encode/tests/test_chunk_id_encoder.py index aa891f27fb..d34a9bd20a 100644 --- a/hub/core/meta/encode/tests/test_chunk_id_encoder.py +++ b/hub/core/meta/encode/tests/test_chunk_id_encoder.py @@ -49,7 +49,7 @@ def test_trivial(): # test local indexing assert enc.get_local_sample_index(0) == 0 - assert enc.get_local_sample_index(1) == 1 + assert enc.get_local_sample_index(1) == 1, (enc._data, enc._buffer) assert enc.get_local_sample_index(29) == 29 assert enc.get_local_sample_index(30) == 0 assert enc.get_local_sample_index(31) == 0 From 60b73d3f9edadd6a1fe693aa7b7977ebe9a9794a Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sat, 10 Jul 2021 05:14:59 +0530 Subject: [PATCH 33/79] format --- hub/core/meta/encode/chunk_id.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index a2d73e5595..0f24f64899 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -272,7 +272,7 @@ def get_local_sample_index(self, global_sample_index: int) -> int: int: local index value between 0 and the amount of samples the chunk contains - 1. """ - return self.get(global_sample_index, return_local_sample_index=True)[1] + return self.get(global_sample_index, return_local_sample_index=True)[1] # type: ignore def __getitem__(self, sample_index: int) -> int: return self.get(sample_index) # type: ignore @@ -282,7 +282,7 @@ def get( sample_index: int, return_chunk_index: bool = False, return_local_sample_index: bool = False, - ) -> Union[int, Tuple[int, Tuple[int, int]]]: + ) -> Union[int, Tuple[int, Tuple[int, int]], Tuple[int, Tuple[int, int], int], Tuple[int, int]]: """Get the ID for the chunk that `sample_index` is stored in. To get the name of the chunk, use `name_from_id`. @@ -327,7 +327,7 @@ def get( local_sample_index = sample_index ret.append(local_sample_index) - return tuple(ret) + return tuple(ret) # type: ignore def iter(self, index: Union[int, slice, tuple] = slice(None)): if isinstance(index, int): @@ -352,7 +352,7 @@ def iter(self, index: Union[int, slice, tuple] = slice(None)): return n = 0 self._flush_buffer() - chunk_id, (shard_index, chunk_index), local_sample_index = self.get( + chunk_id, (shard_index, chunk_index), local_sample_index = self.get( # type: ignore start, return_chunk_index=True, return_local_sample_index=True ) shard = self._data[shard_index] From 8cb1bce7a3e9ece12c6fadf8c027c6e1d9e78b4e Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sat, 10 Jul 2021 05:25:58 +0530 Subject: [PATCH 34/79] rem debug line --- hub/core/meta/encode/tests/test_chunk_id_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/meta/encode/tests/test_chunk_id_encoder.py b/hub/core/meta/encode/tests/test_chunk_id_encoder.py index d34a9bd20a..aa891f27fb 100644 --- a/hub/core/meta/encode/tests/test_chunk_id_encoder.py +++ b/hub/core/meta/encode/tests/test_chunk_id_encoder.py @@ -49,7 +49,7 @@ def test_trivial(): # test local indexing assert enc.get_local_sample_index(0) == 0 - assert enc.get_local_sample_index(1) == 1, (enc._data, enc._buffer) + assert enc.get_local_sample_index(1) == 1 assert enc.get_local_sample_index(29) == 29 assert enc.get_local_sample_index(30) == 0 assert enc.get_local_sample_index(31) == 0 From 91596e494e7ce7e8fa389f50cfe45f296f58060a Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sat, 10 Jul 2021 06:19:18 +0530 Subject: [PATCH 35/79] fr_optimizations_2 --- hub/core/chunk_engine.py | 2 +- hub/core/meta/encode/chunk_id.py | 42 ++++++++++++++++++++++++++------ hub/integrations/pytorch.py | 8 +++--- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 3038bb8f5a..99dd8b4a9e 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -195,7 +195,7 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype): self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples) last_chunk = self._last_chunk - key = last_chunk.key + key = last_chunk.key # type: ignore self.cache.update_used_cache_for_path(key, len(last_chunk)) # type: ignore def _try_appending_to_last_chunk( diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 0f24f64899..8166d754cc 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -75,9 +75,16 @@ def __init__(self, ids=None): self._data: List[np.ndarray] = [] if ids is None else [ids] self._num_chunks = sum(map(len, self._data)) + self._prev_sample_index: Optional[int] = None + self._prev_chunk_id: Optional[int] = None + self._prev_chunk_index: Optional[Tuple[int, int]] = None + self._prev_entry: Optional[Union[np.ndarray, List[int]]] = None + def _flush_buffer(self): if self._buffer: self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE)) + if self._prev_chunk_index and self._prev_chunk_index[0] < 0: + self._prev_chunk_index = (len(self._data) -1, self._prev_chunk_index[1]) self._buffer.clear() def _get_2d_idx(self, idx: int) -> Tuple[int, int]: @@ -306,14 +313,33 @@ def get( if sample_index < 0: sample_index = (self.num_samples) + sample_index - self._flush_buffer() - last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data] - shard_index = np.searchsorted(last_idxs, sample_index) - shard = self._data[shard_index] - idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) - current_entry = shard[idx] - chunk_id = current_entry[CHUNK_ID_INDEX] - chunk_index = (shard_index, idx) + if self._prev_sample_index is not None and sample_index == self._prev_sample_index + 1: + if sample_index > self._prev_entry[LAST_INDEX_INDEX]: + chunk_index = self._incr_2d(*self._prev_chunk_index) + current_entry = self._get_entry_2d(*chunk_index) + chunk_id = current_entry[CHUNK_ID_INDEX] + self._prev_entry = current_entry + self._prev_chunk_id = chunk_id + else: + chunk_id = self._prev_chunk_id + chunk_index = self._prev_chunk_index + else: + self._flush_buffer() + last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data] + shard_index = np.searchsorted(last_idxs, sample_index) + shard = self._data[shard_index] + idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) + current_entry = shard[idx] + chunk_id = current_entry[CHUNK_ID_INDEX] + chunk_index = (shard_index, idx) + self._prev_entry = current_entry + self._prev_chunk_id = chunk_id + + self._prev_sample_index = sample_index + self._prev_chunk_index = chunk_index + + if not return_chunk_index and not return_local_sample_index: + return chunk_id ret = [chunk_id] if return_chunk_index: ret.append(chunk_index) diff --git a/hub/integrations/pytorch.py b/hub/integrations/pytorch.py index a1b90a912f..2d6a2f360d 100644 --- a/hub/integrations/pytorch.py +++ b/hub/integrations/pytorch.py @@ -243,10 +243,10 @@ def _generate_shared_memory_names(self, chunk_names: Set[str]): ls.append(f"al_{self.last_chunk_num_generated}") return ls - def _numpy_from_chunk(self, index: int, key: str, chunk): + def _numpy_from_chunk(self, chunk, key: str, local_index: int): """Takes a list of chunks and returns a numpy array from it""" chunk_engine = self.all_chunk_engines[key] - value = chunk_engine.read_sample_from_chunk(index, chunk) + value = chunk_engine.read_sample_from_chunk(chunk, local_index) # typecast if incompatible with pytorch if value.dtype == "uint16": @@ -279,14 +279,14 @@ def _get_data_from_chunks( actual_index = self.index_offset + i # TODO change this once it returns list/set of str chunk_engine = self.all_chunk_engines[key] - chunk_id = chunk_engine.chunk_id_encoder[actual_index] + chunk_id, local_index = chunk_engine.chunk_id_encoder.get(actual_index, return_local_sample_index=True) chunk_name = chunk_engine.chunk_id_encoder.name_from_id(chunk_id) # type: ignore if chunk_name not in chunk_map: self.last_index_meta[key] = i - 1 return chunk = chunk_map[chunk_name] self.all_index_value_maps[key][i] = self._numpy_from_chunk( - actual_index, key, chunk + chunk, key, local_index ) self.last_index_meta[key] = len(self) - 1 From 2f28314164b564a7afce9bc46887c548f6559db2 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sun, 11 Jul 2021 14:01:36 +0530 Subject: [PATCH 36/79] optimize tensor iteration --- hub/api/tensor.py | 15 +++++++++++++-- hub/core/chunk_engine.py | 9 ++++++--- hub/core/meta/encode/chunk_id.py | 20 ++++++++++++++------ hub/integrations/pytorch.py | 4 +++- 4 files changed, 36 insertions(+), 12 deletions(-) diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 1f24783900..3af85e05e3 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -43,6 +43,8 @@ def __init__( self.chunk_engine = ChunkEngine(self.key, self.storage) + self._sample: Optional[Tuple(int, int)] = None + def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array, or a sequence of `hub.load` outputs, which can be used to load files. See examples down below. @@ -71,6 +73,7 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): The length should be equal to the number of samples to add. """ self.chunk_engine.extend(samples) + self._sample = None def append( self, @@ -192,8 +195,12 @@ def __setitem__(self, item: Union[int, slice], value: np.ndarray): raise NotImplementedError("Tensor update not currently supported!") def __iter__(self): - for i in range(len(self)): - yield self[i] + for i, (chunk_id, local_sample_index) in enumerate( + self.chunk_engine.chunk_id_encoder.iter(self.index.values[0].value) + ): + tensor_i = Tensor(self.key, self.storage, index=self.index[i]) + tensor_i._sample = chunk_id, local_sample_index + yield tensor_i def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]: """Computes the contents of the tensor in numpy format. @@ -209,6 +216,10 @@ def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]: Returns: A numpy array containing the data represented by this tensor. """ + if self._sample: + chunk_id, local_sample_index = self._sample + chunk = self.chunk_engine.get_chunk_from_id(chunk_id) + return self.chunk_engine.read_sample_from_chunk(chunk, local_sample_index) return self.chunk_engine.numpy(self.index, aslist=aslist) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 99dd8b4a9e..b50eadb850 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -348,9 +348,7 @@ def numpy( samples = [] for chunk_id, local_sample_index in enc.iter(index.values[0].value): - chunk_name = ChunkIdEncoder.name_from_id(chunk_id) - chunk_key = get_chunk_key(self.key, chunk_name) - chunk = self.cache.get_cachable(chunk_key, Chunk) + chunk = self.get_chunk_from_id(chunk_id) sample = self.read_sample_from_chunk(chunk, local_sample_index) shape = sample.shape if not aslist and last_shape is not None: @@ -360,6 +358,11 @@ def numpy( last_shape = shape return _format_samples(samples, index, aslist) + def get_chunk_from_id(self, chunk_id: int) -> Chunk: + chunk_name = ChunkIdEncoder.name_from_id(chunk_id) + chunk_key = get_chunk_key(self.key, chunk_name) + return self.cache.get_cachable(chunk_key, Chunk) + def read_sample_from_chunk( self, chunk: Chunk, local_sample_index: int ) -> np.ndarray: diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 8166d754cc..1472e5d053 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -84,7 +84,7 @@ def _flush_buffer(self): if self._buffer: self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE)) if self._prev_chunk_index and self._prev_chunk_index[0] < 0: - self._prev_chunk_index = (len(self._data) -1, self._prev_chunk_index[1]) + self._prev_chunk_index = (len(self._data) - 1, self._prev_chunk_index[1]) self._buffer.clear() def _get_2d_idx(self, idx: int) -> Tuple[int, int]: @@ -289,7 +289,12 @@ def get( sample_index: int, return_chunk_index: bool = False, return_local_sample_index: bool = False, - ) -> Union[int, Tuple[int, Tuple[int, int]], Tuple[int, Tuple[int, int], int], Tuple[int, int]]: + ) -> Union[ + int, + Tuple[int, Tuple[int, int]], + Tuple[int, Tuple[int, int], int], + Tuple[int, int], + ]: """Get the ID for the chunk that `sample_index` is stored in. To get the name of the chunk, use `name_from_id`. @@ -313,16 +318,19 @@ def get( if sample_index < 0: sample_index = (self.num_samples) + sample_index - if self._prev_sample_index is not None and sample_index == self._prev_sample_index + 1: - if sample_index > self._prev_entry[LAST_INDEX_INDEX]: - chunk_index = self._incr_2d(*self._prev_chunk_index) + if ( + self._prev_sample_index is not None + and sample_index == self._prev_sample_index + 1 + ): + if sample_index > self._prev_entry[LAST_INDEX_INDEX]: # type: ignore + chunk_index = self._incr_2d(*self._prev_chunk_index) # type: ignore current_entry = self._get_entry_2d(*chunk_index) chunk_id = current_entry[CHUNK_ID_INDEX] self._prev_entry = current_entry self._prev_chunk_id = chunk_id else: chunk_id = self._prev_chunk_id - chunk_index = self._prev_chunk_index + chunk_index = self._prev_chunk_index # type: ignore else: self._flush_buffer() last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data] diff --git a/hub/integrations/pytorch.py b/hub/integrations/pytorch.py index 2d6a2f360d..61eddb504c 100644 --- a/hub/integrations/pytorch.py +++ b/hub/integrations/pytorch.py @@ -279,7 +279,9 @@ def _get_data_from_chunks( actual_index = self.index_offset + i # TODO change this once it returns list/set of str chunk_engine = self.all_chunk_engines[key] - chunk_id, local_index = chunk_engine.chunk_id_encoder.get(actual_index, return_local_sample_index=True) + chunk_id, local_index = chunk_engine.chunk_id_encoder.get( + actual_index, return_local_sample_index=True + ) chunk_name = chunk_engine.chunk_id_encoder.name_from_id(chunk_id) # type: ignore if chunk_name not in chunk_map: self.last_index_meta[key] = i - 1 From 7fd01931b5727fe1b2fcddfb7e4c3ffcc9f816f8 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sun, 11 Jul 2021 15:43:51 +0530 Subject: [PATCH 37/79] dsiter --- hub/api/dataset.py | 17 +++++++++++++---- hub/api/tensor.py | 12 ++++++++++++ hub/api/tests/test_api.py | 27 +++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 1c839b1385..6bb123e95a 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -38,6 +38,7 @@ def __init__( storage: Optional[StorageProvider] = None, public: Optional[bool] = True, token: Optional[str] = None, + _tensors: Optional[Dict[str, Tensor]] = None ): """Initializes a new or existing dataset. @@ -89,7 +90,7 @@ def __init__( self.storage.autoflush = True self.index = index or Index() - self.tensors: Dict[str, Tensor] = {} + self.tensors: Dict[str, Tensor] = _tensors if _tensors else {} self._token = token @@ -213,6 +214,14 @@ def __setattr__(self, name: str, value): def __iter__(self): for i in range(len(self)): yield self[i] + return + tensor_names = list(self.tensors) + tensors_sliced = [t[self.index][:len(self)] for t in self.tensors.values()] + num_tensors = len(tensor_names) + for tensors in zip(*tensors_sliced): + tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)} + ds = Dataset(read_only=True, storage=self.storage, _tensors=tensors) + yield ds def _load_meta(self): meta_key = get_dataset_meta_key() @@ -220,9 +229,9 @@ def _load_meta(self): if dataset_exists(self.storage): logger.info(f"{self.path} loaded successfully.") self.meta = self.storage.get_cachable(meta_key, DatasetMeta) - - for tensor_name in self.meta.tensors: - self.tensors[tensor_name] = Tensor(tensor_name, self.storage) + if not self.tensors: + for tensor_name in self.meta.tensors: + self.tensors[tensor_name] = Tensor(tensor_name, self.storage) elif len(self.storage) > 0: # dataset does not exist, but the path was not empty diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 3af85e05e3..53360f9b38 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -10,6 +10,8 @@ from hub.util.exceptions import TensorDoesNotExistError, InvalidKeyTypeError from hub.core.index import Index +import warnings + class Tensor: def __init__( @@ -44,6 +46,7 @@ def __init__( self.chunk_engine = ChunkEngine(self.key, self.storage) self._sample: Optional[Tuple(int, int)] = None + self._index_history: List[int] = [] def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array, @@ -189,6 +192,15 @@ def __getitem__( ): if not isinstance(item, (int, slice, list, tuple, Index)): raise InvalidKeyTypeError(item) + hist = self._index_history + if isinstance(item, int): + hist.append(item) + if len(hist) == 100: + if hist == list(range(hist[0], hist[-1] + 1, hist[1] - hist[0])): + warnings.warn("Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to iterate through the tensor.") + hist.clear() + else: + self._index_history.clear() return Tensor(self.key, self.storage, index=self.index[item]) def __setitem__(self, item: Union[int, slice], value: np.ndarray): diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 2721179ed7..f70ca54901 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -17,6 +17,8 @@ MAX_FLOAT_DTYPE = np.float_.__name__ + + def test_persist_local(local_storage): ds = Dataset(local_storage.root, local_cache_size=512) ds.create_tensor("image") @@ -465,3 +467,28 @@ def test_hub_cloud_dataset(): np.testing.assert_array_equal(ds.image[i].numpy(), i * np.ones((100, 100))) ds.delete() + + +def test_iter_perf(memory_ds: Dataset): + orig_searchsorted = np.searchsorted + call_count = {"n": 0} + + def searchsorted(*args, **kwargs): + call_count["n"] += 1 + orig_searchsorted(*args, **kwargs) + + np.searchsorted = searchsorted + ds = memory_ds + ds.create_tensor("x") + ds.create_tensor("y") + for _ in range(10): + ds.x.append(np.zeros((10, 10))) + ds.y.append(np.ones((10, 10))) + + for i, sub_ds in enumerate(ds): + np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10))) + np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10))) + + assert call_count["n"] == 4 + + np.searchsorted = orig_searchsorted From b63c56506f057b84dc9f9fa2f6d91457059cef6b Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sun, 11 Jul 2021 15:51:58 +0530 Subject: [PATCH 38/79] fix test --- hub/api/tests/test_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index f70ca54901..0bc0020be0 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -475,9 +475,9 @@ def test_iter_perf(memory_ds: Dataset): def searchsorted(*args, **kwargs): call_count["n"] += 1 - orig_searchsorted(*args, **kwargs) + return orig_searchsorted(*args, **kwargs) + - np.searchsorted = searchsorted ds = memory_ds ds.create_tensor("x") ds.create_tensor("y") @@ -485,9 +485,9 @@ def searchsorted(*args, **kwargs): ds.x.append(np.zeros((10, 10))) ds.y.append(np.ones((10, 10))) + np.searchsorted = searchsorted for i, sub_ds in enumerate(ds): - np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10))) - np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10))) + print(i, sub_ds) assert call_count["n"] == 4 From ae3c17d5b2a524465a239c6f5692b3241750ae97 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sun, 11 Jul 2021 15:55:42 +0530 Subject: [PATCH 39/79] fix test --- hub/api/dataset.py | 3 --- hub/api/tests/test_api.py | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 6bb123e95a..a2f78b99a2 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -212,9 +212,6 @@ def __setattr__(self, name: str, value): return super().__setattr__(name, value) def __iter__(self): - for i in range(len(self)): - yield self[i] - return tensor_names = list(self.tensors) tensors_sliced = [t[self.index][:len(self)] for t in self.tensors.values()] num_tensors = len(tensor_names) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 0bc0020be0..5e45fcd295 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -477,7 +477,6 @@ def searchsorted(*args, **kwargs): call_count["n"] += 1 return orig_searchsorted(*args, **kwargs) - ds = memory_ds ds.create_tensor("x") ds.create_tensor("y") @@ -487,7 +486,8 @@ def searchsorted(*args, **kwargs): np.searchsorted = searchsorted for i, sub_ds in enumerate(ds): - print(i, sub_ds) + np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10))) + np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10))) assert call_count["n"] == 4 From 069a9f69f11fdc000d65ec0561780a612373321f Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sun, 11 Jul 2021 16:51:04 +0530 Subject: [PATCH 40/79] ds iter fixes --- hub/api/dataset.py | 6 +++++- hub/api/tensor.py | 1 - hub/api/tests/test_api.py | 10 ++++++++-- hub/core/chunk_engine.py | 3 --- hub/core/meta/encode/chunk_id.py | 1 - 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index a2f78b99a2..9005414984 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -91,7 +91,9 @@ def __init__( self.index = index or Index() self.tensors: Dict[str, Tensor] = _tensors if _tensors else {} - + if self.tensors: + for t in self.tensors.values(): + assert t._sample self._token = token if self.path.startswith("hub://"): @@ -129,6 +131,8 @@ def __getitem__( if item not in self.tensors: raise TensorDoesNotExistError(item) else: + if self.index.is_trivial(): + return self.tensors[item] return self.tensors[item][self.index] elif isinstance(item, (int, slice, list, tuple, Index)): return Dataset( diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 53360f9b38..8e59913377 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -232,7 +232,6 @@ def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]: chunk_id, local_sample_index = self._sample chunk = self.chunk_engine.get_chunk_from_id(chunk_id) return self.chunk_engine.read_sample_from_chunk(chunk, local_sample_index) - return self.chunk_engine.numpy(self.index, aslist=aslist) def __str__(self): diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 5e45fcd295..bd5f577f1f 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -468,12 +468,14 @@ def test_hub_cloud_dataset(): ds.delete() - +@pytest.mark.xfail(raises=AssertionError, reason="future") def test_iter_perf(memory_ds: Dataset): orig_searchsorted = np.searchsorted call_count = {"n": 0} - + callers = [] def searchsorted(*args, **kwargs): + import inspect + callers.append(inspect.stack()[1][3]) call_count["n"] += 1 return orig_searchsorted(*args, **kwargs) @@ -486,6 +488,10 @@ def searchsorted(*args, **kwargs): np.searchsorted = searchsorted for i, sub_ds in enumerate(ds): + assert sub_ds.x._sample + assert sub_ds.y._sample + sub_ds.x.numpy() + sub_ds.y.numpy() np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10))) np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10))) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index b50eadb850..57971aaf8d 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -341,7 +341,6 @@ def numpy( Returns: Union[np.ndarray, Sequence[np.ndarray]]: Either a list of numpy arrays or a single numpy array (depending on the `aslist` argument). """ - length = self.num_samples enc = self.chunk_id_encoder last_shape = None @@ -373,8 +372,6 @@ def read_sample_from_chunk( enc = self.chunk_id_encoder - # buffer = chunk.memoryview_data - # local_sample_index = enc.get_local_sample_index(global_sample_index) shape = chunk.shapes_encoder[local_sample_index] sb, eb = chunk.byte_positions_encoder[local_sample_index] diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 1472e5d053..1403282a20 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -309,7 +309,6 @@ def get( Tuple[Tuple[ENCODING_DTYPE], Optional[Tuple[int]]]: Returns the chunk ID for `sample_index`. If `return_chunk_index` is True, there will be 2 values. The second one being the chunk's index. """ - if self.num_samples == 0: raise IndexError( f"Index {sample_index} is out of bounds for an empty chunk names encoding." From d0306a5627a85af87df6cdeee78bed2334512954 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sun, 11 Jul 2021 19:03:35 +0530 Subject: [PATCH 41/79] tests --- hub/api/dataset.py | 12 +++++------- hub/api/tensor.py | 11 +++++++++-- hub/api/tests/test_api.py | 19 +++++++++++-------- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 9005414984..bdb7206da2 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -38,7 +38,7 @@ def __init__( storage: Optional[StorageProvider] = None, public: Optional[bool] = True, token: Optional[str] = None, - _tensors: Optional[Dict[str, Tensor]] = None + _tensors: Optional[Dict[str, Tensor]] = None, ): """Initializes a new or existing dataset. @@ -60,7 +60,7 @@ def __init__( Use this if you want to specify the storage provider object manually instead of using a tag or url to generate it. public (bool, optional): Applied only if storage is Hub cloud storage and a new Dataset is being created. Defines if the dataset will have public access. token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated. - + _tensors: (list, optional): Internal. Raises: ValueError: If an existing local path is given, it must be a directory. @@ -90,10 +90,8 @@ def __init__( self.storage.autoflush = True self.index = index or Index() - self.tensors: Dict[str, Tensor] = _tensors if _tensors else {} - if self.tensors: - for t in self.tensors.values(): - assert t._sample + self.tensors: Dict[str, Tensor] = _tensors if _tensors else {} + self._token = token if self.path.startswith("hub://"): @@ -217,7 +215,7 @@ def __setattr__(self, name: str, value): def __iter__(self): tensor_names = list(self.tensors) - tensors_sliced = [t[self.index][:len(self)] for t in self.tensors.values()] + tensors_sliced = [t[self.index][: len(self)] for t in self.tensors.values()] num_tensors = len(tensor_names) for tensors in zip(*tensors_sliced): tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)} diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 8e59913377..bde3799e31 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -197,7 +197,9 @@ def __getitem__( hist.append(item) if len(hist) == 100: if hist == list(range(hist[0], hist[-1] + 1, hist[1] - hist[0])): - warnings.warn("Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to iterate through the tensor.") + warnings.warn( + "Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to iterate through the tensor." + ) hist.clear() else: self._index_history.clear() @@ -231,7 +233,12 @@ def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]: if self._sample: chunk_id, local_sample_index = self._sample chunk = self.chunk_engine.get_chunk_from_id(chunk_id) - return self.chunk_engine.read_sample_from_chunk(chunk, local_sample_index) + ret = self.chunk_engine.read_sample_from_chunk(chunk, local_sample_index) + if aslist: + ret = list(ret) + for entry in self.index.values[1:]: + ret = ret[entry.value] + return ret return self.chunk_engine.numpy(self.index, aslist=aslist) def __str__(self): diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index bd5f577f1f..2f4728659d 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -17,8 +17,6 @@ MAX_FLOAT_DTYPE = np.float_.__name__ - - def test_persist_local(local_storage): ds = Dataset(local_storage.root, local_cache_size=512) ds.create_tensor("image") @@ -468,13 +466,15 @@ def test_hub_cloud_dataset(): ds.delete() -@pytest.mark.xfail(raises=AssertionError, reason="future") + def test_iter_perf(memory_ds: Dataset): orig_searchsorted = np.searchsorted call_count = {"n": 0} callers = [] + def searchsorted(*args, **kwargs): import inspect + callers.append(inspect.stack()[1][3]) call_count["n"] += 1 return orig_searchsorted(*args, **kwargs) @@ -488,13 +488,16 @@ def searchsorted(*args, **kwargs): np.searchsorted = searchsorted for i, sub_ds in enumerate(ds): - assert sub_ds.x._sample - assert sub_ds.y._sample - sub_ds.x.numpy() - sub_ds.y.numpy() np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10))) np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10))) - assert call_count["n"] == 4 + assert call_count["n"] == 44 + + for _ in range(100): + ds.x.append(np.zeros((3, 2))) + + with pytest.warns(): + for i in range(len(ds.x)): + sample = ds.x[i] np.searchsorted = orig_searchsorted From ec4516bd99ec1abcc1d92e97aabfb192551dbd31 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sun, 11 Jul 2021 19:11:18 +0530 Subject: [PATCH 42/79] test --- hub/api/dataset.py | 4 +++- hub/api/tests/test_api.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index bdb7206da2..baa8413364 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -219,7 +219,9 @@ def __iter__(self): num_tensors = len(tensor_names) for tensors in zip(*tensors_sliced): tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)} - ds = Dataset(read_only=True, storage=self.storage, _tensors=tensors) + ds = Dataset( + read_only=self.read_only, storage=self.storage, _tensors=tensors + ) yield ds def _load_meta(self): diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 2f4728659d..5e5b3a73cf 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -496,7 +496,7 @@ def searchsorted(*args, **kwargs): for _ in range(100): ds.x.append(np.zeros((3, 2))) - with pytest.warns(): + with pytest.warns(UserWarning, match=r"Use *"): for i in range(len(ds.x)): sample = ds.x[i] From f6d71f0d2ef2f3a6f6cdbc3859a5a4f4a27d8edd Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sun, 11 Jul 2021 19:15:32 +0530 Subject: [PATCH 43/79] format --- hub/api/tensor.py | 8 ++++++-- hub/api/tests/test_api.py | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/hub/api/tensor.py b/hub/api/tensor.py index bde3799e31..8c7c954134 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -194,11 +194,15 @@ def __getitem__( raise InvalidKeyTypeError(item) hist = self._index_history if isinstance(item, int): + if item < 0: + item += len(self) hist.append(item) if len(hist) == 100: - if hist == list(range(hist[0], hist[-1] + 1, hist[1] - hist[0])): + if hist[1] - hist[0] > 1 and hist == list( + range(hist[0], hist[-1] + 1, hist[1] - hist[0]) + ): warnings.warn( - "Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to iterate through the tensor." + "Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to efficiently iterate through the tensor." ) hist.clear() else: diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 5e5b3a73cf..0505a21b83 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -493,11 +493,11 @@ def searchsorted(*args, **kwargs): assert call_count["n"] == 44 - for _ in range(100): + for _ in range(200): ds.x.append(np.zeros((3, 2))) with pytest.warns(UserWarning, match=r"Use *"): - for i in range(len(ds.x)): + for i in range(0, len(ds.x), 2): sample = ds.x[i] np.searchsorted = orig_searchsorted From 1a107ca29847eae0407834eef17b5fe794893b60 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sun, 11 Jul 2021 19:54:07 +0530 Subject: [PATCH 44/79] pytorch training optims --- hub/api/tensor.py | 16 ---------------- hub/api/tests/test_api.py | 4 ---- hub/core/meta/encode/chunk_id.py | 31 ++++++++++++++++--------------- 3 files changed, 16 insertions(+), 35 deletions(-) diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 8c7c954134..c91e418cd2 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -46,7 +46,6 @@ def __init__( self.chunk_engine = ChunkEngine(self.key, self.storage) self._sample: Optional[Tuple(int, int)] = None - self._index_history: List[int] = [] def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array, @@ -192,21 +191,6 @@ def __getitem__( ): if not isinstance(item, (int, slice, list, tuple, Index)): raise InvalidKeyTypeError(item) - hist = self._index_history - if isinstance(item, int): - if item < 0: - item += len(self) - hist.append(item) - if len(hist) == 100: - if hist[1] - hist[0] > 1 and hist == list( - range(hist[0], hist[-1] + 1, hist[1] - hist[0]) - ): - warnings.warn( - "Use `for i, sample in enumerate(tensor): ` instead of `for i in range(len(tensor)): ` to efficiently iterate through the tensor." - ) - hist.clear() - else: - self._index_history.clear() return Tensor(self.key, self.storage, index=self.index[item]) def __setitem__(self, item: Union[int, slice], value: np.ndarray): diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 0505a21b83..afd4994c87 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -496,8 +496,4 @@ def searchsorted(*args, **kwargs): for _ in range(200): ds.x.append(np.zeros((3, 2))) - with pytest.warns(UserWarning, match=r"Use *"): - for i in range(0, len(ds.x), 2): - sample = ds.x[i] - np.searchsorted = orig_searchsorted diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 1403282a20..7e94a7876a 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -317,20 +317,21 @@ def get( if sample_index < 0: sample_index = (self.num_samples) + sample_index - if ( - self._prev_sample_index is not None - and sample_index == self._prev_sample_index + 1 - ): - if sample_index > self._prev_entry[LAST_INDEX_INDEX]: # type: ignore - chunk_index = self._incr_2d(*self._prev_chunk_index) # type: ignore - current_entry = self._get_entry_2d(*chunk_index) - chunk_id = current_entry[CHUNK_ID_INDEX] - self._prev_entry = current_entry - self._prev_chunk_id = chunk_id - else: + chunk_id = None + if self._prev_sample_index is not None and sample_index >= self._prev_sample_index: + if sample_index <= self._prev_entry[LAST_INDEX_INDEX]: chunk_id = self._prev_chunk_id - chunk_index = self._prev_chunk_index # type: ignore - else: + chunk_index = self._prev_chunk_index + current_entry = self._prev_entry + else: + next_index = self._incr_2d(*self._prev_chunk_index) # type: ignore + next_entry = self._get_entry_2d(*next_index) + if sample_index <= next_entry[LAST_INDEX_INDEX]: + chunk_index = next_index + current_entry = next_entry + chunk_id = current_entry[CHUNK_ID_INDEX] + + if chunk_id is None: self._flush_buffer() last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data] shard_index = np.searchsorted(last_idxs, sample_index) @@ -339,11 +340,11 @@ def get( current_entry = shard[idx] chunk_id = current_entry[CHUNK_ID_INDEX] chunk_index = (shard_index, idx) - self._prev_entry = current_entry - self._prev_chunk_id = chunk_id self._prev_sample_index = sample_index self._prev_chunk_index = chunk_index + self._prev_entry = current_entry + self._prev_chunk_id = chunk_id if not return_chunk_index and not return_local_sample_index: return chunk_id From 0cefcd2ad9680cad2cab62fd3bce666733ccacaf Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sun, 11 Jul 2021 20:07:29 +0530 Subject: [PATCH 45/79] rem bad checks --- hub/api/tests/test_api.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index afd4994c87..ed115dba6c 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -493,7 +493,4 @@ def searchsorted(*args, **kwargs): assert call_count["n"] == 44 - for _ in range(200): - ds.x.append(np.zeros((3, 2))) - np.searchsorted = orig_searchsorted From 318d49684b98b64bfafedabf585c4b62f7dfe326 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 12 Jul 2021 13:41:51 +0530 Subject: [PATCH 46/79] format --- hub/core/meta/encode/chunk_id.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 7e94a7876a..4b465a7da4 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -318,7 +318,10 @@ def get( sample_index = (self.num_samples) + sample_index chunk_id = None - if self._prev_sample_index is not None and sample_index >= self._prev_sample_index: + if ( + self._prev_sample_index is not None + and sample_index >= self._prev_sample_index + ): if sample_index <= self._prev_entry[LAST_INDEX_INDEX]: chunk_id = self._prev_chunk_id chunk_index = self._prev_chunk_index From b1591c4dd993f046a044017233095229dff9fbd5 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 12 Jul 2021 15:31:15 +0530 Subject: [PATCH 47/79] format + smoll change in encoding format --- hub/api/dataset.py | 4 +-- hub/api/tensor.py | 6 ++-- hub/api/tests/test_api.py | 10 +++--- hub/core/lowlevel.py | 60 +++++--------------------------- hub/core/meta/encode/chunk_id.py | 22 +++++++----- hub/core/tests/test_lowlevel.py | 51 +++++++++++++++++++++++++++ 6 files changed, 83 insertions(+), 70 deletions(-) create mode 100644 hub/core/tests/test_lowlevel.py diff --git a/hub/api/dataset.py b/hub/api/dataset.py index baa8413364..c372addd10 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -1,6 +1,6 @@ from hub.core.tensor import create_tensor from hub.constants import DEFAULT_HTYPE -from typing import Callable, Dict, Optional, Union, Tuple, List +from typing import Callable, Dict, Optional, Union, Tuple, List, Iterator import numpy as np from hub.api.tensor import Tensor @@ -213,7 +213,7 @@ def __setattr__(self, name: str, value): else: return super().__setattr__(name, value) - def __iter__(self): + def __iter__(self) -> Iterator["Dataset"]: tensor_names = list(self.tensors) tensors_sliced = [t[self.index][: len(self)] for t in self.tensors.values()] num_tensors = len(tensor_names) diff --git a/hub/api/tensor.py b/hub/api/tensor.py index c91e418cd2..6d825c48a9 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -1,6 +1,6 @@ from hub.util.keys import tensor_exists from hub.core.sample import Sample # type: ignore -from typing import List, Sequence, Union, Optional, Tuple, Dict +from typing import List, Sequence, Union, Optional, Tuple, Dict, Iterator from hub.util.shape import ShapeInterval import numpy as np @@ -45,7 +45,7 @@ def __init__( self.chunk_engine = ChunkEngine(self.key, self.storage) - self._sample: Optional[Tuple(int, int)] = None + self._sample: Optional[Tuple[int, int]] = None def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array, @@ -196,7 +196,7 @@ def __getitem__( def __setitem__(self, item: Union[int, slice], value: np.ndarray): raise NotImplementedError("Tensor update not currently supported!") - def __iter__(self): + def __iter__(self) -> Iterator["Tensor"]: for i, (chunk_id, local_sample_index) in enumerate( self.chunk_engine.chunk_id_encoder.iter(self.index.values[0].value) ): diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index ed115dba6c..fd274359bd 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -195,7 +195,7 @@ def test_empty_samples(ds: Dataset): # test indexing individual empty samples with numpy while looping, this may seem redundant but this was failing before for actual_sample, expected in zip(ds, expected_list): - actual = actual_sample.with_empty.numpy() + actual = actual_sample["with_empty"].numpy() np.testing.assert_array_equal(actual, expected) @@ -483,13 +483,13 @@ def searchsorted(*args, **kwargs): ds.create_tensor("x") ds.create_tensor("y") for _ in range(10): - ds.x.append(np.zeros((10, 10))) - ds.y.append(np.ones((10, 10))) + ds["x"].append(np.zeros((10, 10))) + ds["y"].append(np.ones((10, 10))) np.searchsorted = searchsorted for i, sub_ds in enumerate(ds): - np.testing.assert_array_equal(sub_ds.x.numpy(), np.zeros((10, 10))) - np.testing.assert_array_equal(sub_ds.y.numpy(), np.ones((10, 10))) + np.testing.assert_array_equal(sub_ds["x"].numpy(), np.zeros((10, 10))) + np.testing.assert_array_equal(sub_ds["y"].numpy(), np.ones((10, 10))) assert call_count["n"] == 44 diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py index 8ae4c201ef..4dfd9dc848 100644 --- a/hub/core/lowlevel.py +++ b/hub/core/lowlevel.py @@ -129,11 +129,11 @@ def _infer_chunk_num_bytes( # assert byte_positions.ndim == 2 # version_slice_size = 1 + len(version) # shape_info_slice_size = 4 + 4 + shape_info.nbytes - # byte_positions_slice_size = 4 + 4 + byte_positions.nbytes + # byte_positions_slice_size = 4 + byte_positions.nbytes # data_slice_size = sum(map(len, data)) if len_data is None: len_data = sum(map(len, data)) # type: ignore - return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 17 + return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 13 def encode_chunk( @@ -141,9 +141,12 @@ def encode_chunk( shape_info: np.ndarray, byte_positions: np.ndarray, data: Union[Sequence[bytes], Sequence[memoryview]], - len_data: Optional[int], + len_data: Optional[int] = None, ) -> memoryview: + if len_data is None: + len_data = sum(map(len, data)) + flatbuff = malloc( _infer_chunk_num_bytes(version, shape_info, byte_positions, data, len_data) ) @@ -164,7 +167,6 @@ def encode_chunk( # write byte positions ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[0]).tobytes()) - ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[1]).tobytes()) memcpy(ptr, _ndarray_to_ptr(byte_positions)) ptr += byte_positions.nbytes @@ -209,8 +211,8 @@ def decode_chunk( # read byte positions byte_positions_dtype = np.dtype(hub.constants.ENCODING_DTYPE) - byte_positions_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32) - ptr += 8 + byte_positions_shape = (int(np.frombuffer(ptr.memoryview[:4], dtype=np.int32)), 3) + ptr += 4 byte_positions_data_size = int( np.prod(byte_positions_shape) * byte_positions_dtype.itemsize ) @@ -268,49 +270,3 @@ def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]: ) return version, ids - - -def test_chunk_encoding(): - version = hub.__version__ - shape_info = np.cast[hub.constants.ENCODING_DTYPE]( - np.random.randint(100, size=(17, 63)) - ) - byte_positions = np.cast[hub.constants.ENCODING_DTYPE]( - np.random.randint(100, size=(31, 79)) - ) - data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9] - encoded = bytes(encode_chunk(version, shape_info, byte_positions, data)) - - # from bytes - decoded = decode_chunk(encoded) - version2, shape_info2, byte_positions2, data2 = decoded - assert version2 == version - np.testing.assert_array_equal(shape_info, shape_info2) - np.testing.assert_array_equal(byte_positions, byte_positions2) - assert b"".join(data) == bytes(data2) - - # from pointer - buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded)) - decoded = decode_chunk(buff) - version2, shape_info2, byte_positions2, data2 = decoded - assert version2 == version - np.testing.assert_array_equal(shape_info, shape_info2) - np.testing.assert_array_equal(byte_positions, byte_positions2) - assert b"".join(data) == bytes(data2) - - -def test_chunkids_encoding(): - version = hub.__version__ - shards = [ - np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(100, 2))) - ] - encoded = encode_chunkids(version, shards) - decoded = decode_chunkids(encoded) - version2, ids = decoded - assert version2 == version - np.testing.assert_array_equal(np.concatenate(shards), ids) - - -if __name__ == "__main__": - test_chunk_encoding() - test_chunkids_encoding() diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 4b465a7da4..f5368fed62 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -3,7 +3,7 @@ import hub from hub.core.storage.cachable import Cachable from io import BytesIO -from typing import Optional, Tuple, Union, List +from typing import Optional, Tuple, Union, List, Iterable import numpy as np from uuid import uuid4 from hub.core.lowlevel import encode_chunkids, decode_chunkids @@ -322,7 +322,7 @@ def get( self._prev_sample_index is not None and sample_index >= self._prev_sample_index ): - if sample_index <= self._prev_entry[LAST_INDEX_INDEX]: + if sample_index <= self._prev_entry[LAST_INDEX_INDEX]: # type: ignore chunk_id = self._prev_chunk_id chunk_index = self._prev_chunk_index current_entry = self._prev_entry @@ -355,8 +355,8 @@ def get( if return_chunk_index: ret.append(chunk_index) if return_local_sample_index: - if any(chunk_index): - prev_entry = self._get_entry_2d(*self._decr_2d(*chunk_index)) + if any(chunk_index): # type: ignore + prev_entry = self._get_entry_2d(*self._decr_2d(*chunk_index)) # type: ignore local_sample_index = ( sample_index - int(prev_entry[LAST_INDEX_INDEX]) - 1 ) @@ -366,13 +366,19 @@ def get( return tuple(ret) # type: ignore - def iter(self, index: Union[int, slice, tuple] = slice(None)): + def iter( + self, index: Union[int, slice, tuple] = slice(None) + ) -> Iterable[Tuple[int, int]]: if isinstance(index, int): - yield self.get(index, return_local_sample_index=True) + yield self.get(index, return_local_sample_index=True) # type: ignore elif isinstance(index, slice): start = 0 if index.start is None else index.start stop = self.num_samples if index.stop is None else index.stop step = 1 if index.step is None else index.step + if start < 0: + start += self.num_samples + if stop < 0: + stop += self.num_samples assert isinstance(start, int) assert isinstance(stop, int) assert isinstance(step, int) @@ -392,12 +398,12 @@ def iter(self, index: Union[int, slice, tuple] = slice(None)): chunk_id, (shard_index, chunk_index), local_sample_index = self.get( # type: ignore start, return_chunk_index=True, return_local_sample_index=True ) - shard = self._data[shard_index] yield chunk_id, local_sample_index n += 1 if n == total: return ctr = Counter(step) + shard = self._data[shard_index] if forward: last_index = int(shard[chunk_index, LAST_INDEX_INDEX]) for i in range(local_sample_index + 1, last_index + 1): @@ -472,7 +478,7 @@ def iter(self, index: Union[int, slice, tuple] = slice(None)): elif isinstance(index, tuple): for i in index: # Random access - yield self.get(i, return_local_sample_index=True) + yield self.get(i, return_local_sample_index=True) # type: ignore class Counter: diff --git a/hub/core/tests/test_lowlevel.py b/hub/core/tests/test_lowlevel.py new file mode 100644 index 0000000000..5381aba075 --- /dev/null +++ b/hub/core/tests/test_lowlevel.py @@ -0,0 +1,51 @@ +from hub.core.lowlevel import ( + Pointer, + encode_chunk, + decode_chunk, + encode_chunkids, + decode_chunkids, +) +import numpy as np +import ctypes +import hub + + +def test_chunk_encoding(): + version = hub.__version__ + shape_info = np.cast[hub.constants.ENCODING_DTYPE]( + np.random.randint(100, size=(17, 63)) + ) + byte_positions = np.cast[hub.constants.ENCODING_DTYPE]( + np.random.randint(100, size=(31, 3)) + ) + data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9] + encoded = bytes(encode_chunk(version, shape_info, byte_positions, data)) + + # from bytes + decoded = decode_chunk(encoded) + version2, shape_info2, byte_positions2, data2 = decoded + assert version2 == version + np.testing.assert_array_equal(shape_info, shape_info2) + np.testing.assert_array_equal(byte_positions, byte_positions2) + assert b"".join(data) == bytes(data2) + + # from pointer + buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded)) + decoded = decode_chunk(buff) + version2, shape_info2, byte_positions2, data2 = decoded + assert version2 == version + np.testing.assert_array_equal(shape_info, shape_info2) + np.testing.assert_array_equal(byte_positions, byte_positions2) + assert b"".join(data) == bytes(data2) + + +def test_chunkids_encoding(): + version = hub.__version__ + shards = [ + np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(100, 2))) + ] + encoded = encode_chunkids(version, shards) + decoded = decode_chunkids(encoded) + version2, ids = decoded + assert version2 == version + np.testing.assert_array_equal(np.concatenate(shards), ids) From 8f475a913b342ca781399643e7d2e8dc7213393e Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 13 Jul 2021 00:23:50 +0530 Subject: [PATCH 48/79] minimize searchsorted calls --- hub/api/tests/test_api.py | 2 +- hub/core/meta/encode/chunk_id.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index fd274359bd..64749dfe95 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -491,6 +491,6 @@ def searchsorted(*args, **kwargs): np.testing.assert_array_equal(sub_ds["x"].numpy(), np.zeros((10, 10))) np.testing.assert_array_equal(sub_ds["y"].numpy(), np.ones((10, 10))) - assert call_count["n"] == 44 + assert call_count["n"] == 40 np.searchsorted = orig_searchsorted diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index f5368fed62..fc51755159 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -383,6 +383,10 @@ def iter( assert isinstance(stop, int) assert isinstance(step, int) assert step != 0 + if start < 0: + start += self.num_samples + if stop < 0: + stop += self.num_samples if step > 0: total = math.ceil((stop - start) / step) forward = True @@ -395,15 +399,23 @@ def iter( return n = 0 self._flush_buffer() - chunk_id, (shard_index, chunk_index), local_sample_index = self.get( # type: ignore - start, return_chunk_index=True, return_local_sample_index=True - ) + if start: + chunk_id, (shard_index, chunk_index), local_sample_index = self.get( # type: ignore + start, return_chunk_index=True, return_local_sample_index=True + ) + shard = self._data[shard_index] + else: + shard_index = 0 + chunk_index = 0 + shard = self._data[0] + local_sample_index = 0 + chunk_id = shard[0, CHUNK_ID_INDEX] yield chunk_id, local_sample_index n += 1 if n == total: return ctr = Counter(step) - shard = self._data[shard_index] + if forward: last_index = int(shard[chunk_index, LAST_INDEX_INDEX]) for i in range(local_sample_index + 1, last_index + 1): From a7dd7f173fb57e0ba6e7bbec2bf07a8b24e94de2 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 13 Jul 2021 14:16:06 +0530 Subject: [PATCH 49/79] refac chunk_id.py --- hub/core/meta/encode/chunk_id.py | 221 +++++++++++++++++-------------- 1 file changed, 122 insertions(+), 99 deletions(-) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index fc51755159..fec2fc58e0 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -366,38 +366,130 @@ def get( return tuple(ret) # type: ignore + def _preproc_slice(self, index: slice) -> Tuple[int, int, int, int, bool]: + start = 0 if index.start is None else index.start + stop = self.num_samples if index.stop is None else index.stop + step = 1 if index.step is None else index.step + assert isinstance(start, int) + assert isinstance(stop, int) + assert isinstance(step, int) + if start < 0: + start += self.num_samples + if stop < 0: + stop += self.num_samples + assert step != 0 + if step > 0: + total = math.ceil((stop - start) / step) + forward = True + else: + step = -step + total = math.ceil((stop - start) / step) + start, stop = stop - 1, start + forward = False + return start, stop, step, total, forward + + def _iter_forward( + self, + chunk_id: int, + shard_index: int, + chunk_index: int, + local_sample_index: int, + total: int, + step: int, + ) -> Iterable[Tuple[int, int]]: + n = 0 + ctr = Counter(step) + shard = self._data[shard_index] + last_index = int(shard[chunk_index, LAST_INDEX_INDEX]) + for i in range(local_sample_index + 1, last_index + 1): + if ctr(): + yield chunk_id, i + n += 1 + if n == total: + return + for chunk_index in range(chunk_index + 1, len(shard)): + entry = shard[chunk_index] + chunk_id = entry[CHUNK_ID_INDEX] + new_last_index = int(entry[LAST_INDEX_INDEX]) + for i in range(new_last_index - last_index): + if ctr(): + yield chunk_id, i + n += 1 + if n == total: + return + last_index = new_last_index + for shard_index in range(shard_index + 1, len(self._data)): + shard = self._data[shard_index] + for entry in shard: + chunk_id = entry[CHUNK_ID_INDEX] + new_last_index = int(entry[LAST_INDEX_INDEX]) + for i in range(new_last_index - last_index): + if ctr(): + yield chunk_id, i + n += 1 + if n == total: + return + last_index = new_last_index + + def _iter_reverse( + self, + chunk_id: int, + shard_index: int, + chunk_index: int, + local_sample_index: int, + total: int, + step: int, + ) -> Iterable[Tuple[int, int]]: + n = 0 + ctr = Counter(step) + shard = self._data[shard_index] + last_index = int(shard[chunk_index, LAST_INDEX_INDEX]) + for local_sample_index in range(local_sample_index - 1, -1, -1): + if ctr(): + yield chunk_id, local_sample_index + n += 1 + if n == total: + return + for chunk_index in range(chunk_index - 1, -1, -1): + entry = shard[chunk_index] + chunk_id = entry[CHUNK_ID_INDEX] + last_index = entry[LAST_INDEX_INDEX] + if chunk_index: + last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX] + elif shard_index: + last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX] + for local_sample_index in range(last_index, -1, -1): + if ctr(): + yield chunk_id, local_sample_index + n += 1 + if n == total: + return + for shard_index in range(shard_index - 1, -1, -1): + shard = self._data[shard_index] + for chunk_index in range(len(shard) - 1, -1, -1): + entry = shard[chunk_index] + chunk_id = entry[CHUNK_ID_INDEX] + last_index = entry[LAST_INDEX_INDEX] + if chunk_index: + last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX] + elif shard_index: + last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX] + for local_sample_index in range(last_index, -1, -1): + if ctr(): + yield chunk_id, local_sample_index + n += 1 + if n == total: + return + def iter( self, index: Union[int, slice, tuple] = slice(None) ) -> Iterable[Tuple[int, int]]: if isinstance(index, int): yield self.get(index, return_local_sample_index=True) # type: ignore elif isinstance(index, slice): - start = 0 if index.start is None else index.start - stop = self.num_samples if index.stop is None else index.stop - step = 1 if index.step is None else index.step - if start < 0: - start += self.num_samples - if stop < 0: - stop += self.num_samples - assert isinstance(start, int) - assert isinstance(stop, int) - assert isinstance(step, int) - assert step != 0 - if start < 0: - start += self.num_samples - if stop < 0: - stop += self.num_samples - if step > 0: - total = math.ceil((stop - start) / step) - forward = True - else: - step = -step - total = math.ceil((stop - start) / step) - start, stop = stop - 1, start - forward = False + start, stop, step, total, forward = self._preproc_slice(index) if not total: return - n = 0 self._flush_buffer() if start: chunk_id, (shard_index, chunk_index), local_sample_index = self.get( # type: ignore @@ -411,82 +503,13 @@ def iter( local_sample_index = 0 chunk_id = shard[0, CHUNK_ID_INDEX] yield chunk_id, local_sample_index - n += 1 - if n == total: + if total == 1: return - ctr = Counter(step) - - if forward: - last_index = int(shard[chunk_index, LAST_INDEX_INDEX]) - for i in range(local_sample_index + 1, last_index + 1): - if ctr(): - yield chunk_id, i - n += 1 - if n == total: - return - for chunk_index in range(chunk_index + 1, len(shard)): - entry = shard[chunk_index] - chunk_id = entry[CHUNK_ID_INDEX] - new_last_index = int(entry[LAST_INDEX_INDEX]) - for i in range(new_last_index - last_index): - if ctr(): - yield chunk_id, i - n += 1 - if n == total: - return - last_index = new_last_index - for shard_index in range(shard_index + 1, len(self._data)): - shard = self._data[shard_index] - for entry in shard: - chunk_id = entry[CHUNK_ID_INDEX] - new_last_index = int(entry[LAST_INDEX_INDEX]) - for i in range(new_last_index - last_index): - if ctr(): - yield chunk_id, i - n += 1 - if n == total: - return - last_index = new_last_index - else: - last_index = int(shard[chunk_index, LAST_INDEX_INDEX]) - for local_sample_index in range(local_sample_index - 1, -1, -1): - if ctr(): - yield chunk_id, local_sample_index - n += 1 - if n == total: - return - for chunk_index in range(chunk_index - 1, -1, -1): - entry = shard[chunk_index] - chunk_id = entry[CHUNK_ID_INDEX] - last_index = entry[LAST_INDEX_INDEX] - if chunk_index: - last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX] - elif shard_index: - last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX] - for local_sample_index in range(last_index, -1, -1): - if ctr(): - yield chunk_id, local_sample_index - n += 1 - if n == total: - return - for shard_index in range(shard_index - 1, -1, -1): - shard = self._data[shard_index] - for chunk_index in range(len(shard) - 1, -1, -1): - entry = shard[chunk_index] - chunk_id = entry[CHUNK_ID_INDEX] - last_index = entry[LAST_INDEX_INDEX] - if chunk_index: - last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX] - elif shard_index: - last_index -= self._data[shard_index - 1][ - -1, LAST_INDEX_INDEX - ] - for local_sample_index in range(last_index, -1, -1): - if ctr(): - yield chunk_id, local_sample_index - n += 1 - if n == total: - return + iter_f = self._iter_forward if forward else self._iter_reverse + for chunk_id, local_sample_index in iter_f( + chunk_id, shard_index, chunk_index, local_sample_index, total - 1, step + ): + yield chunk_id, local_sample_index elif isinstance(index, tuple): for i in index: # Random access From 746201c5eea47a977a70fc146d7539bab5765c40 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 13 Jul 2021 21:39:25 +0530 Subject: [PATCH 50/79] more refacc --- hub/core/chunk.py | 42 +-- hub/core/lowlevel.py | 272 ------------------ hub/core/meta/encode/chunk_id.py | 2 +- hub/core/serialize.py | 138 +++++++++ .../{test_lowlevel.py => test_serialize.py} | 12 +- 5 files changed, 150 insertions(+), 316 deletions(-) delete mode 100644 hub/core/lowlevel.py create mode 100644 hub/core/serialize.py rename hub/core/tests/{test_lowlevel.py => test_serialize.py} (74%) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 7c5a918ca3..f8ae9a8066 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -8,13 +8,7 @@ from hub.core.meta.encode.shape import ShapeEncoder from hub.core.meta.encode.byte_positions import BytePositionsEncoder -from hub.core.lowlevel import ( - encode_chunk, - decode_chunk, - malloc, - _write_pybytes, - _infer_chunk_num_bytes, -) +from hub.core.serialize import encode_chunk, decode_chunk, infer_chunk_num_bytes class Chunk(Cachable): @@ -59,16 +53,6 @@ def __init__( self._data.append(data) self._num_data_bytes += len(data) - @property - def memoryview_data(self): - # deprecated - if len(self._data) == 1: - return self._data[0] - ptr = malloc(self.num_data_bytes) - for data in self._data: - ptr = _write_pybytes(ptr, data) - return memoryview(ptr.bytes) - def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]: """Converts `byte_index`, which is an index for a flattened stream of bytes, into a 2D index that can be used for a list of byte streams of varying lengths. Used for accessing `self._data`, which is a list @@ -102,9 +86,7 @@ def view(self, start_byte: int, end_byte: int): end2dx, end2dy = self._get_2d_idx(end_byte) if start2dx == end2dx: # Indexing to the same inner chunk, this would be fast - buff = malloc(end2dy - start2dy) - _write_pybytes(buff, self._data[start2dx][start2dy:end2dy]) - return buff.memoryview + return self._data[start2dx][start2dy:end2dy] # TODO: document this # builds a list of memoryviews that contain the pieces we need for the output view @@ -114,11 +96,14 @@ def view(self, start_byte: int, end_byte: int): for i in range(start2dx + 1, end2dx): byts.append(self._data[i]) byts.append(self._data[end2dx][:end2dy]) - buff = malloc(sum(map(len, byts))) - ptr = buff + 0 + + buff = np.zeros(sum(map(len, byts)), dtype=np.byte) + offset = 0 for byt in byts: - ptr = _write_pybytes(ptr, byt.cast("B")) - return buff.memoryview + n = len(byt) + buff[offset : offset + n] = byt + offset += n + return memoryview(buff.tobytes()) @property def num_samples(self): @@ -181,19 +166,12 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]): def __len__(self): """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached.""" - return _infer_chunk_num_bytes( + return infer_chunk_num_bytes( hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, len_data=self.num_data_bytes, ) - return ( - 17 - + len(hub.__version__) - + self.shapes_encoder.array.nbytes - + self.byte_positions_encoder.array.nbytes - + self.num_data_bytes - ) def tobytes(self) -> memoryview: if self.num_samples == 0: diff --git a/hub/core/lowlevel.py b/hub/core/lowlevel.py deleted file mode 100644 index 4dfd9dc848..0000000000 --- a/hub/core/lowlevel.py +++ /dev/null @@ -1,272 +0,0 @@ -import numpy as np -import ctypes -from collections import namedtuple -from typing import Tuple, Sequence, Union, Optional, List -import hub - - -class Pointer(object): - __slots__ = ("address", "size", "_c_array", "_refs") - - def __init__( - self, - address: Optional[int] = None, - size: Optional[int] = None, - c_array: Optional[ctypes.Array] = None, - ) -> None: - self._refs: List[ctypes.Array] = [] - if c_array is None: - if address is None or size is None: - raise ValueError("Expected c_array or address and size args.") - self.address = address - self.size = size - self._set_c_array() - else: - self._c_array = c_array - self.address = ctypes.addressof(c_array) - self.size = len(c_array) - - def _set_c_array(self) -> None: - try: - self._refs.append(self._c_array) - except AttributeError: - pass - self._c_array = (ctypes.c_byte * self.size).from_address(self.address) - - def __add__(self, i: int) -> "Pointer": - assert i >= 0 - assert i <= self.size - ret = Pointer(self.address + i, self.size - i) - ret._refs.append(self._c_array) - return ret - - def __iadd__(self, i: int) -> "Pointer": - assert i >= 0 - assert i <= self.size - self.address += i - self.size -= i - self._set_c_array() - return self - - def __setitem__(self, idx: int, byte: int) -> None: - self._c_array[idx] = byte - - def __getitem__(self, idx: Union[int, slice]) -> Union[int, "Pointer"]: - if isinstance(idx, int): - return self._c_array[idx] - elif isinstance(idx, slice): - assert idx.step is None - start = idx.start - end = idx.stop - n = self.size - if start is None: - start = 0 - elif start < 0: - start += n - if end is None: - end = n - elif end < 0: - end += n - assert start >= 0 and start < n - assert end >= start and end <= n - ret = Pointer(self.address + start, end - start) - ret._refs.append(self._c_array) - return ret - - @property - def memoryview(self): - return memoryview(self._c_array) - - @property - def bytes(self): - return bytes(self._c_array) - - @property - def bytearray(self): - return bytearray(self._c_array) - - def __len__(self): - return self.size - - -def malloc(size: int) -> Pointer: - return Pointer(c_array=(ctypes.c_byte * size)()) - - -def memcpy(dest: Pointer, src: Pointer, count=None) -> None: - if count is None: - count = src.size - ctypes.memmove(dest.address, src.address, count) - - -def _write_pybytes(ptr: Pointer, byts: Union[bytes, memoryview]) -> Pointer: - memcpy(ptr, _ndarray_to_ptr(np.frombuffer(byts, dtype=np.byte))) - return ptr + len(byts) - - -def _ndarray_to_ptr(arr: np.ndarray) -> Pointer: - return Pointer(arr.__array_interface__["data"][0], arr.itemsize * arr.size) - - -def _pybytes_to_c_array(byts: bytes) -> Pointer: - return Pointer( - np.frombuffer(byts, dtype=np.byte).__array_interface__["data"][0], len(byts) - ) - - -def _infer_chunk_num_bytes( - version: str, - shape_info: np.ndarray, - byte_positions: np.ndarray, - data: Optional[Union[Sequence[bytes], Sequence[memoryview]]] = None, - len_data: Optional[int] = None, -) -> int: - # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128) - # NOTE: Assumption: len(version) < 256 - assert len(version) < 256 - assert max((map(ord, version))) < 128 - # assert shape_info.ndim == 2 - # assert byte_positions.ndim == 2 - # version_slice_size = 1 + len(version) - # shape_info_slice_size = 4 + 4 + shape_info.nbytes - # byte_positions_slice_size = 4 + byte_positions.nbytes - # data_slice_size = sum(map(len, data)) - if len_data is None: - len_data = sum(map(len, data)) # type: ignore - return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 13 - - -def encode_chunk( - version: str, - shape_info: np.ndarray, - byte_positions: np.ndarray, - data: Union[Sequence[bytes], Sequence[memoryview]], - len_data: Optional[int] = None, -) -> memoryview: - - if len_data is None: - len_data = sum(map(len, data)) - - flatbuff = malloc( - _infer_chunk_num_bytes(version, shape_info, byte_positions, data, len_data) - ) - ptr = flatbuff + 0 - - # write version - ptr[0] = len(version) - ptr += 1 - for c in version: - ptr[0] = ord(c) - ptr += 1 - - # write shape info - ptr = _write_pybytes(ptr, np.int32(shape_info.shape[0]).tobytes()) - ptr = _write_pybytes(ptr, np.int32(shape_info.shape[1]).tobytes()) - memcpy(ptr, _ndarray_to_ptr(shape_info)) - ptr += shape_info.nbytes - - # write byte positions - ptr = _write_pybytes(ptr, np.int32(byte_positions.shape[0]).tobytes()) - memcpy(ptr, _ndarray_to_ptr(byte_positions)) - ptr += byte_positions.nbytes - - # write actual data - for d in data: - if isinstance(d, Pointer): - d = d.memoryview - ptr = _write_pybytes(ptr, d) - - return memoryview(flatbuff.bytes) - - -def decode_chunk( - buff: Union[bytes, Pointer, memoryview] -) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: - if not isinstance(buff, Pointer): - buff = _pybytes_to_c_array(buff) - copy = True - else: - copy = False - ptr = buff + 0 - - # read version - len_version: int = ptr[0] # type: ignore - version = "" - ptr += 1 - for i in range(len_version): - version += chr(ptr[i]) # type: ignore - ptr += len_version - - # read shape info - shape_info_dtype = np.dtype(hub.constants.ENCODING_DTYPE) - shape_info_shape = np.frombuffer(ptr.memoryview[:8], dtype=np.int32) - ptr += 8 - shape_info_data_size = int(np.prod(shape_info_shape) * shape_info_dtype.itemsize) - shape_info = np.frombuffer( - ptr.memoryview[:shape_info_data_size], dtype=shape_info_dtype - ).reshape(shape_info_shape) - if copy: - shape_info = shape_info.copy() - ptr += shape_info_data_size - - # read byte positions - byte_positions_dtype = np.dtype(hub.constants.ENCODING_DTYPE) - byte_positions_shape = (int(np.frombuffer(ptr.memoryview[:4], dtype=np.int32)), 3) - ptr += 4 - byte_positions_data_size = int( - np.prod(byte_positions_shape) * byte_positions_dtype.itemsize - ) - byte_positions = np.frombuffer( - ptr.memoryview[:byte_positions_data_size], dtype=byte_positions_dtype - ).reshape(byte_positions_shape) - if copy: - byte_positions = byte_positions.copy() - ptr += byte_positions_data_size - if copy: - data = memoryview(ptr.bytes) - else: - data = ptr.memoryview - return version, shape_info, byte_positions, data - - -def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: - len_version = len(version) - flatbuff = malloc(1 + len_version + sum([x.nbytes for x in ids])) - - # Write version - ptr = flatbuff + 0 - ptr[0] = len_version - ptr += 1 - - for i, c in enumerate(version): - ptr[i] = ord(c) - - ptr += len_version - - for arr in ids: - memcpy(ptr, _ndarray_to_ptr(arr)) - ptr += arr.nbytes - - return memoryview(flatbuff.bytes) - - -def decode_chunkids(buff: bytes) -> Tuple[str, np.ndarray]: - ptr = _pybytes_to_c_array(buff) - - # Read version - len_version: int = ptr[0] # type: ignore - ptr += 1 - version = "" - for i in range(len_version): - version += chr(ptr[i]) # type: ignore - - ptr += len_version - - # Read chunk ids - ids = ( - np.frombuffer(ptr.memoryview, dtype=hub.constants.ENCODING_DTYPE) - .reshape(-1, 2) - .copy() - ) - - return version, ids diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index fec2fc58e0..7fe00a04a5 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -6,7 +6,7 @@ from typing import Optional, Tuple, Union, List, Iterable import numpy as np from uuid import uuid4 -from hub.core.lowlevel import encode_chunkids, decode_chunkids +from hub.core.serialize import encode_chunkids, decode_chunkids from hub.core.index import IndexEntry import math diff --git a/hub/core/serialize.py b/hub/core/serialize.py new file mode 100644 index 0000000000..3b0fc68158 --- /dev/null +++ b/hub/core/serialize.py @@ -0,0 +1,138 @@ +from typing import Optional, Sequence, Union, Tuple + +import hub +import ctypes +import numpy as np + + +def infer_chunk_num_bytes( + version: str, + shape_info: np.ndarray, + byte_positions: np.ndarray, + data: Optional[Union[Sequence[bytes], Sequence[memoryview]]] = None, + len_data: Optional[int] = None, +) -> int: + # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128) + # NOTE: Assumption: len(version) < 256 + assert len(version) < 256 + assert max((map(ord, version))) < 128 + if len_data is None: + len_data = sum(map(len, data)) # type: ignore + return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 13 + + +def encode_chunk( + version: str, + shape_info: np.ndarray, + byte_positions: np.ndarray, + data: Union[Sequence[bytes], Sequence[memoryview]], + len_data: Optional[int] = None, +) -> memoryview: + nbytes = infer_chunk_num_bytes(version, shape_info, byte_positions, data, len_data) + flatbuff = np.zeros(nbytes, dtype=np.byte) + + # Write version + len_version = len(version) + flatbuff[0] = len_version + flatbuff[1 : 1 + len_version] = list(map(ord, version)) + offset = 1 + len_version + + # Write shape info + flatbuff[offset : offset + 8] = np.array(shape_info.shape, dtype=np.int32).view( + np.byte + ) + offset += 8 + flatbuff[offset : offset + shape_info.nbytes] = shape_info.reshape(-1).view(np.byte) + offset += shape_info.nbytes + + # Write byte positions + flatbuff[offset : offset + 4] = np.int32(byte_positions.shape[0]).view((np.byte, 4)) + offset += 4 + flatbuff[offset : offset + byte_positions.nbytes] = byte_positions.reshape(-1).view( + np.byte + ) + offset += byte_positions.nbytes + + # Write actual data + for byts in data: + n = len(byts) + flatbuff[offset : offset + n] = np.frombuffer(byts, dtype=np.byte) + offset += n + return memoryview(flatbuff.tobytes()) + + +def decode_chunk( + byts: Union[bytes, memoryview] +) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: + + enc_dtype = np.dtype(hub.constants.ENCODING_DTYPE) + + buff = np.frombuffer(byts, dtype=np.byte) + + # Read version + len_version = buff[0] + version = "".join(map(chr, buff[1 : 1 + len_version])) + offset = 1 + len_version + + # Read shape info + shape_info_shape = buff[offset : offset + 8].view(np.int32) + offset += 8 + shape_info_nbytes = np.prod(shape_info_shape) * enc_dtype.itemsize + shape_info = ( + buff[offset : offset + shape_info_nbytes] + .view(enc_dtype) + .reshape(shape_info_shape) + .copy() + ) + offset += shape_info_nbytes + + # Read byte positions + byte_positions_rows = buff[offset : offset + 4].view(np.int32)[0] + offset += 4 + byte_positions_nbytes = byte_positions_rows * 3 * enc_dtype.itemsize + byte_positions = ( + buff[offset : offset + byte_positions_nbytes] + .view(enc_dtype) + .reshape(byte_positions_rows, 3) + .copy() + ) + offset += byte_positions_nbytes + + # Read data + data = buff[offset:].copy() + + return version, shape_info, byte_positions, data + + +def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: + len_version = len(version) + flatbuff = np.zeros(1 + len_version + sum([x.nbytes for x in ids]), dtype=np.byte) + + # Write version + len_version = len(version) + flatbuff[0] = len_version + flatbuff[1 : 1 + len_version] = list(map(ord, version)) + offset = 1 + len_version + + # Write ids + for arr in ids: + flatbuff[offset : offset + arr.nbytes] = arr.view(np.byte).reshape(-1) + offset += arr.nbytes + + return memoryview(flatbuff.tobytes()) + + +def decode_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]: + enc_dtype = np.dtype(hub.constants.ENCODING_DTYPE) + + buff = np.frombuffer(byts, dtype=np.byte) + + # Read version + len_version = buff[0] + version = "".join(map(chr, buff[1 : 1 + len_version])) + offset = 1 + len_version + + # Read chunk ids + ids = buff[offset:].view(enc_dtype).reshape(-1, 2).copy() + + return version, ids diff --git a/hub/core/tests/test_lowlevel.py b/hub/core/tests/test_serialize.py similarity index 74% rename from hub/core/tests/test_lowlevel.py rename to hub/core/tests/test_serialize.py index 5381aba075..92fc6ddb17 100644 --- a/hub/core/tests/test_lowlevel.py +++ b/hub/core/tests/test_serialize.py @@ -1,5 +1,4 @@ -from hub.core.lowlevel import ( - Pointer, +from hub.core.serialize import ( encode_chunk, decode_chunk, encode_chunkids, @@ -29,15 +28,6 @@ def test_chunk_encoding(): np.testing.assert_array_equal(byte_positions, byte_positions2) assert b"".join(data) == bytes(data2) - # from pointer - buff = Pointer(c_array=(ctypes.c_byte * len(encoded))(*encoded)) - decoded = decode_chunk(buff) - version2, shape_info2, byte_positions2, data2 = decoded - assert version2 == version - np.testing.assert_array_equal(shape_info, shape_info2) - np.testing.assert_array_equal(byte_positions, byte_positions2) - assert b"".join(data) == bytes(data2) - def test_chunkids_encoding(): version = hub.__version__ From 440a0b70da08eafb98ec0d36bfaecc0f18bbbbcf Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 13 Jul 2021 21:48:11 +0530 Subject: [PATCH 51/79] encode_*->serialize_* --- hub/core/chunk.py | 6 +++--- hub/core/meta/encode/chunk_id.py | 11 +++-------- hub/core/serialize.py | 8 ++++---- hub/core/tests/test_serialize.py | 20 ++++++++++---------- 4 files changed, 20 insertions(+), 25 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index f8ae9a8066..892f1cdba0 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -8,7 +8,7 @@ from hub.core.meta.encode.shape import ShapeEncoder from hub.core.meta.encode.byte_positions import BytePositionsEncoder -from hub.core.serialize import encode_chunk, decode_chunk, infer_chunk_num_bytes +from hub.core.serialize import serialize_chunk, deserialize_chunk, infer_chunk_num_bytes class Chunk(Cachable): @@ -177,7 +177,7 @@ def tobytes(self) -> memoryview: if self.num_samples == 0: return memoryview(bytes()) - return encode_chunk( + return serialize_chunk( hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, @@ -189,5 +189,5 @@ def tobytes(self) -> memoryview: def frombuffer(cls, buffer: bytes) -> "Chunk": if len(buffer) == 0: return cls() - version, shapes, byte_positions, data = decode_chunk(buffer) + version, shapes, byte_positions, data = deserialize_chunk(buffer) return cls(shapes, byte_positions, data=data) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 7fe00a04a5..69c47ca01b 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -6,7 +6,7 @@ from typing import Optional, Tuple, Union, List, Iterable import numpy as np from uuid import uuid4 -from hub.core.serialize import encode_chunkids, decode_chunkids +from hub.core.serialize import serialize_chunkids, deserialize_chunkids from hub.core.index import IndexEntry import math @@ -104,12 +104,7 @@ def _get_2d_idx(self, idx: int) -> Tuple[int, int]: def tobytes(self) -> memoryview: self._flush_buffer() - encoded = encode_chunkids(hub.__version__, self._data) - decoded = decode_chunkids(encoded)[1] - if self._data: - np.testing.assert_array_equal( - decoded, np.concatenate(self._data), err_msg=str(bytes(encoded)) - ) + encoded = serialize_chunkids(hub.__version__, self._data) return encoded @staticmethod @@ -133,7 +128,7 @@ def get_name_for_chunk(self, chunk_index: int) -> str: @classmethod def frombuffer(cls, buffer: bytes): - version, ids = decode_chunkids(buffer) + version, ids = deserialize_chunkids(buffer) return cls(ids) @property diff --git a/hub/core/serialize.py b/hub/core/serialize.py index 3b0fc68158..58726682da 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -21,7 +21,7 @@ def infer_chunk_num_bytes( return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 13 -def encode_chunk( +def serialize_chunk( version: str, shape_info: np.ndarray, byte_positions: np.ndarray, @@ -61,7 +61,7 @@ def encode_chunk( return memoryview(flatbuff.tobytes()) -def decode_chunk( +def deserialize_chunk( byts: Union[bytes, memoryview] ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: @@ -104,7 +104,7 @@ def decode_chunk( return version, shape_info, byte_positions, data -def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: +def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: len_version = len(version) flatbuff = np.zeros(1 + len_version + sum([x.nbytes for x in ids]), dtype=np.byte) @@ -122,7 +122,7 @@ def encode_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: return memoryview(flatbuff.tobytes()) -def decode_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]: +def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]: enc_dtype = np.dtype(hub.constants.ENCODING_DTYPE) buff = np.frombuffer(byts, dtype=np.byte) diff --git a/hub/core/tests/test_serialize.py b/hub/core/tests/test_serialize.py index 92fc6ddb17..543e64717a 100644 --- a/hub/core/tests/test_serialize.py +++ b/hub/core/tests/test_serialize.py @@ -1,15 +1,15 @@ from hub.core.serialize import ( - encode_chunk, - decode_chunk, - encode_chunkids, - decode_chunkids, + serialize_chunk, + deserialize_chunk, + serialize_chunkids, + deserialize_chunkids, ) import numpy as np import ctypes import hub -def test_chunk_encoding(): +def test_chunk_serialize(): version = hub.__version__ shape_info = np.cast[hub.constants.ENCODING_DTYPE]( np.random.randint(100, size=(17, 63)) @@ -18,10 +18,10 @@ def test_chunk_encoding(): np.random.randint(100, size=(31, 3)) ) data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9] - encoded = bytes(encode_chunk(version, shape_info, byte_positions, data)) + encoded = bytes(serialize_chunk(version, shape_info, byte_positions, data)) # from bytes - decoded = decode_chunk(encoded) + decoded = deserialize_chunk(encoded) version2, shape_info2, byte_positions2, data2 = decoded assert version2 == version np.testing.assert_array_equal(shape_info, shape_info2) @@ -29,13 +29,13 @@ def test_chunk_encoding(): assert b"".join(data) == bytes(data2) -def test_chunkids_encoding(): +def test_chunkids_serialize(): version = hub.__version__ shards = [ np.cast[hub.constants.ENCODING_DTYPE](np.random.randint(100, size=(100, 2))) ] - encoded = encode_chunkids(version, shards) - decoded = decode_chunkids(encoded) + encoded = serialize_chunkids(version, shards) + decoded = deserialize_chunkids(encoded) version2, ids = decoded assert version2 == version np.testing.assert_array_equal(np.concatenate(shards), ids) From ceae226105a3597c976b608137231708932bbf11 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 13 Jul 2021 20:25:59 +0400 Subject: [PATCH 52/79] Update hub/core/chunk.py Co-authored-by: dyllan --- hub/core/chunk.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 892f1cdba0..dd1996781d 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -141,8 +141,6 @@ def append_sample(self, buffer: memoryview, max_data_bytes: int, shape: Tuple[in ) # `_data` will be a `memoryview` if `frombuffer` is called. - # if isinstance(self._data, memoryview): - # self._data = bytearray(self._data) # note: incoming_num_bytes can be 0 (empty sample) self._data.append(buffer) From 88ab4bb8a9acabf6534a413513eec9e1d9085411 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 13 Jul 2021 22:25:09 +0530 Subject: [PATCH 53/79] docstring --- hub/api/dataset.py | 14 ++++------- hub/api/tensor.py | 3 +++ hub/core/chunk.py | 5 ++-- hub/core/serialize.py | 56 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 65 insertions(+), 13 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index c372addd10..f49e64d59c 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -60,7 +60,6 @@ def __init__( Use this if you want to specify the storage provider object manually instead of using a tag or url to generate it. public (bool, optional): Applied only if storage is Hub cloud storage and a new Dataset is being created. Defines if the dataset will have public access. token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated. - _tensors: (list, optional): Internal. Raises: ValueError: If an existing local path is given, it must be a directory. @@ -90,7 +89,7 @@ def __init__( self.storage.autoflush = True self.index = index or Index() - self.tensors: Dict[str, Tensor] = _tensors if _tensors else {} + self.tensors: Dict[str, Tensor] = {} self._token = token @@ -218,10 +217,8 @@ def __iter__(self) -> Iterator["Dataset"]: tensors_sliced = [t[self.index][: len(self)] for t in self.tensors.values()] num_tensors = len(tensor_names) for tensors in zip(*tensors_sliced): - tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)} - ds = Dataset( - read_only=self.read_only, storage=self.storage, _tensors=tensors - ) + ds = Dataset(read_only=self.read_only, storage=self.storage) + ds.tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)} yield ds def _load_meta(self): @@ -230,9 +227,8 @@ def _load_meta(self): if dataset_exists(self.storage): logger.info(f"{self.path} loaded successfully.") self.meta = self.storage.get_cachable(meta_key, DatasetMeta) - if not self.tensors: - for tensor_name in self.meta.tensors: - self.tensors[tensor_name] = Tensor(tensor_name, self.storage) + for tensor_name in self.meta.tensors: + self.tensors[tensor_name] = Tensor(tensor_name, self.storage) elif len(self.storage) > 0: # dataset does not exist, but the path was not empty diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 6d825c48a9..a74e53bbe8 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -45,6 +45,9 @@ def __init__( self.chunk_engine = ChunkEngine(self.key, self.storage) + # If this tensor corresponds to a sample in a parent tensor, + # `_sample` caches the chunk id and local sample index + # for that sample. Set during iteration through the parent tensor. self._sample: Optional[Tuple[int, int]] = None def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 892f1cdba0..dc8cea2e00 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -79,6 +79,7 @@ def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]: return i, byte_index def view(self, start_byte: int, end_byte: int): + """Returns a sliced view of the chunk's data""" if len(self._data) == 1: return self._data[0][start_byte:end_byte] @@ -88,9 +89,7 @@ def view(self, start_byte: int, end_byte: int): # Indexing to the same inner chunk, this would be fast return self._data[start2dx][start2dy:end2dy] - # TODO: document this - # builds a list of memoryviews that contain the pieces we need for the output view - + # build a list of memoryviews that contain the pieces we need for the output view byts = [] byts.append(self._data[start2dx][start2dy:]) for i in range(start2dx + 1, end2dx): diff --git a/hub/core/serialize.py b/hub/core/serialize.py index 58726682da..8cbaf6e3bf 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -12,6 +12,18 @@ def infer_chunk_num_bytes( data: Optional[Union[Sequence[bytes], Sequence[memoryview]]] = None, len_data: Optional[int] = None, ) -> int: + """Calculates the number of bytes in a chunk without serializing it. Used by `LRUCache` to determine if a chunk can be cached. + + Args: + version: (str) Version of hub library + shape_info: (numpy.ndarray) Encoded shapes info from the chunk's `ShapeEncoder` instance. + byte_positions: (numpy.ndarray) Encoded byte positions from the chunk's `BytePositionsEncoder` instance. + data: (list) `_data` field of the chunk + len_data: (int, optional) Number of bytes in the chunk + + Returns: + Length of the chunk when serialized as int + """ # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128) # NOTE: Assumption: len(version) < 256 assert len(version) < 256 @@ -28,6 +40,18 @@ def serialize_chunk( data: Union[Sequence[bytes], Sequence[memoryview]], len_data: Optional[int] = None, ) -> memoryview: + """Serializes a chunk + + Args: + version: (str) Version of hub library. + shape_info: (numpy.ndarray) Encoded shapes info from the chunk's `ShapeEncoder` instance. + byte_positions: (numpy.ndarray) Encoded byte positions from the chunk's `BytePositionsEncoder` instance. + data: (list) `_data` field of the chunk. + len_data: (int, optional) Number of bytes in the chunk. + + Returns: + Serialized chunk as memoryview. + """ nbytes = infer_chunk_num_bytes(version, shape_info, byte_positions, data, len_data) flatbuff = np.zeros(nbytes, dtype=np.byte) @@ -64,7 +88,18 @@ def serialize_chunk( def deserialize_chunk( byts: Union[bytes, memoryview] ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: - + """Deserializes a chunk + + Args: + byts: (bytes) Serialized chunk. + + Returns: + Tuple of: + hub version used to create the chunk, + encoded shapes info as numpy array, + encoded byte positions as numpy array, + chunk data as memoryview. + """ enc_dtype = np.dtype(hub.constants.ENCODING_DTYPE) buff = np.frombuffer(byts, dtype=np.byte) @@ -105,6 +140,15 @@ def deserialize_chunk( def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: + """Serializes chunk ids + + Args: + version: (str) Version of hub library. + ids: (list) Encoded chunk ids from a `ChunkIdEncoder` instance. + + Returns: + Serialized chunk ids as memoryview. + """ len_version = len(version) flatbuff = np.zeros(1 + len_version + sum([x.nbytes for x in ids]), dtype=np.byte) @@ -123,6 +167,16 @@ def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]: + """Deserializes chunk ids + + Args: + byts: (bytes) Serialized chunk ids. + + Returns: + Tuple of: + hub version used to create the chunk, + encoded chunk ids as memoryview. + """ enc_dtype = np.dtype(hub.constants.ENCODING_DTYPE) buff = np.frombuffer(byts, dtype=np.byte) From 4b29507b92f61fcf4f1fe8040b8e95e29c5e3a59 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 13 Jul 2021 23:31:58 +0530 Subject: [PATCH 54/79] docstring --- hub/core/meta/encode/chunk_id.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 69c47ca01b..8c171813aa 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -295,14 +295,16 @@ def get( Args: sample_index (int): Global index (relative to the tensor). This will be converted to the local chunk index. - return_chunk_index (bool): If True, 2 values are returned, the second one being the chunk's index. Defaults to False. + return_chunk_index (bool): If True, a tuple of 2 ints representing the chunks index is returned along with the chunk id. + return_local_sample_index (bool): If True, the local index of the sample within the chunk is returned along with the chunk id. Raises: IndexError: If no samples exist or `sample_index` exceeds the available indices. Returns: - Tuple[Tuple[ENCODING_DTYPE], Optional[Tuple[int]]]: Returns the chunk ID for `sample_index`. If `return_chunk_index` is True, - there will be 2 values. The second one being the chunk's index. + Union[int, Tuple[int, Tuple[int, int]], Tuple[int, int], Tuple[int, Tuple[int, int], int]]: Returns either just the chunk id + or a tuple containing the chunk id and one or both of the chunk index and local sample index based on the `return_chunk_index` + and `return_local_sample_index` arguments. """ if self.num_samples == 0: raise IndexError( From 5b386087bdf59b926edf0f801220d4600c5cb549 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 13 Jul 2021 23:42:22 +0530 Subject: [PATCH 55/79] rm comments --- hub/api/tests/test_api.py | 2 +- hub/core/meta/encode/chunk_id.py | 1 - hub/core/tests/test_serialize.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 67809ad311..de53b2df8e 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -502,7 +502,7 @@ def searchsorted(*args, **kwargs): np.searchsorted = orig_searchsorted - + def test_array_interface(memory_ds: Dataset): tensor = memory_ds.create_tensor("tensor") x = np.random.random((32, 32)) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 8c171813aa..def87fe2ae 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -157,7 +157,6 @@ def _decr_2d(self, x: int, y: int) -> Tuple[int, int]: def _incr_2d(self, x: int, y: int) -> Tuple[int, int]: if x < 0: return x, y + 1 - # assert y < len(self._data[x]) if y == len(self._data[x]) - 1: if x == len(self._data) - 1: return -1, 0 diff --git a/hub/core/tests/test_serialize.py b/hub/core/tests/test_serialize.py index 543e64717a..c799ae9191 100644 --- a/hub/core/tests/test_serialize.py +++ b/hub/core/tests/test_serialize.py @@ -20,7 +20,6 @@ def test_chunk_serialize(): data = [b"1234" * 7, b"abcdefg" * 8, b"qwertyuiop" * 9] encoded = bytes(serialize_chunk(version, shape_info, byte_positions, data)) - # from bytes decoded = deserialize_chunk(encoded) version2, shape_info2, byte_positions2, data2 = decoded assert version2 == version From 4f25b210b41456d08eebaa459079381c0e1c0e39 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 13 Jul 2021 23:43:32 +0530 Subject: [PATCH 56/79] rm unused import --- hub/core/serialize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hub/core/serialize.py b/hub/core/serialize.py index 8cbaf6e3bf..f238ceac02 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -1,7 +1,6 @@ from typing import Optional, Sequence, Union, Tuple import hub -import ctypes import numpy as np From 82ce5bea3c6d34f3de48234a102a95ce0d04b96d Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 03:55:39 +0530 Subject: [PATCH 57/79] revert dataset.py --- hub/api/dataset.py | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 53c96d44af..dabd61e6ee 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -1,7 +1,6 @@ -from hub.core.storage.provider import StorageProvider from hub.core.tensor import create_tensor -from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence, Iterator -from hub.constants import DEFAULT_HTYPE, UNSPECIFIED +from hub.constants import DEFAULT_HTYPE +from typing import Callable, Dict, Optional, Union, Tuple, List import numpy as np from hub.api.tensor import Tensor @@ -9,6 +8,7 @@ from hub.core.meta.dataset_meta import DatasetMeta +from hub.core.typing import StorageProvider from hub.core.index import Index from hub.integrations import dataset_to_tensorflow from hub.util.keys import dataset_exists, get_dataset_meta_key, tensor_exists @@ -38,7 +38,6 @@ def __init__( storage: Optional[StorageProvider] = None, public: Optional[bool] = True, token: Optional[str] = None, - _tensors: Optional[Dict[str, Tensor]] = None, ): """Initializes a new or existing dataset. @@ -61,6 +60,7 @@ def __init__( public (bool, optional): Applied only if storage is Hub cloud storage and a new Dataset is being created. Defines if the dataset will have public access. token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated. + Raises: ValueError: If an existing local path is given, it must be a directory. ImproperDatasetInitialization: Exactly one argument out of 'path' and 'storage' needs to be specified. @@ -128,8 +128,6 @@ def __getitem__( if item not in self.tensors: raise TensorDoesNotExistError(item) else: - if self.index.is_trivial(): - return self.tensors[item] return self.tensors[item][self.index] elif isinstance(item, (int, slice, list, tuple, Index)): return Dataset( @@ -145,8 +143,10 @@ def create_tensor( self, name: str, htype: str = DEFAULT_HTYPE, - dtype: Union[str, np.dtype, type] = UNSPECIFIED, - sample_compression: str = UNSPECIFIED, + chunk_size: int = None, + dtype: Union[str, np.dtype, type] = None, + sample_compression: str = None, + chunk_compression: str = None, **kwargs, ): """Creates a new tensor in the dataset. @@ -158,8 +158,12 @@ def create_tensor( For example, `htype="image"` would have `dtype` default to `uint8`. These defaults can be overridden by explicitly passing any of the other parameters to this function. May also modify the defaults for other parameters. + chunk_size (int): Optionally override this tensor's `chunk_size`. In short, `chunk_size` determines the + size of files (chunks) being created to represent this tensor's samples. + For more on chunking, check out `hub.core.chunk_engine.chunker`. dtype (str): Optionally override this tensor's `dtype`. All subsequent samples are required to have this `dtype`. - sample_compression (str): All samples will be compressed in the provided format. If `None`, samples are uncompressed. + sample_compression (str): Optionally override this tensor's `sample_compression`. Only used when the incoming data is uncompressed. + chunk_compression (str): Optionally override this tensor's `chunk_compression`. Currently not implemented. **kwargs: `htype` defaults can be overridden by passing any of the compatible parameters. To see all `htype`s and their correspondent arguments, check out `hub/htypes.py`. @@ -171,6 +175,10 @@ def create_tensor( NotImplementedError: If trying to override `chunk_compression`. """ + if chunk_compression is not None: + # TODO: implement chunk compression + update docstring + raise NotImplementedError("Chunk compression is not implemented yet!") + if tensor_exists(name, self.storage): raise TensorAlreadyExistsError(name) @@ -179,8 +187,10 @@ def create_tensor( name, self.storage, htype=htype, + chunk_size=chunk_size, dtype=dtype, sample_compression=sample_compression, + chunk_compression=chunk_compression, **kwargs, ) tensor = Tensor(name, self.storage) # type: ignore @@ -200,14 +210,9 @@ def __setattr__(self, name: str, value): else: return super().__setattr__(name, value) - def __iter__(self) -> Iterator["Dataset"]: - tensor_names = list(self.tensors) - tensors_sliced = [t[self.index][: len(self)] for t in self.tensors.values()] - num_tensors = len(tensor_names) - for tensors in zip(*tensors_sliced): - ds = Dataset(read_only=self.read_only, storage=self.storage) - ds.tensors = {tensor_names[i]: tensors[i] for i in range(num_tensors)} - yield ds + def __iter__(self): + for i in range(len(self)): + yield self[i] def _load_meta(self): meta_key = get_dataset_meta_key() @@ -215,6 +220,7 @@ def _load_meta(self): if dataset_exists(self.storage): logger.info(f"{self.path} loaded successfully.") self.meta = self.storage.get_cachable(meta_key, DatasetMeta) + for tensor_name in self.meta.tensors: self.tensors[tensor_name] = Tensor(tensor_name, self.storage) @@ -248,7 +254,6 @@ def read_only(self, value: bool): def pytorch( self, transform: Optional[Callable] = None, - tensors: Optional[Sequence[str]] = None, num_workers: int = 1, batch_size: Optional[int] = 1, drop_last: Optional[bool] = False, @@ -263,7 +268,6 @@ def pytorch( Args: transform (Callable, optional) : Transformation function to be applied to each sample. - tensors (List, optional): Optionally provide a list of tensor names in the ordering that your training script expects. For example, if you have a dataset that has "image" and "label" tensors, if `tensors=["image", "label"]`, your training script should expect each batch will be provided as a tuple of (image, label). num_workers (int): The number of workers to use for fetching data in parallel. batch_size (int, optional): Number of samples per batch to load. Default value is 1. drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. @@ -282,7 +286,6 @@ def pytorch( return dataset_to_pytorch( self, transform, - tensors, num_workers=num_workers, batch_size=batch_size, drop_last=drop_last, From 35d3a4a8e335e32108aeb64a429f6bda7144d23c Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 03:56:13 +0530 Subject: [PATCH 58/79] revert tensor.py --- hub/api/tensor.py | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 93786c53c6..1f24783900 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -1,6 +1,6 @@ from hub.util.keys import tensor_exists from hub.core.sample import Sample # type: ignore -from typing import List, Sequence, Union, Optional, Tuple, Dict, Iterator +from typing import List, Sequence, Union, Optional, Tuple, Dict from hub.util.shape import ShapeInterval import numpy as np @@ -10,8 +10,6 @@ from hub.util.exceptions import TensorDoesNotExistError, InvalidKeyTypeError from hub.core.index import Index -import warnings - class Tensor: def __init__( @@ -45,11 +43,6 @@ def __init__( self.chunk_engine = ChunkEngine(self.key, self.storage) - # If this tensor corresponds to a sample in a parent tensor, - # `_sample` caches the chunk id and local sample index - # for that sample. Set during iteration through the parent tensor. - self._sample: Optional[Tuple[int, int]] = None - def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array, or a sequence of `hub.load` outputs, which can be used to load files. See examples down below. @@ -78,7 +71,6 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): The length should be equal to the number of samples to add. """ self.chunk_engine.extend(samples) - self._sample = None def append( self, @@ -199,13 +191,9 @@ def __getitem__( def __setitem__(self, item: Union[int, slice], value: np.ndarray): raise NotImplementedError("Tensor update not currently supported!") - def __iter__(self) -> Iterator["Tensor"]: - for i, (chunk_id, local_sample_index) in enumerate( - self.chunk_engine.chunk_id_encoder.iter(self.index.values[0].value) - ): - tensor_i = Tensor(self.key, self.storage, index=self.index[i]) - tensor_i._sample = chunk_id, local_sample_index - yield tensor_i + def __iter__(self): + for i in range(len(self)): + yield self[i] def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]: """Computes the contents of the tensor in numpy format. @@ -221,15 +209,7 @@ def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]: Returns: A numpy array containing the data represented by this tensor. """ - if self._sample: - chunk_id, local_sample_index = self._sample - chunk = self.chunk_engine.get_chunk_from_id(chunk_id) - ret = self.chunk_engine.read_sample_from_chunk(chunk, local_sample_index) - if aslist: - ret = list(ret) - for entry in self.index.values[1:]: - ret = ret[entry.value] - return ret + return self.chunk_engine.numpy(self.index, aslist=aslist) def __str__(self): @@ -238,7 +218,4 @@ def __str__(self): index_str = "" return f"Tensor(key={repr(self.key)}{index_str})" - def __array__(self) -> np.ndarray: - return self.numpy() - __repr__ = __str__ From cb4ea21852ccc0cae11d354973140bcf840c37b2 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 03:57:13 +0530 Subject: [PATCH 59/79] revert test_api.py --- hub/api/tests/test_api.py | 89 ++++++++++----------------------------- 1 file changed, 22 insertions(+), 67 deletions(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index de53b2df8e..ea28236487 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -1,3 +1,4 @@ +from hub.constants import UNCOMPRESSED import numpy as np import pytest import uuid @@ -6,10 +7,7 @@ from hub.api.dataset import Dataset from hub.core.tests.common import parametrize_all_dataset_storages from hub.tests.common import assert_array_lists_equal -from hub.util.exceptions import ( - TensorDtypeMismatchError, - TensorInvalidSampleShapeError, -) +from hub.util.exceptions import TensorDtypeMismatchError, TensorInvalidSampleShapeError from hub.client.client import HubBackendClient from hub.client.utils import has_hub_testing_creds from click.testing import CliRunner @@ -187,7 +185,8 @@ def test_empty_samples(ds: Dataset): actual_list = tensor.numpy(aslist=True) expected_list = [a1, *a2, a3, *a4] - assert tensor.meta.sample_compression is None + assert tensor.meta.sample_compression == UNCOMPRESSED + assert tensor.meta.chunk_compression == UNCOMPRESSED assert len(tensor) == 16 assert tensor.shape_interval.lower == (16, 0, 0, 2) @@ -197,7 +196,7 @@ def test_empty_samples(ds: Dataset): # test indexing individual empty samples with numpy while looping, this may seem redundant but this was failing before for actual_sample, expected in zip(ds, expected_list): - actual = actual_sample["with_empty"].numpy() + actual = actual_sample.with_empty.numpy() np.testing.assert_array_equal(actual, expected) @@ -218,27 +217,30 @@ def test_scalar_samples(ds: Dataset): tensor.append(-99) tensor.append(np.array(4)) - tensor.append(np.int16(4)) + with pytest.raises(TensorDtypeMismatchError): + tensor.append(np.int16(4)) with pytest.raises(TensorDtypeMismatchError): tensor.append(np.float32(4)) - tensor.append(np.uint8(3)) + with pytest.raises(TensorDtypeMismatchError): + tensor.append(np.uint8(3)) tensor.extend([10, 1, 4]) tensor.extend([1]) tensor.extend(np.array([1, 2, 3], dtype=MAX_INT_DTYPE)) - tensor.extend(np.array([4, 5, 33], dtype="int16")) + with pytest.raises(TensorDtypeMismatchError): + tensor.extend(np.array([4, 5, 33], dtype="int16")) - assert len(tensor) == 16 + assert len(tensor) == 11 - expected = np.array([5, 10, -99, 4, 4, 3, 10, 1, 4, 1, 1, 2, 3, 4, 5, 33]) + expected = np.array([5, 10, -99, 4, 10, 1, 4, 1, 1, 2, 3]) np.testing.assert_array_equal(tensor.numpy(), expected) assert tensor.numpy(aslist=True) == expected.tolist() - assert tensor.shape == (16,) + assert tensor.shape == (11,) # len(shape) for a scalar is `()`. len(shape) for [1] is `(1,)` with pytest.raises(TensorInvalidSampleShapeError): @@ -255,7 +257,6 @@ def test_sequence_samples(ds: Dataset): tensor.append([1, 2, 3]) tensor.extend([[4, 5, 6]]) - ds.clear_cache() assert len(tensor) == 2 @@ -383,7 +384,7 @@ def test_shape_property(memory_ds): def test_htype(memory_ds: Dataset): - image = memory_ds.create_tensor("image", htype="image", sample_compression="png") + image = memory_ds.create_tensor("image", htype="image") bbox = memory_ds.create_tensor("bbox", htype="bbox") label = memory_ds.create_tensor("label", htype="class_label") video = memory_ds.create_tensor("video", htype="video") @@ -426,22 +427,18 @@ def test_dtype(memory_ds: Dataset): np_dtyped_tensor.append(np.ones((10, 10), dtype=MAX_FLOAT_DTYPE)) py_dtyped_tensor.append(np.ones((10, 10), dtype=MAX_FLOAT_DTYPE)) - # test auto upcasting - np_dtyped_tensor.append(np.ones((10, 10), dtype="float32")) - py_dtyped_tensor.append(np.ones((10, 10), dtype="float32")) - - with pytest.raises(TensorDtypeMismatchError): - tensor.append(np.ones((10, 10), dtype="float64")) - - with pytest.raises(TensorDtypeMismatchError): - dtyped_tensor.append(np.ones((10, 10), dtype="uint64") * 256) - assert tensor.dtype == np.float32 assert dtyped_tensor.dtype == np.uint8 assert np_dtyped_tensor.dtype == MAX_FLOAT_DTYPE assert py_dtyped_tensor.dtype == MAX_FLOAT_DTYPE +@pytest.mark.xfail(raises=TensorDtypeMismatchError, strict=True) +def test_dtype_mismatch(memory_ds: Dataset): + tensor = memory_ds.create_tensor("tensor", dtype="float16") + tensor.append(np.ones(100, dtype="uint8")) + + @pytest.mark.xfail(raises=TypeError, strict=True) def test_fails_on_wrong_tensor_syntax(memory_ds): memory_ds.some_tensor = np.ones((28, 28)) @@ -474,55 +471,13 @@ def test_hub_cloud_dataset(): ds.delete() -def test_iter_perf(memory_ds: Dataset): - orig_searchsorted = np.searchsorted - call_count = {"n": 0} - callers = [] - - def searchsorted(*args, **kwargs): - import inspect - - callers.append(inspect.stack()[1][3]) - call_count["n"] += 1 - return orig_searchsorted(*args, **kwargs) - - ds = memory_ds - ds.create_tensor("x") - ds.create_tensor("y") - for _ in range(10): - ds["x"].append(np.zeros((10, 10))) - ds["y"].append(np.ones((10, 10))) - - np.searchsorted = searchsorted - for i, sub_ds in enumerate(ds): - np.testing.assert_array_equal(sub_ds["x"].numpy(), np.zeros((10, 10))) - np.testing.assert_array_equal(sub_ds["y"].numpy(), np.ones((10, 10))) - - assert call_count["n"] == 40 - - np.searchsorted = orig_searchsorted - - -def test_array_interface(memory_ds: Dataset): - tensor = memory_ds.create_tensor("tensor") - x = np.random.random((32, 32)) - tensor.append(x) - arr1 = np.array(tensor) - arr2 = np.array(tensor) - np.testing.assert_array_equal(x, arr1[0]) - np.testing.assert_array_equal(x, arr2[0]) - assert arr1.__array_interface__["data"][0] == arr1.__array_interface__["data"][0] - tensor.append(x) - np.testing.assert_array_equal(tensor.numpy(), np.concatenate([arr1, arr2])) - - @parametrize_all_dataset_storages def test_hub_dataset_suffix_bug(ds): # creating dataset with similar name but some suffix removed from end ds2 = Dataset(ds.path[:-1]) ds2.delete() - + def test_empty_dataset(): with CliRunner().isolated_filesystem(): ds = Dataset("test") From 868004ed8f6d267eb73fabac02d82dcbe0b1e20d Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 04:00:13 +0530 Subject: [PATCH 60/79] revert ChunkEngine.numpy --- hub/core/chunk_engine.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 56b760d142..d73cb3f86c 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -343,22 +343,30 @@ def numpy( Returns: Union[np.ndarray, Sequence[np.ndarray]]: Either a list of numpy arrays or a single numpy array (depending on the `aslist` argument). """ + length = self.num_samples enc = self.chunk_id_encoder last_shape = None samples = [] - for chunk_id, local_sample_index in enc.iter(index.values[0].value): - chunk = self.get_chunk_from_id(chunk_id) - sample = self.read_sample_from_chunk(chunk, local_sample_index) + for global_sample_index in index.values[0].indices(length): + chunk_id = enc[global_sample_index] + chunk_name = ChunkIdEncoder.name_from_id(chunk_id) + chunk_key = get_chunk_key(self.key, chunk_name) + chunk = self.cache.get_cachable(chunk_key, Chunk) + sample = self.read_sample_from_chunk(global_sample_index, chunk) shape = sample.shape + if not aslist and last_shape is not None: if shape != last_shape: raise DynamicTensorNumpyError(self.key, index, "shape") + samples.append(sample) last_shape = shape + return _format_samples(samples, index, aslist) + def get_chunk_from_id(self, chunk_id: int) -> Chunk: chunk_name = ChunkIdEncoder.name_from_id(chunk_id) chunk_key = get_chunk_key(self.key, chunk_name) From c7a1321eff384cd06845d95bd433652d73168b15 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 04:01:49 +0530 Subject: [PATCH 61/79] revert read_sample_from_chunk --- hub/core/chunk_engine.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index d73cb3f86c..05e9baf9fb 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -373,15 +373,16 @@ def get_chunk_from_id(self, chunk_id: int) -> Chunk: return self.cache.get_cachable(chunk_key, Chunk) def read_sample_from_chunk( - self, chunk: Chunk, local_sample_index: int + self, global_sample_index: int, chunk: Chunk ) -> np.ndarray: - """Read a sample from a chunk, given the local index. Handles decompressing if applicable.""" + """Read a sample from a chunk, converts the global index into a local index. Handles decompressing if applicable.""" - expect_compressed = self.tensor_meta.sample_compression is not None + expect_compressed = self.tensor_meta.sample_compression != UNCOMPRESSED dtype = self.tensor_meta.dtype enc = self.chunk_id_encoder + local_sample_index = enc.get_local_sample_index(global_sample_index) shape = chunk.shapes_encoder[local_sample_index] sb, eb = chunk.byte_positions_encoder[local_sample_index] From 6519986f22f451a17ea6f5e40f50635e464b0535 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 04:02:51 +0530 Subject: [PATCH 62/79] rem unreachable --- hub/core/chunk_engine.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 05e9baf9fb..c9f1348bbe 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -366,12 +366,6 @@ def numpy( return _format_samples(samples, index, aslist) - - def get_chunk_from_id(self, chunk_id: int) -> Chunk: - chunk_name = ChunkIdEncoder.name_from_id(chunk_id) - chunk_key = get_chunk_key(self.key, chunk_name) - return self.cache.get_cachable(chunk_key, Chunk) - def read_sample_from_chunk( self, global_sample_index: int, chunk: Chunk ) -> np.ndarray: From 0ddc61bb782d9d96806079d5fdb863e2d5861787 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 04:09:03 +0530 Subject: [PATCH 63/79] remove iter logic --- hub/core/meta/encode/chunk_id.py | 405 +++++-------------------------- 1 file changed, 63 insertions(+), 342 deletions(-) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index def87fe2ae..af828ba4d7 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -3,12 +3,10 @@ import hub from hub.core.storage.cachable import Cachable from io import BytesIO -from typing import Optional, Tuple, Union, List, Iterable +from typing import Optional, Tuple import numpy as np from uuid import uuid4 from hub.core.serialize import serialize_chunkids, deserialize_chunkids -from hub.core.index import IndexEntry -import math # these constants are for accessing the data layout. see the `ChunkIdEncoder` docstring. @@ -17,7 +15,7 @@ class ChunkIdEncoder(Cachable): - def __init__(self, ids=None): + def __init__(self): """Custom compressor that allows reading of chunk IDs from a sample index without decompressing. Chunk IDs: @@ -71,41 +69,11 @@ def __init__(self, ids=None): Then, you get the left-most column and that is your chunk ID! """ - self._buffer: List[List[int]] = [] - self._data: List[np.ndarray] = [] if ids is None else [ids] - self._num_chunks = sum(map(len, self._data)) - - self._prev_sample_index: Optional[int] = None - self._prev_chunk_id: Optional[int] = None - self._prev_chunk_index: Optional[Tuple[int, int]] = None - self._prev_entry: Optional[Union[np.ndarray, List[int]]] = None - - def _flush_buffer(self): - if self._buffer: - self._data.append(np.array(self._buffer, dtype=ENCODING_DTYPE)) - if self._prev_chunk_index and self._prev_chunk_index[0] < 0: - self._prev_chunk_index = (len(self._data) - 1, self._prev_chunk_index[1]) - self._buffer.clear() - - def _get_2d_idx(self, idx: int) -> Tuple[int, int]: - i = 0 - data = self._data - while True: - try: - num_data_i = len(data[i]) - except IndexError: # slightly faster than checking i < len(self._data) in a loop - return -1, idx - if num_data_i <= idx: - idx -= num_data_i - i += 1 - else: - break - return i, idx + + self._encoded_ids = None def tobytes(self) -> memoryview: - self._flush_buffer() - encoded = serialize_chunkids(hub.__version__, self._data) - return encoded + return serialize_chunkids(hub.__version__, [self._encoded_ids]) @staticmethod def name_from_id(id: ENCODING_DTYPE) -> str: @@ -123,79 +91,28 @@ def id_from_name(name: str) -> ENCODING_DTYPE: def get_name_for_chunk(self, chunk_index: int) -> str: """Gets the name for the chunk at index `chunk_index`. If you need to get the name for a chunk from a sample index, instead use `__getitem__`, then `name_from_id`.""" - chunk_id = self.get_entry(chunk_index)[CHUNK_ID_INDEX] + + chunk_id = self._encoded_ids[:, CHUNK_ID_INDEX][chunk_index] return ChunkIdEncoder.name_from_id(chunk_id) @classmethod def frombuffer(cls, buffer: bytes): + instance = cls() version, ids = deserialize_chunkids(buffer) - return cls(ids) + instance._encoded_ids = ids + return instance @property def num_chunks(self) -> int: - return self._num_chunks - - def get_entry(self, idx: int): - x, y = self._get_2d_idx(idx) - return self._buffer[y] if x < 0 else self._data[x][y] - - def _get_entry_2d(self, x: int, y: int): - return self._buffer[y] if x < 0 else self._data[x][y] - - def _decr_2d(self, x: int, y: int) -> Tuple[int, int]: - if x < 0: - if y: - return x, y - 1 - return len(self._data) - 1, len(self._data[-1]) - 1 - if y: - return x, y - 1 - if x: - x -= 1 - return x, len(self._data[x]) - 1 - raise IndexError() - - def _incr_2d(self, x: int, y: int) -> Tuple[int, int]: - if x < 0: - return x, y + 1 - if y == len(self._data[x]) - 1: - if x == len(self._data) - 1: - return -1, 0 - return x + 1, 0 - return x, y + 1 - - def _is_origin(self, x: int, y: int) -> bool: - if not x and not y: - return True - if x < 0 and not self._data and not y: - return True - return False - - @property - def last_entry(self) -> Union[np.ndarray, List[int]]: - if self._buffer: - return self._buffer[-1] - if self._data: - return self._data[-1][-1] - return None - - @property - def last_index(self) -> int: - last_entry = self.last_entry - if last_entry is None: - return -1 - return int(last_entry[LAST_INDEX_INDEX]) + if self._encoded_ids is None: + return 0 + return len(self._encoded_ids) @property def num_samples(self) -> int: - if self._buffer: - return int(self._buffer[-1][LAST_INDEX_INDEX] + 1) - elif self._data: - return int(self._data[-1][-1, LAST_INDEX_INDEX] + 1) - return 0 - - @property - def empty(self) -> bool: - return not self._buffer and not self._data + if self._encoded_ids is None: + return 0 + return int(self._encoded_ids[-1, LAST_INDEX_INDEX] + 1) def generate_chunk_id(self) -> ENCODING_DTYPE: """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it. @@ -204,9 +121,21 @@ def generate_chunk_id(self) -> ENCODING_DTYPE: Returns: ENCODING_DTYPE: The random chunk ID. """ + id = ENCODING_DTYPE(uuid4().int >> UUID_SHIFT_AMOUNT) - self._buffer.append([id, self.last_index]) - self._num_chunks += 1 + + if self.num_samples == 0: + self._encoded_ids = np.array([[id, -1]], dtype=ENCODING_DTYPE) + + else: + last_index = self.num_samples - 1 + + new_entry = np.array( + [[id, last_index]], + dtype=ENCODING_DTYPE, + ) + self._encoded_ids = np.concatenate([self._encoded_ids, new_entry]) + return id def register_samples_to_last_chunk_id(self, num_samples: int): @@ -221,14 +150,15 @@ def register_samples_to_last_chunk_id(self, num_samples: int): ChunkIdEncoderError: Must call `generate_chunk_id` before registering samples. ChunkIdEncoderError: `num_samples` can only be 0 if it is able to be a sample continuation accross chunks. """ + if num_samples < 0: raise ValueError( f"Cannot register negative num samples. Got: {num_samples}" ) - if self.empty: + if self.num_samples == 0: raise ChunkIdEncoderError( - f"Cannot register samples because no chunk IDs exist. {self._buffer}, {self._data}" + "Cannot register samples because no chunk IDs exist." ) if num_samples == 0 and self.num_chunks < 2: @@ -236,14 +166,12 @@ def register_samples_to_last_chunk_id(self, num_samples: int): "Cannot register 0 num_samples (signifying a partial sample continuing the last chunk) when no last chunk exists." ) - last_entry = self.last_entry - if self._buffer: - last_entry[LAST_INDEX_INDEX] += num_samples - else: - err = np.geterr()["over"] - np.seterr(over="ignore") - last_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples) - np.seterr(over=err) + current_entry = self._encoded_ids[-1] + + # this operation will trigger an overflow for the first addition, so supress the warning + np.seterr(over="ignore") + current_entry[LAST_INDEX_INDEX] += ENCODING_DTYPE(num_samples) + np.seterr(over="warn") def get_local_sample_index(self, global_sample_index: int) -> int: """Converts `global_sample_index` into a new index that is relative to the chunk the sample belongs to. @@ -273,38 +201,34 @@ def get_local_sample_index(self, global_sample_index: int) -> int: int: local index value between 0 and the amount of samples the chunk contains - 1. """ - return self.get(global_sample_index, return_local_sample_index=True)[1] # type: ignore - - def __getitem__(self, sample_index: int) -> int: - return self.get(sample_index) # type: ignore - - def get( - self, - sample_index: int, - return_chunk_index: bool = False, - return_local_sample_index: bool = False, - ) -> Union[ - int, - Tuple[int, Tuple[int, int]], - Tuple[int, Tuple[int, int], int], - Tuple[int, int], - ]: + _, chunk_index = self.__getitem__(global_sample_index, return_chunk_index=True) # type: ignore + + if chunk_index == 0: + return global_sample_index + + current_entry = self._encoded_ids[chunk_index - 1] # type: ignore + last_num_samples = current_entry[LAST_INDEX_INDEX] + 1 + + return int(global_sample_index - last_num_samples) + + def __getitem__( + self, sample_index: int, return_chunk_index: bool = False + ) -> Tuple[ENCODING_DTYPE, Optional[int]]: """Get the ID for the chunk that `sample_index` is stored in. To get the name of the chunk, use `name_from_id`. Args: sample_index (int): Global index (relative to the tensor). This will be converted to the local chunk index. - return_chunk_index (bool): If True, a tuple of 2 ints representing the chunks index is returned along with the chunk id. - return_local_sample_index (bool): If True, the local index of the sample within the chunk is returned along with the chunk id. + return_chunk_index (bool): If True, 2 values are returned, the second one being the chunk's index. Defaults to False. Raises: IndexError: If no samples exist or `sample_index` exceeds the available indices. Returns: - Union[int, Tuple[int, Tuple[int, int]], Tuple[int, int], Tuple[int, Tuple[int, int], int]]: Returns either just the chunk id - or a tuple containing the chunk id and one or both of the chunk index and local sample index based on the `return_chunk_index` - and `return_local_sample_index` arguments. + Tuple[Tuple[ENCODING_DTYPE], Optional[Tuple[int]]]: Returns the chunk ID for `sample_index`. If `return_chunk_index` is True, + there will be 2 values. The second one being the chunk's index. """ + if self.num_samples == 0: raise IndexError( f"Index {sample_index} is out of bounds for an empty chunk names encoding." @@ -313,214 +237,11 @@ def get( if sample_index < 0: sample_index = (self.num_samples) + sample_index - chunk_id = None - if ( - self._prev_sample_index is not None - and sample_index >= self._prev_sample_index - ): - if sample_index <= self._prev_entry[LAST_INDEX_INDEX]: # type: ignore - chunk_id = self._prev_chunk_id - chunk_index = self._prev_chunk_index - current_entry = self._prev_entry - else: - next_index = self._incr_2d(*self._prev_chunk_index) # type: ignore - next_entry = self._get_entry_2d(*next_index) - if sample_index <= next_entry[LAST_INDEX_INDEX]: - chunk_index = next_index - current_entry = next_entry - chunk_id = current_entry[CHUNK_ID_INDEX] - - if chunk_id is None: - self._flush_buffer() - last_idxs = [shard[-1, LAST_INDEX_INDEX] for shard in self._data] - shard_index = np.searchsorted(last_idxs, sample_index) - shard = self._data[shard_index] - idx = np.searchsorted(shard[:, LAST_INDEX_INDEX], sample_index) - current_entry = shard[idx] - chunk_id = current_entry[CHUNK_ID_INDEX] - chunk_index = (shard_index, idx) - - self._prev_sample_index = sample_index - self._prev_chunk_index = chunk_index - self._prev_entry = current_entry - self._prev_chunk_id = chunk_id - - if not return_chunk_index and not return_local_sample_index: - return chunk_id - ret = [chunk_id] + idx = np.searchsorted(self._encoded_ids[:, LAST_INDEX_INDEX], sample_index) + id = self._encoded_ids[idx, CHUNK_ID_INDEX] + chunk_index = idx + if return_chunk_index: - ret.append(chunk_index) - if return_local_sample_index: - if any(chunk_index): # type: ignore - prev_entry = self._get_entry_2d(*self._decr_2d(*chunk_index)) # type: ignore - local_sample_index = ( - sample_index - int(prev_entry[LAST_INDEX_INDEX]) - 1 - ) - else: - local_sample_index = sample_index - ret.append(local_sample_index) - - return tuple(ret) # type: ignore - - def _preproc_slice(self, index: slice) -> Tuple[int, int, int, int, bool]: - start = 0 if index.start is None else index.start - stop = self.num_samples if index.stop is None else index.stop - step = 1 if index.step is None else index.step - assert isinstance(start, int) - assert isinstance(stop, int) - assert isinstance(step, int) - if start < 0: - start += self.num_samples - if stop < 0: - stop += self.num_samples - assert step != 0 - if step > 0: - total = math.ceil((stop - start) / step) - forward = True - else: - step = -step - total = math.ceil((stop - start) / step) - start, stop = stop - 1, start - forward = False - return start, stop, step, total, forward - - def _iter_forward( - self, - chunk_id: int, - shard_index: int, - chunk_index: int, - local_sample_index: int, - total: int, - step: int, - ) -> Iterable[Tuple[int, int]]: - n = 0 - ctr = Counter(step) - shard = self._data[shard_index] - last_index = int(shard[chunk_index, LAST_INDEX_INDEX]) - for i in range(local_sample_index + 1, last_index + 1): - if ctr(): - yield chunk_id, i - n += 1 - if n == total: - return - for chunk_index in range(chunk_index + 1, len(shard)): - entry = shard[chunk_index] - chunk_id = entry[CHUNK_ID_INDEX] - new_last_index = int(entry[LAST_INDEX_INDEX]) - for i in range(new_last_index - last_index): - if ctr(): - yield chunk_id, i - n += 1 - if n == total: - return - last_index = new_last_index - for shard_index in range(shard_index + 1, len(self._data)): - shard = self._data[shard_index] - for entry in shard: - chunk_id = entry[CHUNK_ID_INDEX] - new_last_index = int(entry[LAST_INDEX_INDEX]) - for i in range(new_last_index - last_index): - if ctr(): - yield chunk_id, i - n += 1 - if n == total: - return - last_index = new_last_index - - def _iter_reverse( - self, - chunk_id: int, - shard_index: int, - chunk_index: int, - local_sample_index: int, - total: int, - step: int, - ) -> Iterable[Tuple[int, int]]: - n = 0 - ctr = Counter(step) - shard = self._data[shard_index] - last_index = int(shard[chunk_index, LAST_INDEX_INDEX]) - for local_sample_index in range(local_sample_index - 1, -1, -1): - if ctr(): - yield chunk_id, local_sample_index - n += 1 - if n == total: - return - for chunk_index in range(chunk_index - 1, -1, -1): - entry = shard[chunk_index] - chunk_id = entry[CHUNK_ID_INDEX] - last_index = entry[LAST_INDEX_INDEX] - if chunk_index: - last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX] - elif shard_index: - last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX] - for local_sample_index in range(last_index, -1, -1): - if ctr(): - yield chunk_id, local_sample_index - n += 1 - if n == total: - return - for shard_index in range(shard_index - 1, -1, -1): - shard = self._data[shard_index] - for chunk_index in range(len(shard) - 1, -1, -1): - entry = shard[chunk_index] - chunk_id = entry[CHUNK_ID_INDEX] - last_index = entry[LAST_INDEX_INDEX] - if chunk_index: - last_index -= shard[chunk_id - 1, LAST_INDEX_INDEX] - elif shard_index: - last_index -= self._data[shard_index - 1][-1, LAST_INDEX_INDEX] - for local_sample_index in range(last_index, -1, -1): - if ctr(): - yield chunk_id, local_sample_index - n += 1 - if n == total: - return - - def iter( - self, index: Union[int, slice, tuple] = slice(None) - ) -> Iterable[Tuple[int, int]]: - if isinstance(index, int): - yield self.get(index, return_local_sample_index=True) # type: ignore - elif isinstance(index, slice): - start, stop, step, total, forward = self._preproc_slice(index) - if not total: - return - self._flush_buffer() - if start: - chunk_id, (shard_index, chunk_index), local_sample_index = self.get( # type: ignore - start, return_chunk_index=True, return_local_sample_index=True - ) - shard = self._data[shard_index] - else: - shard_index = 0 - chunk_index = 0 - shard = self._data[0] - local_sample_index = 0 - chunk_id = shard[0, CHUNK_ID_INDEX] - yield chunk_id, local_sample_index - if total == 1: - return - iter_f = self._iter_forward if forward else self._iter_reverse - for chunk_id, local_sample_index in iter_f( - chunk_id, shard_index, chunk_index, local_sample_index, total - 1, step - ): - yield chunk_id, local_sample_index - elif isinstance(index, tuple): - for i in index: - # Random access - yield self.get(i, return_local_sample_index=True) # type: ignore - - -class Counter: - # TODO: refac this - def __init__(self, n: int) -> None: - self.n = n - self.i = 0 - - def __call__(self): - self.i += 1 - if self.i == self.n: - self.i = 0 - return True - return False + return id, chunk_index + + return id From 609ea67cd161125e0da3ffc9905873d5985fc18f Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 04:10:18 +0530 Subject: [PATCH 64/79] revert pytorch.py --- hub/integrations/pytorch/pytorch.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/hub/integrations/pytorch/pytorch.py b/hub/integrations/pytorch/pytorch.py index 95f33387f1..596073305f 100644 --- a/hub/integrations/pytorch/pytorch.py +++ b/hub/integrations/pytorch/pytorch.py @@ -253,10 +253,10 @@ def _generate_shared_memory_names(self, chunk_names: Set[str]): ls.append(f"al_{self.last_chunk_num_generated}") return ls - def _numpy_from_chunk(self, chunk, key: str, local_index: int): + def _numpy_from_chunk(self, index: int, key: str, chunk): """Takes a list of chunks and returns a numpy array from it""" chunk_engine = self.all_chunk_engines[key] - value = chunk_engine.read_sample_from_chunk(chunk, local_index) + value = chunk_engine.read_sample_from_chunk(index, chunk) # typecast if incompatible with pytorch if value.dtype == "uint16": @@ -289,16 +289,14 @@ def _get_data_from_chunks( actual_index = self.index_offset + i # TODO change this once it returns list/set of str chunk_engine = self.all_chunk_engines[key] - chunk_id, local_index = chunk_engine.chunk_id_encoder.get( - actual_index, return_local_sample_index=True - ) + chunk_id = chunk_engine.chunk_id_encoder[actual_index] chunk_name = chunk_engine.chunk_id_encoder.name_from_id(chunk_id) # type: ignore if chunk_name not in chunk_map: self.last_index_meta[key] = i - 1 return chunk = chunk_map[chunk_name] self.all_index_value_maps[key][i] = self._numpy_from_chunk( - chunk, key, local_index + actual_index, key, chunk ) self.last_index_meta[key] = len(self) - 1 From 7823060c9cf561e86d41d381672c4601d182967d Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 04:12:48 +0530 Subject: [PATCH 65/79] revert dataset.py --- hub/api/dataset.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index dabd61e6ee..b6a802c217 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -1,6 +1,7 @@ +from hub.core.storage.provider import StorageProvider from hub.core.tensor import create_tensor -from hub.constants import DEFAULT_HTYPE -from typing import Callable, Dict, Optional, Union, Tuple, List +from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence +from hub.constants import DEFAULT_HTYPE, UNSPECIFIED import numpy as np from hub.api.tensor import Tensor @@ -8,7 +9,6 @@ from hub.core.meta.dataset_meta import DatasetMeta -from hub.core.typing import StorageProvider from hub.core.index import Index from hub.integrations import dataset_to_tensorflow from hub.util.keys import dataset_exists, get_dataset_meta_key, tensor_exists @@ -143,10 +143,8 @@ def create_tensor( self, name: str, htype: str = DEFAULT_HTYPE, - chunk_size: int = None, - dtype: Union[str, np.dtype, type] = None, - sample_compression: str = None, - chunk_compression: str = None, + dtype: Union[str, np.dtype, type] = UNSPECIFIED, + sample_compression: str = UNSPECIFIED, **kwargs, ): """Creates a new tensor in the dataset. @@ -158,12 +156,8 @@ def create_tensor( For example, `htype="image"` would have `dtype` default to `uint8`. These defaults can be overridden by explicitly passing any of the other parameters to this function. May also modify the defaults for other parameters. - chunk_size (int): Optionally override this tensor's `chunk_size`. In short, `chunk_size` determines the - size of files (chunks) being created to represent this tensor's samples. - For more on chunking, check out `hub.core.chunk_engine.chunker`. dtype (str): Optionally override this tensor's `dtype`. All subsequent samples are required to have this `dtype`. - sample_compression (str): Optionally override this tensor's `sample_compression`. Only used when the incoming data is uncompressed. - chunk_compression (str): Optionally override this tensor's `chunk_compression`. Currently not implemented. + sample_compression (str): All samples will be compressed in the provided format. If `None`, samples are uncompressed. **kwargs: `htype` defaults can be overridden by passing any of the compatible parameters. To see all `htype`s and their correspondent arguments, check out `hub/htypes.py`. @@ -175,10 +169,6 @@ def create_tensor( NotImplementedError: If trying to override `chunk_compression`. """ - if chunk_compression is not None: - # TODO: implement chunk compression + update docstring - raise NotImplementedError("Chunk compression is not implemented yet!") - if tensor_exists(name, self.storage): raise TensorAlreadyExistsError(name) @@ -187,10 +177,8 @@ def create_tensor( name, self.storage, htype=htype, - chunk_size=chunk_size, dtype=dtype, sample_compression=sample_compression, - chunk_compression=chunk_compression, **kwargs, ) tensor = Tensor(name, self.storage) # type: ignore @@ -254,6 +242,7 @@ def read_only(self, value: bool): def pytorch( self, transform: Optional[Callable] = None, + tensors: Optional[Sequence[str]] = None, num_workers: int = 1, batch_size: Optional[int] = 1, drop_last: Optional[bool] = False, @@ -268,6 +257,7 @@ def pytorch( Args: transform (Callable, optional) : Transformation function to be applied to each sample. + tensors (List, optional): Optionally provide a list of tensor names in the ordering that your training script expects. For example, if you have a dataset that has "image" and "label" tensors, if `tensors=["image", "label"]`, your training script should expect each batch will be provided as a tuple of (image, label). num_workers (int): The number of workers to use for fetching data in parallel. batch_size (int, optional): Number of samples per batch to load. Default value is 1. drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. @@ -286,6 +276,7 @@ def pytorch( return dataset_to_pytorch( self, transform, + tensors, num_workers=num_workers, batch_size=batch_size, drop_last=drop_last, From 390151ba5146bb1a5b437e1eb867c5e4f26772dc Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 04:22:39 +0530 Subject: [PATCH 66/79] reverts --- hub/api/tests/test_api.py | 58 ++++++++++++++++++++------------ hub/core/chunk_engine.py | 26 +++++--------- hub/core/meta/encode/chunk_id.py | 4 +++ 3 files changed, 50 insertions(+), 38 deletions(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index ea28236487..5bb9b75751 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -1,4 +1,3 @@ -from hub.constants import UNCOMPRESSED import numpy as np import pytest import uuid @@ -7,7 +6,10 @@ from hub.api.dataset import Dataset from hub.core.tests.common import parametrize_all_dataset_storages from hub.tests.common import assert_array_lists_equal -from hub.util.exceptions import TensorDtypeMismatchError, TensorInvalidSampleShapeError +from hub.util.exceptions import ( + TensorDtypeMismatchError, + TensorInvalidSampleShapeError, +) from hub.client.client import HubBackendClient from hub.client.utils import has_hub_testing_creds from click.testing import CliRunner @@ -185,8 +187,7 @@ def test_empty_samples(ds: Dataset): actual_list = tensor.numpy(aslist=True) expected_list = [a1, *a2, a3, *a4] - assert tensor.meta.sample_compression == UNCOMPRESSED - assert tensor.meta.chunk_compression == UNCOMPRESSED + assert tensor.meta.sample_compression is None assert len(tensor) == 16 assert tensor.shape_interval.lower == (16, 0, 0, 2) @@ -217,30 +218,27 @@ def test_scalar_samples(ds: Dataset): tensor.append(-99) tensor.append(np.array(4)) - with pytest.raises(TensorDtypeMismatchError): - tensor.append(np.int16(4)) + tensor.append(np.int16(4)) with pytest.raises(TensorDtypeMismatchError): tensor.append(np.float32(4)) - with pytest.raises(TensorDtypeMismatchError): - tensor.append(np.uint8(3)) + tensor.append(np.uint8(3)) tensor.extend([10, 1, 4]) tensor.extend([1]) tensor.extend(np.array([1, 2, 3], dtype=MAX_INT_DTYPE)) - with pytest.raises(TensorDtypeMismatchError): - tensor.extend(np.array([4, 5, 33], dtype="int16")) + tensor.extend(np.array([4, 5, 33], dtype="int16")) - assert len(tensor) == 11 + assert len(tensor) == 16 - expected = np.array([5, 10, -99, 4, 10, 1, 4, 1, 1, 2, 3]) + expected = np.array([5, 10, -99, 4, 4, 3, 10, 1, 4, 1, 1, 2, 3, 4, 5, 33]) np.testing.assert_array_equal(tensor.numpy(), expected) assert tensor.numpy(aslist=True) == expected.tolist() - assert tensor.shape == (11,) + assert tensor.shape == (16,) # len(shape) for a scalar is `()`. len(shape) for [1] is `(1,)` with pytest.raises(TensorInvalidSampleShapeError): @@ -257,6 +255,7 @@ def test_sequence_samples(ds: Dataset): tensor.append([1, 2, 3]) tensor.extend([[4, 5, 6]]) + ds.clear_cache() assert len(tensor) == 2 @@ -384,7 +383,7 @@ def test_shape_property(memory_ds): def test_htype(memory_ds: Dataset): - image = memory_ds.create_tensor("image", htype="image") + image = memory_ds.create_tensor("image", htype="image", sample_compression="png") bbox = memory_ds.create_tensor("bbox", htype="bbox") label = memory_ds.create_tensor("label", htype="class_label") video = memory_ds.create_tensor("video", htype="video") @@ -427,18 +426,22 @@ def test_dtype(memory_ds: Dataset): np_dtyped_tensor.append(np.ones((10, 10), dtype=MAX_FLOAT_DTYPE)) py_dtyped_tensor.append(np.ones((10, 10), dtype=MAX_FLOAT_DTYPE)) + # test auto upcasting + np_dtyped_tensor.append(np.ones((10, 10), dtype="float32")) + py_dtyped_tensor.append(np.ones((10, 10), dtype="float32")) + + with pytest.raises(TensorDtypeMismatchError): + tensor.append(np.ones((10, 10), dtype="float64")) + + with pytest.raises(TensorDtypeMismatchError): + dtyped_tensor.append(np.ones((10, 10), dtype="uint64") * 256) + assert tensor.dtype == np.float32 assert dtyped_tensor.dtype == np.uint8 assert np_dtyped_tensor.dtype == MAX_FLOAT_DTYPE assert py_dtyped_tensor.dtype == MAX_FLOAT_DTYPE -@pytest.mark.xfail(raises=TensorDtypeMismatchError, strict=True) -def test_dtype_mismatch(memory_ds: Dataset): - tensor = memory_ds.create_tensor("tensor", dtype="float16") - tensor.append(np.ones(100, dtype="uint8")) - - @pytest.mark.xfail(raises=TypeError, strict=True) def test_fails_on_wrong_tensor_syntax(memory_ds): memory_ds.some_tensor = np.ones((28, 28)) @@ -471,13 +474,26 @@ def test_hub_cloud_dataset(): ds.delete() +def test_array_interface(memory_ds: Dataset): + tensor = memory_ds.create_tensor("tensor") + x = np.random.random((32, 32)) + tensor.append(x) + arr1 = np.array(tensor) + arr2 = np.array(tensor) + np.testing.assert_array_equal(x, arr1[0]) + np.testing.assert_array_equal(x, arr2[0]) + assert arr1.__array_interface__["data"][0] == arr1.__array_interface__["data"][0] + tensor.append(x) + np.testing.assert_array_equal(tensor.numpy(), np.concatenate([arr1, arr2])) + + @parametrize_all_dataset_storages def test_hub_dataset_suffix_bug(ds): # creating dataset with similar name but some suffix removed from end ds2 = Dataset(ds.path[:-1]) ds2.delete() - + def test_empty_dataset(): with CliRunner().isolated_filesystem(): ds = Dataset("test") diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index c9f1348bbe..583868839f 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -267,19 +267,7 @@ def _create_new_chunk(self): def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): """Formats a batch of `samples` and feeds them into `_append_bytes`.""" - self.get_last_chunk() - - uniform = False if isinstance(samples, np.ndarray): - uniform = True - elif isinstance(samples, Sequence): - if is_uniform_sequence(samples): - uniform = True - if not isinstance(samples[0], np.ndarray): - samples = np.array(samples) - else: - raise TypeError(f"Unsupported type for extending. Got: {type(samples)}") - if uniform: compression = self.tensor_meta.sample_compression if compression is None: buffers = [] @@ -305,11 +293,15 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): for sample_object in sample_objects: self.append(sample_object) - else: - for sample in samples: - self.append(sample) - self.cache.maybe_flush() + elif isinstance(samples, Sequence): + if is_uniform_sequence(samples): + self.extend(np.array(samples)) + else: + for sample in samples: + self.append(sample) + else: + raise TypeError(f"Unsupported type for extending. Got: {type(samples)}") def append(self, sample: SampleValue): """Formats a single `sample` (compresseses/decompresses if applicable) and feeds it into `_append_bytes`.""" @@ -371,7 +363,7 @@ def read_sample_from_chunk( ) -> np.ndarray: """Read a sample from a chunk, converts the global index into a local index. Handles decompressing if applicable.""" - expect_compressed = self.tensor_meta.sample_compression != UNCOMPRESSED + expect_compressed = self.tensor_meta.sample_compression is not None dtype = self.tensor_meta.dtype enc = self.chunk_id_encoder diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index af828ba4d7..0f5f3cf6bd 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -73,6 +73,8 @@ def __init__(self): self._encoded_ids = None def tobytes(self) -> memoryview: + if self._encoded_ids is None: + return b"" return serialize_chunkids(hub.__version__, [self._encoded_ids]) @staticmethod @@ -98,6 +100,8 @@ def get_name_for_chunk(self, chunk_index: int) -> str: @classmethod def frombuffer(cls, buffer: bytes): instance = cls() + if not buffer: + return instance version, ids = deserialize_chunkids(buffer) instance._encoded_ids = ids return instance From 77ca4f7b6a1bac8298a888104558c6d9d9faacfb Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 04:23:16 +0530 Subject: [PATCH 67/79] revert tensor.py --- hub/api/tensor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 1f24783900..7de61a8372 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -218,4 +218,7 @@ def __str__(self): index_str = "" return f"Tensor(key={repr(self.key)}{index_str})" + def __array__(self) -> np.ndarray: + return self.numpy() + __repr__ = __str__ From e3ab3bf406b15f5ce59b23481f8ac3749285973f Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 04:36:29 +0530 Subject: [PATCH 68/79] reverts --- hub/core/chunk.py | 96 +++++++---------------------------------------- 1 file changed, 14 insertions(+), 82 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index a3f25d4e88..2ec4336f2c 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -1,7 +1,7 @@ from hub.util.exceptions import FullChunkError import hub from hub.core.storage.cachable import Cachable -from typing import List, Sequence, Tuple, Union +from typing import Sequence, Tuple, Union import numpy as np from io import BytesIO @@ -10,7 +10,6 @@ from hub.core.serialize import serialize_chunk, deserialize_chunk, infer_chunk_num_bytes - class Chunk(Cachable): def __init__( self, @@ -46,71 +45,17 @@ def __init__( self.shapes_encoder = ShapeEncoder(encoded_shapes) self.byte_positions_encoder = BytePositionsEncoder(encoded_byte_positions) - self._data: List[memoryview] = [] - self._num_data_bytes: int = 0 # replaces: sum(map(len, self._data)) - - if data is not None: - self._data.append(data) - self._num_data_bytes += len(data) - - def _get_2d_idx(self, byte_index: int) -> Tuple[int, int]: - """Converts `byte_index`, which is an index for a flattened stream of bytes, into a 2D index that can - be used for a list of byte streams of varying lengths. Used for accessing `self._data`, which is a list - of `memoryview`s. - - Args: - byte_index (int): Index over a flattened stream of bytes. - - Returns: - Tuple[int, int]: 2D index to be used to access `self._data`. - """ - i = 0 - data = self._data - while True: - try: - num_data_i = len(data[i]) - except IndexError: # slightly faster than checking i < len(self._data) in a loop - return i - 1, len(data[i - 1]) + byte_index - if num_data_i <= byte_index: - byte_index -= num_data_i - i += 1 - else: - break - return i, byte_index - - def view(self, start_byte: int, end_byte: int): - """Returns a sliced view of the chunk's data""" - if len(self._data) == 1: - return self._data[0][start_byte:end_byte] - - start2dx, start2dy = self._get_2d_idx(start_byte) - end2dx, end2dy = self._get_2d_idx(end_byte) - if start2dx == end2dx: - # Indexing to the same inner chunk, this would be fast - return self._data[start2dx][start2dy:end2dy] - - # build a list of memoryviews that contain the pieces we need for the output view - byts = [] - byts.append(self._data[start2dx][start2dy:]) - for i in range(start2dx + 1, end2dx): - byts.append(self._data[i]) - byts.append(self._data[end2dx][:end2dy]) - - buff = np.zeros(sum(map(len, byts)), dtype=np.byte) - offset = 0 - for byt in byts: - n = len(byt) - buff[offset : offset + n] = byt - offset += n - return memoryview(buff.tobytes()) + self._data: Union[memoryview, bytearray] = data or bytearray() @property - def num_samples(self): - return self.shapes_encoder.num_samples + def memoryview_data(self): + if isinstance(self._data, memoryview): + return self._data + return memoryview(self._data) @property def num_data_bytes(self): - return self._num_data_bytes + return len(self._data) def is_under_min_space(self, min_data_bytes_target: int) -> bool: """If this chunk's data is less than `min_data_bytes_target`, returns True.""" @@ -140,10 +85,11 @@ def append_sample(self, buffer: memoryview, max_data_bytes: int, shape: Tuple[in ) # `_data` will be a `memoryview` if `frombuffer` is called. + if isinstance(self._data, memoryview): + self._data = bytearray(self._data) # note: incoming_num_bytes can be 0 (empty sample) - self._data.append(buffer) - self._num_data_bytes += len(buffer) + self._data += buffer self.update_headers(incoming_num_bytes, shape) def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]): @@ -163,28 +109,14 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]): def __len__(self): """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached.""" - return infer_chunk_num_bytes( - hub.__version__, - self.shapes_encoder.array, - self.byte_positions_encoder.array, - len_data=self.num_data_bytes, - ) + return infer_chunk_num_bytes(hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, len_data=len(self._data)) def tobytes(self) -> memoryview: - if self.num_samples == 0: - return memoryview(bytes()) - - return serialize_chunk( - hub.__version__, - self.shapes_encoder.array, - self.byte_positions_encoder.array, - self._data, - self.num_data_bytes, - ) + return serialize_chunk(hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, [self._data]) @classmethod - def frombuffer(cls, buffer: bytes) -> "Chunk": - if len(buffer) == 0: + def frombuffer(cls, buffer: bytes): + if not buffer: return cls() version, shapes, byte_positions, data = deserialize_chunk(buffer) return cls(shapes, byte_positions, data=data) From 0e5e84ce15cb9afdb70fbde51034e515d6bfa716 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 05:01:35 +0530 Subject: [PATCH 69/79] fixes --- hub/core/chunk.py | 15 ++++++++++-- hub/core/chunk_engine.py | 2 +- hub/core/meta/encode/chunk_id.py | 2 +- hub/core/serialize.py | 41 ++++++++++++++++++++++---------- 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 2ec4336f2c..7aa81db641 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -10,6 +10,7 @@ from hub.core.serialize import serialize_chunk, deserialize_chunk, infer_chunk_num_bytes + class Chunk(Cachable): def __init__( self, @@ -109,10 +110,20 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]): def __len__(self): """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached.""" - return infer_chunk_num_bytes(hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, len_data=len(self._data)) + return infer_chunk_num_bytes( + hub.__version__, + self.shapes_encoder.array, + self.byte_positions_encoder.array, + len_data=len(self._data), + ) def tobytes(self) -> memoryview: - return serialize_chunk(hub.__version__, self.shapes_encoder.array, self.byte_positions_encoder.array, [self._data]) + return serialize_chunk( + hub.__version__, + self.shapes_encoder.array, + self.byte_positions_encoder.array, + [self._data], + ) @classmethod def frombuffer(cls, buffer: bytes): diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 583868839f..52f94cccd4 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -372,7 +372,7 @@ def read_sample_from_chunk( shape = chunk.shapes_encoder[local_sample_index] sb, eb = chunk.byte_positions_encoder[local_sample_index] - buffer = chunk.view(sb, eb) + buffer = chunk.memoryview_data[sb:eb] if expect_compressed: sample = decompress_array(buffer, shape) else: diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 0f5f3cf6bd..4fe06eec40 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -74,7 +74,7 @@ def __init__(self): def tobytes(self) -> memoryview: if self._encoded_ids is None: - return b"" + return memoryview(b"") return serialize_chunkids(hub.__version__, [self._encoded_ids]) @staticmethod diff --git a/hub/core/serialize.py b/hub/core/serialize.py index f238ceac02..b33944d52e 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -61,20 +61,35 @@ def serialize_chunk( offset = 1 + len_version # Write shape info - flatbuff[offset : offset + 8] = np.array(shape_info.shape, dtype=np.int32).view( - np.byte - ) - offset += 8 - flatbuff[offset : offset + shape_info.nbytes] = shape_info.reshape(-1).view(np.byte) - offset += shape_info.nbytes + if shape_info.ndim == 1: + assert shape_info.nbytes == 0 + flatbuff[offset : offset + 8] = np.zeros(8, dtype=np.byte) + offset += 8 + else: + assert shape_info.ndim == 2 + flatbuff[offset : offset + 8] = np.array(shape_info.shape, dtype=np.int32).view( + np.byte + ) + offset += 8 + flatbuff[offset : offset + shape_info.nbytes] = shape_info.reshape(-1).view( + np.byte + ) + offset += shape_info.nbytes # Write byte positions - flatbuff[offset : offset + 4] = np.int32(byte_positions.shape[0]).view((np.byte, 4)) - offset += 4 - flatbuff[offset : offset + byte_positions.nbytes] = byte_positions.reshape(-1).view( - np.byte - ) - offset += byte_positions.nbytes + if byte_positions.ndim == 1: + assert byte_positions.nbytes == 0 + flatbuff[offset : offset + 4] = np.zeros(4, dtype=np.byte) + offset += 4 + else: + flatbuff[offset : offset + 4] = np.int32(byte_positions.shape[0]).view( + (np.byte, 4) + ) + offset += 4 + flatbuff[offset : offset + byte_positions.nbytes] = byte_positions.reshape( + -1 + ).view(np.byte) + offset += byte_positions.nbytes # Write actual data for byts in data: @@ -133,7 +148,7 @@ def deserialize_chunk( offset += byte_positions_nbytes # Read data - data = buff[offset:].copy() + data = memoryview(buff[offset:].tobytes()) return version, shape_info, byte_positions, data From 0ee485d58425004c8d50ce7be26ade9568abe6b7 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Tue, 13 Jul 2021 16:33:25 -0700 Subject: [PATCH 70/79] add chunk size tests --- hub/api/tests/test_chunk_sizes.py | 122 ++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 hub/api/tests/test_chunk_sizes.py diff --git a/hub/api/tests/test_chunk_sizes.py b/hub/api/tests/test_chunk_sizes.py new file mode 100644 index 0000000000..da006bfdd5 --- /dev/null +++ b/hub/api/tests/test_chunk_sizes.py @@ -0,0 +1,122 @@ +import numpy as np +from hub.constants import KB +from hub.core.tests.common import parametrize_all_dataset_storages + + +def _update_chunk_sizes(ds, max_chunk_size: int): + """Updates all chunk sizes for tensors that already exist in `ds`. If + more tensors are created after calling this method, those tensors will NOT have + the same chunk size. + """ + + # TODO: set / update chunk sizes API (to replace this function) + + min_chunk_size = max_chunk_size // 2 + + for tensor in ds.tensors.values(): + chunk_engine = tensor.chunk_engine + + chunk_engine.max_chunk_size = max_chunk_size + chunk_engine.min_chunk_size = min_chunk_size + + +def _assert_num_chunks(tensor, expected_num_chunks): + chunk_engine = tensor.chunk_engine + actual_num_chunks = chunk_engine.chunk_id_encoder.num_chunks + assert actual_num_chunks == expected_num_chunks + + +def _create_tensors(ds): + images = ds.create_tensor("images", htype="image", sample_compression=None) + labels = ds.create_tensor("labels", htype="class_label") + return images, labels + + +def _append_tensors(images, labels): + for i in range(100): + x = np.ones((28, 28), dtype=np.uint8) * i + y = np.uint32(i) + + images.append(x) + labels.append(y) + + +def _extend_tensors(images, labels): + images.extend(np.ones((100, 28, 28), dtype=np.uint8)) + labels.extend(np.ones(100, dtype=np.uint32)) + + +@parametrize_all_dataset_storages +def test_append(ds): + images, labels = _create_tensors(ds) + _update_chunk_sizes(ds, 32 * KB) + + _append_tensors(images, labels) + + _assert_num_chunks(labels, 1) + _assert_num_chunks(images, 5) + + _append_tensors(images, labels) + + _assert_num_chunks(labels, 1) + _assert_num_chunks(images, 10) + + _append_tensors(images, labels) + + _assert_num_chunks(labels, 1) + _assert_num_chunks(images, 15) + + assert len(ds) == 300 + + +@parametrize_all_dataset_storages +def test_extend(ds): + images, labels = _create_tensors(ds) + + _update_chunk_sizes(ds, 32 * KB) + + _extend_tensors(images, labels) + + _assert_num_chunks(labels, 1) + _assert_num_chunks(images, 5) + + _extend_tensors(images, labels) + + _assert_num_chunks(labels, 1) + _assert_num_chunks(images, 10) + + _extend_tensors(images, labels) + + _assert_num_chunks(labels, 1) + _assert_num_chunks(images, 15) + + assert len(ds) == 300 + + +@parametrize_all_dataset_storages +def test_extend_and_append(ds): + images, labels = _create_tensors(ds) + + _update_chunk_sizes(ds, 32 * KB) + + _extend_tensors(images, labels) + + _assert_num_chunks(labels, 1) + _assert_num_chunks(images, 5) + + _append_tensors(images, labels) + + _assert_num_chunks(labels, 1) + _assert_num_chunks(images, 10) + + _extend_tensors(images, labels) + + _assert_num_chunks(labels, 1) + _assert_num_chunks(images, 15) + + _append_tensors(images, labels) + + _assert_num_chunks(labels, 1) + _assert_num_chunks(images, 20) + + assert len(ds) == 400 From d16d550108e59143287a8ef85024ff156ac4dc21 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 12:40:18 +0530 Subject: [PATCH 71/79] fixes --- hub/api/tests/test_api.py | 2 +- hub/core/meta/encode/chunk_id.py | 8 ++++-- hub/core/serialize.py | 48 +++++++++++++++++++++----------- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 5bb9b75751..55fa85ddef 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -26,7 +26,7 @@ def test_persist_local(local_storage): ds.image.extend(np.ones((4, 224, 224, 3))) ds_new = Dataset(local_storage.root) - assert len(ds_new) == 4 + assert len(ds_new) == 4, (ds_new.image.chunk_engine.chunk_id_encoder._encoded_ids,) assert ds_new.image.shape == (4, 224, 224, 3) np.testing.assert_array_equal(ds_new.image.numpy(), np.ones((4, 224, 224, 3))) diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index 4fe06eec40..c6a16d609f 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -74,7 +74,7 @@ def __init__(self): def tobytes(self) -> memoryview: if self._encoded_ids is None: - return memoryview(b"") + return serialize_chunkids(hub.__version__, [np.array([], dtype=ENCODING_DTYPE)]) return serialize_chunkids(hub.__version__, [self._encoded_ids]) @staticmethod @@ -103,7 +103,8 @@ def frombuffer(cls, buffer: bytes): if not buffer: return instance version, ids = deserialize_chunkids(buffer) - instance._encoded_ids = ids + if ids.nbytes: + instance._encoded_ids = ids return instance @property @@ -116,7 +117,8 @@ def num_chunks(self) -> int: def num_samples(self) -> int: if self._encoded_ids is None: return 0 - return int(self._encoded_ids[-1, LAST_INDEX_INDEX] + 1) + return int(self._encoded_ids[-1, LAST_INDEX_INDEX]) + 1 + def generate_chunk_id(self) -> ENCODING_DTYPE: """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it. diff --git a/hub/core/serialize.py b/hub/core/serialize.py index b33944d52e..6ee63d6fc9 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -96,7 +96,13 @@ def serialize_chunk( n = len(byts) flatbuff[offset : offset + n] = np.frombuffer(byts, dtype=np.byte) offset += n - return memoryview(flatbuff.tobytes()) + ret = flatbuff.tobytes() + v, s, b, d = deserialize_chunk(ret) + assert v == version + np.testing.assert_array_equal(s, shape_info) + np.testing.assert_array_equal(b, byte_positions) + assert bytes(d) == bytes(data[0]) + return bytes(ret) def deserialize_chunk( @@ -127,25 +133,31 @@ def deserialize_chunk( shape_info_shape = buff[offset : offset + 8].view(np.int32) offset += 8 shape_info_nbytes = np.prod(shape_info_shape) * enc_dtype.itemsize - shape_info = ( - buff[offset : offset + shape_info_nbytes] - .view(enc_dtype) - .reshape(shape_info_shape) - .copy() - ) - offset += shape_info_nbytes + if shape_info_nbytes == 0: + shape_info = np.array([], dtype=enc_dtype) + else: + shape_info = ( + buff[offset : offset + shape_info_nbytes] + .view(enc_dtype) + .reshape(shape_info_shape) + .copy() + ) + offset += shape_info_nbytes # Read byte positions byte_positions_rows = buff[offset : offset + 4].view(np.int32)[0] offset += 4 byte_positions_nbytes = byte_positions_rows * 3 * enc_dtype.itemsize - byte_positions = ( - buff[offset : offset + byte_positions_nbytes] - .view(enc_dtype) - .reshape(byte_positions_rows, 3) - .copy() - ) - offset += byte_positions_nbytes + if byte_positions_nbytes == 0: + byte_positions = np.array([], dtype=enc_dtype) + else: + byte_positions = ( + buff[offset : offset + byte_positions_nbytes] + .view(enc_dtype) + .reshape(byte_positions_rows, 3) + .copy() + ) + offset += byte_positions_nbytes # Read data data = memoryview(buff[offset:].tobytes()) @@ -177,7 +189,11 @@ def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: flatbuff[offset : offset + arr.nbytes] = arr.view(np.byte).reshape(-1) offset += arr.nbytes - return memoryview(flatbuff.tobytes()) + ret = memoryview(flatbuff.tobytes()) + v, ids2 = deserialize_chunkids(ret) + assert v == version + np.testing.assert_array_equal(ids[0].reshape(-1, 2), ids2) + return ret def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]: From 1b0973a7474dee284220c1ddfb36b6bb2c7f8793 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 13:00:52 +0530 Subject: [PATCH 72/79] fixes --- hub/core/chunk_engine.py | 63 ++++++++++++++++++-------------- hub/core/meta/encode/chunk_id.py | 5 ++- hub/core/serialize.py | 14 +------ 3 files changed, 40 insertions(+), 42 deletions(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 52f94cccd4..11fe63e259 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -114,10 +114,8 @@ def __init__( # only the last chunk may be less than this self.min_chunk_size = self.max_chunk_size // 2 - self.get_chunk_id_encoder() - self.get_last_chunk() - - def get_chunk_id_encoder(self) -> ChunkIdEncoder: + @property + def chunk_id_encoder(self) -> ChunkIdEncoder: """Gets the chunk id encoder from cache, if one is not found it creates a blank encoder. For more information on what `ChunkIdEncoder` is used for, see the `__init__` docstring. @@ -130,40 +128,49 @@ def get_chunk_id_encoder(self) -> ChunkIdEncoder: """ key = get_chunk_id_encoder_key(self.key) - if key in self.cache: - self.chunk_id_encoder = self.cache.get_cachable(key, ChunkIdEncoder) + if not self.chunk_id_encoder_exists: - else: # 1 because we always update the meta information before writing the samples (to account for potentially corrupted data in the future) if self.tensor_meta.length > 1: raise CorruptedMetaError( f"Tensor length is {self.tensor_meta.length}, but could not find the chunk id encoder." ) - self.chunk_id_encoder = ChunkIdEncoder() - self.cache[key] = self.chunk_id_encoder + enc = ChunkIdEncoder() + self.cache[key] = enc + return enc - return self.chunk_id_encoder + enc = self.cache.get_cachable(key, ChunkIdEncoder) + return enc + + @property + def chunk_id_encoder_exists(self) -> bool: + return get_chunk_id_encoder_key(self.key) in self.cache @property def num_chunks(self) -> int: + if not self.chunk_id_encoder_exists: + return 0 return self.chunk_id_encoder.num_chunks @property def num_samples(self) -> int: + if not self.chunk_id_encoder_exists: + return 0 return self.chunk_id_encoder.num_samples - def get_last_chunk(self) -> Optional[Chunk]: + @property + def last_chunk(self) -> Optional[Chunk]: if self.num_chunks == 0: - self._last_chunk = None - else: - last_chunk_name = self.chunk_id_encoder.get_name_for_chunk(-1) - last_chunk_key = get_chunk_key(self.key, last_chunk_name) + return None - self._last_chunk = self.cache.get_cachable(last_chunk_key, Chunk) - self._last_chunk.key = last_chunk_key + return self.cache.get_cachable(self.last_chunk_key, Chunk) - return self._last_chunk + @property + def last_chunk_key(self) -> str: + last_chunk_name = self.chunk_id_encoder.get_name_for_chunk(-1) + last_chunk_key = get_chunk_key(self.key, last_chunk_name) + return last_chunk_key @property def tensor_meta(self): @@ -196,9 +203,9 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype): self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples) - last_chunk = self._last_chunk - key = last_chunk.key # type: ignore - self.cache.update_used_cache_for_path(key, len(last_chunk)) # type: ignore + # TODO implement tests for cache size compute + if self.last_chunk is not None: + self.cache[self.last_chunk_key] = self.last_chunk def _try_appending_to_last_chunk( self, buffer: memoryview, shape: Tuple[int] @@ -214,7 +221,7 @@ def _try_appending_to_last_chunk( bool: True if `buffer` was successfully written to the last chunk, otherwise False. """ - last_chunk = self._last_chunk + last_chunk = self.last_chunk if last_chunk is None: return False @@ -259,9 +266,8 @@ def _create_new_chunk(self): chunk_id = self.chunk_id_encoder.generate_chunk_id() chunk = Chunk() chunk_name = ChunkIdEncoder.name_from_id(chunk_id) - chunk.key = get_chunk_key(self.key, chunk_name) - self.cache[chunk.key] = chunk - self._last_chunk = chunk + chunk_key = get_chunk_key(self.key, chunk_name) + self.cache[chunk_key] = chunk return chunk def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): @@ -303,11 +309,11 @@ def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): else: raise TypeError(f"Unsupported type for extending. Got: {type(samples)}") + self.cache.maybe_flush() + def append(self, sample: SampleValue): """Formats a single `sample` (compresseses/decompresses if applicable) and feeds it into `_append_bytes`.""" - self.get_last_chunk() - if isinstance(sample, Sample): # has to decompress to read the array's shape and dtype # might be able to optimize this away @@ -368,11 +374,12 @@ def read_sample_from_chunk( enc = self.chunk_id_encoder + buffer = chunk.memoryview_data local_sample_index = enc.get_local_sample_index(global_sample_index) shape = chunk.shapes_encoder[local_sample_index] sb, eb = chunk.byte_positions_encoder[local_sample_index] - buffer = chunk.memoryview_data[sb:eb] + buffer = buffer[sb:eb] if expect_compressed: sample = decompress_array(buffer, shape) else: diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index c6a16d609f..fd6b0ecc41 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -74,7 +74,9 @@ def __init__(self): def tobytes(self) -> memoryview: if self._encoded_ids is None: - return serialize_chunkids(hub.__version__, [np.array([], dtype=ENCODING_DTYPE)]) + return serialize_chunkids( + hub.__version__, [np.array([], dtype=ENCODING_DTYPE)] + ) return serialize_chunkids(hub.__version__, [self._encoded_ids]) @staticmethod @@ -119,7 +121,6 @@ def num_samples(self) -> int: return 0 return int(self._encoded_ids[-1, LAST_INDEX_INDEX]) + 1 - def generate_chunk_id(self) -> ENCODING_DTYPE: """Generates a random 64bit chunk ID using uuid4. Also prepares this ID to have samples registered to it. This method should be called once per chunk created. diff --git a/hub/core/serialize.py b/hub/core/serialize.py index 6ee63d6fc9..1aa7958366 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -96,13 +96,7 @@ def serialize_chunk( n = len(byts) flatbuff[offset : offset + n] = np.frombuffer(byts, dtype=np.byte) offset += n - ret = flatbuff.tobytes() - v, s, b, d = deserialize_chunk(ret) - assert v == version - np.testing.assert_array_equal(s, shape_info) - np.testing.assert_array_equal(b, byte_positions) - assert bytes(d) == bytes(data[0]) - return bytes(ret) + return memoryview(flatbuff.tobytes()) def deserialize_chunk( @@ -189,11 +183,7 @@ def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: flatbuff[offset : offset + arr.nbytes] = arr.view(np.byte).reshape(-1) offset += arr.nbytes - ret = memoryview(flatbuff.tobytes()) - v, ids2 = deserialize_chunkids(ret) - assert v == version - np.testing.assert_array_equal(ids[0].reshape(-1, 2), ids2) - return ret + return memoryview(flatbuff.tobytes()) def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]: From 58954381d1d5f6fc4cd5d755298f24e0a1cb46d5 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 13:38:51 +0530 Subject: [PATCH 73/79] rem assert --- hub/api/tests/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 55fa85ddef..5bb9b75751 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -26,7 +26,7 @@ def test_persist_local(local_storage): ds.image.extend(np.ones((4, 224, 224, 3))) ds_new = Dataset(local_storage.root) - assert len(ds_new) == 4, (ds_new.image.chunk_engine.chunk_id_encoder._encoded_ids,) + assert len(ds_new) == 4 assert ds_new.image.shape == (4, 224, 224, 3) np.testing.assert_array_equal(ds_new.image.numpy(), np.ones((4, 224, 224, 3))) From f15d71c43aa7ea6a56e6b07ede7bdf93d5087906 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 13:56:15 +0530 Subject: [PATCH 74/79] test chunk sizes on memds only --- hub/api/tests/test_chunk_sizes.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/hub/api/tests/test_chunk_sizes.py b/hub/api/tests/test_chunk_sizes.py index da006bfdd5..2a767bc1f5 100644 --- a/hub/api/tests/test_chunk_sizes.py +++ b/hub/api/tests/test_chunk_sizes.py @@ -1,6 +1,5 @@ import numpy as np from hub.constants import KB -from hub.core.tests.common import parametrize_all_dataset_storages def _update_chunk_sizes(ds, max_chunk_size: int): @@ -46,8 +45,8 @@ def _extend_tensors(images, labels): labels.extend(np.ones(100, dtype=np.uint32)) -@parametrize_all_dataset_storages -def test_append(ds): +def test_append(memory_ds): + ds = memory_ds images, labels = _create_tensors(ds) _update_chunk_sizes(ds, 32 * KB) @@ -69,8 +68,8 @@ def test_append(ds): assert len(ds) == 300 -@parametrize_all_dataset_storages -def test_extend(ds): +def test_extend(memory_ds): + ds = memory_ds images, labels = _create_tensors(ds) _update_chunk_sizes(ds, 32 * KB) @@ -93,8 +92,8 @@ def test_extend(ds): assert len(ds) == 300 -@parametrize_all_dataset_storages -def test_extend_and_append(ds): +def test_extend_and_append(memory_ds): + ds = memory_ds images, labels = _create_tensors(ds) _update_chunk_sizes(ds, 32 * KB) From 0d7e9f257e9e75f99ba9f25708915f0a27c7e98c Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 12:37:42 +0400 Subject: [PATCH 75/79] Update hub/core/serialize.py Co-authored-by: dyllan --- hub/core/serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/serialize.py b/hub/core/serialize.py index 1aa7958366..43f0128266 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -39,7 +39,7 @@ def serialize_chunk( data: Union[Sequence[bytes], Sequence[memoryview]], len_data: Optional[int] = None, ) -> memoryview: - """Serializes a chunk + """Serializes a chunk's headers and data into a single byte stream. This is how the chunk will be written to the storage provider. Args: version: (str) Version of hub library. From 8310e36a2c903a90581729c4be354870b1094b2b Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 12:37:47 +0400 Subject: [PATCH 76/79] Update hub/core/serialize.py Co-authored-by: dyllan --- hub/core/serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/serialize.py b/hub/core/serialize.py index 43f0128266..b295663920 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -160,7 +160,7 @@ def deserialize_chunk( def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: - """Serializes chunk ids + """Serializes chunk ID encoders into a single byte stream. This is how the encoders will be written to the storage provider. Args: version: (str) Version of hub library. From 3a8ccc840b7c8570e9ad5d7dffe3efdc00cafc00 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 12:37:53 +0400 Subject: [PATCH 77/79] Update hub/core/serialize.py Co-authored-by: dyllan --- hub/core/serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/serialize.py b/hub/core/serialize.py index b295663920..bd2f86f181 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -187,7 +187,7 @@ def serialize_chunkids(version: str, ids: Sequence[np.ndarray]) -> memoryview: def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarray]: - """Deserializes chunk ids + """Deserializes a chunk ID encoder from the serialized byte stream. This is how the encoder can be accessed/modified after it is read from storage. Args: byts: (bytes) Serialized chunk ids. From d9a846b89852190559d1025672dfc29e8de8925f Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 12:38:00 +0400 Subject: [PATCH 78/79] Update hub/core/serialize.py Co-authored-by: dyllan --- hub/core/serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/serialize.py b/hub/core/serialize.py index bd2f86f181..71c80a9a19 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -102,7 +102,7 @@ def serialize_chunk( def deserialize_chunk( byts: Union[bytes, memoryview] ) -> Tuple[str, np.ndarray, np.ndarray, memoryview]: - """Deserializes a chunk + """Deserializes a chunk from the serialized byte stream. This is how the chunk can be accessed/modified after it is read from storage. Args: byts: (bytes) Serialized chunk. From 8c3b83b94a535da4c9b298c2c44d9c3f5985b1b5 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 14 Jul 2021 14:29:30 +0530 Subject: [PATCH 79/79] rem assertions --- hub/core/serialize.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/hub/core/serialize.py b/hub/core/serialize.py index 1aa7958366..b2bd03c5a6 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -25,8 +25,6 @@ def infer_chunk_num_bytes( """ # NOTE: Assumption: version string contains ascii characters only (ord(c) < 128) # NOTE: Assumption: len(version) < 256 - assert len(version) < 256 - assert max((map(ord, version))) < 128 if len_data is None: len_data = sum(map(len, data)) # type: ignore return len(version) + shape_info.nbytes + byte_positions.nbytes + len_data + 13 @@ -62,11 +60,9 @@ def serialize_chunk( # Write shape info if shape_info.ndim == 1: - assert shape_info.nbytes == 0 flatbuff[offset : offset + 8] = np.zeros(8, dtype=np.byte) offset += 8 else: - assert shape_info.ndim == 2 flatbuff[offset : offset + 8] = np.array(shape_info.shape, dtype=np.int32).view( np.byte ) @@ -78,7 +74,6 @@ def serialize_chunk( # Write byte positions if byte_positions.ndim == 1: - assert byte_positions.nbytes == 0 flatbuff[offset : offset + 4] = np.zeros(4, dtype=np.byte) offset += 4 else: