activeloopai · verbose-void · May 13, 2021 · May 5, 2021 · May 5, 2021 · May 5, 2021
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -156,15 +156,15 @@ commands:
                 command: |
                   $Env:GOOGLE_APPLICATION_CREDENTIALS = $Env:CI_GCS_PATH
                   setx /m GOOGLE_APPLICATION_CREDENTIALS "$Env:GOOGLE_APPLICATION_CREDENTIALS"
-                  python3 -m pytest --cov-report=xml --cov=./ --benchmark-autosave
+                  python3 -m pytest --cov-report=xml --cov=./ --benchmark-enable --benchmark-autosave
       - when:
           condition: << parameters.unix-like >>
           steps:
             - run:
                 name: "Running tests - Unix"
                 command: |
                   export GOOGLE_APPLICATION_CREDENTIALS=$HOME/.secrets/gcs.json
-                  python3 -m pytest --cov-report=xml --cov=./ --benchmark-autosave
+                  python3 -m pytest --cov-report=xml --cov=./ --benchmark-enable --benchmark-autosave
   codecov-upload:
     steps:
       - codecov/upload:

diff --git a/hub/constants.py b/hub/constants.py
@@ -1 +1,4 @@
 BYTE_PADDING = b"\0"
+CHUNKS_FOLDER = "chunks"
+META_FILENAME = "meta.json"
+INDEX_MAP_FILENAME = "index_map.json"
diff --git a/hub/core/chunk_engine/__init__.py b/hub/core/chunk_engine/__init__.py
@@ -0,0 +1,3 @@
+from .chunker import generate_chunks, join_chunks
+from .write import write_array
+from .read import read_array
diff --git a/hub/core/chunk_engine/chunker.py b/hub/core/chunk_engine/chunker.py
@@ -0,0 +1,99 @@
+import numpy as np
+from typing import Generator, Optional, List
+
+from hub.util.exceptions import ChunkSizeTooSmallError
+
+
+def generate_chunks(
+    content_bytes: bytes,
+    chunk_size: int,
+    bytes_left_in_last_chunk: int = 0,
+) -> Generator[bytes, None, None]:
+    """Generator function that chunks bytes.
+
+    Chunking is the process of taking the input `content_bytes` & breaking it up into a sequence of smaller bytes called "chunks".
+    The sizes of each chunk are <= `chunk_size`.
+
+    Example:
+        content_bytes = b"1234567890123"
+        chunk_size = 4
+        yields:
+            b"1234"
+            b"5678"
+            b"9012"
+            b"3"
+
+    Args:
+        content_bytes (bytes): Bytes object with the data to be chunked.
+        chunk_size (int): Each individual chunk will be assigned this many bytes maximum.
+        bytes_left_in_last_chunk (int): If chunks were created already, `bytes_left_in_last_chunk`
+            should be set to the `chunk_size - len(last_chunk)`. This is so the generator's
+            first output will be enough bytes to fill that chunk up to `chunk_size`.
+
+    Yields:
+        bytes: Chunk of the `content_bytes`. Will have length on the interval (1, `chunk_size`].
+
+    Raises:
+        ChunkSizeTooSmallError: If `chunk_size` <= 0
+        ValueError: If `bytes_left_in_last_chunk` < 0
+    """
+
+    # validate inputs
+    if chunk_size <= 0:
+        raise ChunkSizeTooSmallError()
+    if bytes_left_in_last_chunk < 0:
+        raise ValueError("Bytes left in last chunk must be >= 0.")
+    if len(content_bytes) <= 0:
+        return
+
+    # yield the remainder of the last chunk (provided as `last_chunk_num_bytes`)
+    total_bytes_yielded = 0
+    if bytes_left_in_last_chunk > 0:
+        chunk = content_bytes[:bytes_left_in_last_chunk]
+        yield chunk
+        total_bytes_yielded += bytes_left_in_last_chunk
+
+    # yield all new chunks
+    while total_bytes_yielded < len(content_bytes):
+        end = total_bytes_yielded + chunk_size
+        chunk = content_bytes[total_bytes_yielded:end]
+
+        yield chunk
+        total_bytes_yielded += len(chunk)
+
+
+def join_chunks(chunks: List[bytes], start_byte: int, end_byte: int) -> bytes:
+    """Given a list of bytes that represent sequential chunks, join them into one bytes object.
+    For more on chunking, see the `generate_chunks` method.
+
+    Example:
+        chunks = [b"123", b"456", b"789"]
+        start_byte = 1
+        end_byte = 2
+        returns:
+            b"2345678"
+
+    Args:
+        chunks (list[bytes]): Sequential list of bytes objects that represent chunks.
+        start_byte (int): The first chunk in the sequence will ignore the bytes before `start_byte`. If 0, all bytes are included.
+        end_byte (int): The last chunk in the sequence will ignore the bytes at and after `end_byte-1`. If None, all bytes are included.
+
+    Notes:
+        Bytes are indexed using: chunk[start_byte:end_byte]. That is why `chunk[end_byte]` will not be included in `chunk[start_byte:end_byte]`.
+        If `len(chunks) == 1`, `start_byte`:`end_byte` will be applied to the same chunk (the first & last one).
+
+    Returns:
+        bytes: The chunks joined as one bytes object.
+    """
+
+    joined_bytearray = bytearray()
+    for i, chunk in enumerate(chunks):
+        actual_start_byte, actual_end_byte = 0, len(chunk)
+
+        if i <= 0:
+            actual_start_byte = start_byte
+        if i >= len(chunks) - 1:
+            actual_end_byte = end_byte
+
+        joined_bytearray.extend(chunk[actual_start_byte:actual_end_byte])
+    return bytes(joined_bytearray)
diff --git a/hub/core/chunk_engine/generator.py b/hub/core/chunk_engine/generator.py
diff --git a/hub/core/chunk_engine/read.py b/hub/core/chunk_engine/read.py
@@ -0,0 +1,50 @@
+import os
+import pickle
+import numpy as np
+
+from .chunker import join_chunks
+from .util import get_meta_key, get_index_map_key
+
+from hub.core.typing import Provider
+from typing import Callable, List, Union
+
+
+def read_array(
+    key: str,
+    storage: Provider,
+    array_slice: slice = slice(None),
+) -> np.ndarray:
+    """Read & join chunks into an array from storage.
+
+    Args:
+        key (str): Key for where the chunks, index_map, & meta are located in `storage` relative to it's root.
+        array_slice (slice): Slice that represents which samples to read. Default = slice representing all samples.
+        storage (Provider): Provider for reading the chunks, index_map, & meta.
+
+    Returns:
+        np.ndarray: Array containing the sample(s) in the `array_slice` slice.
+    """
+
+    # TODO: don't use pickle
+    meta = pickle.loads(storage[get_meta_key(key)])
+    index_map = pickle.loads(storage[get_index_map_key(key)])
+
+    samples = []
+    for index_entry in index_map[array_slice]:
+        chunks = []
+        for chunk_name in index_entry["chunk_names"]:
+            chunk_key = os.path.join(key, "chunks", chunk_name)
+            chunk = storage[chunk_key]
+
+            chunks.append(chunk)
+
+        combined_bytes = join_chunks(
+            chunks,
+            index_entry["start_byte"],
+            index_entry["end_byte"],
+        )
+
+        out_array = np.frombuffer(combined_bytes, dtype=meta["dtype"])
+        samples.append(out_array.reshape(index_entry["shape"]))
+
+    return np.array(samples)
diff --git a/hub/core/chunk_engine/tests/common.py b/hub/core/chunk_engine/tests/common.py
@@ -0,0 +1,152 @@
+import numpy as np
+import pickle
+
+from hub.core.chunk_engine import write_array, read_array
+from hub.core.chunk_engine.util import (
+    normalize_and_batchify_shape,
+    get_meta_key,
+    get_index_map_key,
+    get_chunk_key,
+    get_random_array,
+)
+from hub.core.storage import MappedProvider
+from hub.core.typing import Provider
+
+from typing import List, Tuple
+
+
+TENSOR_KEY = "TEST_TENSOR"
+
+
+STORAGE_PROVIDERS = (
+    MappedProvider(),
+)  # TODO: replace MappedProvider with MemoryProvider
+
+
+CHUNK_SIZES = (
+    128,
+    4096,
+    16000000,  # 16MB
+)
+
+
+DTYPES = (
+    "uint8",
+    "int64",
+    "float64",
+    "bool",
+)
+
+
+def get_min_shape(batch: np.ndarray) -> Tuple:
+    return tuple(np.minimum.reduce([sample.shape for sample in batch]))
+
+
+def get_max_shape(batch: np.ndarray) -> Tuple:
+    return tuple(np.maximum.reduce([sample.shape for sample in batch]))
+
+
+def assert_meta_is_valid(meta: dict, expected_meta: dict):
+    for k, v in expected_meta.items():
+        assert k in meta
+        assert v == meta[k]
+
+
+def assert_chunk_sizes(key: str, index_map: List, chunk_size: int, storage: Provider):
+    incomplete_chunk_names = set()
+    complete_chunk_count = 0
+    total_chunks = 0
+    for i, entry in enumerate(index_map):
+        for j, chunk_name in enumerate(entry["chunk_names"]):
+            chunk_key = get_chunk_key(key, chunk_name)
+            chunk_length = len(storage[chunk_key])
+
+            # exceeding chunk_size is never acceptable
+            assert (
+                chunk_length <= chunk_size
+            ), 'Chunk "%s" exceeded chunk_size=%i (got %i) @ [%i, %i].' % (
+                chunk_name,
+                chunk_size,
+                chunk_length,
+                i,
+                j,
+            )
+
+            if chunk_length < chunk_size:
+                incomplete_chunk_names.add(chunk_name)
+            if chunk_length == chunk_size:
+                complete_chunk_count += 1
+
+            total_chunks += 1
+
+    incomplete_chunk_count = len(incomplete_chunk_names)
+    assert (
+        incomplete_chunk_count <= 1
+    ), "Incomplete chunk count should never exceed 1. Incomplete count: %i. Complete count: %i. Total: %i.\nIncomplete chunk names: %s" % (
+        incomplete_chunk_count,
+        complete_chunk_count,
+        total_chunks,
+        str(incomplete_chunk_names),
+    )
+
+
+def run_engine_test(arrays, storage, batched, chunk_size):
+    storage.clear()
+
+    for i, a_in in enumerate(arrays):
+        write_array(
+            a_in,
+            TENSOR_KEY,
+            chunk_size,
+            storage,
+            batched=batched,
+        )
+
+        index_map_key = get_index_map_key(TENSOR_KEY)
+        index_map = pickle.loads(storage[index_map_key])
+
+        assert_chunk_sizes(TENSOR_KEY, index_map, chunk_size, storage)
+
+        # `write_array` implicitly normalizes/batchifies shape
+        a_in = normalize_and_batchify_shape(a_in, batched=batched)
+
+        a_out = read_array(TENSOR_KEY, storage)
+
+        meta_key = get_meta_key(TENSOR_KEY)
+        assert meta_key in storage, "Meta was not found."
+        meta = pickle.loads(storage[meta_key])
+
+        assert_meta_is_valid(
+            meta,
+            {
+                "chunk_size": chunk_size,
+                "length": a_in.shape[0],
+                "dtype": a_in.dtype.name,
+                "min_shape": get_min_shape(a_in),
+                "max_shape": get_max_shape(a_in),
+            },
+        )
+
+        assert np.array_equal(a_in, a_out), "Array not equal @ batch_index=%i." % i
+
+    storage.clear()
+
+
+def benchmark_write(arrays, chunk_size, storage, batched, clear_after_write=True):
+    storage.clear()
+
+    for a_in in arrays:
+        write_array(
+            a_in,
+            TENSOR_KEY,
+            chunk_size,
+            storage,
+            batched=batched,
+        )
+
+    if clear_after_write:
+        storage.clear()
+
+
+def benchmark_read(storage):
+    read_array(TENSOR_KEY, storage)