From 175ac3ba05316e89bb76750afc6c1c95c87310fa Mon Sep 17 00:00:00 2001 From: benchislett Date: Thu, 3 Jun 2021 22:45:09 -0400 Subject: [PATCH 1/8] Various API changes - Tensor metadata initialization without data: - - tensor_meta_from_array split into default and update - Remove Dataset.__setattr__ and related info since __setitem__ will no longer be used. - Add @property for meta to Dataset and setters for it and Tensor - Add Dataset.create_tensor that does not depend on any data addition - Split Tensor.append into append and extend - Remove Tensor.__setitem__, preferring append/extend until update is supported - Resolve minor mypy typing issues --- hub/api/dataset.py | 65 +++++++------------ hub/api/tensor.py | 59 ++++++++++------- hub/api/tests/test_api.py | 31 ++++++--- .../tests/test_benchmark_chunk_engine.py | 6 +- hub/core/compression/webp.py | 6 +- hub/core/meta/tensor_meta.py | 23 +++---- hub/core/tensor.py | 23 +++---- hub/core/tests/common.py | 7 +- hub/core/tests/test_tensor_failures.py | 10 +-- 9 files changed, 122 insertions(+), 108 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 7a073e1327..5b2c039039 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -8,10 +8,12 @@ from hub.core.tensor import tensor_exists from hub.core.dataset import dataset_exists from hub.core.meta.dataset_meta import read_dataset_meta, write_dataset_meta -from hub.core.meta.tensor_meta import tensor_meta_from_array +from hub.core.meta.tensor_meta import default_tensor_meta from hub.core.typing import StorageProvider from hub.util.index import Index + +from hub.constants import DEFAULT_CHUNK_SIZE from hub.util.path import provider_from_path from hub.util.exceptions import ( InvalidKeyTypeError, @@ -21,9 +23,6 @@ ) from hub.util.path import provider_from_path -# Used to distinguish between attributes and items (tensors) -DATASET_RESERVED_ATTRIBUTES = ["path", "mode", "index", "provider", "tensors"] - class Dataset: def __init__( @@ -35,12 +34,6 @@ def __init__( ): """Initialize a new or existing dataset. - Note: - Entries of `DATASET_RESERVED_ATTRIBUTES` cannot be used as tensor names. - This is to distinguish between attributes (like `ds.mode`) and tensors. - - Be sure to keep `DATASET_RESERVED_ATTRIBUTES` up-to-date when changing this class. - Args: path (str): The location of the dataset. Used to initialize the storage provider. mode (str): Mode in which the dataset is opened. @@ -67,11 +60,10 @@ def __init__( self.tensors: Dict[str, Tensor] = {} if dataset_exists(self.provider): - ds_meta = read_dataset_meta(self.provider) - for tensor_name in ds_meta["tensors"]: + for tensor_name in self.meta["tensors"]: self.tensors[tensor_name] = Tensor(tensor_name, self.provider) else: - write_dataset_meta(self.provider, {"tensors": []}) + self.meta = {"tensors": []} def __len__(self): """Return the greatest length of tensors""" @@ -89,43 +81,36 @@ def __getitem__(self, item: Union[str, int, slice, Index]): else: raise InvalidKeyTypeError(item) - def __setitem__(self, item: Union[slice, str], value): - if isinstance(item, str): - tensor_key = item - - if tensor_exists(tensor_key, self.provider): - raise TensorAlreadyExistsError(tensor_key) - - if isinstance(value, np.ndarray): - tensor_meta = tensor_meta_from_array(value, batched=True) + def create_tensor( + self, name, chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64" + ): + if tensor_exists(name, self.provider): + raise TensorAlreadyExistsError(name) - ds_meta = read_dataset_meta(self.provider) - ds_meta["tensors"].append(tensor_key) - write_dataset_meta(self.provider, ds_meta) + ds_meta = self.meta + ds_meta["tensors"].append(name) + self.meta = ds_meta - tensor = Tensor(tensor_key, self.provider, tensor_meta=tensor_meta) - self.tensors[tensor_key] = tensor - tensor.append(value, batched=True) + tensor_meta = default_tensor_meta(chunk_size, dtype) + tensor = Tensor(name, self.provider, tensor_meta=tensor_meta) + self.tensors[name] = tensor - return tensor - else: - raise UnsupportedTensorTypeError(item) - else: - raise InvalidKeyTypeError(item) + return tensor __getattr__ = __getitem__ - def __setattr__(self, name: str, value): - """Set the named attribute on the dataset""" - if name in DATASET_RESERVED_ATTRIBUTES: - return super().__setattr__(name, value) - else: - return self.__setitem__(name, value) - def __iter__(self): for i in range(len(self)): yield self[i] + @property + def meta(self): + return read_dataset_meta(self.provider) + + @meta.setter + def meta(self, new_meta: dict): + write_dataset_meta(self.provider, new_meta) + @staticmethod def from_path(path: str): """Create a local hub dataset from unstructured data. diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 46d105e109..3cd8309091 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -8,6 +8,7 @@ add_samples_to_tensor, read_samples_from_tensor, read_tensor_meta, + write_tensor_meta, tensor_exists, ) from hub.core.typing import StorageProvider @@ -58,19 +59,45 @@ def __init__( create_tensor(self.key, self.provider, tensor_meta) - def append(self, array: np.ndarray, batched: bool): - # TODO: split into `append`/`extend` - add_samples_to_tensor( - array, - self.key, - storage=self.provider, - batched=batched, - ) + def extend(self, array: np.ndarray): + """Extend tensor by appending elements from a batched numpy array. + + Example: + >>> len(image) + 0 + >>> image.extend(np.zeros((100, 28, 28, 1))) + >>> len(image) + 100 + + Args: + array (np.ndarray): The data to add to the tensor. + The primary axis should be the number of samples to add. + """ + add_samples_to_tensor(array, self.key, storage=self.provider, batched=True) + + def append(self, array: np.ndarray): + """Append a sample to the end of the tensor. + + Example: + >>> len(image) + 0 + >>> image.append(np.zeros((28, 28, 1))) + >>> len(image) + 1 + + Args: + array (np.ndarray): The data to add to the tensor. + """ + add_samples_to_tensor(array, self.key, storage=self.provider, batched=False) @property def meta(self): return read_tensor_meta(self.key, self.provider) + @meta.setter + def meta(self, new_meta: dict): + write_tensor_meta(self.key, self.provider, new_meta) + @property def shape(self): # TODO: when dynamic arrays are supported, handle `min_shape != max_shape` (right now they're always equal) @@ -84,21 +111,7 @@ def __getitem__(self, item: Union[int, slice, Index]): return Tensor(self.key, self.provider, index=self.index[item]) def __setitem__(self, item: Union[int, slice], value: np.ndarray): - sliced_self = self[item] - if sliced_self.index.item != slice(None): - raise NotImplementedError( - "Assignment to Tensor subsections not currently supported!" - ) - else: - if tensor_exists(self.key, self.provider): - raise TensorAlreadyExistsError(self.key) - - add_samples_to_tensor( - array=value, - key=self.key, - storage=self.provider, - batched=True, - ) + raise NotImplementedError("Tensor update not currently supported!") def __iter__(self): for i in range(len(self)): diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 39776057da..fbf4a0af9b 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -12,7 +12,8 @@ def test_persist_local(local_storage): pytest.skip() ds = Dataset(local_storage.root) - ds.image = np.ones((4, 4096, 4096)) + ds.create_tensor("image") + ds.image.extend(np.ones((4, 4096, 4096))) ds_new = Dataset(local_storage.root) assert len(ds_new) == 4 @@ -22,21 +23,32 @@ def test_persist_local(local_storage): @parametrize_all_dataset_storages def test_populate_dataset(ds): - assert read_dataset_meta(ds.provider) == {"tensors": []} - ds.image = np.ones((4, 28, 28)) - assert read_dataset_meta(ds.provider) == {"tensors": ["image"]} + assert ds.meta == {"tensors": []} + ds.create_tensor("image") + assert len(ds) == 0 + assert len(ds.image) == 0 + + ds.image.extend(np.ones((4, 28, 28))) assert len(ds) == 4 + assert len(ds.image) == 4 + + ds.image.append(np.ones((28, 28))) + assert len(ds.image) == 5 + + assert ds.meta == {"tensors": ["image"]} @parametrize_all_dataset_storages def test_compute_tensor(ds): - ds.image = np.ones((32, 28, 28)) + ds.create_tensor("image") + ds.image.extend(np.ones((32, 28, 28))) np.testing.assert_array_equal(ds.image.numpy(), np.ones((32, 28, 28))) @parametrize_all_dataset_storages def test_compute_tensor_slice(ds): - ds.image = np.vstack((np.arange(16),) * 8) + ds.create_tensor("image") + ds.image.extend(np.vstack((np.arange(16),) * 8)) sliced_data = ds.image[2:5].numpy() expected_data = np.vstack((np.arange(16),) * 3) @@ -46,8 +58,11 @@ def test_compute_tensor_slice(ds): @parametrize_all_dataset_storages def test_iterate_dataset(ds): labels = [1, 9, 7, 4] - ds.image = np.ones((4, 28, 28)) - ds.label = np.asarray(labels).reshape((4, 1)) + ds.create_tensor("image") + ds.create_tensor("label") + + ds.image.extend(np.ones((4, 28, 28))) + ds.label.extend(np.asarray(labels).reshape((4, 1))) for idx, sub_ds in enumerate(ds): img = sub_ds.image.numpy() diff --git a/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py b/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py index 4ea4fbcffc..6f18277d08 100644 --- a/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py +++ b/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py @@ -10,7 +10,7 @@ add_samples_to_tensor, create_tensor, ) -from hub.core.meta.tensor_meta import tensor_meta_from_array +from hub.core.meta.tensor_meta import default_tensor_meta from hub.core.tests.common import TENSOR_KEY from hub.core.typing import StorageProvider from hub.tests.common_benchmark import ( @@ -24,9 +24,7 @@ def single_benchmark_write(info, key, arrays, chunk_size, storage, batched): actual_key = "%s_%i" % (key, info["iteration"]) - create_tensor( - actual_key, storage, tensor_meta_from_array(arrays[0], batched, chunk_size) - ) + create_tensor(actual_key, storage, default_tensor_meta(chunk_size)) for a_in in arrays: add_samples_to_tensor( diff --git a/hub/core/compression/webp.py b/hub/core/compression/webp.py index f7c50d786b..77f335ecb7 100644 --- a/hub/core/compression/webp.py +++ b/hub/core/compression/webp.py @@ -46,9 +46,9 @@ def encode_single_image(self, image: np.ndarray) -> bytes: Encoded data. """ with BytesIO() as buffer: - image = Image.fromarray(image) - image = image.convert("RGB") - image.save(buffer, format=self.codec_id, quality=self.quality) + img = Image.fromarray(image) + img = img.convert("RGB") + img.save(buffer, format=self.codec_id, quality=self.quality) return buffer.getvalue() def decode_single_image(self, buf: bytes, image_shape: tuple) -> np.ndarray: diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index 0c7262bd10..a2868be607 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -17,18 +17,19 @@ def read_tensor_meta(key: str, storage: StorageProvider) -> dict: return pickle.loads(storage[get_tensor_meta_key(key)]) -def tensor_meta_from_array( - array: np.ndarray, batched: bool, chunk_size: int = DEFAULT_CHUNK_SIZE +def default_tensor_meta(chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64"): + return {"chunk_size": chunk_size, "dtype": dtype, "length": 0} + + +def update_tensor_meta_with_array( + tensor_meta: dict, array: np.ndarray, batched=False ) -> dict: - array = normalize_and_batchify_shape(array, batched=batched) - - tensor_meta = { - "chunk_size": chunk_size, - "dtype": array.dtype.name, - "min_shape": tuple(array.shape[1:]), - "max_shape": tuple(array.shape[1:]), - # TODO: add entry in meta for which tobytes function is used and handle mismatch versions for this - } + shape = array.shape + if batched: + shape = shape[1:] + tensor_meta["dtype"] = str(array.dtype) + tensor_meta["min_shape"] = shape + tensor_meta["max_shape"] = shape return tensor_meta diff --git a/hub/core/tensor.py b/hub/core/tensor.py index 216f38747f..8c99b2a79a 100644 --- a/hub/core/tensor.py +++ b/hub/core/tensor.py @@ -6,6 +6,7 @@ from hub.core.meta.tensor_meta import ( read_tensor_meta, write_tensor_meta, + update_tensor_meta_with_array, validate_tensor_meta, ) from hub.core.meta.index_map import read_index_map, write_index_map @@ -37,9 +38,7 @@ def create_tensor(key: str, storage: StorageProvider, meta: dict): Args: key (str): Key for where the chunks, index_map, and meta will be located in `storage` relative to it's root. storage (StorageProvider): StorageProvider that all tensor data is written to. - meta (dict): Meta for the tensor. Required properties: - chunk_size (int): Desired length of chunks. - dtype (str): Datatype for each sample. + meta (dict): Meta for the tensor. For required properties, see `default_tensor_meta`. Raises: TensorAlreadyExistsError: If a tensor defined with `key` already exists. @@ -48,8 +47,6 @@ def create_tensor(key: str, storage: StorageProvider, meta: dict): if tensor_exists(key, storage): raise TensorAlreadyExistsError(key) - meta.update({"length": 0}) - validate_tensor_meta(meta) write_tensor_meta(key, storage, meta) @@ -77,15 +74,17 @@ def add_samples_to_tensor( TensorDoesNotExistError: If a tensor at `key` does not exist. A tensor must be created first using `create_tensor(...)`. """ - # TODO: split into `append` and `extend` - - array = normalize_and_batchify_shape(array, batched=batched) - if not tensor_exists(key, storage): raise TensorDoesNotExistError(key) index_map = read_index_map(key, storage) tensor_meta = read_tensor_meta(key, storage) + + array = normalize_and_batchify_shape(array, batched=batched) + + if "min_shape" not in tensor_meta: + tensor_meta = update_tensor_meta_with_array(tensor_meta, array, batched=True) + _check_array_and_tensor_are_compatible(tensor_meta, array) # TODO: get the tobytes function from meta @@ -137,10 +136,12 @@ def read_samples_from_tensor( array = sample_from_index_entry(key, storage, index_entry, meta["dtype"]) samples.append(array) + array = np.array(samples) + if isinstance(index.item, int): - samples = samples[0] + array = array.squeeze(axis=0) - return np.array(samples) + return array def _check_array_and_tensor_are_compatible(tensor_meta: dict, array: np.ndarray): diff --git a/hub/core/tests/common.py b/hub/core/tests/common.py index 169f1bb7ad..7293d99055 100644 --- a/hub/core/tests/common.py +++ b/hub/core/tests/common.py @@ -11,7 +11,7 @@ tensor_exists, read_samples_from_tensor, ) -from hub.core.meta.tensor_meta import read_tensor_meta, tensor_meta_from_array +from hub.core.meta.tensor_meta import read_tensor_meta, default_tensor_meta from hub.core.meta.index_map import read_index_map from hub.core.typing import StorageProvider @@ -139,7 +139,7 @@ def run_engine_test( key = TENSOR_KEY sample_count = 0 - create_tensor(key, storage, tensor_meta_from_array(arrays[0], batched, chunk_size)) + create_tensor(key, storage, default_tensor_meta(chunk_size)) for i, a_in in enumerate(arrays): add_samples_to_tensor( @@ -180,7 +180,8 @@ def run_engine_test( def benchmark_write( key, arrays, chunk_size, storage, batched, clear_memory_after_write=True ): - create_tensor(key, storage, tensor_meta_from_array(arrays[0], batched, chunk_size)) + + create_tensor(key, storage, default_tensor_meta(chunk_size)) for a_in in arrays: add_samples_to_tensor( diff --git a/hub/core/tests/test_tensor_failures.py b/hub/core/tests/test_tensor_failures.py index ca8e000682..4dc229b925 100644 --- a/hub/core/tests/test_tensor_failures.py +++ b/hub/core/tests/test_tensor_failures.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from hub.core.meta.tensor_meta import tensor_meta_from_array +from hub.core.meta.tensor_meta import default_tensor_meta from hub.core.tensor import add_samples_to_tensor, create_tensor from hub.tests.common import TENSOR_KEY @@ -17,7 +17,7 @@ def test_dtype_mismatch(memory_storage): a1 = np.array([1, 2, 3, 5.3], dtype=float) a2 = np.array([0, 1, 1, 0], dtype=bool) - create_tensor(TENSOR_KEY, memory_storage, tensor_meta_from_array(a1, batched=False)) + create_tensor(TENSOR_KEY, memory_storage, default_tensor_meta()) add_samples_to_tensor(a1, TENSOR_KEY, memory_storage, batched=False) add_samples_to_tensor(a2, TENSOR_KEY, memory_storage, batched=False) @@ -26,7 +26,7 @@ def test_dtype_mismatch(memory_storage): def test_shape_length_mismatch(memory_storage): a1 = np.arange(100).reshape(5, 20) a2 = np.arange(200).reshape(5, 20, 2) - create_tensor(TENSOR_KEY, memory_storage, tensor_meta_from_array(a1, batched=False)) + create_tensor(TENSOR_KEY, memory_storage, default_tensor_meta()) add_samples_to_tensor(a1, TENSOR_KEY, memory_storage, batched=False) add_samples_to_tensor(a2, TENSOR_KEY, memory_storage, batched=False) @@ -40,8 +40,8 @@ def test_tensor_does_not_exist(memory_storage): @pytest.mark.xfail(raises=TensorAlreadyExistsError, strict=True) def test_tensor_already_exists(memory_storage): a1 = np.arange(10) - create_tensor(TENSOR_KEY, memory_storage, tensor_meta_from_array(a1, batched=False)) - create_tensor(TENSOR_KEY, memory_storage, tensor_meta_from_array(a1, batched=False)) + create_tensor(TENSOR_KEY, memory_storage, default_tensor_meta()) + create_tensor(TENSOR_KEY, memory_storage, default_tensor_meta()) @pytest.mark.xfail(raises=TensorMetaInvalidValue, strict=True) From c2a615c34ad26f386c66a8e4943e3d3aa4f021fd Mon Sep 17 00:00:00 2001 From: benchislett Date: Thu, 3 Jun 2021 23:08:55 -0400 Subject: [PATCH 2/8] Add docstring for Dataset.create_tensor --- hub/api/dataset.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 5b2c039039..665b2e1a74 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -82,8 +82,22 @@ def __getitem__(self, item: Union[str, int, slice, Index]): raise InvalidKeyTypeError(item) def create_tensor( - self, name, chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64" + self, name: str, chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64" ): + """Create a new tensor in this dataset. + + Args: + name (str): The name of the tensor to be created. + chunk_size (int): The target size for chunks in this tensor. + dtype (str): The dtype to use for this tensor. + Will be overwritten when the first sample is added. + + Returns: + The new tensor, which can also be accessed by `self[name]`. + + Raises: + TensorAlreadyExistsError: Duplicate tensors are not allowed. + """ if tensor_exists(name, self.provider): raise TensorAlreadyExistsError(name) From 5377ffce6bc75fc9b8b6de6436caa43049ad83d5 Mon Sep 17 00:00:00 2001 From: benchislett Date: Fri, 4 Jun 2021 11:17:17 -0400 Subject: [PATCH 3/8] Update docstring for from_path --- hub/api/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 665b2e1a74..fa408dcb91 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -127,10 +127,10 @@ def meta(self, new_meta: dict): @staticmethod def from_path(path: str): - """Create a local hub dataset from unstructured data. + """Create a hub dataset from unstructured data. Note: - This copies the data locally in hub format. + This copies the data into hub format. Be careful when using this with large datasets. Args: From 852baa9e7b92de73efdf885fc705a5da96c65be6 Mon Sep 17 00:00:00 2001 From: benchislett Date: Fri, 4 Jun 2021 11:29:59 -0400 Subject: [PATCH 4/8] Use descriptive-style docstrings --- hub/api/dataset.py | 6 +++--- hub/api/tensor.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index fa408dcb91..15eaab5402 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -32,7 +32,7 @@ def __init__( provider: Optional[StorageProvider] = None, index: Union[int, slice, Index] = None, ): - """Initialize a new or existing dataset. + """Initializes a new or existing dataset. Args: path (str): The location of the dataset. Used to initialize the storage provider. @@ -84,7 +84,7 @@ def __getitem__(self, item: Union[str, int, slice, Index]): def create_tensor( self, name: str, chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64" ): - """Create a new tensor in this dataset. + """Creates a new tensor in a dataset. Args: name (str): The name of the tensor to be created. @@ -127,7 +127,7 @@ def meta(self, new_meta: dict): @staticmethod def from_path(path: str): - """Create a hub dataset from unstructured data. + """Creates a hub dataset from unstructured data. Note: This copies the data into hub format. diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 3cd8309091..74f1348fbe 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -25,7 +25,7 @@ def __init__( tensor_meta: dict = None, index: Union[int, slice, Index] = None, ): - """Initialize a new tensor. + """Initializes a new tensor. Note: This operation does not create a new tensor in the storage provider, @@ -60,7 +60,7 @@ def __init__( create_tensor(self.key, self.provider, tensor_meta) def extend(self, array: np.ndarray): - """Extend tensor by appending elements from a batched numpy array. + """Extends a tensor by appending elements from a batched numpy array. Example: >>> len(image) @@ -76,7 +76,7 @@ def extend(self, array: np.ndarray): add_samples_to_tensor(array, self.key, storage=self.provider, batched=True) def append(self, array: np.ndarray): - """Append a sample to the end of the tensor. + """Appends a sample to the end of a tensor. Example: >>> len(image) @@ -104,7 +104,7 @@ def shape(self): return self.meta["max_shape"] def __len__(self): - """Return the length of the primary axis.""" + """Returns the length of the primary axis of a tensor.""" return self.meta["length"] def __getitem__(self, item: Union[int, slice, Index]): @@ -118,7 +118,7 @@ def __iter__(self): yield self[i] def numpy(self): - """Compute the contents of this tensor in numpy format. + """Computes the contents of a tensor in numpy format. Returns: A numpy array containing the data represented by this tensor. From db24b05e9071cb5cbfc12cd7697a94c127adcf24 Mon Sep 17 00:00:00 2001 From: benchislett Date: Fri, 4 Jun 2021 12:26:25 -0400 Subject: [PATCH 5/8] Support iterables in Tensor.extend --- hub/api/tensor.py | 17 +++++++++++------ hub/api/tests/test_api.py | 3 +++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 74f1348fbe..b27d550e91 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, Iterable import warnings import numpy as np @@ -59,8 +59,9 @@ def __init__( create_tensor(self.key, self.provider, tensor_meta) - def extend(self, array: np.ndarray): - """Extends a tensor by appending elements from a batched numpy array. + def extend(self, array: Union[np.ndarray, Iterable[np.ndarray]]): + """Extends a tensor by appending multiple elements from an iterable. + Accepts an iterable of numpy arrays or a single batched numpy array. Example: >>> len(image) @@ -70,10 +71,14 @@ def extend(self, array: np.ndarray): 100 Args: - array (np.ndarray): The data to add to the tensor. - The primary axis should be the number of samples to add. + array: The data to add to the tensor. + The length should be equal to the number of samples to add. """ - add_samples_to_tensor(array, self.key, storage=self.provider, batched=True) + if isinstance(array, np.ndarray): + add_samples_to_tensor(array, self.key, storage=self.provider, batched=True) + else: + for sample in array: + self.append(sample) def append(self, array: np.ndarray): """Appends a sample to the end of a tensor. diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index fbf4a0af9b..7b20e4f0e9 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -35,6 +35,9 @@ def test_populate_dataset(ds): ds.image.append(np.ones((28, 28))) assert len(ds.image) == 5 + ds.image.extend([np.ones((28, 28)), np.ones((28, 28))]) + assert len(ds.image) == 7 + assert ds.meta == {"tensors": ["image"]} From 146e82fd685afa1d36e6cd31a470b34337ee7e24 Mon Sep 17 00:00:00 2001 From: benchislett Date: Fri, 4 Jun 2021 12:51:53 -0400 Subject: [PATCH 6/8] Update parameters to Dataset.create_tensor --- hub/api/dataset.py | 16 ++++++++--- hub/api/tests/test_api.py | 7 ++--- hub/constants.py | 2 ++ .../tests/test_benchmark_chunk_engine.py | 2 +- hub/core/meta/tensor_meta.py | 27 ++++++++++++++++--- hub/core/tests/common.py | 4 +-- 6 files changed, 44 insertions(+), 14 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 15eaab5402..fbd2c226e5 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -82,15 +82,23 @@ def __getitem__(self, item: Union[str, int, slice, Index]): raise InvalidKeyTypeError(item) def create_tensor( - self, name: str, chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64" + self, + name: str, + htype: Optional[str] = None, + chunk_size: Optional[int] = None, + dtype: Optional[str] = None, + extra_meta: Optional[dict] = None, ): """Creates a new tensor in a dataset. Args: name (str): The name of the tensor to be created. - chunk_size (int): The target size for chunks in this tensor. - dtype (str): The dtype to use for this tensor. + htype (str, optional): The type of the data for the tensor. + May also modify the defaults for other parameters. + chunk_size (int, optional): The target size for chunks in this tensor. + dtype (str, optional): The data type to use for this tensor. Will be overwritten when the first sample is added. + extra_meta (dict, optional): Any additional metadata to be added to the tensor. Returns: The new tensor, which can also be accessed by `self[name]`. @@ -105,7 +113,7 @@ def create_tensor( ds_meta["tensors"].append(name) self.meta = ds_meta - tensor_meta = default_tensor_meta(chunk_size, dtype) + tensor_meta = default_tensor_meta(htype, chunk_size, dtype, extra_meta) tensor = Tensor(name, self.provider, tensor_meta=tensor_meta) self.tensors[name] = tensor diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 7b20e4f0e9..d10881500f 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -32,11 +32,12 @@ def test_populate_dataset(ds): assert len(ds) == 4 assert len(ds.image) == 4 - ds.image.append(np.ones((28, 28))) - assert len(ds.image) == 5 + for _ in range(10): + ds.image.append(np.ones((28, 28))) + assert len(ds.image) == 14 ds.image.extend([np.ones((28, 28)), np.ones((28, 28))]) - assert len(ds.image) == 7 + assert len(ds.image) == 16 assert ds.meta == {"tensors": ["image"]} diff --git a/hub/constants.py b/hub/constants.py index 64f318d614..902480d327 100644 --- a/hub/constants.py +++ b/hub/constants.py @@ -6,6 +6,8 @@ MB = 1000 * KB GB = 1000 * MB +DEFAULT_DTYPE = "float64" + DEFAULT_CHUNK_SIZE = 16 * MB MIN_FIRST_CACHE_SIZE = 32 * MB MIN_SECOND_CACHE_SIZE = 160 * MB diff --git a/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py b/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py index 6f18277d08..8354c34789 100644 --- a/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py +++ b/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py @@ -24,7 +24,7 @@ def single_benchmark_write(info, key, arrays, chunk_size, storage, batched): actual_key = "%s_%i" % (key, info["iteration"]) - create_tensor(actual_key, storage, default_tensor_meta(chunk_size)) + create_tensor(actual_key, storage, default_tensor_meta(chunk_size=chunk_size)) for a_in in arrays: add_samples_to_tensor( diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index a2868be607..694fff2369 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -1,10 +1,10 @@ from hub.util.exceptions import TensorMetaInvalidValue, TensorMetaMissingKey import numpy as np import pickle # TODO: NEVER USE PICKLE -from typing import Any, Callable +from typing import Any, Callable, Optional from hub.core.typing import StorageProvider -from hub.constants import DEFAULT_CHUNK_SIZE +from hub.constants import DEFAULT_CHUNK_SIZE, DEFAULT_DTYPE from hub.util.keys import get_tensor_meta_key from hub.util.array import normalize_and_batchify_shape @@ -17,8 +17,27 @@ def read_tensor_meta(key: str, storage: StorageProvider) -> dict: return pickle.loads(storage[get_tensor_meta_key(key)]) -def default_tensor_meta(chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64"): - return {"chunk_size": chunk_size, "dtype": dtype, "length": 0} +def default_tensor_meta( + htype: Optional[str] = None, + chunk_size: Optional[int] = None, + dtype: Optional[str] = None, + extra_meta: Optional[dict] = None, +): + if chunk_size is None: + chunk_size = DEFAULT_CHUNK_SIZE + if dtype is None: + dtype = DEFAULT_DTYPE + if extra_meta is None: + extra_meta = {} + + tensor_meta = extra_meta + tensor_meta["chunk_size"] = chunk_size + tensor_meta["dtype"] = dtype + tensor_meta["length"] = 0 + if htype is not None: + tensor_meta["htype"] = htype # TODO: identify presets + + return tensor_meta def update_tensor_meta_with_array( diff --git a/hub/core/tests/common.py b/hub/core/tests/common.py index 7293d99055..039e2ac663 100644 --- a/hub/core/tests/common.py +++ b/hub/core/tests/common.py @@ -139,7 +139,7 @@ def run_engine_test( key = TENSOR_KEY sample_count = 0 - create_tensor(key, storage, default_tensor_meta(chunk_size)) + create_tensor(key, storage, default_tensor_meta(chunk_size=chunk_size)) for i, a_in in enumerate(arrays): add_samples_to_tensor( @@ -181,7 +181,7 @@ def benchmark_write( key, arrays, chunk_size, storage, batched, clear_memory_after_write=True ): - create_tensor(key, storage, default_tensor_meta(chunk_size)) + create_tensor(key, storage, default_tensor_meta(chunk_size=chunk_size)) for a_in in arrays: add_samples_to_tensor( From a9dca38111aebc4f99549e422c3a12d804c52a19 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Fri, 4 Jun 2021 14:29:22 -0400 Subject: [PATCH 7/8] Update Dataset.create_tensor docstring Better explanation of htype --- hub/api/dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 18efa8f2c1..2b85da0c25 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -102,7 +102,10 @@ def create_tensor( Args: name (str): The name of the tensor to be created. - htype (str, optional): The type of the data for the tensor. + htype (str, optional): The class of data for the tensor. + The defaults for other parameters are determined in terms of this value. + For example, `htype="image"` would have `dtype` default to `uint8`. + These defaults can be overridden by explicitly passing any of the other parameters to this function. May also modify the defaults for other parameters. chunk_size (int, optional): The target size for chunks in this tensor. dtype (str, optional): The data type to use for this tensor. From 0e175383ff31505debe705f9fa52139a681bc3af Mon Sep 17 00:00:00 2001 From: benchislett Date: Fri, 4 Jun 2021 14:30:03 -0400 Subject: [PATCH 8/8] Remove artifact of merge conflict from docstring --- hub/api/dataset.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 2b85da0c25..4e46ebe004 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -100,24 +100,23 @@ def create_tensor( ): """Creates a new tensor in a dataset. - Args: - name (str): The name of the tensor to be created. - htype (str, optional): The class of data for the tensor. - The defaults for other parameters are determined in terms of this value. - For example, `htype="image"` would have `dtype` default to `uint8`. - These defaults can be overridden by explicitly passing any of the other parameters to this function. - May also modify the defaults for other parameters. - chunk_size (int, optional): The target size for chunks in this tensor. - dtype (str, optional): The data type to use for this tensor. - Will be overwritten when the first sample is added. - extra_meta (dict, optional): Any additional metadata to be added to the tensor. - - <<<<<<< HEAD - Returns: - The new tensor, which can also be accessed by `self[name]`. - - Raises: - TensorAlreadyExistsError: Duplicate tensors are not allowed. + Args: + name (str): The name of the tensor to be created. + htype (str, optional): The class of data for the tensor. + The defaults for other parameters are determined in terms of this value. + For example, `htype="image"` would have `dtype` default to `uint8`. + These defaults can be overridden by explicitly passing any of the other parameters to this function. + May also modify the defaults for other parameters. + chunk_size (int, optional): The target size for chunks in this tensor. + dtype (str, optional): The data type to use for this tensor. + Will be overwritten when the first sample is added. + extra_meta (dict, optional): Any additional metadata to be added to the tensor. + + Returns: + The new tensor, which can also be accessed by `self[name]`. + + Raises: + TensorAlreadyExistsError: Duplicate tensors are not allowed. """ if tensor_exists(name, self.storage): raise TensorAlreadyExistsError(name)