From be6ce00bee8c5cbad1e6bc080e76d7ee537366ab Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Wed, 14 Jul 2021 14:45:00 -0700 Subject: [PATCH 01/33] add test for subsequent meta updates (failing) --- hub/api/tests/test_meta.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 hub/api/tests/test_meta.py diff --git a/hub/api/tests/test_meta.py b/hub/api/tests/test_meta.py new file mode 100644 index 0000000000..2271dc73eb --- /dev/null +++ b/hub/api/tests/test_meta.py @@ -0,0 +1,29 @@ +import numpy as np +import hub + + +def test_version(local_ds_generator): + ds = local_ds_generator() + assert ds.meta.version == hub.__version__ + + # persistence + ds = local_ds_generator() + assert ds.meta.version == hub.__version__ + + +def test_subsequent_updates(local_ds_generator): + with local_ds_generator() as ds: + ds.create_tensor("tensor") + + with local_ds_generator() as ds: + ds.tensor.extend(np.ones((5, 100, 100))) + + ds = local_ds_generator() + assert len(ds) == 5 + + with local_ds_generator() as ds: + for _ in range(5): + ds.tensor.append(np.ones((100, 100))) + + ds = local_ds_generator() + assert len(ds) == 10 From d11920fff77cd0983606e32e9ba6a491cd48eb86 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 13:17:08 -0700 Subject: [PATCH 02/33] synchronize everything with cache --- hub/api/tensor.py | 2 +- hub/api/tests/test_meta.py | 20 +++++++++++++++++++- hub/core/chunk_engine.py | 20 ++++++++++++++++++-- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/hub/api/tensor.py b/hub/api/tensor.py index 7de61a8372..a49f105096 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -178,7 +178,7 @@ def __len__(self): int: The current length of this tensor. """ - return self.index.length(self.chunk_engine.num_samples) + return self.index.length(self.meta.length) def __getitem__( self, diff --git a/hub/api/tests/test_meta.py b/hub/api/tests/test_meta.py index 2271dc73eb..194a9d0a96 100644 --- a/hub/api/tests/test_meta.py +++ b/hub/api/tests/test_meta.py @@ -1,3 +1,4 @@ +from hub.api.tests.test_api import MAX_FLOAT_DTYPE import numpy as np import hub @@ -13,11 +14,16 @@ def test_version(local_ds_generator): def test_subsequent_updates(local_ds_generator): with local_ds_generator() as ds: - ds.create_tensor("tensor") + ds.create_tensor("tensor") # dtype is not specified + + assert ds.tensor.dtype == None with local_ds_generator() as ds: ds.tensor.extend(np.ones((5, 100, 100))) + # dtype is auto-specified + assert ds.tensor.dtype == MAX_FLOAT_DTYPE + ds = local_ds_generator() assert len(ds) == 5 @@ -27,3 +33,15 @@ def test_subsequent_updates(local_ds_generator): ds = local_ds_generator() assert len(ds) == 10 + assert ds.tensor.shape == (10, 100, 100) + + ds = local_ds_generator() + + with local_ds_generator() as ds: + for _ in range(5): + ds.tensor.append(np.ones((100, 200))) + + assert ds.tensor.shape == (15, 100, None) + si = ds.tensor.shape_interval + assert si.lower == (15, 100, 100) + assert si.upper == (15, 100, 200) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 11fe63e259..03146c293c 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -202,10 +202,26 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype): self._append_to_new_chunk(buffer, shape) self.chunk_id_encoder.register_samples_to_last_chunk_id(num_samples) + self._synchronize_cache() + def _synchronize_cache(self): + # TODO: docstring # TODO implement tests for cache size compute - if self.last_chunk is not None: - self.cache[self.last_chunk_key] = self.last_chunk + + # TODO: optimize this by storing all of these keys in the chunk engine's state + + # synchronize last chunk + last_chunk_key = self.last_chunk_key + last_chunk = self.last_chunk + self.cache.update_used_cache_for_path(last_chunk_key, len(last_chunk)) # type: ignore + + # synchronize tensor meta + tensor_meta_key = get_tensor_meta_key(self.key) + self.cache[tensor_meta_key] = self.tensor_meta + + # synchronize chunk ID encoder + chunk_id_key = get_chunk_id_encoder_key(self.key) + self.cache[chunk_id_key] = self.chunk_id_encoder def _try_appending_to_last_chunk( self, buffer: memoryview, shape: Tuple[int] From bfc97f338812f5cfcb2ef4483acc57e3aa9b1eea Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 13:18:40 -0700 Subject: [PATCH 03/33] add docstring --- hub/core/chunk_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 03146c293c..89908335b8 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -205,10 +205,10 @@ def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype): self._synchronize_cache() def _synchronize_cache(self): - # TODO: docstring - # TODO implement tests for cache size compute + """Synchronizes cachables with the cache. Includes: the last chunk, tensor meta, and chunk IDs encoder.""" - # TODO: optimize this by storing all of these keys in the chunk engine's state + # TODO implement tests for cache size compute + # TODO: optimize this by storing all of these keys in the chunk engine's state (posixpath.joins are pretty slow) # synchronize last chunk last_chunk_key = self.last_chunk_key From 40b902dd8f81e4c7b4357ea9f2b87a2e05335206 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 13:51:41 -0700 Subject: [PATCH 04/33] check that tensor meta length & chunk id encoder num_samples are the same --- hub/api/tensor.py | 10 ++++++++-- hub/core/chunk_engine.py | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/hub/api/tensor.py b/hub/api/tensor.py index a49f105096..feaa3daea0 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -1,4 +1,4 @@ -from hub.util.keys import tensor_exists +from hub.util.keys import get_chunk_id_encoder_key, get_tensor_meta_key, tensor_exists from hub.core.sample import Sample # type: ignore from typing import List, Sequence, Union, Optional, Tuple, Dict from hub.util.shape import ShapeInterval @@ -7,7 +7,10 @@ from hub.core.chunk_engine import ChunkEngine, SampleValue from hub.core.storage import LRUCache -from hub.util.exceptions import TensorDoesNotExistError, InvalidKeyTypeError +from hub.util.exceptions import ( + TensorDoesNotExistError, + InvalidKeyTypeError, +) from hub.core.index import Index @@ -178,6 +181,9 @@ def __len__(self): int: The current length of this tensor. """ + # catch corrupted datasets / user tampering ASAP + self.chunk_engine.validate_num_samples_is_synchronized() + return self.index.length(self.meta.length) def __getitem__( diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 89908335b8..10cacfefe9 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -438,6 +438,26 @@ def get_chunk_names( sample_index += 1 return chunk_names + def validate_num_samples_is_synchronized(self): + """Check if tensor meta length and chunk ID encoder are representing the same number of samples. + Helpful for determining if a user has tampered with the tensor meta or the chunk ID encoder, or if + the tensor was corruptd. + + Raises: + CorruptedMetaError: tensor_meta and chunk_id_encoder must have the same num samples. + """ + + tensor_meta_length = self.tensor_meta.length + + # compare chunk ID encoder and tensor meta + chunk_id_num_samples = self.chunk_id_encoder.num_samples + if tensor_meta_length != chunk_id_num_samples: + tkey = get_tensor_meta_key(self.key) + ikey = get_chunk_id_encoder_key(self.key) + raise CorruptedMetaError( + f"'{tkey}' and '{ikey}' have a record of different numbers of samples. Got {tensor_meta_length} and {chunk_id_num_samples} respectively." + ) + def _format_samples( samples: Sequence[np.array], index: Index, aslist: bool From 161d9e25ae091556f237509e51ab06b3dd399fda Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 15:48:34 -0700 Subject: [PATCH 05/33] write tests for user-defined `info` --- hub/api/tests/test_info.py | 78 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 hub/api/tests/test_info.py diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py new file mode 100644 index 0000000000..105d021e5e --- /dev/null +++ b/hub/api/tests/test_info.py @@ -0,0 +1,78 @@ +def test_dataset(local_ds_generator): + ds = local_ds_generator() + + assert len(ds.info) == 0 + + ds.info.update(my_key=0) + ds.info.update(my_key=1) + + ds.info.update(another_key="hi") + ds.info.update({"another_key": "hello"}) + + ds.info.update({"something": "aaaaa"}, something="bbbb") + + ds.info.update(test=[1, 2, "5"]) + + with ds: + ds.info.update({"test2": (1, 5, (1, "2"), [5, 6, (7, 8)])}) + ds.info.update(xyz="abc") + + ds.info.update({"1_-+", 5}) + + ds = local_ds_generator() + + assert len(ds.info) == 7 + + assert ds.info.another_key == "hello" + assert ds.info.something == "bbbb" + assert ds.info.test == (1, 2, "5"), "needs to convert to tuple" + assert ds.info.test2 == (1, 5, (1, "2"), (5, 6, (7, 8))) + assert ds.info.xyz == "abc" + assert ds.info["1_-+"] == 5 # key can't be accessed with `.` syntax + + ds.info.update(test=[99]) + + ds = local_ds_generator() + + assert len(ds.info) == 8 + assert ds.info.test == (99,) + + +def test_tensor(local_ds_generator): + ds = local_ds_generator() + + t1 = ds.create_tensor("tensor1") + t2 = ds.create_tensor("tensor2") + + assert len(t1.info) == 0 + assert len(t2.info) == 0 + + t1.info.update(key=0) + t2.info.update(key=1, key1=0) + + ds = local_ds_generator() + + t1 = ds.tensor1 + t2 = ds.tensor2 + + assert len(t1.info) == 1 + assert len(t2.info) == 2 + + assert t1.info.key == 0 + assert t2.info.key == 1 + assert t2.info.key1 == 0 + + with ds: + t1.info.update(key=99) + + ds = local_ds_generator() + + t1 = ds.tensor1 + t2 = ds.tensor2 + + assert len(t1.info) == 2 + assert len(t2.info) == 2 + + assert t1.info.key == 99 + + From 60f917143fbc36be6f723caa71dadff8b49be5e9 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 15:56:55 -0700 Subject: [PATCH 06/33] add info class + update docstrings --- hub/core/meta/info.py | 13 +++++++++++++ hub/core/meta/meta.py | 4 ++++ 2 files changed, 17 insertions(+) create mode 100644 hub/core/meta/info.py diff --git a/hub/core/meta/info.py b/hub/core/meta/info.py new file mode 100644 index 0000000000..a005e404bf --- /dev/null +++ b/hub/core/meta/info.py @@ -0,0 +1,13 @@ +from hub.core.storage.cachable import Cachable + + +class Info(Cachable): + def __init__(self): + """Contains **optional** key/values that datasets/tensors use for human-readability. + See the `Meta` class for required key/values for datasets/tensors. + """ + + pass + + def as_dict(self) -> dict: + raise NotImplementedError diff --git a/hub/core/meta/meta.py b/hub/core/meta/meta.py index 7aa553b955..bc9bfa55ff 100644 --- a/hub/core/meta/meta.py +++ b/hub/core/meta/meta.py @@ -4,6 +4,10 @@ class Meta(Cachable): def __init__(self): + """Contains **required** key/values that datasets/tensors use to function. + See the `Info` class for optional key/values for datasets/tensors. + """ + self.version = hub.__version__ def as_dict(self) -> dict: From 43bc04f04191cf2501159c74e7c8e4e83cf85c6a Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 17:04:20 -0700 Subject: [PATCH 07/33] implement `TensorMeta.as_dict` --- hub/core/meta/dataset_meta.py | 3 --- hub/core/meta/tensor_meta.py | 12 ++++++++++-- hub/core/storage/cachable.py | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/hub/core/meta/dataset_meta.py b/hub/core/meta/dataset_meta.py index 7c7ebc7172..94438e6d7e 100644 --- a/hub/core/meta/dataset_meta.py +++ b/hub/core/meta/dataset_meta.py @@ -1,7 +1,4 @@ -from typing import Dict, List -from hub.core.storage.provider import StorageProvider from hub.core.meta.meta import Meta -from hub.util.keys import get_dataset_meta_key class DatasetMeta(Meta): diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index 80563bf5fb..a60f13b3da 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -51,7 +51,11 @@ def __init__( required_meta = _required_meta_from_htype(htype) required_meta.update(kwargs) + + self._required_meta_keys = tuple(required_meta.keys()) self.__dict__.update(required_meta) + else: + self._required_meta_keys = tuple() super().__init__() @@ -168,8 +172,12 @@ def _update_shape_interval(self, shape: Tuple[int, ...]): self.max_shape[i] = max(dim, self.max_shape[i]) def as_dict(self): - # TODO: tensor meta as_dict - raise NotImplementedError + d = super().as_dict() + + for key in self._required_meta_keys: + d[key] = getattr(self, key) + + return d def _required_meta_from_htype(htype: str) -> dict: diff --git a/hub/core/storage/cachable.py b/hub/core/storage/cachable.py index 7a3cb120c3..6d7d139b36 100644 --- a/hub/core/storage/cachable.py +++ b/hub/core/storage/cachable.py @@ -17,7 +17,7 @@ def __len__(self): return len(self.tobytes()) def tobytes(self) -> bytes: - return bytes(json.dumps(self.__dict__), "utf-8") + return bytes(json.dumps(self.as_dict()), "utf-8") @classmethod def frombuffer(cls, buffer: bytes): From bfb8876c8ef03eabf068ac8c7e33872fbd3f7eca Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 17:28:54 -0700 Subject: [PATCH 08/33] replace `len(Cachable)` with `Cachable.num_bytes` --- hub/core/chunk.py | 4 +++- hub/core/chunk_engine.py | 2 +- hub/core/meta/dataset_meta.py | 5 +++++ hub/core/meta/encode/chunk_id.py | 5 +++++ hub/core/meta/tensor_meta.py | 4 ++++ hub/core/storage/cachable.py | 6 ++++-- hub/core/storage/lru_cache.py | 17 ++++++++++++----- 7 files changed, 34 insertions(+), 9 deletions(-) diff --git a/hub/core/chunk.py b/hub/core/chunk.py index 7aa81db641..3c5238a361 100644 --- a/hub/core/chunk.py +++ b/hub/core/chunk.py @@ -108,8 +108,10 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]): self.shapes_encoder.add_shape(sample_shape, 1) self.byte_positions_encoder.add_byte_position(num_bytes_per_sample, 1) - def __len__(self): + @property + def nbytes(self): """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached.""" + return infer_chunk_num_bytes( hub.__version__, self.shapes_encoder.array, diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 10cacfefe9..4ef6fad5fd 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -213,7 +213,7 @@ def _synchronize_cache(self): # synchronize last chunk last_chunk_key = self.last_chunk_key last_chunk = self.last_chunk - self.cache.update_used_cache_for_path(last_chunk_key, len(last_chunk)) # type: ignore + self.cache.update_used_cache_for_path(last_chunk_key, last_chunk.nbytes) # type: ignore # synchronize tensor meta tensor_meta_key = get_tensor_meta_key(self.key) diff --git a/hub/core/meta/dataset_meta.py b/hub/core/meta/dataset_meta.py index 94438e6d7e..f824d896c5 100644 --- a/hub/core/meta/dataset_meta.py +++ b/hub/core/meta/dataset_meta.py @@ -7,6 +7,11 @@ def __init__(self): super().__init__() + @property + def nbytes(self): + # TODO: can optimize this + return len(self.tobytes()) + def as_dict(self) -> dict: d = super().as_dict() d["tensors"] = self.tensors diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py index fd6b0ecc41..7b3518979e 100644 --- a/hub/core/meta/encode/chunk_id.py +++ b/hub/core/meta/encode/chunk_id.py @@ -72,6 +72,11 @@ def __init__(self): self._encoded_ids = None + @property + def nbytes(self): + # TODO: optimize this + return len(self.tobytes()) + def tobytes(self) -> memoryview: if self._encoded_ids is None: return serialize_chunkids( diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index a60f13b3da..c7016cb014 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -179,6 +179,10 @@ def as_dict(self): return d + @property + def nbytes(self): + return len(self.tobytes()) + def _required_meta_from_htype(htype: str) -> dict: """Gets a dictionary with all required meta information to define a tensor.""" diff --git a/hub/core/storage/cachable.py b/hub/core/storage/cachable.py index 6d7d139b36..db67e5cf92 100644 --- a/hub/core/storage/cachable.py +++ b/hub/core/storage/cachable.py @@ -13,8 +13,10 @@ def __init__(self, buffer: bytes = None): if buffer: self.frombuffer(buffer) - def __len__(self): - return len(self.tobytes()) + @property + def nbytes(self): + # do not implement, each class should do this because it could be very slow if `tobytes` is called + raise NotImplementedError def tobytes(self) -> bytes: return bytes(json.dumps(self.as_dict()), "utf-8") diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index 36c9a9ba63..7f315115b1 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -5,6 +5,12 @@ from hub.core.storage.provider import StorageProvider +def _get_nbytes(obj: Union[bytes, memoryview, Cachable]): + if isinstance(obj, Cachable): + return obj.nbytes + return len(obj) + + # TODO use lock for multiprocessing class LRUCache(StorageProvider): """LRU Cache that uses StorageProvider for caching""" @@ -82,7 +88,7 @@ def get_cachable(self, path: str, expected_class): if isinstance(item, (bytes, memoryview)): obj = expected_class.frombuffer(item) - if len(obj) <= self.cache_size: + if obj.nbytes <= self.cache_size: self._insert_in_cache(path, obj) return obj @@ -106,7 +112,8 @@ def __getitem__(self, path: str): return self.cache_storage[path] else: result = self.next_storage[path] # fetch from storage, may throw KeyError - if len(result) <= self.cache_size: # insert in cache if it fits + + if _get_nbytes(result) <= self.cache_size: # insert in cache if it fits self._insert_in_cache(path, result) return result @@ -125,7 +132,7 @@ def __setitem__(self, path: str, value: Union[bytes, Cachable]): size = self.lru_sizes.pop(path) self.cache_used -= size - if len(value) <= self.cache_size: + if _get_nbytes(value) <= self.cache_size: self._insert_in_cache(path, value) self.dirty_keys.add(path) else: # larger than cache, directly send to next layer @@ -255,10 +262,10 @@ def _insert_in_cache(self, path: str, value: Union[bytes, Cachable]): ReadOnlyError: If the provider is in read-only mode. """ - self._free_up_space(len(value)) + self._free_up_space(_get_nbytes(value)) self.cache_storage[path] = value # type: ignore - self.update_used_cache_for_path(path, len(value)) + self.update_used_cache_for_path(path, _get_nbytes(value)) def _list_keys(self): """Helper function that lists all the objects present in the cache and the underlying storage. From 338b94922f0a98e923340c398e1dc515a0861443 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 18:20:26 -0700 Subject: [PATCH 09/33] added callback cachable class hierarchy & initialization --- hub/api/dataset.py | 20 ++++++++++- hub/api/tests/test_info.py | 2 -- hub/constants.py | 7 ++++ hub/core/meta/info.py | 27 +++++++++++++-- hub/core/meta/tensor_meta.py | 1 + hub/core/storage/lru_cache.py | 5 +++ hub/util/exceptions.py | 4 +++ hub/util/keys.py | 15 ++++++++- hub/util/storage_callback.py | 63 +++++++++++++++++++++++++++++++++++ 9 files changed, 137 insertions(+), 7 deletions(-) create mode 100644 hub/util/storage_callback.py diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 0d12b80f3f..268c048a44 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -1,3 +1,4 @@ +from hub.core.meta.info import Info from hub.core.storage.provider import StorageProvider from hub.core.tensor import create_tensor from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence @@ -11,7 +12,12 @@ from hub.core.index import Index from hub.integrations import dataset_to_tensorflow -from hub.util.keys import dataset_exists, get_dataset_meta_key, tensor_exists +from hub.util.keys import ( + dataset_exists, + get_dataset_info_key, + get_dataset_meta_key, + tensor_exists, +) from hub.util.bugout_reporter import hub_reporter from hub.util.cache_chain import generate_chain from hub.util.exceptions import ( @@ -102,6 +108,7 @@ def __init__( self.public = public self._load_meta() + self._load_info() hub_reporter.feature_report( feature_name="Dataset", parameters={"Path": str(self.path)} @@ -233,6 +240,17 @@ def _load_meta(self): self.org_id, self.ds_name, self.meta.as_dict(), public=self.public ) + def _load_info(self): + info_key = get_dataset_info_key() + + if info_key in self.storage: + self.info = self.storage.get_cachable(info_key, Info) + else: + self.info = Info() + self.info.initialize_callback_location(info_key, self.storage) + + return self.info + @property def read_only(self): return self._read_only diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py index 105d021e5e..ad929a0d6f 100644 --- a/hub/api/tests/test_info.py +++ b/hub/api/tests/test_info.py @@ -74,5 +74,3 @@ def test_tensor(local_ds_generator): assert len(t2.info) == 2 assert t1.info.key == 99 - - diff --git a/hub/constants.py b/hub/constants.py index 60f76469cb..e4c6e7bc97 100644 --- a/hub/constants.py +++ b/hub/constants.py @@ -24,6 +24,7 @@ SUPPORTED_MODES = ["r", "a"] +# min chunk size is always half of `DEFAULT_MAX_CHUNK_SIZE` DEFAULT_MAX_CHUNK_SIZE = 32 * MB MIN_FIRST_CACHE_SIZE = 32 * MB @@ -34,8 +35,14 @@ DEFAULT_LOCAL_CACHE_SIZE = 0 +# meta is 100% required hub-defined meta DATASET_META_FILENAME = "dataset_meta.json" TENSOR_META_FILENAME = "tensor_meta.json" + +# info is 100% optional user-defined information +DATASET_INFO_FILENAME = "dataset_info.json" +TENSOR_INFO_FILENAME = "tensor_info.json" + META_ENCODING = "utf8" CHUNKS_FOLDER = "chunks" diff --git a/hub/core/meta/info.py b/hub/core/meta/info.py index a005e404bf..99e9111ae5 100644 --- a/hub/core/meta/info.py +++ b/hub/core/meta/info.py @@ -1,13 +1,34 @@ -from hub.core.storage.cachable import Cachable +from hub.util.storage_callback import CachableCallback, callback -class Info(Cachable): +class Info(CachableCallback): def __init__(self): """Contains **optional** key/values that datasets/tensors use for human-readability. See the `Meta` class for required key/values for datasets/tensors. + + Note: + Since `Info` is rarely written to and mostly by the user, every modifier will call `storage[key] = self`. + This is so the user doesn't have to call `flush`. + Must call `initialize_callback_location` before using any methods. """ - pass + self._dict = {} + super().__init__() + + @property + @callback(check_only=True) + def nbytes(self): + # TODO: optimize this + return len(self.tobytes()) + + @callback(check_only=True) + def __len__(self): + return len(self._dict) + @callback(check_only=True) def as_dict(self) -> dict: raise NotImplementedError + + @callback() + def update(self, *args, **kwargs): + raise NotImplementedError diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index c7016cb014..8ffabea460 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -181,6 +181,7 @@ def as_dict(self): @property def nbytes(self): + # TODO: optimize this return len(self.tobytes()) diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index 7f315115b1..3fa3b5957e 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -1,4 +1,5 @@ from collections import OrderedDict +from hub.util.storage_callback import CachableCallback from hub.core.storage.cachable import Cachable from typing import Callable, Set, Union @@ -90,6 +91,10 @@ def get_cachable(self, path: str, expected_class): obj = expected_class.frombuffer(item) if obj.nbytes <= self.cache_size: self._insert_in_cache(path, obj) + + if isinstance(obj, CachableCallback): + obj.initialize_callback_location(path, self) + return obj raise ValueError(f"Item at '{path}' got an invalid type: '{type(item)}'.") diff --git a/hub/util/exceptions.py b/hub/util/exceptions.py index d226f46733..5320b2f31a 100644 --- a/hub/util/exceptions.py +++ b/hub/util/exceptions.py @@ -453,3 +453,7 @@ def __init__(self): super().__init__( f"Python Shared memory with multiprocessing doesn't work properly on Windows." ) + + +class CallbackInitializationError(Exception): + pass \ No newline at end of file diff --git a/hub/util/keys.py b/hub/util/keys.py index f10fb6649e..d120777ee7 100644 --- a/hub/util/keys.py +++ b/hub/util/keys.py @@ -1,3 +1,4 @@ +from hub.util.exceptions import CorruptedMetaError from hub.core.storage.provider import StorageProvider import posixpath @@ -13,10 +14,19 @@ def get_dataset_meta_key() -> str: return constants.DATASET_META_FILENAME +def get_dataset_info_key() -> str: + # dataset info is always relative to the `StorageProvider`'s root + return constants.DATASET_INFO_FILENAME + + def get_tensor_meta_key(key: str) -> str: return posixpath.join(key, constants.TENSOR_META_FILENAME) +def get_tensor_info_key(key: str) -> str: + return posixpath.join(key, constants.TENSOR_INFO_FILENAME) + + def get_chunk_id_encoder_key(key: str) -> str: return posixpath.join( key, @@ -26,7 +36,10 @@ def get_chunk_id_encoder_key(key: str) -> str: def dataset_exists(storage: StorageProvider) -> bool: - return get_dataset_meta_key() in storage + """A dataset exists if the provided `storage` contains a `dataset_meta.json`.""" + + meta_exists = get_dataset_meta_key() in storage + return meta_exists def tensor_exists(key: str, storage: StorageProvider) -> bool: diff --git a/hub/util/storage_callback.py b/hub/util/storage_callback.py new file mode 100644 index 0000000000..52292cbd5a --- /dev/null +++ b/hub/util/storage_callback.py @@ -0,0 +1,63 @@ +from hub.core.storage.provider import StorageProvider +from hub.util.exceptions import CallbackInitializationError +from hub.core.storage.cachable import Cachable + + +def callback(check_only: bool = False): + """Decorator for methods that should require `initialize_callback_location` to be called first.""" + + # TODO: update docstring + + def outer(func): + def inner(obj: "CachableCallback", *args, **kwargs): + if not obj._is_callback_initialized(): + raise CallbackInitializationError( + "Must first call `initialize_callback_location` before any other methods may be called." + ) + + y = func(obj, *args, **kwargs) + + if not check_only: + obj.callback() + + return y + + return inner + + return outer + + +class CachableCallback(Cachable): + def __init__(self): + # TODO: docstring (warn that this may be very slow and shouldn't be used often or should be optimized) + + self._key = None + self._storage = None + + def _is_callback_initialized(self) -> bool: + key_ex = self._key is not None + storage_ex = self._storage is not None + return key_ex and storage_ex + + def initialize_callback_location(self, key: str, storage: StorageProvider): + """Must be called once before any other method calls. + + Args: + key (str): The key for where in `storage` bytes are serialized with each callback call. + storage (LRUCache): The storage for where bytes are serialized with each callback call. + + Raises: + CallbackInitializationError: Cannot re-initialize. + """ + + if self._is_callback_initialized(): + raise CallbackInitializationError( + f"`initialize_callback_location` was already called. key={self._key}" + ) + + self._key = key + self._storage = storage + + def callback(self): + # TODO + raise NotImplementedError From 5e51ed2bbad00f5544493bd5b67d3ea1ce7a8f37 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 18:34:42 -0700 Subject: [PATCH 10/33] write failure test cases & start functionality --- hub/api/tests/test_info.py | 20 +++++++++++++++++++- hub/core/meta/info.py | 17 +++++++++++++++-- hub/core/storage/lru_cache.py | 7 ++++--- hub/util/storage_callback.py | 3 +-- 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py index ad929a0d6f..aa50989c68 100644 --- a/hub/api/tests/test_info.py +++ b/hub/api/tests/test_info.py @@ -1,3 +1,21 @@ +import pytest + + +def test_failures(local_ds): + ds = local_ds + + with pytest.raises(ValueError): + ds.info.update(1, 2) + + with pytest.raises(ValueError): + ds.info.update({"test": 0}, {"test": 2}) + + with pytest.raises(ValueError): + ds.info.update(1) + + # TODO: raise error when a user tries to add a numpy array + + def test_dataset(local_ds_generator): ds = local_ds_generator() @@ -17,7 +35,7 @@ def test_dataset(local_ds_generator): ds.info.update({"test2": (1, 5, (1, "2"), [5, 6, (7, 8)])}) ds.info.update(xyz="abc") - ds.info.update({"1_-+", 5}) + ds.info.update({"1_-+": 5}) ds = local_ds_generator() diff --git a/hub/core/meta/info.py b/hub/core/meta/info.py index 99e9111ae5..27f6e8e487 100644 --- a/hub/core/meta/info.py +++ b/hub/core/meta/info.py @@ -1,6 +1,9 @@ from hub.util.storage_callback import CachableCallback, callback +_VALUE_ERROR_MSG = '`info.update` should be called with a single dictionary or **kwargs values. Example: `info.update({"key1": 1}, key2=2, key3=3)`' + + class Info(CachableCallback): def __init__(self): """Contains **optional** key/values that datasets/tensors use for human-readability. @@ -27,8 +30,18 @@ def __len__(self): @callback(check_only=True) def as_dict(self) -> dict: - raise NotImplementedError + # TODO: optimize this + return self._dict.copy() @callback() def update(self, *args, **kwargs): - raise NotImplementedError + # TODO: convert everything to immutable structures + + if len(args) > 1: + raise ValueError(_VALUE_ERROR_MSG) # TODO: exceptions.py + + if len(args) == 1: + if not isinstance(args[0], dict): + raise ValueError(_VALUE_ERROR_MSG) + + self._dict.update(*args, **kwargs) diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index 3fa3b5957e..f08d4e70c7 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -89,12 +89,13 @@ def get_cachable(self, path: str, expected_class): if isinstance(item, (bytes, memoryview)): obj = expected_class.frombuffer(item) - if obj.nbytes <= self.cache_size: - self._insert_in_cache(path, obj) if isinstance(obj, CachableCallback): obj.initialize_callback_location(path, self) - + + if obj.nbytes <= self.cache_size: + self._insert_in_cache(path, obj) + return obj raise ValueError(f"Item at '{path}' got an invalid type: '{type(item)}'.") diff --git a/hub/util/storage_callback.py b/hub/util/storage_callback.py index 52292cbd5a..3ce52afd1a 100644 --- a/hub/util/storage_callback.py +++ b/hub/util/storage_callback.py @@ -59,5 +59,4 @@ def initialize_callback_location(self, key: str, storage: StorageProvider): self._storage = storage def callback(self): - # TODO - raise NotImplementedError + self._storage[self._key] = self From 52a65f1a0e8ba3bee3fc87b4b2f49c717ca98c6d Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 18:38:00 -0700 Subject: [PATCH 11/33] fix serialization and allow dotdict-like access --- hub/core/meta/info.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/hub/core/meta/info.py b/hub/core/meta/info.py index 27f6e8e487..1a306c4570 100644 --- a/hub/core/meta/info.py +++ b/hub/core/meta/info.py @@ -1,3 +1,4 @@ +from typing import Any from hub.util.storage_callback import CachableCallback, callback @@ -31,7 +32,17 @@ def __len__(self): @callback(check_only=True) def as_dict(self) -> dict: # TODO: optimize this - return self._dict.copy() + + return {"_dict": self._dict.copy()} + + def __getattribute__(self, name: str) -> Any: + """Allows access to info values using the `.` syntax. Example: `info.description`.""" + + if name == "_dict": + return super().__getattribute__(name) + if name in self._dict: + return self._dict[name] + return super().__getattribute__(name) @callback() def update(self, *args, **kwargs): From ac33b0d02605362606834b5369e1c04b59c3a546 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Thu, 15 Jul 2021 18:56:31 -0700 Subject: [PATCH 12/33] add tests for non-immutable values --- hub/api/tests/test_info.py | 10 +++++++++ hub/core/meta/info.py | 46 +++++++++++++++++++++++++++----------- hub/util/immutability.py | 21 +++++++++++++++++ 3 files changed, 64 insertions(+), 13 deletions(-) create mode 100644 hub/util/immutability.py diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py index aa50989c68..414d4703db 100644 --- a/hub/api/tests/test_info.py +++ b/hub/api/tests/test_info.py @@ -13,6 +13,16 @@ def test_failures(local_ds): with pytest.raises(ValueError): ds.info.update(1) + with pytest.raises(ValueError): + ds.info.update({"something": {"nested": "dict"}}) + + with pytest.raises(ValueError): + ds.info.update(bad_key={"normal": "dict"}) + + with pytest.raises(ValueError): + # this is bad because **kwargs values cannot be dictionaries + ds.info.update({"good_key": 1}, good_key=1, bad_key={"normal": "dict"}) + # TODO: raise error when a user tries to add a numpy array diff --git a/hub/core/meta/info.py b/hub/core/meta/info.py index 1a306c4570..267a63c613 100644 --- a/hub/core/meta/info.py +++ b/hub/core/meta/info.py @@ -1,3 +1,7 @@ +from hub.util.immutability import ( + recursively_parse_as_immutable, + validate_can_be_parsed_as_immutable, +) from typing import Any from hub.util.storage_callback import CachableCallback, callback @@ -16,7 +20,7 @@ def __init__(self): Must call `initialize_callback_location` before using any methods. """ - self._dict = {} + self._info = {} super().__init__() @property @@ -27,22 +31,15 @@ def nbytes(self): @callback(check_only=True) def __len__(self): - return len(self._dict) + return len(self._info) @callback(check_only=True) def as_dict(self) -> dict: + # TODO: docstring (INTERNAL USE ONLY!) + # TODO: optimize this - return {"_dict": self._dict.copy()} - - def __getattribute__(self, name: str) -> Any: - """Allows access to info values using the `.` syntax. Example: `info.description`.""" - - if name == "_dict": - return super().__getattribute__(name) - if name in self._dict: - return self._dict[name] - return super().__getattribute__(name) + return {"_info": self._info.copy()} @callback() def update(self, *args, **kwargs): @@ -55,4 +52,27 @@ def update(self, *args, **kwargs): if not isinstance(args[0], dict): raise ValueError(_VALUE_ERROR_MSG) - self._dict.update(*args, **kwargs) + for v in args[0].values(): + validate_can_be_parsed_as_immutable(v, recursive=True) + + for v in kwargs.values(): + validate_can_be_parsed_as_immutable(v, recursive=True) + + self._info.update(*args, **kwargs) + + def __getattribute__(self, name: str) -> Any: + """Allows access to info values using the `.` syntax. Example: `info.description`.""" + + if name == "_info": + return super().__getattribute__(name) + if name in self._info: + return self.__getitem__(name) + return super().__getattribute__(name) + + def __getitem__(self, key: str): + # TODO: docstring (immutability) + + value = self._info[key] + + # TODO: return immutable (tuples and stuff) + return recursively_parse_as_immutable(value) diff --git a/hub/util/immutability.py b/hub/util/immutability.py new file mode 100644 index 0000000000..cdbd42ebae --- /dev/null +++ b/hub/util/immutability.py @@ -0,0 +1,21 @@ +from typing import Any, Sequence + + +def validate_can_be_parsed_as_immutable(item: Any, recursive: bool = True): + # TODO: docstring + + if not isinstance(item, (int, float, str, list, tuple)): + raise ValueError() # TODO + + if recursive: + if isinstance(item, (list, tuple)): + for v in item: + validate_can_be_parsed_as_immutable(v, recursive=True) + + +def recursively_parse_as_immutable(item: Any): + # TODO: docstring + + validate_can_be_parsed_as_immutable(item, recursive=False) + + return item From e4915477bb2b4d5a5ba46b8a0353b4ebe8cfcf73 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 16 Jul 2021 12:17:48 -0700 Subject: [PATCH 13/33] validate is jsonable --- hub/core/meta/info.py | 13 +++++++------ hub/util/immutability.py | 18 ++++++++++-------- hub/util/json.py | 14 ++++++++++++++ 3 files changed, 31 insertions(+), 14 deletions(-) create mode 100644 hub/util/json.py diff --git a/hub/core/meta/info.py b/hub/core/meta/info.py index 267a63c613..96313db19d 100644 --- a/hub/core/meta/info.py +++ b/hub/core/meta/info.py @@ -1,3 +1,4 @@ +from hub.util.json import validate_is_jsonable from hub.util.immutability import ( recursively_parse_as_immutable, validate_can_be_parsed_as_immutable, @@ -52,11 +53,13 @@ def update(self, *args, **kwargs): if not isinstance(args[0], dict): raise ValueError(_VALUE_ERROR_MSG) - for v in args[0].values(): - validate_can_be_parsed_as_immutable(v, recursive=True) + for k, v in args[0].items(): + validate_can_be_parsed_as_immutable(v, recursive=True, key=k) + validate_is_jsonable(k, v) - for v in kwargs.values(): - validate_can_be_parsed_as_immutable(v, recursive=True) + for k, v in kwargs.items(): + validate_can_be_parsed_as_immutable(v, recursive=True, key=k) + validate_is_jsonable(k, v) self._info.update(*args, **kwargs) @@ -73,6 +76,4 @@ def __getitem__(self, key: str): # TODO: docstring (immutability) value = self._info[key] - - # TODO: return immutable (tuples and stuff) return recursively_parse_as_immutable(value) diff --git a/hub/util/immutability.py b/hub/util/immutability.py index cdbd42ebae..16e4d3f72f 100644 --- a/hub/util/immutability.py +++ b/hub/util/immutability.py @@ -1,21 +1,23 @@ -from typing import Any, Sequence +from typing import Any -def validate_can_be_parsed_as_immutable(item: Any, recursive: bool = True): +PARSABLE_TYPES = (int, float, str, list, tuple) +PARSABLE_SEQUENCE_TYPES = (list, tuple) + + +def validate_can_be_parsed_as_immutable(item: Any, recursive: bool = True, key=None): # TODO: docstring - if not isinstance(item, (int, float, str, list, tuple)): - raise ValueError() # TODO + if not isinstance(item, PARSABLE_TYPES): + raise ValueError() # TODO (mention `key`) if recursive: - if isinstance(item, (list, tuple)): + if isinstance(item, PARSABLE_SEQUENCE_TYPES): for v in item: - validate_can_be_parsed_as_immutable(v, recursive=True) + validate_can_be_parsed_as_immutable(v, recursive=True, key=key) def recursively_parse_as_immutable(item: Any): # TODO: docstring - validate_can_be_parsed_as_immutable(item, recursive=False) - return item diff --git a/hub/util/json.py b/hub/util/json.py new file mode 100644 index 0000000000..57956faa25 --- /dev/null +++ b/hub/util/json.py @@ -0,0 +1,14 @@ +from hub.util.immutability import PARSABLE_TYPES +import json +from typing import Any + + +def validate_is_jsonable(key: str, item: Any): + # TODO: docstring + + try: + json.dumps(item) + except Exception: + raise ValueError( + f"Item for key='{key}' is not JSON serializable. Allowed types: {str(PARSABLE_TYPES)} Got: type={type(item)}, item={item}" + ) From 0542988d7ad99a1832874ecc8b54f7c4c52ec518 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 16 Jul 2021 12:46:12 -0700 Subject: [PATCH 14/33] remove immutability parsing + allow json --- hub/api/tests/test_info.py | 35 ++++++++++++++++++++--------------- hub/core/meta/info.py | 18 +++++++----------- hub/util/immutability.py | 23 ----------------------- hub/util/json.py | 3 +-- 4 files changed, 28 insertions(+), 51 deletions(-) delete mode 100644 hub/util/immutability.py diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py index 414d4703db..79d5351f6d 100644 --- a/hub/api/tests/test_info.py +++ b/hub/api/tests/test_info.py @@ -6,22 +6,20 @@ def test_failures(local_ds): with pytest.raises(ValueError): ds.info.update(1, 2) - - with pytest.raises(ValueError): - ds.info.update({"test": 0}, {"test": 2}) - with pytest.raises(ValueError): ds.info.update(1) + # bad because 2 positional args with pytest.raises(ValueError): - ds.info.update({"something": {"nested": "dict"}}) - - with pytest.raises(ValueError): - ds.info.update(bad_key={"normal": "dict"}) + ds.info.update({"test": 0}, {"test": 2}) - with pytest.raises(ValueError): - # this is bad because **kwargs values cannot be dictionaries - ds.info.update({"good_key": 1}, good_key=1, bad_key={"normal": "dict"}) + # bad because **kwargs values cannot be dictionaries (TODO remove this?) + # with pytest.raises(ValueError): + # ds.info.update(bad_key={"normal": "dict"}) + # with pytest.raises(ValueError): + # ds.info.update({"good_key": 1}, good_key=1, bad_key={"normal": "dict"}) + # with pytest.raises(ValueError): + # ds.info.update({"something": {"nested": "dict"}}) # TODO: raise error when a user tries to add a numpy array @@ -53,8 +51,15 @@ def test_dataset(local_ds_generator): assert ds.info.another_key == "hello" assert ds.info.something == "bbbb" - assert ds.info.test == (1, 2, "5"), "needs to convert to tuple" - assert ds.info.test2 == (1, 5, (1, "2"), (5, 6, (7, 8))) + + # need to convert to tuples (TODO remove this?) + # assert ds.info.test == (1, 2, "5") + # assert ds.info.test2 == (1, 5, (1, "2"), (5, 6, (7, 8))) + + # TODO: remove this? + assert ds.info.test == [1, 2, "5"] + assert ds.info.test2 == [1, 5, [1, "2"], [5, 6, [7, 8]]] + assert ds.info.xyz == "abc" assert ds.info["1_-+"] == 5 # key can't be accessed with `.` syntax @@ -62,8 +67,8 @@ def test_dataset(local_ds_generator): ds = local_ds_generator() - assert len(ds.info) == 8 - assert ds.info.test == (99,) + assert len(ds.info) == 7 + assert ds.info.test == [99] def test_tensor(local_ds_generator): diff --git a/hub/core/meta/info.py b/hub/core/meta/info.py index 96313db19d..567c610d7d 100644 --- a/hub/core/meta/info.py +++ b/hub/core/meta/info.py @@ -1,8 +1,4 @@ from hub.util.json import validate_is_jsonable -from hub.util.immutability import ( - recursively_parse_as_immutable, - validate_can_be_parsed_as_immutable, -) from typing import Any from hub.util.storage_callback import CachableCallback, callback @@ -39,12 +35,11 @@ def as_dict(self) -> dict: # TODO: docstring (INTERNAL USE ONLY!) # TODO: optimize this - return {"_info": self._info.copy()} @callback() def update(self, *args, **kwargs): - # TODO: convert everything to immutable structures + # TODO: docstring (mention jsonable) if len(args) > 1: raise ValueError(_VALUE_ERROR_MSG) # TODO: exceptions.py @@ -54,11 +49,9 @@ def update(self, *args, **kwargs): raise ValueError(_VALUE_ERROR_MSG) for k, v in args[0].items(): - validate_can_be_parsed_as_immutable(v, recursive=True, key=k) validate_is_jsonable(k, v) for k, v in kwargs.items(): - validate_can_be_parsed_as_immutable(v, recursive=True, key=k) validate_is_jsonable(k, v) self._info.update(*args, **kwargs) @@ -73,7 +66,10 @@ def __getattribute__(self, name: str) -> Any: return super().__getattribute__(name) def __getitem__(self, key: str): - # TODO: docstring (immutability) + return self._info[key] + + def __str__(self): + return self._info.__str__() - value = self._info[key] - return recursively_parse_as_immutable(value) + def __repr__(self): + return self._info.__repr__() diff --git a/hub/util/immutability.py b/hub/util/immutability.py deleted file mode 100644 index 16e4d3f72f..0000000000 --- a/hub/util/immutability.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import Any - - -PARSABLE_TYPES = (int, float, str, list, tuple) -PARSABLE_SEQUENCE_TYPES = (list, tuple) - - -def validate_can_be_parsed_as_immutable(item: Any, recursive: bool = True, key=None): - # TODO: docstring - - if not isinstance(item, PARSABLE_TYPES): - raise ValueError() # TODO (mention `key`) - - if recursive: - if isinstance(item, PARSABLE_SEQUENCE_TYPES): - for v in item: - validate_can_be_parsed_as_immutable(v, recursive=True, key=key) - - -def recursively_parse_as_immutable(item: Any): - # TODO: docstring - - return item diff --git a/hub/util/json.py b/hub/util/json.py index 57956faa25..d466eed1fe 100644 --- a/hub/util/json.py +++ b/hub/util/json.py @@ -1,4 +1,3 @@ -from hub.util.immutability import PARSABLE_TYPES import json from typing import Any @@ -10,5 +9,5 @@ def validate_is_jsonable(key: str, item: Any): json.dumps(item) except Exception: raise ValueError( - f"Item for key='{key}' is not JSON serializable. Allowed types: {str(PARSABLE_TYPES)} Got: type={type(item)}, item={item}" + f"Item for key='{key}' is not JSON serializable. Got: type={type(item)}, item={item}" ) From 39f64d7afbad9aae40b0a0799103fb6329fb6eb5 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 16 Jul 2021 12:52:02 -0700 Subject: [PATCH 15/33] move CachableCallback -> hub.api --- benchmark_upload.py | 49 +++++++++++++++++++++++++++ hub/api/dataset.py | 2 +- hub/{core/meta => api}/info.py | 10 +++--- hub/core/storage/cachable.py | 61 +++++++++++++++++++++++++++++++++ hub/core/storage/lru_cache.py | 5 ++- hub/util/storage_callback.py | 62 ---------------------------------- 6 files changed, 118 insertions(+), 71 deletions(-) create mode 100644 benchmark_upload.py rename hub/{core/meta => api}/info.py (91%) delete mode 100644 hub/util/storage_callback.py diff --git a/benchmark_upload.py b/benchmark_upload.py new file mode 100644 index 0000000000..721f9d7269 --- /dev/null +++ b/benchmark_upload.py @@ -0,0 +1,49 @@ +from time import time +from shutil import rmtree +import numpy as np +import cProfile +import hub +from tqdm import tqdm + + +def upload(): + path = f"./mock_mnist" + rmtree(path, ignore_errors=True) + ds = hub.Dataset(path) + + # ds = hub.Dataset("s3://internal-datasets/dummy-mnist-NEW-META") + + # ds = hub.Dataset(f"hub://dyllan/dummy-mnist-NEW-META") + + ds.create_tensor("image", htype="image", sample_compression=None) + ds.create_tensor("label") + + # N = 60000 + N = 5000 + with ds: + times = [] + for _ in tqdm(range(N), desc="uploading"): + start = time() + # ds.image.append(np.ones((512, 512), dtype="uint8")) + ds.image.append(np.ones((1024, 1024), dtype="uint8")) + # ds.image.append(np.ones((28, 28), dtype="uint8")) + ds.label.append(np.ones((1,), dtype="uint8")) + times.append((time() - start) * 1000) + # ds.label.append(y.astype("int32")) + print("first append times (ms):", times[:10]) + print("last append times (ms):", times[-10:]) + + # tensor = ds.image + + # for i in tqdm(range(N), desc="uploading", total=N): + # shape = (28, 28, 3) + # tensor.append(np.arange(np.prod(shape), dtype=np.uint8).reshape(*shape)) + + +def main(): + cProfile.run("upload()", sort="cumulative") + # upload() + + +if __name__ == "__main__": + main() diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 268c048a44..3c556ec2e6 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -1,4 +1,4 @@ -from hub.core.meta.info import Info +from hub.api.info import Info from hub.core.storage.provider import StorageProvider from hub.core.tensor import create_tensor from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence diff --git a/hub/core/meta/info.py b/hub/api/info.py similarity index 91% rename from hub/core/meta/info.py rename to hub/api/info.py index 567c610d7d..720df18bef 100644 --- a/hub/core/meta/info.py +++ b/hub/api/info.py @@ -1,6 +1,6 @@ from hub.util.json import validate_is_jsonable from typing import Any -from hub.util.storage_callback import CachableCallback, callback +from hub.core.storage.cachable import CachableCallback, use_callback _VALUE_ERROR_MSG = '`info.update` should be called with a single dictionary or **kwargs values. Example: `info.update({"key1": 1}, key2=2, key3=3)`' @@ -21,23 +21,23 @@ def __init__(self): super().__init__() @property - @callback(check_only=True) + @use_callback(check_only=True) def nbytes(self): # TODO: optimize this return len(self.tobytes()) - @callback(check_only=True) + @use_callback(check_only=True) def __len__(self): return len(self._info) - @callback(check_only=True) + @use_callback(check_only=True) def as_dict(self) -> dict: # TODO: docstring (INTERNAL USE ONLY!) # TODO: optimize this return {"_info": self._info.copy()} - @callback() + @use_callback() def update(self, *args, **kwargs): # TODO: docstring (mention jsonable) diff --git a/hub/core/storage/cachable.py b/hub/core/storage/cachable.py index db67e5cf92..558f3b58e5 100644 --- a/hub/core/storage/cachable.py +++ b/hub/core/storage/cachable.py @@ -1,5 +1,6 @@ from abc import ABC import json +from hub.util.exceptions import CallbackInitializationError class Cachable(ABC): @@ -26,3 +27,63 @@ def frombuffer(cls, buffer: bytes): instance = cls() instance.__dict__.update(json.loads(buffer)) return instance + + +class CachableCallback(Cachable): + def __init__(self): + # TODO: docstring (warn that this may be very slow and shouldn't be used often or should be optimized) + # TODO: mention in docstring "use_callback" + + self._key = None + self._storage = None + + def _is_callback_initialized(self) -> bool: + key_ex = self._key is not None + storage_ex = self._storage is not None + return key_ex and storage_ex + + def initialize_callback_location(self, key, storage): + """Must be called once before any other method calls. + + Args: + key: The key for where in `storage` bytes are serialized with each callback call. + storage: The storage for where bytes are serialized with each callback call. + + Raises: + CallbackInitializationError: Cannot re-initialize. + """ + + if self._is_callback_initialized(): + raise CallbackInitializationError( + f"`initialize_callback_location` was already called. key={self._key}" + ) + + self._key = key + self._storage = storage + + def callback(self): + self._storage[self._key] = self + + +def use_callback(check_only: bool = False): + """Decorator for methods that should require `initialize_callback_location` to be called first.""" + + # TODO: update docstring + + def outer(func): + def inner(obj: "CachableCallback", *args, **kwargs): + if not obj._is_callback_initialized(): + raise CallbackInitializationError( + "Must first call `initialize_callback_location` before any other methods may be called." + ) + + y = func(obj, *args, **kwargs) + + if not check_only: + obj.callback() + + return y + + return inner + + return outer diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index f08d4e70c7..a2453d294e 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -1,7 +1,6 @@ from collections import OrderedDict -from hub.util.storage_callback import CachableCallback -from hub.core.storage.cachable import Cachable -from typing import Callable, Set, Union +from hub.core.storage.cachable import Cachable, CachableCallback +from typing import Set, Union from hub.core.storage.provider import StorageProvider diff --git a/hub/util/storage_callback.py b/hub/util/storage_callback.py deleted file mode 100644 index 3ce52afd1a..0000000000 --- a/hub/util/storage_callback.py +++ /dev/null @@ -1,62 +0,0 @@ -from hub.core.storage.provider import StorageProvider -from hub.util.exceptions import CallbackInitializationError -from hub.core.storage.cachable import Cachable - - -def callback(check_only: bool = False): - """Decorator for methods that should require `initialize_callback_location` to be called first.""" - - # TODO: update docstring - - def outer(func): - def inner(obj: "CachableCallback", *args, **kwargs): - if not obj._is_callback_initialized(): - raise CallbackInitializationError( - "Must first call `initialize_callback_location` before any other methods may be called." - ) - - y = func(obj, *args, **kwargs) - - if not check_only: - obj.callback() - - return y - - return inner - - return outer - - -class CachableCallback(Cachable): - def __init__(self): - # TODO: docstring (warn that this may be very slow and shouldn't be used often or should be optimized) - - self._key = None - self._storage = None - - def _is_callback_initialized(self) -> bool: - key_ex = self._key is not None - storage_ex = self._storage is not None - return key_ex and storage_ex - - def initialize_callback_location(self, key: str, storage: StorageProvider): - """Must be called once before any other method calls. - - Args: - key (str): The key for where in `storage` bytes are serialized with each callback call. - storage (LRUCache): The storage for where bytes are serialized with each callback call. - - Raises: - CallbackInitializationError: Cannot re-initialize. - """ - - if self._is_callback_initialized(): - raise CallbackInitializationError( - f"`initialize_callback_location` was already called. key={self._key}" - ) - - self._key = key - self._storage = storage - - def callback(self): - self._storage[self._key] = self From 55f99f5a5fc656d706d62926d3a59ee67bebf8f2 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 16 Jul 2021 12:52:34 -0700 Subject: [PATCH 16/33] remove benchamrk file --- benchmark_upload.py | 49 --------------------------------------------- 1 file changed, 49 deletions(-) delete mode 100644 benchmark_upload.py diff --git a/benchmark_upload.py b/benchmark_upload.py deleted file mode 100644 index 721f9d7269..0000000000 --- a/benchmark_upload.py +++ /dev/null @@ -1,49 +0,0 @@ -from time import time -from shutil import rmtree -import numpy as np -import cProfile -import hub -from tqdm import tqdm - - -def upload(): - path = f"./mock_mnist" - rmtree(path, ignore_errors=True) - ds = hub.Dataset(path) - - # ds = hub.Dataset("s3://internal-datasets/dummy-mnist-NEW-META") - - # ds = hub.Dataset(f"hub://dyllan/dummy-mnist-NEW-META") - - ds.create_tensor("image", htype="image", sample_compression=None) - ds.create_tensor("label") - - # N = 60000 - N = 5000 - with ds: - times = [] - for _ in tqdm(range(N), desc="uploading"): - start = time() - # ds.image.append(np.ones((512, 512), dtype="uint8")) - ds.image.append(np.ones((1024, 1024), dtype="uint8")) - # ds.image.append(np.ones((28, 28), dtype="uint8")) - ds.label.append(np.ones((1,), dtype="uint8")) - times.append((time() - start) * 1000) - # ds.label.append(y.astype("int32")) - print("first append times (ms):", times[:10]) - print("last append times (ms):", times[-10:]) - - # tensor = ds.image - - # for i in tqdm(range(N), desc="uploading", total=N): - # shape = (28, 28, 3) - # tensor.append(np.arange(np.prod(shape), dtype=np.uint8).reshape(*shape)) - - -def main(): - cProfile.run("upload()", sort="cumulative") - # upload() - - -if __name__ == "__main__": - main() From a1da0a9579f00e8d6b2742768f63315e198f2b7e Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 16 Jul 2021 14:34:59 -0700 Subject: [PATCH 17/33] clear paths before running tests always --- hub/tests/path_fixtures.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hub/tests/path_fixtures.py b/hub/tests/path_fixtures.py index 08a429be22..cc7a656e0e 100644 --- a/hub/tests/path_fixtures.py +++ b/hub/tests/path_fixtures.py @@ -100,6 +100,7 @@ def local_path(request): return path = _get_storage_path(request, LOCAL) + LocalProvider(path).clear() yield path @@ -115,6 +116,7 @@ def s3_path(request): return path = _get_storage_path(request, S3) + S3Provider(path).clear() yield path @@ -130,6 +132,7 @@ def hub_cloud_path(request, hub_cloud_dev_token): return path = _get_storage_path(request, HUB_CLOUD) + storage_provider_from_hub_path(path, token=hub_cloud_dev_token).clear() yield path From 46faf5a10329ed4ff0fa9f9704efa57754c5dc2f Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 16 Jul 2021 14:57:16 -0700 Subject: [PATCH 18/33] setstate/getstate for metas and pass tests --- hub/api/dataset.py | 22 +++++++--------------- hub/api/info.py | 20 ++++++++++++++++---- hub/api/tensor.py | 8 +++++++- hub/api/tests/test_info.py | 2 +- hub/core/meta/dataset_meta.py | 5 +++-- hub/core/meta/meta.py | 3 ++- hub/core/meta/tensor_meta.py | 10 +++++++--- hub/core/storage/cachable.py | 11 +++++++++-- 8 files changed, 52 insertions(+), 29 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 3c556ec2e6..48d2621f44 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -1,4 +1,4 @@ -from hub.api.info import Info +from hub.api.info import load_info from hub.core.storage.provider import StorageProvider from hub.core.tensor import create_tensor from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence @@ -107,8 +107,8 @@ def __init__( self.client = HubBackendClient(token=token) self.public = public - self._load_meta() - self._load_info() + self._load_meta() # TODO: use the same load scheme as info + self.info = load_info(get_dataset_info_key(), self.storage) hub_reporter.feature_report( feature_name="Dataset", parameters={"Path": str(self.path)} @@ -237,20 +237,12 @@ def _load_meta(self): self.flush() if self.path.startswith("hub://"): self.client.create_dataset_entry( - self.org_id, self.ds_name, self.meta.as_dict(), public=self.public + self.org_id, + self.ds_name, + self.meta.__getstate__(), + public=self.public, ) - def _load_info(self): - info_key = get_dataset_info_key() - - if info_key in self.storage: - self.info = self.storage.get_cachable(info_key, Info) - else: - self.info = Info() - self.info.initialize_callback_location(info_key, self.storage) - - return self.info - @property def read_only(self): return self._read_only diff --git a/hub/api/info.py b/hub/api/info.py index 720df18bef..bbbf0c4958 100644 --- a/hub/api/info.py +++ b/hub/api/info.py @@ -1,5 +1,6 @@ +from hub.core.storage.provider import StorageProvider from hub.util.json import validate_is_jsonable -from typing import Any +from typing import Any, Dict from hub.core.storage.cachable import CachableCallback, use_callback @@ -31,11 +32,12 @@ def __len__(self): return len(self._info) @use_callback(check_only=True) - def as_dict(self) -> dict: + def __getstate__(self) -> Dict[str, Any]: # TODO: docstring (INTERNAL USE ONLY!) + return self._info - # TODO: optimize this - return {"_info": self._info.copy()} + def __setstate__(self, state: Dict[str, Any]): + self._info = state.copy() @use_callback() def update(self, *args, **kwargs): @@ -73,3 +75,13 @@ def __str__(self): def __repr__(self): return self._info.__repr__() + + +def load_info(info_key: str, storage: StorageProvider): + if info_key in storage: + info = storage.get_cachable(info_key, Info) + else: + info = Info() + info.initialize_callback_location(info_key, storage) + + return info diff --git a/hub/api/tensor.py b/hub/api/tensor.py index feaa3daea0..1877f32734 100644 --- a/hub/api/tensor.py +++ b/hub/api/tensor.py @@ -1,4 +1,8 @@ -from hub.util.keys import get_chunk_id_encoder_key, get_tensor_meta_key, tensor_exists +from hub.api.info import load_info +from hub.util.keys import ( + get_tensor_info_key, + tensor_exists, +) from hub.core.sample import Sample # type: ignore from typing import List, Sequence, Union, Optional, Tuple, Dict from hub.util.shape import ShapeInterval @@ -46,6 +50,8 @@ def __init__( self.chunk_engine = ChunkEngine(self.key, self.storage) + self.info = load_info(get_tensor_info_key(self.key), self.storage) + def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]): """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array, or a sequence of `hub.load` outputs, which can be used to load files. See examples down below. diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py index 79d5351f6d..af8e05f456 100644 --- a/hub/api/tests/test_info.py +++ b/hub/api/tests/test_info.py @@ -103,7 +103,7 @@ def test_tensor(local_ds_generator): t1 = ds.tensor1 t2 = ds.tensor2 - assert len(t1.info) == 2 + assert len(t1.info) == 1 assert len(t2.info) == 2 assert t1.info.key == 99 diff --git a/hub/core/meta/dataset_meta.py b/hub/core/meta/dataset_meta.py index f824d896c5..5ca988fbf3 100644 --- a/hub/core/meta/dataset_meta.py +++ b/hub/core/meta/dataset_meta.py @@ -1,3 +1,4 @@ +from typing import Any, Dict from hub.core.meta.meta import Meta @@ -12,7 +13,7 @@ def nbytes(self): # TODO: can optimize this return len(self.tobytes()) - def as_dict(self) -> dict: - d = super().as_dict() + def __getstate__(self) -> Dict[str, Any]: + d = super().__getstate__() d["tensors"] = self.tensors return d diff --git a/hub/core/meta/meta.py b/hub/core/meta/meta.py index bc9bfa55ff..64e4838690 100644 --- a/hub/core/meta/meta.py +++ b/hub/core/meta/meta.py @@ -1,3 +1,4 @@ +from typing import Any, Dict import hub from hub.core.storage.cachable import Cachable @@ -10,5 +11,5 @@ def __init__(self): self.version = hub.__version__ - def as_dict(self) -> dict: + def __getstate__(self) -> Dict[str, Any]: return {"version": self.version} diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index 8ffabea460..9bb1bb59b4 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, List, Tuple +from typing import Any, Callable, Dict, List, Tuple import numpy as np from hub.util.exceptions import ( TensorInvalidSampleShapeError, @@ -171,14 +171,18 @@ def _update_shape_interval(self, shape: Tuple[int, ...]): self.min_shape[i] = min(dim, self.min_shape[i]) self.max_shape[i] = max(dim, self.max_shape[i]) - def as_dict(self): - d = super().as_dict() + def __getstate__(self) -> Dict[str, Any]: + d = super().__getstate__() for key in self._required_meta_keys: d[key] = getattr(self, key) return d + def __setstate__(self, state: Dict[str, Any]): + super().__setstate__(state) + self._required_meta_keys = tuple(state.keys()) + @property def nbytes(self): # TODO: optimize this diff --git a/hub/core/storage/cachable.py b/hub/core/storage/cachable.py index 558f3b58e5..a055201e6d 100644 --- a/hub/core/storage/cachable.py +++ b/hub/core/storage/cachable.py @@ -1,5 +1,6 @@ from abc import ABC import json +from typing import Any, Dict from hub.util.exceptions import CallbackInitializationError @@ -19,13 +20,19 @@ def nbytes(self): # do not implement, each class should do this because it could be very slow if `tobytes` is called raise NotImplementedError + def __getstate__(self) -> Dict[str, Any]: + raise NotImplementedError + + def __setstate__(self, state: Dict[str, Any]): + self.__dict__.update(state) + def tobytes(self) -> bytes: - return bytes(json.dumps(self.as_dict()), "utf-8") + return bytes(json.dumps(self.__getstate__()), "utf-8") @classmethod def frombuffer(cls, buffer: bytes): instance = cls() - instance.__dict__.update(json.loads(buffer)) + instance.__setstate__(json.loads(buffer)) return instance From e792f99293762a8e723966bb6a64db3231961c9f Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 16 Jul 2021 14:58:58 -0700 Subject: [PATCH 19/33] mypy --- hub/api/dataset.py | 2 +- hub/api/info.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 48d2621f44..0b52d17ace 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -108,7 +108,7 @@ def __init__( self.public = public self._load_meta() # TODO: use the same load scheme as info - self.info = load_info(get_dataset_info_key(), self.storage) + self.info = load_info(get_dataset_info_key(), self.storage) # type: ignore hub_reporter.feature_report( feature_name="Dataset", parameters={"Path": str(self.path)} diff --git a/hub/api/info.py b/hub/api/info.py index bbbf0c4958..7ffd101e38 100644 --- a/hub/api/info.py +++ b/hub/api/info.py @@ -1,4 +1,4 @@ -from hub.core.storage.provider import StorageProvider +from hub.core.storage.lru_cache import LRUCache from hub.util.json import validate_is_jsonable from typing import Any, Dict from hub.core.storage.cachable import CachableCallback, use_callback @@ -22,7 +22,6 @@ def __init__(self): super().__init__() @property - @use_callback(check_only=True) def nbytes(self): # TODO: optimize this return len(self.tobytes()) @@ -77,7 +76,7 @@ def __repr__(self): return self._info.__repr__() -def load_info(info_key: str, storage: StorageProvider): +def load_info(info_key: str, storage: LRUCache): if info_key in storage: info = storage.get_cachable(info_key, Info) else: From d7745e785fd40880a2e45e0aefd115a805e3967d Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 16 Jul 2021 15:23:49 -0700 Subject: [PATCH 20/33] readonly tests & others --- hub/api/info.py | 32 +++++++---------- hub/api/tests/test_info.py | 47 +++++++++++------------- hub/api/tests/test_readonly.py | 3 ++ hub/core/storage/cachable.py | 66 ++++++++++++++++++---------------- 4 files changed, 71 insertions(+), 77 deletions(-) diff --git a/hub/api/info.py b/hub/api/info.py index 7ffd101e38..f6798c8d66 100644 --- a/hub/api/info.py +++ b/hub/api/info.py @@ -13,7 +13,7 @@ def __init__(self): See the `Meta` class for required key/values for datasets/tensors. Note: - Since `Info` is rarely written to and mostly by the user, every modifier will call `storage[key] = self`. + Since `Info` is rarely written to and mostly by the user, every modifier will call `cache[key] = self`. This is so the user doesn't have to call `flush`. Must call `initialize_callback_location` before using any methods. """ @@ -32,29 +32,21 @@ def __len__(self): @use_callback(check_only=True) def __getstate__(self) -> Dict[str, Any]: - # TODO: docstring (INTERNAL USE ONLY!) return self._info def __setstate__(self, state: Dict[str, Any]): - self._info = state.copy() + self._info = state @use_callback() def update(self, *args, **kwargs): - # TODO: docstring (mention jsonable) + """Updates info and synchronizes with cache. Inputs must be supported by JSON. + A full list of supported value types can be found here: https://docs.python.org/3/library/json.html#json.JSONEncoder. - if len(args) > 1: - raise ValueError(_VALUE_ERROR_MSG) # TODO: exceptions.py - - if len(args) == 1: - if not isinstance(args[0], dict): - raise ValueError(_VALUE_ERROR_MSG) - - for k, v in args[0].items(): - validate_is_jsonable(k, v) - - for k, v in kwargs.items(): - validate_is_jsonable(k, v) + Note: + This method has the same functionality as `dict().update(...)` Reference: https://www.geeksforgeeks.org/python-dictionary-update-method/. + """ + self._cache.check_readonly() self._info.update(*args, **kwargs) def __getattribute__(self, name: str) -> Any: @@ -76,11 +68,11 @@ def __repr__(self): return self._info.__repr__() -def load_info(info_key: str, storage: LRUCache): - if info_key in storage: - info = storage.get_cachable(info_key, Info) +def load_info(info_key: str, cache: LRUCache): + if info_key in cache: + info = cache.get_cachable(info_key, Info) else: info = Info() - info.initialize_callback_location(info_key, storage) + info.initialize_callback_location(info_key, cache) return info diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py index af8e05f456..59dcb1edf8 100644 --- a/hub/api/tests/test_info.py +++ b/hub/api/tests/test_info.py @@ -1,29 +1,3 @@ -import pytest - - -def test_failures(local_ds): - ds = local_ds - - with pytest.raises(ValueError): - ds.info.update(1, 2) - with pytest.raises(ValueError): - ds.info.update(1) - - # bad because 2 positional args - with pytest.raises(ValueError): - ds.info.update({"test": 0}, {"test": 2}) - - # bad because **kwargs values cannot be dictionaries (TODO remove this?) - # with pytest.raises(ValueError): - # ds.info.update(bad_key={"normal": "dict"}) - # with pytest.raises(ValueError): - # ds.info.update({"good_key": 1}, good_key=1, bad_key={"normal": "dict"}) - # with pytest.raises(ValueError): - # ds.info.update({"something": {"nested": "dict"}}) - - # TODO: raise error when a user tries to add a numpy array - - def test_dataset(local_ds_generator): ds = local_ds_generator() @@ -107,3 +81,24 @@ def test_tensor(local_ds_generator): assert len(t2.info) == 2 assert t1.info.key == 99 + + +def test_update_reference_manually(local_ds_generator): + """Right now synchronization can only happen when you call `info.update`.""" + + ds = local_ds_generator() + + ds.info.update(key=[1, 2, 3]) + + ds = local_ds_generator() + + l = ds.info.key + assert l == [1, 2, 3] + + # un-registered update + l.append(5) + + ds = local_ds_generator() + + l = ds.info.key + assert l == [1, 2, 3] diff --git a/hub/api/tests/test_readonly.py b/hub/api/tests/test_readonly.py index 6bd0c361ff..9e2f78cabf 100644 --- a/hub/api/tests/test_readonly.py +++ b/hub/api/tests/test_readonly.py @@ -29,6 +29,9 @@ def test_readonly(local_ds_generator): ds.read_only = True _assert_readonly_ops(ds, 1, (100, 100)) + with pytest.raises(ReadOnlyModeError): + ds.info.update(key=0) + @pytest.mark.xfail(raises=CouldNotCreateNewDatasetException, strict=True) def test_readonly_doesnt_exist(local_path): diff --git a/hub/core/storage/cachable.py b/hub/core/storage/cachable.py index a055201e6d..dabc61c4c7 100644 --- a/hub/core/storage/cachable.py +++ b/hub/core/storage/cachable.py @@ -36,25 +36,49 @@ def frombuffer(cls, buffer: bytes): return instance +def use_callback(check_only: bool = False): + """Decorator for methods that should require `initialize_callback_location` to be called first.""" + + # TODO: update docstring + + def outer(func): + def inner(obj: "CachableCallback", *args, **kwargs): + if not obj._is_callback_initialized(): + raise CallbackInitializationError( + "Must first call `initialize_callback_location` before any other methods may be called." + ) + + y = func(obj, *args, **kwargs) + + if not check_only: + obj.callback() + + return y + + return inner + + return outer + + class CachableCallback(Cachable): def __init__(self): # TODO: docstring (warn that this may be very slow and shouldn't be used often or should be optimized) # TODO: mention in docstring "use_callback" self._key = None - self._storage = None + self._cache = None def _is_callback_initialized(self) -> bool: key_ex = self._key is not None - storage_ex = self._storage is not None - return key_ex and storage_ex + cache_ex = self._cache is not None + return key_ex and cache_ex - def initialize_callback_location(self, key, storage): + def initialize_callback_location(self, key, cache): """Must be called once before any other method calls. Args: - key: The key for where in `storage` bytes are serialized with each callback call. - storage: The storage for where bytes are serialized with each callback call. + key: The key for where in `cache` bytes are serialized with each callback call. + cache: The cache for where bytes are serialized with each callback call. Raises: CallbackInitializationError: Cannot re-initialize. @@ -66,31 +90,11 @@ def initialize_callback_location(self, key, storage): ) self._key = key - self._storage = storage + self._cache = cache def callback(self): - self._storage[self._key] = self + self._cache[self._key] = self - -def use_callback(check_only: bool = False): - """Decorator for methods that should require `initialize_callback_location` to be called first.""" - - # TODO: update docstring - - def outer(func): - def inner(obj: "CachableCallback", *args, **kwargs): - if not obj._is_callback_initialized(): - raise CallbackInitializationError( - "Must first call `initialize_callback_location` before any other methods may be called." - ) - - y = func(obj, *args, **kwargs) - - if not check_only: - obj.callback() - - return y - - return inner - - return outer + @use_callback(check_only=True) + def flush(self): + self._cache.flush() From 723a61a8e1d520679c16aea5d99b5a00cb15921a Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 19 Jul 2021 12:02:05 -0700 Subject: [PATCH 21/33] `update` docstring/example --- hub/api/info.py | 35 ++++++++++++++++++++++++++++++----- hub/api/tests/test_info.py | 9 +++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/hub/api/info.py b/hub/api/info.py index f6798c8d66..c67017b77d 100644 --- a/hub/api/info.py +++ b/hub/api/info.py @@ -4,9 +4,6 @@ from hub.core.storage.cachable import CachableCallback, use_callback -_VALUE_ERROR_MSG = '`info.update` should be called with a single dictionary or **kwargs values. Example: `info.update({"key1": 1}, key2=2, key3=3)`' - - class Info(CachableCallback): def __init__(self): """Contains **optional** key/values that datasets/tensors use for human-readability. @@ -39,11 +36,39 @@ def __setstate__(self, state: Dict[str, Any]): @use_callback() def update(self, *args, **kwargs): - """Updates info and synchronizes with cache. Inputs must be supported by JSON. - A full list of supported value types can be found here: https://docs.python.org/3/library/json.html#json.JSONEncoder. + """Store optional dataset/tensor information. Will be accessible after loading your data from a new script! + Inputs must be supported by JSON. + Note: This method has the same functionality as `dict().update(...)` Reference: https://www.geeksforgeeks.org/python-dictionary-update-method/. + A full list of supported value types can be found here: https://docs.python.org/3/library/json.html#json.JSONEncoder. + + Examples: + Normal update usage: + >>> ds.info + {} + >>> ds.info.update(key=0) + >>> ds.info + {"key": 0} + >>> ds.info.update({"key1": 5, "key2": [1, 2, "test"]}) + >>> ds.info + {"key": 0, "key1": 5, "key2": [1, 2, "test"]} + + Alternate update usage: + >>> ds.info + {} + >>> ds.info.update(list=[1, 2, "apple"]) + >>> ds.info + {"list": [1, 2, "apple"]} + >>> l = ds.info.list + >>> l + [1, 2, "apple"] + >>> l.append(5) + >>> l + [1, 2, "apple", 5] + >>> ds.info.update() # required to be persistent! + """ self._cache.check_readonly() diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py index 59dcb1edf8..19de4c8a0a 100644 --- a/hub/api/tests/test_info.py +++ b/hub/api/tests/test_info.py @@ -97,8 +97,17 @@ def test_update_reference_manually(local_ds_generator): # un-registered update l.append(5) + assert ds.info.key == [1, 2, 3, 5] ds = local_ds_generator() l = ds.info.key assert l == [1, 2, 3] + + # registered update + l.append(99) + ds.info.update() + + ds = local_ds_generator() + + assert l == [1, 2, 3, 99] From 173da0329d32497cfc2bb58507191b16bc328dfd Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 19 Jul 2021 12:15:22 -0700 Subject: [PATCH 22/33] test with --- hub/api/tests/test_info.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py index 19de4c8a0a..6d3fdfd023 100644 --- a/hub/api/tests/test_info.py +++ b/hub/api/tests/test_info.py @@ -13,9 +13,11 @@ def test_dataset(local_ds_generator): ds.info.update(test=[1, 2, "5"]) + test_list = ds.info.test with ds: ds.info.update({"test2": (1, 5, (1, "2"), [5, 6, (7, 8)])}) ds.info.update(xyz="abc") + test_list.extend(["user made change without `update`"]) ds.info.update({"1_-+": 5}) @@ -31,7 +33,7 @@ def test_dataset(local_ds_generator): # assert ds.info.test2 == (1, 5, (1, "2"), (5, 6, (7, 8))) # TODO: remove this? - assert ds.info.test == [1, 2, "5"] + assert ds.info.test == [1, 2, "5", "user made change without `update`"] assert ds.info.test2 == [1, 5, [1, "2"], [5, 6, [7, 8]]] assert ds.info.xyz == "abc" From fbed70935b7b716992b69456580e39d9610e612b Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 19 Jul 2021 12:16:48 -0700 Subject: [PATCH 23/33] remove flush statement --- hub/api/info.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hub/api/info.py b/hub/api/info.py index c67017b77d..e1ba9b2586 100644 --- a/hub/api/info.py +++ b/hub/api/info.py @@ -11,7 +11,6 @@ def __init__(self): Note: Since `Info` is rarely written to and mostly by the user, every modifier will call `cache[key] = self`. - This is so the user doesn't have to call `flush`. Must call `initialize_callback_location` before using any methods. """ From 930a0ed7f3d451b743b0a1a5b349b6c3957f6239 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 19 Jul 2021 12:24:22 -0700 Subject: [PATCH 24/33] remove TODOs and add tensor info readonly test --- hub/api/tests/test_info.py | 5 ----- hub/api/tests/test_readonly.py | 3 +++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py index 6d3fdfd023..c2cd25be32 100644 --- a/hub/api/tests/test_info.py +++ b/hub/api/tests/test_info.py @@ -28,11 +28,6 @@ def test_dataset(local_ds_generator): assert ds.info.another_key == "hello" assert ds.info.something == "bbbb" - # need to convert to tuples (TODO remove this?) - # assert ds.info.test == (1, 2, "5") - # assert ds.info.test2 == (1, 5, (1, "2"), (5, 6, (7, 8))) - - # TODO: remove this? assert ds.info.test == [1, 2, "5", "user made change without `update`"] assert ds.info.test2 == [1, 5, [1, "2"], [5, 6, [7, 8]]] diff --git a/hub/api/tests/test_readonly.py b/hub/api/tests/test_readonly.py index 9e2f78cabf..901423b5f2 100644 --- a/hub/api/tests/test_readonly.py +++ b/hub/api/tests/test_readonly.py @@ -32,6 +32,9 @@ def test_readonly(local_ds_generator): with pytest.raises(ReadOnlyModeError): ds.info.update(key=0) + with pytest.raises(ReadOnlyModeError): + ds.tensor.info.update(key=0) + @pytest.mark.xfail(raises=CouldNotCreateNewDatasetException, strict=True) def test_readonly_doesnt_exist(local_path): From a212d375ad660adb08a6b55a9f61066f77b966a5 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 19 Jul 2021 12:32:35 -0700 Subject: [PATCH 25/33] CachableCallback decorator --- hub/core/storage/cachable.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hub/core/storage/cachable.py b/hub/core/storage/cachable.py index dabc61c4c7..c412a416d1 100644 --- a/hub/core/storage/cachable.py +++ b/hub/core/storage/cachable.py @@ -62,8 +62,14 @@ def inner(obj: "CachableCallback", *args, **kwargs): class CachableCallback(Cachable): def __init__(self): - # TODO: docstring (warn that this may be very slow and shouldn't be used often or should be optimized) - # TODO: mention in docstring "use_callback" + """CachableCallback objects can be stored in memory cache and when modifier methods are called, this class is synchronized + with the cache. This means the user doesn't have to do `ds.cache[cache_key] = ds.info`. + + Note: + This class should be used as infrequently as possible, as it may lead to slowdowns. + When extending this class, methods that should have a callback called should be decorated with + `@use_callback()`. + """ self._key = None self._cache = None From d1f6faf249f1b9d832a777ae190a27304a2afc38 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 19 Jul 2021 12:34:36 -0700 Subject: [PATCH 26/33] formatting --- hub/util/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/util/exceptions.py b/hub/util/exceptions.py index 5320b2f31a..299f57f648 100644 --- a/hub/util/exceptions.py +++ b/hub/util/exceptions.py @@ -456,4 +456,4 @@ def __init__(self): class CallbackInitializationError(Exception): - pass \ No newline at end of file + pass From 5959dfb5f7fcdab150127f3a98082dd8db3bd984 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 19 Jul 2021 12:36:54 -0700 Subject: [PATCH 27/33] validate json docstring --- hub/util/json.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/hub/util/json.py b/hub/util/json.py index d466eed1fe..9638cb1e2b 100644 --- a/hub/util/json.py +++ b/hub/util/json.py @@ -3,7 +3,15 @@ def validate_is_jsonable(key: str, item: Any): - # TODO: docstring + """Validates if `item` can be parsed with the `json` package. + + Args: + key (str): Key for the item. This is printed in the exception. + item (Any): `item` that should be parsable with the `json` package. + + Raises: + ValueError: If `item` is not `json` parsable. + """ try: json.dumps(item) From 4cfc707099aa089621b6454c9e6d638be31d82ad Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Tue, 20 Jul 2021 10:30:03 -0700 Subject: [PATCH 28/33] read_only mode xfail pass --- hub/api/dataset.py | 10 ++++------ hub/core/storage/tests/test_readonly.py | 3 +-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index c3f0af19e9..ee297abc0a 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -252,14 +252,12 @@ def _load_meta(self): raise PathNotEmptyException else: - self.meta = DatasetMeta() - - try: - self.storage[meta_key] = self.meta - except ReadOnlyModeError: - # if this is thrown, that means the dataset doesn't exist and the user has no write access. + if self.read_only: + # cannot create a new dataset when in read_only mode. raise CouldNotCreateNewDatasetException(self.path) + self.meta = DatasetMeta() + self.storage[meta_key] = self.meta self.flush() if self.path.startswith("hub://"): self.client.create_dataset_entry( diff --git a/hub/core/storage/tests/test_readonly.py b/hub/core/storage/tests/test_readonly.py index c5005f4da8..d61d704f52 100644 --- a/hub/core/storage/tests/test_readonly.py +++ b/hub/core/storage/tests/test_readonly.py @@ -1,6 +1,5 @@ import pytest -from hub.util.exceptions import CouldNotCreateNewDatasetException, ReadOnlyModeError -from hub import Dataset +from hub.util.exceptions import ReadOnlyModeError from hub.tests.storage_fixtures import enabled_storages From 59a99dd0f9089b6d2f1fe9194463993ce1157f9e Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Tue, 20 Jul 2021 11:19:52 -0700 Subject: [PATCH 29/33] move meta / info loading into set derived attribtues --- hub/api/dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index ee297abc0a..3db662cd60 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -95,13 +95,11 @@ def __init__( self.tensors: Dict[str, Tensor] = {} self._token = token self.public = public - self._load_meta() # TODO: use the same load scheme as info - self.info = load_info(get_dataset_info_key(), self.storage) # type: ignore - self.index.validate(self.num_samples) hub_reporter.feature_report( feature_name="Dataset", parameters={"Path": str(self.path)} ) + self._set_derived_attributes() def __enter__(self): @@ -334,13 +332,15 @@ def _get_total_meta(self): def _set_derived_attributes(self): """Sets derived attributes during init and unpickling.""" + self.storage.autoflush = True if self.path.startswith("hub://"): split_path = self.path.split("/") self.org_id, self.ds_name = split_path[2], split_path[3] self.client = HubBackendClient(token=self._token) - self._load_meta() + self._load_meta() # TODO: use the same scheme as `load_info` + self.info = load_info(get_dataset_info_key(), self.storage) # type: ignore self.index.validate(self.num_samples) hub_reporter.feature_report( From cd06615fbb0963ed5ce932dfba11249dd1574c3f Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Tue, 20 Jul 2021 11:21:07 -0700 Subject: [PATCH 30/33] remove extra feature rport --- hub/api/dataset.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 3db662cd60..97e6c9dcfe 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -96,10 +96,6 @@ def __init__( self._token = token self.public = public - hub_reporter.feature_report( - feature_name="Dataset", parameters={"Path": str(self.path)} - ) - self._set_derived_attributes() def __enter__(self): From ba8095eeaa583ab5f953dcb1921aa0d3b4fa6a92 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Tue, 20 Jul 2021 11:26:15 -0700 Subject: [PATCH 31/33] update docstring --- hub/core/storage/cachable.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hub/core/storage/cachable.py b/hub/core/storage/cachable.py index c412a416d1..7ba8bf509c 100644 --- a/hub/core/storage/cachable.py +++ b/hub/core/storage/cachable.py @@ -37,9 +37,15 @@ def frombuffer(cls, buffer: bytes): def use_callback(check_only: bool = False): - """Decorator for methods that should require `initialize_callback_location` to be called first.""" + """Decorator function for `CachableCallback` and it's subclasses. + Use this decorator on a field method that should use the `CachableCallback.callback` method. - # TODO: update docstring + All methods that are decorated will require that `CachableCallback.initialize_callback_location` + is called first. Also, after the function executes, `CachableCallback.callback` is called. + + Args: + check_only (bool, optional): If True, the callback is not actually called. Only the requirement check is executed. Defaults to False. + """ def outer(func): def inner(obj: "CachableCallback", *args, **kwargs): From f447bda8aef7f3d597bfb2a99b5c32386c9009fb Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Tue, 20 Jul 2021 12:00:54 -0700 Subject: [PATCH 32/33] fix darglint --- hub/core/storage/cachable.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/hub/core/storage/cachable.py b/hub/core/storage/cachable.py index 7ba8bf509c..24bda46b5d 100644 --- a/hub/core/storage/cachable.py +++ b/hub/core/storage/cachable.py @@ -38,13 +38,18 @@ def frombuffer(cls, buffer: bytes): def use_callback(check_only: bool = False): """Decorator function for `CachableCallback` and it's subclasses. - Use this decorator on a field method that should use the `CachableCallback.callback` method. - All methods that are decorated will require that `CachableCallback.initialize_callback_location` - is called first. Also, after the function executes, `CachableCallback.callback` is called. + Note: + Must call `@use_callback()` not `@use_callback`. + Use this decorator on a field method that should use the `CachableCallback.callback` method. + All methods that are decorated will require that `CachableCallback.initialize_callback_location` + is called first. Also, after the function executes, `CachableCallback.callback` is called. Args: - check_only (bool, optional): If True, the callback is not actually called. Only the requirement check is executed. Defaults to False. + check_only (bool): If True, the callback is not actually called. Only the requirement check is executed. Defaults to False. + + Returns: + Decorator function. """ def outer(func): From 19160c50288a4c1d5a87546407dfdc745f188fb8 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Wed, 21 Jul 2021 10:45:22 -0700 Subject: [PATCH 33/33] minor comments --- hub/api/info.py | 1 - hub/constants.py | 4 ++-- hub/core/storage/cachable.py | 6 +++--- hub/util/json.py | 21 --------------------- 4 files changed, 5 insertions(+), 27 deletions(-) delete mode 100644 hub/util/json.py diff --git a/hub/api/info.py b/hub/api/info.py index e1ba9b2586..383edb7716 100644 --- a/hub/api/info.py +++ b/hub/api/info.py @@ -1,5 +1,4 @@ from hub.core.storage.lru_cache import LRUCache -from hub.util.json import validate_is_jsonable from typing import Any, Dict from hub.core.storage.cachable import CachableCallback, use_callback diff --git a/hub/constants.py b/hub/constants.py index e4c6e7bc97..52787be452 100644 --- a/hub/constants.py +++ b/hub/constants.py @@ -35,11 +35,11 @@ DEFAULT_LOCAL_CACHE_SIZE = 0 -# meta is 100% required hub-defined meta +# meta is hub-defined information, necessary for hub Datasets/Tensors to function DATASET_META_FILENAME = "dataset_meta.json" TENSOR_META_FILENAME = "tensor_meta.json" -# info is 100% optional user-defined information +# info is user-defined information, entirely optional. may be used by the visualizer DATASET_INFO_FILENAME = "dataset_info.json" TENSOR_INFO_FILENAME = "tensor_info.json" diff --git a/hub/core/storage/cachable.py b/hub/core/storage/cachable.py index 24bda46b5d..4a40f0c6c2 100644 --- a/hub/core/storage/cachable.py +++ b/hub/core/storage/cachable.py @@ -86,9 +86,9 @@ def __init__(self): self._cache = None def _is_callback_initialized(self) -> bool: - key_ex = self._key is not None - cache_ex = self._cache is not None - return key_ex and cache_ex + key_exists = self._key is not None + cache_exists = self._cache is not None + return key_exists and cache_exists def initialize_callback_location(self, key, cache): """Must be called once before any other method calls. diff --git a/hub/util/json.py b/hub/util/json.py deleted file mode 100644 index 9638cb1e2b..0000000000 --- a/hub/util/json.py +++ /dev/null @@ -1,21 +0,0 @@ -import json -from typing import Any - - -def validate_is_jsonable(key: str, item: Any): - """Validates if `item` can be parsed with the `json` package. - - Args: - key (str): Key for the item. This is printed in the exception. - item (Any): `item` that should be parsable with the `json` package. - - Raises: - ValueError: If `item` is not `json` parsable. - """ - - try: - json.dumps(item) - except Exception: - raise ValueError( - f"Item for key='{key}' is not JSON serializable. Got: type={type(item)}, item={item}" - )