Merge pull request #1066 from activeloopai/task/udpate-meta-api

dataset/tensor `info` alongside `meta`
activeloopai · Jul 21, 2021 · 35224df · 35224df
2 parents 2047f2a + 19160c5
commit 35224df
Show file tree

Hide file tree

Showing 18 changed files with 429 additions and 35 deletions.
diff --git a/hub/api/dataset.py b/hub/api/dataset.py
@@ -1,3 +1,4 @@
+from hub.api.info import load_info
 from hub.core.storage.provider import StorageProvider
 from hub.core.tensor import create_tensor
 from typing import Any, Callable, Dict, Optional, Union, Tuple, List, Sequence
@@ -11,7 +12,12 @@
 
 from hub.core.index import Index
 from hub.integrations import dataset_to_tensorflow
-from hub.util.keys import dataset_exists, get_dataset_meta_key, tensor_exists
+from hub.util.keys import (
+    dataset_exists,
+    get_dataset_info_key,
+    get_dataset_meta_key,
+    tensor_exists,
+)
 from hub.util.bugout_reporter import hub_reporter
 from hub.util.cache_chain import generate_chain
 from hub.util.exceptions import (
@@ -89,6 +95,7 @@ def __init__(
         self.tensors: Dict[str, Tensor] = {}
         self._token = token
         self.public = public
+
         self._set_derived_attributes()
 
     def __enter__(self):
@@ -239,18 +246,19 @@ def _load_meta(self):
             raise PathNotEmptyException
 
         else:
-            self.meta = DatasetMeta()
-
-            try:
-                self.storage[meta_key] = self.meta
-            except ReadOnlyModeError:
-                # if this is thrown, that means the dataset doesn't exist and the user has no write access.
+            if self.read_only:
+                # cannot create a new dataset when in read_only mode.
                 raise CouldNotCreateNewDatasetException(self.path)
 
+            self.meta = DatasetMeta()
+            self.storage[meta_key] = self.meta
             self.flush()
             if self.path.startswith("hub://"):
                 self.client.create_dataset_entry(
-                    self.org_id, self.ds_name, self.meta.as_dict(), public=self.public
+                    self.org_id,
+                    self.ds_name,
+                    self.meta.__getstate__(),
+                    public=self.public,
                 )
 
     @property
@@ -320,13 +328,15 @@ def _get_total_meta(self):
 
     def _set_derived_attributes(self):
         """Sets derived attributes during init and unpickling."""
+
         self.storage.autoflush = True
         if self.path.startswith("hub://"):
             split_path = self.path.split("/")
             self.org_id, self.ds_name = split_path[2], split_path[3]
             self.client = HubBackendClient(token=self._token)
 
-        self._load_meta()
+        self._load_meta()  # TODO: use the same scheme as `load_info`
+        self.info = load_info(get_dataset_info_key(), self.storage)  # type: ignore
         self.index.validate(self.num_samples)
 
         hub_reporter.feature_report(

diff --git a/hub/api/info.py b/hub/api/info.py
@@ -0,0 +1,101 @@
+from hub.core.storage.lru_cache import LRUCache
+from typing import Any, Dict
+from hub.core.storage.cachable import CachableCallback, use_callback
+
+
+class Info(CachableCallback):
+    def __init__(self):
+        """Contains **optional** key/values that datasets/tensors use for human-readability.
+        See the `Meta` class for required key/values for datasets/tensors.
+
+        Note:
+            Since `Info` is rarely written to and mostly by the user, every modifier will call `cache[key] = self`.
+            Must call `initialize_callback_location` before using any methods.
+        """
+
+        self._info = {}
+        super().__init__()
+
+    @property
+    def nbytes(self):
+        # TODO: optimize this
+        return len(self.tobytes())
+
+    @use_callback(check_only=True)
+    def __len__(self):
+        return len(self._info)
+
+    @use_callback(check_only=True)
+    def __getstate__(self) -> Dict[str, Any]:
+        return self._info
+
+    def __setstate__(self, state: Dict[str, Any]):
+        self._info = state
+
+    @use_callback()
+    def update(self, *args, **kwargs):
+        """Store optional dataset/tensor information. Will be accessible after loading your data from a new script!
+        Inputs must be supported by JSON.
+
+
+        Note:
+            This method has the same functionality as `dict().update(...)` Reference: https://www.geeksforgeeks.org/python-dictionary-update-method/.
+            A full list of supported value types can be found here: https://docs.python.org/3/library/json.html#json.JSONEncoder.
+
+        Examples:
+            Normal update usage:
+                >>> ds.info
+                {}
+                >>> ds.info.update(key=0)
+                >>> ds.info
+                {"key": 0}
+                >>> ds.info.update({"key1": 5, "key2": [1, 2, "test"]})
+                >>> ds.info
+                {"key": 0, "key1": 5, "key2": [1, 2, "test"]}
+
+            Alternate update usage:
+                >>> ds.info
+                {}
+                >>> ds.info.update(list=[1, 2, "apple"])
+                >>> ds.info
+                {"list": [1, 2, "apple"]}
+                >>> l = ds.info.list
+                >>> l
+                [1, 2, "apple"]
+                >>> l.append(5)
+                >>> l
+                [1, 2, "apple", 5]
+                >>> ds.info.update()  # required to be persistent!
+
+        """
+
+        self._cache.check_readonly()
+        self._info.update(*args, **kwargs)
+
+    def __getattribute__(self, name: str) -> Any:
+        """Allows access to info values using the `.` syntax. Example: `info.description`."""
+
+        if name == "_info":
+            return super().__getattribute__(name)
+        if name in self._info:
+            return self.__getitem__(name)
+        return super().__getattribute__(name)
+
+    def __getitem__(self, key: str):
+        return self._info[key]
+
+    def __str__(self):
+        return self._info.__str__()
+
+    def __repr__(self):
+        return self._info.__repr__()
+
+
+def load_info(info_key: str, cache: LRUCache):
+    if info_key in cache:
+        info = cache.get_cachable(info_key, Info)
+    else:
+        info = Info()
+        info.initialize_callback_location(info_key, cache)
+
+    return info
diff --git a/hub/api/tensor.py b/hub/api/tensor.py
@@ -1,4 +1,8 @@
-from hub.util.keys import get_chunk_id_encoder_key, get_tensor_meta_key, tensor_exists
+from hub.api.info import load_info
+from hub.util.keys import (
+    get_tensor_info_key,
+    tensor_exists,
+)
 from hub.core.sample import Sample  # type: ignore
 from typing import List, Sequence, Union, Optional, Tuple, Dict
 from hub.util.shape import ShapeInterval
@@ -47,6 +51,8 @@ def __init__(
         self.chunk_engine = ChunkEngine(self.key, self.storage)
         self.index.validate(self.num_samples)
 
+        self.info = load_info(get_tensor_info_key(self.key), self.storage)
+
     def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
         """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array,
         or a sequence of `hub.read` outputs, which can be used to load files. See examples down below.

diff --git a/hub/api/tests/test_info.py b/hub/api/tests/test_info.py
@@ -0,0 +1,110 @@
+def test_dataset(local_ds_generator):
+    ds = local_ds_generator()
+
+    assert len(ds.info) == 0
+
+    ds.info.update(my_key=0)
+    ds.info.update(my_key=1)
+
+    ds.info.update(another_key="hi")
+    ds.info.update({"another_key": "hello"})
+
+    ds.info.update({"something": "aaaaa"}, something="bbbb")
+
+    ds.info.update(test=[1, 2, "5"])
+
+    test_list = ds.info.test
+    with ds:
+        ds.info.update({"test2": (1, 5, (1, "2"), [5, 6, (7, 8)])})
+        ds.info.update(xyz="abc")
+        test_list.extend(["user made change without `update`"])
+
+    ds.info.update({"1_-+": 5})
+
+    ds = local_ds_generator()
+
+    assert len(ds.info) == 7
+
+    assert ds.info.another_key == "hello"
+    assert ds.info.something == "bbbb"
+
+    assert ds.info.test == [1, 2, "5", "user made change without `update`"]
+    assert ds.info.test2 == [1, 5, [1, "2"], [5, 6, [7, 8]]]
+
+    assert ds.info.xyz == "abc"
+    assert ds.info["1_-+"] == 5  # key can't be accessed with `.` syntax
+
+    ds.info.update(test=[99])
+
+    ds = local_ds_generator()
+
+    assert len(ds.info) == 7
+    assert ds.info.test == [99]
+
+
+def test_tensor(local_ds_generator):
+    ds = local_ds_generator()
+
+    t1 = ds.create_tensor("tensor1")
+    t2 = ds.create_tensor("tensor2")
+
+    assert len(t1.info) == 0
+    assert len(t2.info) == 0
+
+    t1.info.update(key=0)
+    t2.info.update(key=1, key1=0)
+
+    ds = local_ds_generator()
+
+    t1 = ds.tensor1
+    t2 = ds.tensor2
+
+    assert len(t1.info) == 1
+    assert len(t2.info) == 2
+
+    assert t1.info.key == 0
+    assert t2.info.key == 1
+    assert t2.info.key1 == 0
+
+    with ds:
+        t1.info.update(key=99)
+
+    ds = local_ds_generator()
+
+    t1 = ds.tensor1
+    t2 = ds.tensor2
+
+    assert len(t1.info) == 1
+    assert len(t2.info) == 2
+
+    assert t1.info.key == 99
+
+
+def test_update_reference_manually(local_ds_generator):
+    """Right now synchronization can only happen when you call `info.update`."""
+
+    ds = local_ds_generator()
+
+    ds.info.update(key=[1, 2, 3])
+
+    ds = local_ds_generator()
+
+    l = ds.info.key
+    assert l == [1, 2, 3]
+
+    # un-registered update
+    l.append(5)
+    assert ds.info.key == [1, 2, 3, 5]
+
+    ds = local_ds_generator()
+
+    l = ds.info.key
+    assert l == [1, 2, 3]
+
+    # registered update
+    l.append(99)
+    ds.info.update()
+
+    ds = local_ds_generator()
+
+    assert l == [1, 2, 3, 99]
diff --git a/hub/api/tests/test_readonly.py b/hub/api/tests/test_readonly.py
@@ -29,6 +29,12 @@ def test_readonly(local_ds_generator):
     ds.read_only = True
     _assert_readonly_ops(ds, 1, (100, 100))
 
+    with pytest.raises(ReadOnlyModeError):
+        ds.info.update(key=0)
+
+    with pytest.raises(ReadOnlyModeError):
+        ds.tensor.info.update(key=0)
+
 
 @pytest.mark.xfail(raises=CouldNotCreateNewDatasetException, strict=True)
 def test_readonly_doesnt_exist(local_path):

diff --git a/hub/constants.py b/hub/constants.py
@@ -24,6 +24,7 @@
 
 SUPPORTED_MODES = ["r", "a"]
 
+# min chunk size is always half of `DEFAULT_MAX_CHUNK_SIZE`
 DEFAULT_MAX_CHUNK_SIZE = 32 * MB
 
 MIN_FIRST_CACHE_SIZE = 32 * MB
@@ -34,8 +35,14 @@
 DEFAULT_LOCAL_CACHE_SIZE = 0
 
 
+# meta is hub-defined information, necessary for hub Datasets/Tensors to function
 DATASET_META_FILENAME = "dataset_meta.json"
 TENSOR_META_FILENAME = "tensor_meta.json"
+
+# info is user-defined information, entirely optional. may be used by the visualizer
+DATASET_INFO_FILENAME = "dataset_info.json"
+TENSOR_INFO_FILENAME = "tensor_info.json"
+
 META_ENCODING = "utf8"
 
 CHUNKS_FOLDER = "chunks"

diff --git a/hub/core/chunk.py b/hub/core/chunk.py
@@ -108,8 +108,10 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
         self.shapes_encoder.add_shape(sample_shape, 1)
         self.byte_positions_encoder.add_byte_position(num_bytes_per_sample, 1)
 
-    def __len__(self):
+    @property
+    def nbytes(self):
         """Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached."""
+
         return infer_chunk_num_bytes(
             hub.__version__,
             self.shapes_encoder.array,

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
@@ -213,7 +213,7 @@ def _synchronize_cache(self):
         # synchronize last chunk
         last_chunk_key = self.last_chunk_key
         last_chunk = self.last_chunk
-        self.cache.update_used_cache_for_path(last_chunk_key, len(last_chunk))  # type: ignore
+        self.cache.update_used_cache_for_path(last_chunk_key, last_chunk.nbytes)  # type: ignore
 
         # synchronize tensor meta
         tensor_meta_key = get_tensor_meta_key(self.key)

diff --git a/hub/core/meta/dataset_meta.py b/hub/core/meta/dataset_meta.py
@@ -1,7 +1,5 @@
-from typing import Dict, List
-from hub.core.storage.provider import StorageProvider
+from typing import Any, Dict
 from hub.core.meta.meta import Meta
-from hub.util.keys import get_dataset_meta_key
 
 
 class DatasetMeta(Meta):
@@ -10,7 +8,12 @@ def __init__(self):
 
         super().__init__()
 
-    def as_dict(self) -> dict:
-        d = super().as_dict()
+    @property
+    def nbytes(self):
+        # TODO: can optimize this
+        return len(self.tobytes())
+
+    def __getstate__(self) -> Dict[str, Any]:
+        d = super().__getstate__()
         d["tensors"] = self.tensors
         return d
diff --git a/hub/core/meta/encode/chunk_id.py b/hub/core/meta/encode/chunk_id.py
@@ -72,6 +72,11 @@ def __init__(self):
 
         self._encoded_ids = None
 
+    @property
+    def nbytes(self):
+        # TODO: optimize this
+        return len(self.tobytes())
+
     def tobytes(self) -> memoryview:
         if self._encoded_ids is None:
             return serialize_chunkids(