Skip to content

Commit

Permalink
Merge pull request #1066 from activeloopai/task/udpate-meta-api
Browse files Browse the repository at this point in the history
dataset/tensor `info` alongside `meta`
  • Loading branch information
verbose-void committed Jul 21, 2021
2 parents 2047f2a + 19160c5 commit 35224df
Show file tree
Hide file tree
Showing 18 changed files with 429 additions and 35 deletions.
28 changes: 19 additions & 9 deletions hub/api/dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from hub.api.info import load_info
from hub.core.storage.provider import StorageProvider
from hub.core.tensor import create_tensor
from typing import Any, Callable, Dict, Optional, Union, Tuple, List, Sequence
Expand All @@ -11,7 +12,12 @@

from hub.core.index import Index
from hub.integrations import dataset_to_tensorflow
from hub.util.keys import dataset_exists, get_dataset_meta_key, tensor_exists
from hub.util.keys import (
dataset_exists,
get_dataset_info_key,
get_dataset_meta_key,
tensor_exists,
)
from hub.util.bugout_reporter import hub_reporter
from hub.util.cache_chain import generate_chain
from hub.util.exceptions import (
Expand Down Expand Up @@ -89,6 +95,7 @@ def __init__(
self.tensors: Dict[str, Tensor] = {}
self._token = token
self.public = public

self._set_derived_attributes()

def __enter__(self):
Expand Down Expand Up @@ -239,18 +246,19 @@ def _load_meta(self):
raise PathNotEmptyException

else:
self.meta = DatasetMeta()

try:
self.storage[meta_key] = self.meta
except ReadOnlyModeError:
# if this is thrown, that means the dataset doesn't exist and the user has no write access.
if self.read_only:
# cannot create a new dataset when in read_only mode.
raise CouldNotCreateNewDatasetException(self.path)

self.meta = DatasetMeta()
self.storage[meta_key] = self.meta
self.flush()
if self.path.startswith("hub://"):
self.client.create_dataset_entry(
self.org_id, self.ds_name, self.meta.as_dict(), public=self.public
self.org_id,
self.ds_name,
self.meta.__getstate__(),
public=self.public,
)

@property
Expand Down Expand Up @@ -320,13 +328,15 @@ def _get_total_meta(self):

def _set_derived_attributes(self):
"""Sets derived attributes during init and unpickling."""

self.storage.autoflush = True
if self.path.startswith("hub://"):
split_path = self.path.split("/")
self.org_id, self.ds_name = split_path[2], split_path[3]
self.client = HubBackendClient(token=self._token)

self._load_meta()
self._load_meta() # TODO: use the same scheme as `load_info`
self.info = load_info(get_dataset_info_key(), self.storage) # type: ignore
self.index.validate(self.num_samples)

hub_reporter.feature_report(
Expand Down
101 changes: 101 additions & 0 deletions hub/api/info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from hub.core.storage.lru_cache import LRUCache
from typing import Any, Dict
from hub.core.storage.cachable import CachableCallback, use_callback


class Info(CachableCallback):
def __init__(self):
"""Contains **optional** key/values that datasets/tensors use for human-readability.
See the `Meta` class for required key/values for datasets/tensors.
Note:
Since `Info` is rarely written to and mostly by the user, every modifier will call `cache[key] = self`.
Must call `initialize_callback_location` before using any methods.
"""

self._info = {}
super().__init__()

@property
def nbytes(self):
# TODO: optimize this
return len(self.tobytes())

@use_callback(check_only=True)
def __len__(self):
return len(self._info)

@use_callback(check_only=True)
def __getstate__(self) -> Dict[str, Any]:
return self._info

def __setstate__(self, state: Dict[str, Any]):
self._info = state

@use_callback()
def update(self, *args, **kwargs):
"""Store optional dataset/tensor information. Will be accessible after loading your data from a new script!
Inputs must be supported by JSON.
Note:
This method has the same functionality as `dict().update(...)` Reference: https://www.geeksforgeeks.org/python-dictionary-update-method/.
A full list of supported value types can be found here: https://docs.python.org/3/library/json.html#json.JSONEncoder.
Examples:
Normal update usage:
>>> ds.info
{}
>>> ds.info.update(key=0)
>>> ds.info
{"key": 0}
>>> ds.info.update({"key1": 5, "key2": [1, 2, "test"]})
>>> ds.info
{"key": 0, "key1": 5, "key2": [1, 2, "test"]}
Alternate update usage:
>>> ds.info
{}
>>> ds.info.update(list=[1, 2, "apple"])
>>> ds.info
{"list": [1, 2, "apple"]}
>>> l = ds.info.list
>>> l
[1, 2, "apple"]
>>> l.append(5)
>>> l
[1, 2, "apple", 5]
>>> ds.info.update() # required to be persistent!
"""

self._cache.check_readonly()
self._info.update(*args, **kwargs)

def __getattribute__(self, name: str) -> Any:
"""Allows access to info values using the `.` syntax. Example: `info.description`."""

if name == "_info":
return super().__getattribute__(name)
if name in self._info:
return self.__getitem__(name)
return super().__getattribute__(name)

def __getitem__(self, key: str):
return self._info[key]

def __str__(self):
return self._info.__str__()

def __repr__(self):
return self._info.__repr__()


def load_info(info_key: str, cache: LRUCache):
if info_key in cache:
info = cache.get_cachable(info_key, Info)
else:
info = Info()
info.initialize_callback_location(info_key, cache)

return info
8 changes: 7 additions & 1 deletion hub/api/tensor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from hub.util.keys import get_chunk_id_encoder_key, get_tensor_meta_key, tensor_exists
from hub.api.info import load_info
from hub.util.keys import (
get_tensor_info_key,
tensor_exists,
)
from hub.core.sample import Sample # type: ignore
from typing import List, Sequence, Union, Optional, Tuple, Dict
from hub.util.shape import ShapeInterval
Expand Down Expand Up @@ -47,6 +51,8 @@ def __init__(
self.chunk_engine = ChunkEngine(self.key, self.storage)
self.index.validate(self.num_samples)

self.info = load_info(get_tensor_info_key(self.key), self.storage)

def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
"""Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array,
or a sequence of `hub.read` outputs, which can be used to load files. See examples down below.
Expand Down
110 changes: 110 additions & 0 deletions hub/api/tests/test_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
def test_dataset(local_ds_generator):
ds = local_ds_generator()

assert len(ds.info) == 0

ds.info.update(my_key=0)
ds.info.update(my_key=1)

ds.info.update(another_key="hi")
ds.info.update({"another_key": "hello"})

ds.info.update({"something": "aaaaa"}, something="bbbb")

ds.info.update(test=[1, 2, "5"])

test_list = ds.info.test
with ds:
ds.info.update({"test2": (1, 5, (1, "2"), [5, 6, (7, 8)])})
ds.info.update(xyz="abc")
test_list.extend(["user made change without `update`"])

ds.info.update({"1_-+": 5})

ds = local_ds_generator()

assert len(ds.info) == 7

assert ds.info.another_key == "hello"
assert ds.info.something == "bbbb"

assert ds.info.test == [1, 2, "5", "user made change without `update`"]
assert ds.info.test2 == [1, 5, [1, "2"], [5, 6, [7, 8]]]

assert ds.info.xyz == "abc"
assert ds.info["1_-+"] == 5 # key can't be accessed with `.` syntax

ds.info.update(test=[99])

ds = local_ds_generator()

assert len(ds.info) == 7
assert ds.info.test == [99]


def test_tensor(local_ds_generator):
ds = local_ds_generator()

t1 = ds.create_tensor("tensor1")
t2 = ds.create_tensor("tensor2")

assert len(t1.info) == 0
assert len(t2.info) == 0

t1.info.update(key=0)
t2.info.update(key=1, key1=0)

ds = local_ds_generator()

t1 = ds.tensor1
t2 = ds.tensor2

assert len(t1.info) == 1
assert len(t2.info) == 2

assert t1.info.key == 0
assert t2.info.key == 1
assert t2.info.key1 == 0

with ds:
t1.info.update(key=99)

ds = local_ds_generator()

t1 = ds.tensor1
t2 = ds.tensor2

assert len(t1.info) == 1
assert len(t2.info) == 2

assert t1.info.key == 99


def test_update_reference_manually(local_ds_generator):
"""Right now synchronization can only happen when you call `info.update`."""

ds = local_ds_generator()

ds.info.update(key=[1, 2, 3])

ds = local_ds_generator()

l = ds.info.key
assert l == [1, 2, 3]

# un-registered update
l.append(5)
assert ds.info.key == [1, 2, 3, 5]

ds = local_ds_generator()

l = ds.info.key
assert l == [1, 2, 3]

# registered update
l.append(99)
ds.info.update()

ds = local_ds_generator()

assert l == [1, 2, 3, 99]
6 changes: 6 additions & 0 deletions hub/api/tests/test_readonly.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ def test_readonly(local_ds_generator):
ds.read_only = True
_assert_readonly_ops(ds, 1, (100, 100))

with pytest.raises(ReadOnlyModeError):
ds.info.update(key=0)

with pytest.raises(ReadOnlyModeError):
ds.tensor.info.update(key=0)


@pytest.mark.xfail(raises=CouldNotCreateNewDatasetException, strict=True)
def test_readonly_doesnt_exist(local_path):
Expand Down
7 changes: 7 additions & 0 deletions hub/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

SUPPORTED_MODES = ["r", "a"]

# min chunk size is always half of `DEFAULT_MAX_CHUNK_SIZE`
DEFAULT_MAX_CHUNK_SIZE = 32 * MB

MIN_FIRST_CACHE_SIZE = 32 * MB
Expand All @@ -34,8 +35,14 @@
DEFAULT_LOCAL_CACHE_SIZE = 0


# meta is hub-defined information, necessary for hub Datasets/Tensors to function
DATASET_META_FILENAME = "dataset_meta.json"
TENSOR_META_FILENAME = "tensor_meta.json"

# info is user-defined information, entirely optional. may be used by the visualizer
DATASET_INFO_FILENAME = "dataset_info.json"
TENSOR_INFO_FILENAME = "tensor_info.json"

META_ENCODING = "utf8"

CHUNKS_FOLDER = "chunks"
Expand Down
4 changes: 3 additions & 1 deletion hub/core/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,10 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
self.shapes_encoder.add_shape(sample_shape, 1)
self.byte_positions_encoder.add_byte_position(num_bytes_per_sample, 1)

def __len__(self):
@property
def nbytes(self):
"""Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached."""

return infer_chunk_num_bytes(
hub.__version__,
self.shapes_encoder.array,
Expand Down
2 changes: 1 addition & 1 deletion hub/core/chunk_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def _synchronize_cache(self):
# synchronize last chunk
last_chunk_key = self.last_chunk_key
last_chunk = self.last_chunk
self.cache.update_used_cache_for_path(last_chunk_key, len(last_chunk)) # type: ignore
self.cache.update_used_cache_for_path(last_chunk_key, last_chunk.nbytes) # type: ignore

# synchronize tensor meta
tensor_meta_key = get_tensor_meta_key(self.key)
Expand Down
13 changes: 8 additions & 5 deletions hub/core/meta/dataset_meta.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from typing import Dict, List
from hub.core.storage.provider import StorageProvider
from typing import Any, Dict
from hub.core.meta.meta import Meta
from hub.util.keys import get_dataset_meta_key


class DatasetMeta(Meta):
Expand All @@ -10,7 +8,12 @@ def __init__(self):

super().__init__()

def as_dict(self) -> dict:
d = super().as_dict()
@property
def nbytes(self):
# TODO: can optimize this
return len(self.tobytes())

def __getstate__(self) -> Dict[str, Any]:
d = super().__getstate__()
d["tensors"] = self.tensors
return d
5 changes: 5 additions & 0 deletions hub/core/meta/encode/chunk_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ def __init__(self):

self._encoded_ids = None

@property
def nbytes(self):
# TODO: optimize this
return len(self.tobytes())

def tobytes(self) -> memoryview:
if self._encoded_ids is None:
return serialize_chunkids(
Expand Down

0 comments on commit 35224df

Please sign in to comment.