Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dataset/tensor info alongside meta #1066

Merged
merged 38 commits into from
Jul 21, 2021
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
be6ce00
add test for subsequent meta updates (failing)
verbose-void Jul 14, 2021
ea459f3
Merge branch 'tests/update-fixtures' of github.com:activeloopai/Hub i…
verbose-void Jul 14, 2021
96fb8a5
Merge branch 'tests/update-fixtures' of github.com:activeloopai/Hub i…
verbose-void Jul 15, 2021
d11920f
synchronize everything with cache
verbose-void Jul 15, 2021
bfc97f3
add docstring
verbose-void Jul 15, 2021
0cb390a
Merge branch 'tests/update-fixtures' of github.com:activeloopai/Hub i…
verbose-void Jul 15, 2021
40b902d
check that tensor meta length & chunk id encoder num_samples are the …
verbose-void Jul 15, 2021
161d9e2
write tests for user-defined `info`
verbose-void Jul 15, 2021
60f9171
add info class + update docstrings
verbose-void Jul 15, 2021
43bc04f
implement `TensorMeta.as_dict`
verbose-void Jul 16, 2021
bfb8876
replace `len(Cachable)` with `Cachable.num_bytes`
verbose-void Jul 16, 2021
338b949
added callback cachable class hierarchy & initialization
verbose-void Jul 16, 2021
5e51ed2
write failure test cases & start functionality
verbose-void Jul 16, 2021
52a65f1
fix serialization and allow dotdict-like access
verbose-void Jul 16, 2021
ac33b0d
add tests for non-immutable values
verbose-void Jul 16, 2021
e491547
validate is jsonable
verbose-void Jul 16, 2021
0542988
remove immutability parsing + allow json
verbose-void Jul 16, 2021
39f64d7
move CachableCallback -> hub.api
verbose-void Jul 16, 2021
55f99f5
remove benchamrk file
verbose-void Jul 16, 2021
a1da0a9
clear paths before running tests always
verbose-void Jul 16, 2021
46faf5a
setstate/getstate for metas and pass tests
verbose-void Jul 16, 2021
e792f99
mypy
verbose-void Jul 16, 2021
d7745e7
readonly tests & others
verbose-void Jul 16, 2021
723a61a
`update` docstring/example
verbose-void Jul 19, 2021
173da03
test with
verbose-void Jul 19, 2021
fbed709
remove flush statement
verbose-void Jul 19, 2021
2a14a88
Merge branch 'main' of github.com:activeloopai/Hub into task/udpate-m…
verbose-void Jul 19, 2021
930a0ed
remove TODOs and add tensor info readonly test
verbose-void Jul 19, 2021
a212d37
CachableCallback decorator
verbose-void Jul 19, 2021
d1f6faf
formatting
verbose-void Jul 19, 2021
5959dfb
validate json docstring
verbose-void Jul 19, 2021
3b1d992
Merge branch 'main' into task/udpate-meta-api
farizrahman4u Jul 20, 2021
4cfc707
read_only mode xfail pass
verbose-void Jul 20, 2021
59a99dd
move meta / info loading into set derived attribtues
verbose-void Jul 20, 2021
cd06615
remove extra feature rport
verbose-void Jul 20, 2021
ba8095e
update docstring
verbose-void Jul 20, 2021
f447bda
fix darglint
verbose-void Jul 20, 2021
19160c5
minor comments
verbose-void Jul 21, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions hub/api/dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from hub.api.info import load_info
from hub.core.storage.provider import StorageProvider
from hub.core.tensor import create_tensor
from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence
Expand All @@ -11,7 +12,12 @@

from hub.core.index import Index
from hub.integrations import dataset_to_tensorflow
from hub.util.keys import dataset_exists, get_dataset_meta_key, tensor_exists
from hub.util.keys import (
dataset_exists,
get_dataset_info_key,
get_dataset_meta_key,
tensor_exists,
)
from hub.util.bugout_reporter import hub_reporter
from hub.util.cache_chain import generate_chain
from hub.util.exceptions import (
Expand Down Expand Up @@ -101,7 +107,8 @@ def __init__(
self.client = HubBackendClient(token=token)

self.public = public
self._load_meta()
self._load_meta() # TODO: use the same load scheme as info
self.info = load_info(get_dataset_info_key(), self.storage) # type: ignore
self.index.validate(self.num_samples)

hub_reporter.feature_report(
Expand Down Expand Up @@ -238,7 +245,10 @@ def _load_meta(self):
self.flush()
if self.path.startswith("hub://"):
self.client.create_dataset_entry(
self.org_id, self.ds_name, self.meta.as_dict(), public=self.public
self.org_id,
self.ds_name,
self.meta.__getstate__(),
public=self.public,
)

@property
Expand Down
102 changes: 102 additions & 0 deletions hub/api/info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from hub.core.storage.lru_cache import LRUCache
from hub.util.json import validate_is_jsonable
from typing import Any, Dict
from hub.core.storage.cachable import CachableCallback, use_callback


class Info(CachableCallback):
def __init__(self):
"""Contains **optional** key/values that datasets/tensors use for human-readability.
See the `Meta` class for required key/values for datasets/tensors.

Note:
Since `Info` is rarely written to and mostly by the user, every modifier will call `cache[key] = self`.
Must call `initialize_callback_location` before using any methods.
"""

self._info = {}
super().__init__()

@property
def nbytes(self):
# TODO: optimize this
return len(self.tobytes())

@use_callback(check_only=True)
def __len__(self):
return len(self._info)

@use_callback(check_only=True)
def __getstate__(self) -> Dict[str, Any]:
return self._info

def __setstate__(self, state: Dict[str, Any]):
self._info = state

@use_callback()
def update(self, *args, **kwargs):
"""Store optional dataset/tensor information. Will be accessible after loading your data from a new script!
Inputs must be supported by JSON.


Note:
This method has the same functionality as `dict().update(...)` Reference: https://www.geeksforgeeks.org/python-dictionary-update-method/.
A full list of supported value types can be found here: https://docs.python.org/3/library/json.html#json.JSONEncoder.

Examples:
Normal update usage:
>>> ds.info
{}
>>> ds.info.update(key=0)
>>> ds.info
{"key": 0}
>>> ds.info.update({"key1": 5, "key2": [1, 2, "test"]})
>>> ds.info
{"key": 0, "key1": 5, "key2": [1, 2, "test"]}

Alternate update usage:
>>> ds.info
{}
>>> ds.info.update(list=[1, 2, "apple"])
>>> ds.info
{"list": [1, 2, "apple"]}
>>> l = ds.info.list
>>> l
[1, 2, "apple"]
>>> l.append(5)
>>> l
[1, 2, "apple", 5]
>>> ds.info.update() # required to be persistent!

"""

self._cache.check_readonly()
self._info.update(*args, **kwargs)

def __getattribute__(self, name: str) -> Any:
"""Allows access to info values using the `.` syntax. Example: `info.description`."""

if name == "_info":
return super().__getattribute__(name)
if name in self._info:
return self.__getitem__(name)
return super().__getattribute__(name)

def __getitem__(self, key: str):
return self._info[key]

def __str__(self):
return self._info.__str__()

def __repr__(self):
return self._info.__repr__()


def load_info(info_key: str, cache: LRUCache):
if info_key in cache:
info = cache.get_cachable(info_key, Info)
else:
info = Info()
info.initialize_callback_location(info_key, cache)

return info
8 changes: 7 additions & 1 deletion hub/api/tensor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from hub.util.keys import get_chunk_id_encoder_key, get_tensor_meta_key, tensor_exists
from hub.api.info import load_info
from hub.util.keys import (
get_tensor_info_key,
tensor_exists,
)
from hub.core.sample import Sample # type: ignore
from typing import List, Sequence, Union, Optional, Tuple, Dict
from hub.util.shape import ShapeInterval
Expand Down Expand Up @@ -47,6 +51,8 @@ def __init__(
self.chunk_engine = ChunkEngine(self.key, self.storage)
self.index.validate(self.num_samples)

self.info = load_info(get_tensor_info_key(self.key), self.storage)

def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
"""Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array,
or a sequence of `hub.load` outputs, which can be used to load files. See examples down below.
Expand Down
110 changes: 110 additions & 0 deletions hub/api/tests/test_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
def test_dataset(local_ds_generator):
ds = local_ds_generator()

assert len(ds.info) == 0

ds.info.update(my_key=0)
ds.info.update(my_key=1)

ds.info.update(another_key="hi")
ds.info.update({"another_key": "hello"})

ds.info.update({"something": "aaaaa"}, something="bbbb")

ds.info.update(test=[1, 2, "5"])

test_list = ds.info.test
with ds:
ds.info.update({"test2": (1, 5, (1, "2"), [5, 6, (7, 8)])})
ds.info.update(xyz="abc")
test_list.extend(["user made change without `update`"])

ds.info.update({"1_-+": 5})

ds = local_ds_generator()

assert len(ds.info) == 7

assert ds.info.another_key == "hello"
assert ds.info.something == "bbbb"

assert ds.info.test == [1, 2, "5", "user made change without `update`"]
assert ds.info.test2 == [1, 5, [1, "2"], [5, 6, [7, 8]]]

assert ds.info.xyz == "abc"
assert ds.info["1_-+"] == 5 # key can't be accessed with `.` syntax

ds.info.update(test=[99])

ds = local_ds_generator()

assert len(ds.info) == 7
assert ds.info.test == [99]


def test_tensor(local_ds_generator):
ds = local_ds_generator()

t1 = ds.create_tensor("tensor1")
t2 = ds.create_tensor("tensor2")

assert len(t1.info) == 0
assert len(t2.info) == 0

t1.info.update(key=0)
t2.info.update(key=1, key1=0)

ds = local_ds_generator()

t1 = ds.tensor1
t2 = ds.tensor2

assert len(t1.info) == 1
assert len(t2.info) == 2

assert t1.info.key == 0
assert t2.info.key == 1
assert t2.info.key1 == 0

with ds:
t1.info.update(key=99)

ds = local_ds_generator()

t1 = ds.tensor1
t2 = ds.tensor2

assert len(t1.info) == 1
assert len(t2.info) == 2

assert t1.info.key == 99


def test_update_reference_manually(local_ds_generator):
"""Right now synchronization can only happen when you call `info.update`."""

ds = local_ds_generator()

ds.info.update(key=[1, 2, 3])

ds = local_ds_generator()

l = ds.info.key
assert l == [1, 2, 3]

# un-registered update
l.append(5)
assert ds.info.key == [1, 2, 3, 5]

ds = local_ds_generator()

l = ds.info.key
assert l == [1, 2, 3]

# registered update
l.append(99)
ds.info.update()

ds = local_ds_generator()

assert l == [1, 2, 3, 99]
6 changes: 6 additions & 0 deletions hub/api/tests/test_readonly.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ def test_readonly(local_ds_generator):
ds.read_only = True
_assert_readonly_ops(ds, 1, (100, 100))

with pytest.raises(ReadOnlyModeError):
ds.info.update(key=0)

with pytest.raises(ReadOnlyModeError):
ds.tensor.info.update(key=0)


@pytest.mark.xfail(raises=CouldNotCreateNewDatasetException, strict=True)
def test_readonly_doesnt_exist(local_path):
Expand Down
7 changes: 7 additions & 0 deletions hub/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

SUPPORTED_MODES = ["r", "a"]

# min chunk size is always half of `DEFAULT_MAX_CHUNK_SIZE`
DEFAULT_MAX_CHUNK_SIZE = 32 * MB

MIN_FIRST_CACHE_SIZE = 32 * MB
Expand All @@ -34,8 +35,14 @@
DEFAULT_LOCAL_CACHE_SIZE = 0


# meta is 100% required hub-defined meta
verbose-void marked this conversation as resolved.
Show resolved Hide resolved
DATASET_META_FILENAME = "dataset_meta.json"
TENSOR_META_FILENAME = "tensor_meta.json"

# info is 100% optional user-defined information
DATASET_INFO_FILENAME = "dataset_info.json"
TENSOR_INFO_FILENAME = "tensor_info.json"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2 additional small files -- this means that in total we will have the following small files:

  • dataset meta/dataset info (2)
  • tensor info/tensor info (2 * num_tensors)
  • chunk_ids (1 * num_tensors)

for a dataset with 2 tensors, there will be 8 small files.
for a dataset with 4 tensors there will be 14 small files.

the more tensors we add, the more small files they will contain. this isn't a problem now, but we should keep it in mind as we proceed.


META_ENCODING = "utf8"

CHUNKS_FOLDER = "chunks"
Expand Down
4 changes: 3 additions & 1 deletion hub/core/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,10 @@ def update_headers(self, incoming_num_bytes: int, sample_shape: Tuple[int]):
self.shapes_encoder.add_shape(sample_shape, 1)
self.byte_positions_encoder.add_byte_position(num_bytes_per_sample, 1)

def __len__(self):
@property
def nbytes(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

replaced len with nbytes for all cachables so that cachable classes can have their len computed without returning the num bytes.

"""Calculates the number of bytes `tobytes` will be without having to call `tobytes`. Used by `LRUCache` to determine if this chunk can be cached."""

return infer_chunk_num_bytes(
hub.__version__,
self.shapes_encoder.array,
Expand Down
2 changes: 1 addition & 1 deletion hub/core/chunk_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def _synchronize_cache(self):
# synchronize last chunk
last_chunk_key = self.last_chunk_key
last_chunk = self.last_chunk
self.cache.update_used_cache_for_path(last_chunk_key, len(last_chunk)) # type: ignore
self.cache.update_used_cache_for_path(last_chunk_key, last_chunk.nbytes) # type: ignore

# synchronize tensor meta
tensor_meta_key = get_tensor_meta_key(self.key)
Expand Down
13 changes: 8 additions & 5 deletions hub/core/meta/dataset_meta.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from typing import Dict, List
from hub.core.storage.provider import StorageProvider
from typing import Any, Dict
from hub.core.meta.meta import Meta
from hub.util.keys import get_dataset_meta_key


class DatasetMeta(Meta):
Expand All @@ -10,7 +8,12 @@ def __init__(self):

super().__init__()

def as_dict(self) -> dict:
d = super().as_dict()
@property
def nbytes(self):
# TODO: can optimize this
return len(self.tobytes())

def __getstate__(self) -> Dict[str, Any]:
d = super().__getstate__()
d["tensors"] = self.tensors
return d
5 changes: 5 additions & 0 deletions hub/core/meta/encode/chunk_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ def __init__(self):

self._encoded_ids = None

@property
def nbytes(self):
# TODO: optimize this
return len(self.tobytes())

def tobytes(self) -> memoryview:
if self._encoded_ids is None:
return serialize_chunkids(
Expand Down
7 changes: 6 additions & 1 deletion hub/core/meta/meta.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
from typing import Any, Dict
import hub
from hub.core.storage.cachable import Cachable


class Meta(Cachable):
def __init__(self):
"""Contains **required** key/values that datasets/tensors use to function.
See the `Info` class for optional key/values for datasets/tensors.
"""

self.version = hub.__version__

def as_dict(self) -> dict:
def __getstate__(self) -> Dict[str, Any]:
return {"version": self.version}
Loading