Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AL-1619][AL-1586] Removes cachable, makes Info more dict like #1469

Merged
merged 59 commits into from
Feb 15, 2022
Merged
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
8316498
rewrite info, introduce is_dirty
AbhinavTuli Jan 24, 2022
63da480
test fixes, cleanup
AbhinavTuli Jan 25, 2022
7a1c168
cleanup transform code
AbhinavTuli Jan 27, 2022
6eaf915
lint fix
AbhinavTuli Jan 27, 2022
c6ecc9b
fix transform
AbhinavTuli Jan 31, 2022
80aa8b9
adds flush dirty items
AbhinavTuli Jan 31, 2022
2a6ecfb
Merge remote-tracking branch 'origin' into remove/cachable
AbhinavTuli Jan 31, 2022
2ffa0d5
linting
AbhinavTuli Feb 1, 2022
f38d998
fix import
AbhinavTuli Feb 1, 2022
340136c
change how chunk engine handles dirty objects
AbhinavTuli Feb 1, 2022
b9d8a54
lint
AbhinavTuli Feb 1, 2022
5e2183f
Merge remote-tracking branch 'origin' into remove/cachable
AbhinavTuli Feb 1, 2022
919bc25
fix recursion error and imports
AbhinavTuli Feb 1, 2022
6ecd2c8
fix some tests and lint
AbhinavTuli Feb 1, 2022
bc13622
replace cachable with HubMemoryObject
AbhinavTuli Feb 1, 2022
b08ec0b
lint fix
AbhinavTuli Feb 2, 2022
ba48d30
fix tensor delete
AbhinavTuli Feb 2, 2022
326861a
fix vc bug
AbhinavTuli Feb 2, 2022
3cf44db
fix ds.append
AbhinavTuli Feb 2, 2022
c053b28
fix chunk compressed chunk
AbhinavTuli Feb 2, 2022
0095a0f
bug fix
AbhinavTuli Feb 2, 2022
efd3561
fix info
AbhinavTuli Feb 2, 2022
9788aca
reduce flushes
AbhinavTuli Feb 2, 2022
56ee27f
fix io.py
AbhinavTuli Feb 3, 2022
4c98176
fix cache pickling
AbhinavTuli Feb 3, 2022
bf1ab74
fix shape updates
AbhinavTuli Feb 3, 2022
bc75517
transform fix
AbhinavTuli Feb 3, 2022
1a39dec
change info exception
AbhinavTuli Feb 7, 2022
9da2d87
fix autocheckout
AbhinavTuli Feb 7, 2022
befccb0
update info test
AbhinavTuli Feb 7, 2022
9eeec9a
fix diff tests
AbhinavTuli Feb 8, 2022
d057ff4
Merge remote-tracking branch 'origin' into remove/cachable
AbhinavTuli Feb 8, 2022
ff72c3a
remove duplicate method from info
AbhinavTuli Feb 8, 2022
438bdf8
mypy fix
AbhinavTuli Feb 8, 2022
58af3d7
added more info codecov
AbhinavTuli Feb 8, 2022
4da514d
removed unnecessary argument
AbhinavTuli Feb 8, 2022
e8f34be
increase codecov
AbhinavTuli Feb 9, 2022
97bc46f
makes nbytes abstract method
AbhinavTuli Feb 9, 2022
dbe28fb
change decorator order
AbhinavTuli Feb 9, 2022
97ebcb7
use context in info
AbhinavTuli Feb 9, 2022
8fb8067
reflect info modifications in diff
AbhinavTuli Feb 9, 2022
2dbc9ab
add modify info calls
AbhinavTuli Feb 9, 2022
45199b8
remove info updated from dataset meta
AbhinavTuli Feb 9, 2022
1c3c558
more version control tweaks
AbhinavTuli Feb 9, 2022
eca4fd0
minor tweaks to dataset
AbhinavTuli Feb 9, 2022
81ca4b2
Merge remote-tracking branch 'origin' into remove/cachable
AbhinavTuli Feb 9, 2022
725c2a3
don't mark certain encoders as dirty on init
AbhinavTuli Feb 9, 2022
d39b4a9
remove unused method
AbhinavTuli Feb 9, 2022
10003cf
fix circular import
AbhinavTuli Feb 9, 2022
b4b1904
improve codecov
AbhinavTuli Feb 9, 2022
a06e4af
fix after merge
AbhinavTuli Feb 9, 2022
ceba68c
refactor to remove dataset from cache
AbhinavTuli Feb 9, 2022
e1b9857
undo temporary test changes
AbhinavTuli Feb 9, 2022
274a9d8
simplify get_chunk
AbhinavTuli Feb 9, 2022
ddb218f
change remove_hub_object implementation
AbhinavTuli Feb 14, 2022
7b4e83e
Merge remote-tracking branch 'origin' into remove/cachable
AbhinavTuli Feb 14, 2022
72b4a7f
remove unused code
AbhinavTuli Feb 14, 2022
81032ab
fix windows issue
AbhinavTuli Feb 14, 2022
e4f13c0
mypy fix
AbhinavTuli Feb 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
184 changes: 91 additions & 93 deletions hub/api/info.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,40 @@
from hub.util.version_control import auto_checkout
from hub.core.storage.lru_cache import LRUCache
from typing import Any, Dict, Optional, Union, Sequence
from hub.core.storage.cachable import CachableCallback, use_callback
from hub.util.exceptions import InfoError
AbhinavTuli marked this conversation as resolved.
Show resolved Hide resolved
from hub.core.storage.hub_memory_object import HubMemoryObject
from typing import Any, Dict, Optional


class Info(CachableCallback):
class Info(HubMemoryObject):
AbhinavTuli marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self):
"""Contains **optional** key/values that datasets/tensors use for human-readability.
See the `Meta` class for required key/values for datasets/tensors.

Note:
Since `Info` is rarely written to and mostly by the user, every modifier will call `cache[key] = self`.
Must call `initialize_callback_location` before using any methods.
"""
self._info = {}
self._dataset = None
super().__init__()

def prepare_for_write(self):
AbhinavTuli marked this conversation as resolved.
Show resolved Hide resolved
if self._dataset is not None:
storage = self._dataset.storage
storage.check_readonly()
if not self._dataset.version_state["commit_node"].is_head_node:
raise InfoError("Cannot modify info from a non-head commit.")
self.is_dirty = True

def end_write(self):
if self._dataset is not None:
storage = self._dataset.storage
AbhinavTuli marked this conversation as resolved.
Show resolved Hide resolved
storage.maybe_flush()

@property
def nbytes(self):
# TODO: optimize this
return len(self.tobytes())

@use_callback(check_only=True)
def __len__(self):
return len(self._info)

@use_callback(check_only=True)
def __getstate__(self) -> Dict[str, Any]:
return self._info

def __setstate__(self, state: Dict[str, Any]):
self._info = state

@use_callback()
def update(self, *args, **kwargs):
"""Store optional dataset/tensor information. Will be accessible after loading your data from a new script!
Inputs must be supported by JSON.


Note:
This method has the same functionality as `dict().update(...)` Reference: https://www.geeksforgeeks.org/python-dictionary-update-method/.
A full list of supported value types can be found here: https://docs.python.org/3/library/json.html#json.JSONEncoder.

Examples:
Normal update usage:
>>> ds.info
{}
>>> ds.info.update(key=0)
>>> ds.info
{"key": 0}
>>> ds.info.update({"key1": 5, "key2": [1, 2, "test"]})
>>> ds.info
{"key": 0, "key1": 5, "key2": [1, 2, "test"]}

Alternate update usage:
>>> ds.info
{}
>>> ds.info.update(list=[1, 2, "apple"])
>>> ds.info
{"list": [1, 2, "apple"]}
>>> l = ds.info.list
>>> l
[1, 2, "apple"]
>>> l.append(5)
>>> l
[1, 2, "apple", 5]
>>> ds.info.update() # required to be persistent!

"""
self._cache.check_readonly()
if self._dataset is not None:
auto_checkout(self._dataset)
self._info.update(*args, **kwargs)

def __getattribute__(self, name: str) -> Any:
"""Allows access to info values using the `.` syntax. Example: `info.description`."""

Expand All @@ -82,48 +44,33 @@ def __getattribute__(self, name: str) -> Any:
return self.__getitem__(name)
return super().__getattribute__(name)

# implement all the methods of dictionary
def __getitem__(self, key: str):
return self._info[key]

def get(self, key: str, default: Optional[Any] = None):
return self._info.get(key, default)

def __str__(self):
return self._info.__str__()

def __repr__(self):
return self._info.__repr__()

@use_callback()
def delete(self, key: Optional[Union[Sequence[str], str]] = None):
"""Deletes a key or list of keys. If no key(s) is passed, all keys are deleted."""
self._cache.check_readonly()
if self._dataset is not None:
auto_checkout(self._dataset)
if key is None:
self._info.clear()
elif isinstance(key, str):
del self._info[key]
elif isinstance(key, Sequence):
for k in key:
del self._info[k]
else:
raise KeyError(key)

@use_callback()
def __setitem__(self, key: str, value):
self._cache.check_readonly()
if self._dataset is not None:
auto_checkout(self._dataset)
def __setitem__(self, key, value):
self.prepare_for_write()
self._info[key] = value
self.end_write()

def __setattr__(self, key: str, value):
if key in {"_key", "_cache", "_info", "_dataset"}:
object.__setattr__(self, key, value)
else:
self[key] = value
def __delitem__(self, key):
self.prepare_for_write()
del self._info[key]
self.end_write()

def __getattr__(self, key: str):
def __contains__(self, key):
return key in self._info

def __iter__(self):
return iter(self._info)

def __getattr__(self, key):
try:
return object.__getattribute__(self, key)
except AttributeError:
Expand All @@ -132,6 +79,60 @@ def __getattr__(self, key: str):
return self._info
return self[key]

def __setattr__(self, key: str, value):
if key in {"_info", "_dataset", "is_dirty"}:
object.__setattr__(self, key, value)
else:
self.prepare_for_write()
self[key] = value
self.end_write()

def get(self, key, default=None):
return self._info.get(key, default)

def setdefault(self, key, default=None):
self.prepare_for_write()
ret = self._info.setdefault(key, default)
self.end_write()
return ret

def clear(self):
self.prepare_for_write()
self._info.clear()
self.end_write()

def pop(self, key, default=None):
self.prepare_for_write()
popped = self._info.pop(key, default)
self.end_write()
return popped

def popitem(self):
self.prepare_for_write()
popped = self._info.popitem()
self.end_write()
return popped

def update(self, *args, **kwargs):
self.prepare_for_write()
self._info.update(*args, **kwargs)
self.end_write()

def keys(self):
return self._info.keys()

def values(self):
return self._info.values()

def items(self):
return self._info.items()

def replace_with(self, d):
self.prepare_for_write()
self._info.clear()
self._info.update(d)
self.end_write()

# the below methods are used by cloudpickle dumps
def __origin__(self):
return None
Expand All @@ -155,11 +156,8 @@ def __args__(self):
return None


def load_info(info_key: str, cache: LRUCache, dataset):
if info_key in cache:
info = cache.get_cachable(info_key, Info, callback_arg=dataset)
else:
info = Info()
info.initialize_callback_location(info_key, cache, dataset)

def load_info(key, dataset):
storage = dataset.storage
info = storage.get_hub_object(key, Info) if key in storage else Info()
info._dataset = dataset
return info
3 changes: 3 additions & 0 deletions hub/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,9 @@ def test_like(local_path):

src_ds.d.info.update(key=1)

assert src_ds.info.key == 0
assert src_ds.d.info.key == 1

dest_ds = hub.like(dest_path, src_ds)

assert tuple(dest_ds.tensors.keys()) == ("a", "b", "c", "d")
Expand Down
56 changes: 50 additions & 6 deletions hub/api/tests/test_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def test_dataset(local_ds_generator):
test_list.extend(["user made change without `update`"])

ds.info.update({"1_-+": 5})
assert len(ds.info) == 7

ds = local_ds_generator()

Expand All @@ -41,13 +42,14 @@ def test_dataset(local_ds_generator):
assert len(ds.info) == 7
assert ds.info.test == [99]

ds.info.delete("test")
ds.info.pop("test")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this change doesn't look to be backward compatible, hub version needs to be appropriately bumped.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a slight change in API for info, delete is barely used atm, we should be fine.

assert len(ds.info) == 6

ds.info.delete(["1_-+", "xyz"])
ds.info.pop("1_-+")
ds.info.pop("xyz")
assert len(ds.info) == 4

ds.info.delete()
ds.info.clear()
assert len(ds.info) == 0


Expand Down Expand Up @@ -89,13 +91,14 @@ def test_tensor(local_ds_generator):

assert t1.info.key == 99

t2.info.delete("key")
t2.info.pop("key")
assert len(t2.info) == 3

t2.info.delete(["key2", "key3"])
t2.info.pop("key2")
t2.info.pop("key3")
assert len(t2.info) == 1

t2.info.delete()
t2.info.clear()
assert len(t2.info) == 0


Expand Down Expand Up @@ -147,3 +150,44 @@ def test_class_label(local_ds_generator):
ds.labels.info.class_names == ds.labels.info["class_names"] == ["c", "b", "a"]
)
assert ds.labels2.info.class_names == ds.labels2.info["class_names"] == []


def test_info_new_methods(local_ds_generator):
ds = local_ds_generator()

ds.info[0] = "hello"
ds.info[1] = "world"
assert len(ds.info) == 2
assert set(ds.info.keys()) == {0, 1}
assert 0 in ds.info
assert 1 in ds.info

assert ds.info[0] == "hello"
assert ds.info[1] == "world"

del ds.info[0]
assert len(ds.info) == 1
assert 1 in ds.info
assert ds.info[1] == "world"

for it in ds.info:
assert it == 1

ds.info.setdefault(0, "yo")
assert len(ds.info) == 2
assert 0 in ds.info
assert 1 in ds.info
assert ds.info[0] == "yo"
assert ds.info[1] == "world"

ds.info.popitem()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separate tests for some of these methods (like popitem) might be better -- LIFO popitem not obvious here :)

assert len(ds.info) == 1
assert 1 in ds.info
assert ds.info[1] == "world"

for k, v in ds.info.items():
assert k == 1
assert v == "world"

for v in ds.info.values():
assert v == "world"
11 changes: 7 additions & 4 deletions hub/core/chunk/base_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
serialize_text,
serialize_tensor,
)
from hub.core.storage.cachable import Cachable
from hub.core.storage.hub_memory_object import HubMemoryObject
from hub.core.tiling.sample_tiles import SampleTiles
from hub.util.exceptions import TensorInvalidSampleShapeError

Expand All @@ -40,7 +40,7 @@
SerializedOutput = Tuple[bytes, Tuple]


class BaseChunk(Cachable):
class BaseChunk(HubMemoryObject):
def __init__(
self,
min_chunk_size: int,
Expand All @@ -51,6 +51,7 @@ def __init__(
encoded_byte_positions: Optional[np.ndarray] = None,
data: Optional[memoryview] = None,
):
super().__init__()
self._data_bytes: Union[bytearray, bytes, memoryview] = data or bytearray()
self.version = hub.__version__
self.min_chunk_size = min_chunk_size
Expand Down Expand Up @@ -140,6 +141,7 @@ def frombuffer(cls, buffer: bytes, chunk_args: list, copy=True): # type: ignore
version, shapes, byte_positions, data = deserialize_chunk(buffer, copy=copy)
chunk = cls(*chunk_args, shapes, byte_positions, data=data) # type: ignore
chunk.version = version
chunk.is_dirty = False
return chunk

@abstractmethod
Expand All @@ -163,6 +165,7 @@ def _make_data_bytearray(self):
def prepare_for_write(self):
ffw_chunk(self)
self._make_data_bytearray()
self.is_dirty = True

def register_sample_to_headers(
self, incoming_num_bytes: Optional[int], sample_shape: Tuple[int]
Expand Down Expand Up @@ -259,7 +262,7 @@ def register_in_meta_and_headers(self, sample_nbytes: Optional[int], shape):
"""Registers a new sample in meta and headers"""
self.register_sample_to_headers(sample_nbytes, shape)
if self._update_tensor_meta_length:
self.tensor_meta.length += 1
self.tensor_meta.update_length(1)
self.tensor_meta.update_shape_interval(shape)

def update_in_meta_and_headers(
Expand Down Expand Up @@ -308,7 +311,7 @@ def write_tile(self, sample: SampleTiles):
if sample.is_first_write:
self.tensor_meta.update_shape_interval(sample.sample_shape)
if self._update_tensor_meta_length:
self.tensor_meta.length += 1
self.tensor_meta.update_length(1)

def _pop_sample(self):
self.prepare_for_write()
Expand Down