Skip to content

Commit

Permalink
[AL-1619][AL-1586] Removes cachable, makes Info more dict like (#1469)
Browse files Browse the repository at this point in the history
* rewrite info, introduce is_dirty

* test fixes, cleanup

* cleanup transform code

* lint fix

* fix transform

* adds flush dirty items

* linting

* fix import

* change how chunk engine handles dirty objects

* lint

* fix recursion error and imports

* fix some tests and lint

* replace cachable with HubMemoryObject

* lint fix

* fix tensor delete

* fix vc bug

* fix ds.append

* fix chunk compressed chunk

* bug fix

* fix info

* reduce flushes

* fix io.py

* fix cache pickling

* fix shape updates

* transform fix

* change info exception

* fix autocheckout

* update info test

* fix diff tests

* remove duplicate method from info

* mypy fix

* added more info codecov

* removed unnecessary argument

* increase codecov

* makes nbytes abstract method

* change decorator order

* use context in info

* reflect info modifications in diff

* add modify info calls

* remove info updated from dataset meta

* more version control tweaks

* minor tweaks to dataset

* don't mark certain encoders as dirty on init

* remove unused method

* fix circular import

* improve codecov

* fix after merge

* refactor to remove dataset from cache

* undo temporary test changes

* simplify get_chunk

* change remove_hub_object implementation

* remove unused code

* fix windows issue

* mypy fix
  • Loading branch information
AbhinavTuli committed Feb 15, 2022
1 parent bf3eb46 commit aada0a9
Show file tree
Hide file tree
Showing 34 changed files with 777 additions and 652 deletions.
183 changes: 93 additions & 90 deletions hub/api/info.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,47 @@
from hub.util.version_control import auto_checkout
from hub.core.storage.lru_cache import LRUCache
from typing import Any, Dict, Optional, Union, Sequence
from hub.core.storage.cachable import CachableCallback, use_callback
from hub.util.exceptions import InfoError
from hub.core.storage.hub_memory_object import HubMemoryObject
from typing import Any, Dict


class Info(CachableCallback):
class Info(HubMemoryObject):
def __init__(self):
"""Contains **optional** key/values that datasets/tensors use for human-readability.
See the `Meta` class for required key/values for datasets/tensors.
Note:
Since `Info` is rarely written to and mostly by the user, every modifier will call `cache[key] = self`.
Must call `initialize_callback_location` before using any methods.
"""
self._info = {}
super().__init__()
self._dataset = None

# the key to info in case of Tensor Info, None in case of Dataset Info
self._key = None
self.is_dirty = False

def __enter__(self):
ds = self._dataset
key = self._key
if ds is not None:
ds.storage.check_readonly()
if not ds.version_state["commit_node"].is_head_node:
raise InfoError("Cannot modify info from a non-head commit.")
self.is_dirty = True
if key:
ds[key].chunk_engine.commit_diff.modify_info()
return self

def __exit__(self, exc_type, exc_val, exc_tb):
if self._dataset is not None:
self._dataset.storage.maybe_flush()

@property
def nbytes(self):
# TODO: optimize this
return len(self.tobytes())

@use_callback(check_only=True)
def __len__(self):
return len(self._info)

@use_callback(check_only=True)
def __getstate__(self) -> Dict[str, Any]:
return self._info

def __setstate__(self, state: Dict[str, Any]):
self._info = state

@use_callback()
def update(self, *args, **kwargs):
"""Store optional dataset/tensor information. Will be accessible after loading your data from a new script!
Inputs must be supported by JSON.
Note:
This method has the same functionality as `dict().update(...)` Reference: https://www.geeksforgeeks.org/python-dictionary-update-method/.
A full list of supported value types can be found here: https://docs.python.org/3/library/json.html#json.JSONEncoder.
Examples:
Normal update usage:
>>> ds.info
{}
>>> ds.info.update(key=0)
>>> ds.info
{"key": 0}
>>> ds.info.update({"key1": 5, "key2": [1, 2, "test"]})
>>> ds.info
{"key": 0, "key1": 5, "key2": [1, 2, "test"]}
Alternate update usage:
>>> ds.info
{}
>>> ds.info.update(list=[1, 2, "apple"])
>>> ds.info
{"list": [1, 2, "apple"]}
>>> l = ds.info.list
>>> l
[1, 2, "apple"]
>>> l.append(5)
>>> l
[1, 2, "apple", 5]
>>> ds.info.update() # required to be persistent!
"""
self._cache.check_readonly()
if self._dataset is not None:
auto_checkout(self._dataset)
self._info.update(*args, **kwargs)

def __getattribute__(self, name: str) -> Any:
"""Allows access to info values using the `.` syntax. Example: `info.description`."""

Expand All @@ -82,48 +51,31 @@ def __getattribute__(self, name: str) -> Any:
return self.__getitem__(name)
return super().__getattribute__(name)

# implement all the methods of dictionary
def __getitem__(self, key: str):
return self._info[key]

def get(self, key: str, default: Optional[Any] = None):
return self._info.get(key, default)

def __str__(self):
return self._info.__str__()

def __repr__(self):
return self._info.__repr__()

@use_callback()
def delete(self, key: Optional[Union[Sequence[str], str]] = None):
"""Deletes a key or list of keys. If no key(s) is passed, all keys are deleted."""
self._cache.check_readonly()
if self._dataset is not None:
auto_checkout(self._dataset)
if key is None:
self._info.clear()
elif isinstance(key, str):
def __setitem__(self, key, value):
with self:
self._info[key] = value

def __delitem__(self, key):
with self:
del self._info[key]
elif isinstance(key, Sequence):
for k in key:
del self._info[k]
else:
raise KeyError(key)

@use_callback()
def __setitem__(self, key: str, value):
self._cache.check_readonly()
if self._dataset is not None:
auto_checkout(self._dataset)
self._info[key] = value
def __contains__(self, key):
return key in self._info

def __setattr__(self, key: str, value):
if key in {"_key", "_cache", "_info", "_dataset"}:
object.__setattr__(self, key, value)
else:
self[key] = value
def __iter__(self):
return iter(self._info)

def __getattr__(self, key: str):
def __getattr__(self, key):
try:
return object.__getattribute__(self, key)
except AttributeError:
Expand All @@ -132,6 +84,53 @@ def __getattr__(self, key: str):
return self._info
return self[key]

def __setattr__(self, key: str, value):
if key in {"_info", "_dataset", "_key", "is_dirty"}:
object.__setattr__(self, key, value)
else:
with self:
self[key] = value

def get(self, key, default=None):
return self._info.get(key, default)

def setdefault(self, key, default=None):
with self:
ret = self._info.setdefault(key, default)
return ret

def clear(self):
with self:
self._info.clear()

def pop(self, key, default=None):
with self:
popped = self._info.pop(key, default)
return popped

def popitem(self):
with self:
popped = self._info.popitem()
return popped

def update(self, *args, **kwargs):
with self:
self._info.update(*args, **kwargs)

def keys(self):
return self._info.keys()

def values(self):
return self._info.values()

def items(self):
return self._info.items()

def replace_with(self, d):
with self:
self._info.clear()
self._info.update(d)

# the below methods are used by cloudpickle dumps
def __origin__(self):
return None
Expand All @@ -155,11 +154,15 @@ def __args__(self):
return None


def load_info(info_key: str, cache: LRUCache, dataset):
if info_key in cache:
info = cache.get_cachable(info_key, Info, callback_arg=dataset)
else:
def load_info(path, dataset, key=None):
storage: LRUCache = dataset.storage

try:
info = storage.get_hub_object(path, Info)
except KeyError:
info = Info()
info.initialize_callback_location(info_key, cache, dataset)

info._dataset = dataset
info._key = key
storage.register_hub_object(path, info)
return info
7 changes: 5 additions & 2 deletions hub/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def test_persist_keys(local_ds_generator):
"dataset_meta.json",
"image/commit_diff",
"image/tensor_meta.json",
"image/tensor_info.json",
}


Expand Down Expand Up @@ -623,6 +622,9 @@ def test_like(local_path):

src_ds.d.info.update(key=1)

assert src_ds.info.key == 0
assert src_ds.d.info.key == 1

dest_ds = hub.like(dest_path, src_ds)

assert tuple(dest_ds.tensors.keys()) == ("a", "b", "c", "d")
Expand Down Expand Up @@ -806,7 +808,8 @@ def test_groups(local_ds_generator):

def test_tensor_delete(local_ds_generator):
ds = local_ds_generator()
ds.create_tensor("x")
ds.create_tensor("x", max_chunk_size=2 * MB)
ds.x.extend(np.ones((3, 253, 501, 5)))
ds.delete_tensor("x")
assert list(ds.storage.keys()) == ["dataset_meta.json"]
assert ds.tensors == {}
Expand Down
78 changes: 72 additions & 6 deletions hub/api/tests/test_info.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import pytest


def test_dataset(local_ds_generator):
ds = local_ds_generator()

Expand All @@ -20,6 +23,7 @@ def test_dataset(local_ds_generator):
test_list.extend(["user made change without `update`"])

ds.info.update({"1_-+": 5})
assert len(ds.info) == 7

ds = local_ds_generator()

Expand All @@ -41,13 +45,14 @@ def test_dataset(local_ds_generator):
assert len(ds.info) == 7
assert ds.info.test == [99]

ds.info.delete("test")
ds.info.pop("test")
assert len(ds.info) == 6

ds.info.delete(["1_-+", "xyz"])
ds.info.pop("1_-+")
ds.info.pop("xyz")
assert len(ds.info) == 4

ds.info.delete()
ds.info.clear()
assert len(ds.info) == 0


Expand Down Expand Up @@ -89,13 +94,14 @@ def test_tensor(local_ds_generator):

assert t1.info.key == 99

t2.info.delete("key")
t2.info.pop("key")
assert len(t2.info) == 3

t2.info.delete(["key2", "key3"])
t2.info.pop("key2")
t2.info.pop("key3")
assert len(t2.info) == 1

t2.info.delete()
t2.info.clear()
assert len(t2.info) == 0


Expand Down Expand Up @@ -147,3 +153,63 @@ def test_class_label(local_ds_generator):
ds.labels.info.class_names == ds.labels.info["class_names"] == ["c", "b", "a"]
)
assert ds.labels2.info.class_names == ds.labels2.info["class_names"] == []


def test_info_new_methods(local_ds_generator):
ds = local_ds_generator()
ds.create_tensor("x")

ds.info[0] = "hello"
ds.info[1] = "world"
assert len(ds.info) == 2
assert set(ds.info.keys()) == {0, 1}
assert 0 in ds.info
assert 1 in ds.info

assert ds.info[0] == "hello"
assert ds.info[1] == "world"

del ds.info[0]
assert len(ds.info) == 1
assert 1 in ds.info
assert ds.info[1] == "world"

for it in ds.info:
assert it == 1

ds.info.setdefault(0, "yo")
assert len(ds.info) == 2
assert 0 in ds.info
assert 1 in ds.info
assert ds.info[0] == "yo"
assert ds.info[1] == "world"

ds.info.popitem()
assert len(ds.info) == 1
assert 1 in ds.info
assert ds.info[1] == "world"

for k, v in ds.info.items():
assert k == 1
assert v == "world"

for v in ds.info.values():
assert v == "world"

ds.info = {"a": "b"}
assert len(ds.info) == 1
assert "a" in ds.info
assert ds.info["a"] == "b"

ds.x.info = {"x": "y", "z": "w"}
assert len(ds.x.info) == 2
assert "x" in ds.x.info
assert "z" in ds.x.info
assert ds.x.info["x"] == "y"
assert ds.x.info["z"] == "w"

with pytest.raises(TypeError):
ds.info = ["abc"]

with pytest.raises(TypeError):
ds.x.info = ["abc"]

0 comments on commit aada0a9

Please sign in to comment.