Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AL-1606] Reduce autocommits before checkout #1485

Merged
merged 69 commits into from
Feb 15, 2022
Merged
Show file tree
Hide file tree
Changes from 67 commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
8316498
rewrite info, introduce is_dirty
AbhinavTuli Jan 24, 2022
63da480
test fixes, cleanup
AbhinavTuli Jan 25, 2022
7a1c168
cleanup transform code
AbhinavTuli Jan 27, 2022
6eaf915
lint fix
AbhinavTuli Jan 27, 2022
c6ecc9b
fix transform
AbhinavTuli Jan 31, 2022
80aa8b9
adds flush dirty items
AbhinavTuli Jan 31, 2022
2a6ecfb
Merge remote-tracking branch 'origin' into remove/cachable
AbhinavTuli Jan 31, 2022
2ffa0d5
linting
AbhinavTuli Feb 1, 2022
f38d998
fix import
AbhinavTuli Feb 1, 2022
340136c
change how chunk engine handles dirty objects
AbhinavTuli Feb 1, 2022
b9d8a54
lint
AbhinavTuli Feb 1, 2022
5e2183f
Merge remote-tracking branch 'origin' into remove/cachable
AbhinavTuli Feb 1, 2022
919bc25
fix recursion error and imports
AbhinavTuli Feb 1, 2022
6ecd2c8
fix some tests and lint
AbhinavTuli Feb 1, 2022
bc13622
replace cachable with HubMemoryObject
AbhinavTuli Feb 1, 2022
b08ec0b
lint fix
AbhinavTuli Feb 2, 2022
ba48d30
fix tensor delete
AbhinavTuli Feb 2, 2022
326861a
fix vc bug
AbhinavTuli Feb 2, 2022
3cf44db
fix ds.append
AbhinavTuli Feb 2, 2022
c053b28
fix chunk compressed chunk
AbhinavTuli Feb 2, 2022
0095a0f
bug fix
AbhinavTuli Feb 2, 2022
efd3561
fix info
AbhinavTuli Feb 2, 2022
9788aca
reduce flushes
AbhinavTuli Feb 2, 2022
56ee27f
fix io.py
AbhinavTuli Feb 3, 2022
4c98176
fix cache pickling
AbhinavTuli Feb 3, 2022
bf1ab74
fix shape updates
AbhinavTuli Feb 3, 2022
bc75517
transform fix
AbhinavTuli Feb 3, 2022
1a39dec
change info exception
AbhinavTuli Feb 7, 2022
9da2d87
fix autocheckout
AbhinavTuli Feb 7, 2022
befccb0
update info test
AbhinavTuli Feb 7, 2022
9eeec9a
fix diff tests
AbhinavTuli Feb 8, 2022
d057ff4
Merge remote-tracking branch 'origin' into remove/cachable
AbhinavTuli Feb 8, 2022
ff72c3a
remove duplicate method from info
AbhinavTuli Feb 8, 2022
438bdf8
mypy fix
AbhinavTuli Feb 8, 2022
58af3d7
added more info codecov
AbhinavTuli Feb 8, 2022
4da514d
removed unnecessary argument
AbhinavTuli Feb 8, 2022
e8f34be
increase codecov
AbhinavTuli Feb 9, 2022
97bc46f
makes nbytes abstract method
AbhinavTuli Feb 9, 2022
dbe28fb
change decorator order
AbhinavTuli Feb 9, 2022
97ebcb7
use context in info
AbhinavTuli Feb 9, 2022
8fb8067
reflect info modifications in diff
AbhinavTuli Feb 9, 2022
2dbc9ab
add modify info calls
AbhinavTuli Feb 9, 2022
45199b8
remove info updated from dataset meta
AbhinavTuli Feb 9, 2022
1c3c558
more version control tweaks
AbhinavTuli Feb 9, 2022
eca4fd0
minor tweaks to dataset
AbhinavTuli Feb 9, 2022
81ca4b2
Merge remote-tracking branch 'origin' into remove/cachable
AbhinavTuli Feb 9, 2022
725c2a3
don't mark certain encoders as dirty on init
AbhinavTuli Feb 9, 2022
d39b4a9
remove unused method
AbhinavTuli Feb 9, 2022
10003cf
fix circular import
AbhinavTuli Feb 9, 2022
b4b1904
improve codecov
AbhinavTuli Feb 9, 2022
a06e4af
fix after merge
AbhinavTuli Feb 9, 2022
ceba68c
refactor to remove dataset from cache
AbhinavTuli Feb 9, 2022
e1b9857
undo temporary test changes
AbhinavTuli Feb 9, 2022
274a9d8
simplify get_chunk
AbhinavTuli Feb 9, 2022
58ec94b
added dataset diff
AbhinavTuli Feb 10, 2022
069dd25
changes autocommit logic
AbhinavTuli Feb 10, 2022
10badf1
fix dataset diff tobytes
AbhinavTuli Feb 10, 2022
d647a1d
lint fixes
AbhinavTuli Feb 10, 2022
cc9c626
fix vc tests
AbhinavTuli Feb 14, 2022
ddb218f
change remove_hub_object implementation
AbhinavTuli Feb 14, 2022
7b4e83e
Merge remote-tracking branch 'origin' into remove/cachable
AbhinavTuli Feb 14, 2022
aea34a5
Merge branch 'remove/cachable' into reduce/autocommits
AbhinavTuli Feb 14, 2022
72b4a7f
remove unused code
AbhinavTuli Feb 14, 2022
81032ab
fix windows issue
AbhinavTuli Feb 14, 2022
b6b03db
Merge branch 'remove/cachable' into reduce/autocommits
AbhinavTuli Feb 14, 2022
e4f13c0
mypy fix
AbhinavTuli Feb 14, 2022
248b6ac
Merge branch 'remove/cachable' into reduce/autocommits
AbhinavTuli Feb 14, 2022
bc48e49
Merge remote-tracking branch 'origin' into reduce/autocommits
AbhinavTuli Feb 15, 2022
2d5b2d4
changed print statements
AbhinavTuli Feb 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 95 additions & 90 deletions hub/api/info.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,49 @@
from hub.util.version_control import auto_checkout
from hub.core.storage.lru_cache import LRUCache
from typing import Any, Dict, Optional, Union, Sequence
from hub.core.storage.cachable import CachableCallback, use_callback
from hub.util.exceptions import InfoError
from hub.core.storage.hub_memory_object import HubMemoryObject
from typing import Any, Dict


class Info(CachableCallback):
class Info(HubMemoryObject):
def __init__(self):
"""Contains **optional** key/values that datasets/tensors use for human-readability.
See the `Meta` class for required key/values for datasets/tensors.

Note:
Since `Info` is rarely written to and mostly by the user, every modifier will call `cache[key] = self`.
Must call `initialize_callback_location` before using any methods.
"""
self._info = {}
super().__init__()
self._dataset = None

# the key to info in case of Tensor Info, None in case of Dataset Info
self._key = None
self.is_dirty = False

def __enter__(self):
ds = self._dataset
key = self._key
if ds is not None:
ds.storage.check_readonly()
if not ds.version_state["commit_node"].is_head_node:
raise InfoError("Cannot modify info from a non-head commit.")
self.is_dirty = True
if key:
ds[key].chunk_engine.commit_diff.modify_info()
else:
ds._dataset_diff.modify_info()
return self

def __exit__(self, exc_type, exc_val, exc_tb):
if self._dataset is not None:
self._dataset.storage.maybe_flush()

@property
def nbytes(self):
# TODO: optimize this
return len(self.tobytes())

@use_callback(check_only=True)
def __len__(self):
return len(self._info)

@use_callback(check_only=True)
def __getstate__(self) -> Dict[str, Any]:
return self._info

def __setstate__(self, state: Dict[str, Any]):
self._info = state

@use_callback()
def update(self, *args, **kwargs):
"""Store optional dataset/tensor information. Will be accessible after loading your data from a new script!
Inputs must be supported by JSON.


Note:
This method has the same functionality as `dict().update(...)` Reference: https://www.geeksforgeeks.org/python-dictionary-update-method/.
A full list of supported value types can be found here: https://docs.python.org/3/library/json.html#json.JSONEncoder.

Examples:
Normal update usage:
>>> ds.info
{}
>>> ds.info.update(key=0)
>>> ds.info
{"key": 0}
>>> ds.info.update({"key1": 5, "key2": [1, 2, "test"]})
>>> ds.info
{"key": 0, "key1": 5, "key2": [1, 2, "test"]}

Alternate update usage:
>>> ds.info
{}
>>> ds.info.update(list=[1, 2, "apple"])
>>> ds.info
{"list": [1, 2, "apple"]}
>>> l = ds.info.list
>>> l
[1, 2, "apple"]
>>> l.append(5)
>>> l
[1, 2, "apple", 5]
>>> ds.info.update() # required to be persistent!

"""
self._cache.check_readonly()
if self._dataset is not None:
auto_checkout(self._dataset)
self._info.update(*args, **kwargs)

def __getattribute__(self, name: str) -> Any:
"""Allows access to info values using the `.` syntax. Example: `info.description`."""

Expand All @@ -82,48 +53,31 @@ def __getattribute__(self, name: str) -> Any:
return self.__getitem__(name)
return super().__getattribute__(name)

# implement all the methods of dictionary
def __getitem__(self, key: str):
return self._info[key]

def get(self, key: str, default: Optional[Any] = None):
return self._info.get(key, default)

def __str__(self):
return self._info.__str__()

def __repr__(self):
return self._info.__repr__()

@use_callback()
def delete(self, key: Optional[Union[Sequence[str], str]] = None):
"""Deletes a key or list of keys. If no key(s) is passed, all keys are deleted."""
self._cache.check_readonly()
if self._dataset is not None:
auto_checkout(self._dataset)
if key is None:
self._info.clear()
elif isinstance(key, str):
def __setitem__(self, key, value):
with self:
self._info[key] = value

def __delitem__(self, key):
with self:
del self._info[key]
elif isinstance(key, Sequence):
for k in key:
del self._info[k]
else:
raise KeyError(key)

@use_callback()
def __setitem__(self, key: str, value):
self._cache.check_readonly()
if self._dataset is not None:
auto_checkout(self._dataset)
self._info[key] = value
def __contains__(self, key):
return key in self._info

def __setattr__(self, key: str, value):
if key in {"_key", "_cache", "_info", "_dataset"}:
object.__setattr__(self, key, value)
else:
self[key] = value
def __iter__(self):
return iter(self._info)

def __getattr__(self, key: str):
def __getattr__(self, key):
try:
return object.__getattribute__(self, key)
except AttributeError:
Expand All @@ -132,6 +86,53 @@ def __getattr__(self, key: str):
return self._info
return self[key]

def __setattr__(self, key: str, value):
if key in {"_info", "_dataset", "_key", "is_dirty"}:
object.__setattr__(self, key, value)
else:
with self:
self[key] = value

def get(self, key, default=None):
return self._info.get(key, default)

def setdefault(self, key, default=None):
with self:
ret = self._info.setdefault(key, default)
return ret

def clear(self):
with self:
self._info.clear()

def pop(self, key, default=None):
with self:
popped = self._info.pop(key, default)
return popped

def popitem(self):
with self:
popped = self._info.popitem()
return popped

def update(self, *args, **kwargs):
with self:
self._info.update(*args, **kwargs)

def keys(self):
return self._info.keys()

def values(self):
return self._info.values()

def items(self):
return self._info.items()

def replace_with(self, d):
with self:
self._info.clear()
self._info.update(d)

# the below methods are used by cloudpickle dumps
def __origin__(self):
return None
Expand All @@ -155,11 +156,15 @@ def __args__(self):
return None


def load_info(info_key: str, cache: LRUCache, dataset):
if info_key in cache:
info = cache.get_cachable(info_key, Info, callback_arg=dataset)
else:
def load_info(path, dataset, key=None):
storage: LRUCache = dataset.storage

try:
info = storage.get_hub_object(path, Info)
except KeyError:
info = Info()
info.initialize_callback_location(info_key, cache, dataset)

info._dataset = dataset
info._key = key
storage.register_hub_object(path, info)
return info
7 changes: 5 additions & 2 deletions hub/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ def test_persist_keys(local_ds_generator):
"dataset_meta.json",
"image/commit_diff",
"image/tensor_meta.json",
"image/tensor_info.json",
}


Expand Down Expand Up @@ -618,6 +617,9 @@ def test_like(local_path):

src_ds.d.info.update(key=1)

assert src_ds.info.key == 0
assert src_ds.d.info.key == 1

dest_ds = hub.like(dest_path, src_ds)

assert tuple(dest_ds.tensors.keys()) == ("a", "b", "c", "d")
Expand Down Expand Up @@ -801,7 +803,8 @@ def test_groups(local_ds_generator):

def test_tensor_delete(local_ds_generator):
ds = local_ds_generator()
ds.create_tensor("x")
ds.create_tensor("x", max_chunk_size=2 * MB)
ds.x.extend(np.ones((3, 253, 501, 5)))
ds.delete_tensor("x")
assert list(ds.storage.keys()) == ["dataset_meta.json"]
assert ds.tensors == {}
Expand Down
78 changes: 72 additions & 6 deletions hub/api/tests/test_info.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import pytest


def test_dataset(local_ds_generator):
ds = local_ds_generator()

Expand All @@ -20,6 +23,7 @@ def test_dataset(local_ds_generator):
test_list.extend(["user made change without `update`"])

ds.info.update({"1_-+": 5})
assert len(ds.info) == 7

ds = local_ds_generator()

Expand All @@ -41,13 +45,14 @@ def test_dataset(local_ds_generator):
assert len(ds.info) == 7
assert ds.info.test == [99]

ds.info.delete("test")
ds.info.pop("test")
assert len(ds.info) == 6

ds.info.delete(["1_-+", "xyz"])
ds.info.pop("1_-+")
ds.info.pop("xyz")
assert len(ds.info) == 4

ds.info.delete()
ds.info.clear()
assert len(ds.info) == 0


Expand Down Expand Up @@ -89,13 +94,14 @@ def test_tensor(local_ds_generator):

assert t1.info.key == 99

t2.info.delete("key")
t2.info.pop("key")
assert len(t2.info) == 3

t2.info.delete(["key2", "key3"])
t2.info.pop("key2")
t2.info.pop("key3")
assert len(t2.info) == 1

t2.info.delete()
t2.info.clear()
assert len(t2.info) == 0


Expand Down Expand Up @@ -147,3 +153,63 @@ def test_class_label(local_ds_generator):
ds.labels.info.class_names == ds.labels.info["class_names"] == ["c", "b", "a"]
)
assert ds.labels2.info.class_names == ds.labels2.info["class_names"] == []


def test_info_new_methods(local_ds_generator):
ds = local_ds_generator()
ds.create_tensor("x")

ds.info[0] = "hello"
ds.info[1] = "world"
assert len(ds.info) == 2
assert set(ds.info.keys()) == {0, 1}
assert 0 in ds.info
assert 1 in ds.info

assert ds.info[0] == "hello"
assert ds.info[1] == "world"

del ds.info[0]
assert len(ds.info) == 1
assert 1 in ds.info
assert ds.info[1] == "world"

for it in ds.info:
assert it == 1

ds.info.setdefault(0, "yo")
assert len(ds.info) == 2
assert 0 in ds.info
assert 1 in ds.info
assert ds.info[0] == "yo"
assert ds.info[1] == "world"

ds.info.popitem()
assert len(ds.info) == 1
assert 1 in ds.info
assert ds.info[1] == "world"

for k, v in ds.info.items():
assert k == 1
assert v == "world"

for v in ds.info.values():
assert v == "world"

ds.info = {"a": "b"}
assert len(ds.info) == 1
assert "a" in ds.info
assert ds.info["a"] == "b"

ds.x.info = {"x": "y", "z": "w"}
assert len(ds.x.info) == 2
assert "x" in ds.x.info
assert "z" in ds.x.info
assert ds.x.info["x"] == "y"
assert ds.x.info["z"] == "w"

with pytest.raises(TypeError):
ds.info = ["abc"]

with pytest.raises(TypeError):
ds.x.info = ["abc"]
1 change: 1 addition & 0 deletions hub/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
TENSOR_INFO_FILENAME = "tensor_info.json"

DATASET_LOCK_FILENAME = "dataset_lock.lock"
DATASET_DIFF_FILENAME = "dataset_diff"
TENSOR_COMMIT_CHUNK_SET_FILENAME = "chunk_set"
TENSOR_COMMIT_DIFF_FILENAME = "commit_diff"

Expand Down
Loading