Skip to content

Commit

Permalink
Merge pull request #2271 from activeloopai/fy_commit_info
Browse files Browse the repository at this point in the history
[AL-2227] Commit level info
  • Loading branch information
FayazRahman committed Apr 20, 2023
2 parents 89ee314 + 3421b5d commit 68487d2
Show file tree
Hide file tree
Showing 9 changed files with 389 additions and 45 deletions.
10 changes: 3 additions & 7 deletions deeplake/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from deeplake.util.connect_dataset import connect_dataset_entry
from deeplake.util.version_control import (
load_version_info,
rebuild_version_info,
get_parent_and_reset_commit_ids,
replace_head,
integrity_check,
Expand Down Expand Up @@ -275,10 +276,6 @@ def init(
"Exception occured (see Traceback). The dataset maybe corrupted. "
"Try using `reset=True` to reset HEAD changes and load the previous commit."
) from e
if storage.read_only:
raise ReadOnlyModeError(
"Cannot reset HEAD when loading dataset in read-only mode."
)
return dataset._reset_and_load(
cache_chain, access_method, dataset_kwargs, address, e
)
Expand Down Expand Up @@ -610,16 +607,15 @@ def load(

@staticmethod
def _reset_and_load(storage, access_method, dataset_kwargs, address, err):
"""Reset and then load the dataset. Only called when loading dataset errored out with `err`."""
"""Reset and then load the dataset. Only called when loading dataset errored out with ``err``."""
if access_method != "stream":
dataset_kwargs["reset"] = True
ds = dataset._load(dataset_kwargs, access_method)
return ds

try:
version_info = load_version_info(storage)
except KeyError:
# no version control info - cant do anything
except Exception:
raise err

address = address or "main"
Expand Down
162 changes: 161 additions & 1 deletion deeplake/api/tests/test_reset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
from deeplake.util.exceptions import DatasetCorruptError, ReadOnlyModeError
from deeplake.util.exceptions import (
DatasetCorruptError,
ReadOnlyModeError,
CheckoutError,
)
from deeplake.util.version_control import rebuild_version_info
from deeplake.util.testing import compare_version_info
from deeplake.util.keys import get_commit_info_key

import numpy as np

import deeplake
import pytest
import json


def corrupt_ds(ds, tensor, data):
Expand Down Expand Up @@ -57,6 +65,38 @@ def test_load_corrupt_dataset(path):
verify_reset_on_checkout(ds, "main", second, save_head, {"abc": [[1], [2]]})


def test_load_corrupt_dataset_no_vc(local_path):
ds = deeplake.empty(local_path)

with ds:
ds.create_tensor("abc")
ds.abc.append(1)
first = ds.commit()

ds.abc.append(2)
second = ds.commit()

ds = deeplake.load(local_path)
corrupt_ds(ds, "abc", 3)
save_head = ds.pending_commit_id

saved = json.loads(ds.storage["version_control_info.json"].decode("utf-8"))
del ds.storage["version_control_info.json"]

with pytest.raises(KeyError):
ds.storage["version_control_info.json"]

with pytest.raises(DatasetCorruptError):
ds = deeplake.load(local_path)

reloaded = json.loads(ds.storage["version_control_info.json"].decode("utf-8"))
compare_version_info(saved, reloaded)

ds = deeplake.load(local_path, reset=True)

verify_reset_on_checkout(ds, "main", second, save_head, {"abc": [[1], [2]]})


def test_load_corrupted_branch(local_path):
ds = deeplake.empty(local_path, overwrite=True)
ds.create_tensor("abc")
Expand Down Expand Up @@ -164,3 +204,123 @@ def test_load_corrupt_dataset_with_no_commits(local_path):
ds = deeplake.load(local_path, reset=True)

assert set(ds._tensors()) == set()


def test_rebuild_vc_info(local_ds):
with local_ds as ds:
ds.create_tensor("abc")
ds.abc.append(1)
ds.commit()
ds.checkout("alt1", create=True)
ds.abc.append(2)
ds.commit()
ds.checkout("main")
ds.abc.append(3)
ds.commit()
ds.checkout("alt2", create=True)
ds.abc.append(4)
ds.commit()
ds.abc.append(5)
ds.commit()
ds.checkout("main")
ds.merge("alt2")
ds.merge("alt1")

saved = json.loads(local_ds.storage["version_control_info.json"])
del local_ds.storage["version_control_info.json"]

with pytest.raises(KeyError):
local_ds.storage["version_control_info.json"]

rebuild_version_info(local_ds.storage)

reloaded = json.loads(local_ds.storage["version_control_info.json"])

compare_version_info(saved, reloaded)


def test_fix_vc(local_path):
ds = deeplake.empty(local_path, overwrite=True)

with ds:
ds.create_tensor("abc")
ds.abc.append(1)
ds.commit()
ds.checkout("alt", create=True)
ds.abc.append(2)
ds.commit()
ds.checkout("main")
ds.abc.append(3)
ds.commit()

saved = json.loads(ds.storage["version_control_info.json"].decode("utf-8"))

for commit_id, commit in saved["commits"].items():
commit["children"] = [
c for c in commit["children"] if saved["commits"][c]["branch"] != "alt"
]
alt_id = saved["branches"].pop("alt")
del saved["commits"][alt_id]
saved["commits"] = dict(
filter(
lambda x: x[0] != alt_id and x[1]["branch"] != "alt",
saved["commits"].items(),
)
)

ds.storage["version_control_info.json"] = json.dumps(saved).encode("utf-8")
ds.storage.flush()

ds = deeplake.load(local_path)

with pytest.raises(CheckoutError):
ds.checkout("alt")

ds.fix_vc()

ds.checkout("alt")

np.testing.assert_array_equal(ds.abc.numpy(), [[1], [2]])


def test_missing_commit_infos(local_ds):
with local_ds as ds:
ds.create_tensor("abc")
ds.abc.append(1)
a = ds.commit()
ds.abc.append(2)
b = ds.commit()
ds.checkout("alt", create=True)
ds.abc.append(3)
c = ds.commit()
ds.abc.append(4)
d = ds.commit()
ds.abc.append(5)

del ds.storage["version_control_info.json"]
del ds.storage[get_commit_info_key(d)]
ds.storage.flush()

ds = deeplake.load(local_ds.path)

np.testing.assert_array_equal(ds.abc.numpy(), [[1], [2]])

ds.checkout("alt")

np.testing.assert_array_equal(ds.abc.numpy(), [[1], [2], [3]])

assert ds.commit_id == c


def test_dataset_with_no_commits_unaffected(local_path):
ds = deeplake.empty(local_path, overwrite=True)

ds.create_tensor("abc")
ds.abc.append(1)

del ds.storage["version_control_info.json"]
ds.storage.flush()

ds = deeplake.load(local_path)

np.testing.assert_array_equal(ds.abc.numpy(), [[1]])
1 change: 1 addition & 0 deletions deeplake/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
DATASET_INFO_FILENAME = "dataset_info.json"
TENSOR_INFO_FILENAME = "tensor_info.json"

COMMIT_INFO_FILENAME = "commit_info.json"
DATASET_LOCK_FILENAME = "dataset_lock.lock"
DATASET_DIFF_FILENAME = "dataset_diff"
TENSOR_COMMIT_CHUNK_MAP_FILENAME = "chunk_set"
Expand Down
19 changes: 17 additions & 2 deletions deeplake/core/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from deeplake.util.version_control import (
save_version_info,
integrity_check,
save_commit_info,
rebuild_version_info,
)
from deeplake.util.invalid_view_op import invalid_view_op
from deeplake.util.spinner import spinner
Expand Down Expand Up @@ -1224,7 +1226,12 @@ def _load_version_info(self, address=None):

version_state = {}
try:
version_info = load_version_info(self.storage)
try:
version_info = load_version_info(self.storage)
except Exception as e:
version_info = rebuild_version_info(self.storage)
if version_info is None:
raise e
version_state["branch_commit_map"] = version_info["branch_commit_map"]
version_state["commit_node_map"] = version_info["commit_node_map"]

Expand Down Expand Up @@ -1415,7 +1422,6 @@ def merge(
self._initial_autoflush.append(self.storage.autoflush)
self.storage.autoflush = False
merge(self, target_id, conflict_resolution, delete_removed_tensors, force)
self.__dict__["_vc_info_updated"] = False
self.storage.autoflush = self._initial_autoflush.pop()
self.storage.maybe_flush()

Expand Down Expand Up @@ -2139,6 +2145,9 @@ def flush(self):
def _flush_vc_info(self):
if self._vc_info_updated:
save_version_info(self.version_state, self.storage)
for node in self.version_state["commit_node_map"].values():
if node._info_updated:
save_commit_info(node, self.storage)
self.__dict__["_vc_info_updated"] = False

def clear_cache(self):
Expand Down Expand Up @@ -3563,6 +3572,12 @@ def reset(self, force: bool = False):

self.checkout(new_commit_id)

def fix_vc(self):
"""Rebuilds version control info. To be used when the version control info is corrupted."""
version_info = rebuild_version_info(self.storage)
self.version_state["commit_node_map"] = version_info["commit_node_map"]
self.version_state["branch_commit_map"] = version_info["branch_commit_map"]

def connect(
self,
creds_key: str,
Expand Down
11 changes: 11 additions & 0 deletions deeplake/core/version_control/commit_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def __init__(self, branch: str, commit_id: str):
self.commit_time: Optional[datetime] = None
self.commit_user_name: Optional[str] = None
self.merge_parent: Optional["CommitNode"] = None
self._info_updated: bool = False

def add_child(self, node: "CommitNode"):
"""Adds a child to the node, used for branching."""
Expand Down Expand Up @@ -54,3 +55,13 @@ def is_head_node(self) -> bool:
return self.commit_time is None

__str__ = __repr__

def to_json(self):
return {
"branch": self.branch,
"children": [node.commit_id for node in self.children],
"parent": self.parent.commit_id if self.parent else None,
"commit_message": self.commit_message,
"commit_time": self.commit_time.timestamp() if self.commit_time else None,
"commit_user_name": self.commit_user_name,
}
3 changes: 2 additions & 1 deletion deeplake/core/version_control/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import pytest
import deeplake

from deeplake.util.testing import assert_array_equal
from deeplake.util.testing import assert_array_equal, compare_version_info
from deeplake.util.version_control import rebuild_version_info
from deeplake.util.exceptions import (
MergeConflictError,
MergeMismatchError,
Expand Down
7 changes: 7 additions & 0 deletions deeplake/util/keys.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import posixpath
from deeplake.constants import (
CHUNKS_FOLDER,
COMMIT_INFO_FILENAME,
DATASET_DIFF_FILENAME,
DATASET_INFO_FILENAME,
DATASET_LOCK_FILENAME,
Expand Down Expand Up @@ -59,6 +60,12 @@ def get_dataset_diff_key(commit_id: str) -> str:
return "/".join(("versions", commit_id, DATASET_DIFF_FILENAME))


def get_commit_info_key(commit_id: str) -> str:
if commit_id == FIRST_COMMIT_ID:
return COMMIT_INFO_FILENAME
return "/".join(("versions", commit_id, COMMIT_INFO_FILENAME))


def get_dataset_linked_creds_key() -> str:
return LINKED_CREDS_FILENAME

Expand Down
15 changes: 15 additions & 0 deletions deeplake/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,18 @@ def assert_array_equal(x, y, *args, **kwargs):
return np.testing.assert_array_equal(xi, yi, *args, **kwargs)
else:
return np.testing.assert_array_equal(x, y, *args, **kwargs)


def compare_version_info(info1, info2):
for commit_id in info1["commits"]:
commit_info_1 = info1["commits"][commit_id]
commit_info_2 = info2["commits"][commit_id]
for key in commit_info_1:
if key == "children":
if set(commit_info_1[key]) != set(commit_info_2[key]):
print(commit_info_1[key], commit_info_2[key])
assert set(commit_info_1[key]) == set(commit_info_2[key])
else:
assert commit_info_1[key] == commit_info_2[key]

assert info1["branches"] == info2["branches"]
Loading

0 comments on commit 68487d2

Please sign in to comment.