From d3e23a9fe1a20e99221f5fe04e7c735a87e3692e Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Thu, 15 Jul 2021 16:00:41 +0530 Subject: [PATCH 01/13] added pickling to dataset --- hub/api/dataset.py | 31 ++++++++++++++++++++++++++----- hub/api/tests/test_api.py | 34 ++++++++++++++++++++++++++++++++++ hub/core/storage/lru_cache.py | 26 ++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 5 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 0eb4376469..05cbdc2dc4 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -86,19 +86,19 @@ def __init__( self.storage = generate_chain( base_storage, memory_cache_size_bytes, local_cache_size_bytes, path ) - self.storage.autoflush = True self.index = index or Index() - self.tensors: Dict[str, Tensor] = {} - self._token = token + self.public = public + self._init_helper() + def _init_helper(self): + self.storage.autoflush = True if self.path.startswith("hub://"): split_path = self.path.split("/") self.org_id, self.ds_name = split_path[2], split_path[3] - self.client = HubBackendClient(token=token) + self.client = HubBackendClient(token=self._token) - self.public = public self._load_meta() hub_reporter.feature_report( @@ -118,6 +118,27 @@ def __len__(self): tensor_lengths = [len(tensor[self.index]) for tensor in self.tensors.values()] return min(tensor_lengths, default=0) + def __getstate__(self): + """Returns a dict that can be pickled and used to restore this dataset. + + PS: Pickling a dataset does not copy the dataset, it only saves attributes that can be used to restore the dataset. + If you pickle a local dataset and try to access it on a machine that does not have the data present, the dataset will not work. + """ + return { + "path": self.path, + "_read_only": self.read_only, + "index": self.index, + "public": self.public, + "storage": self.storage, + "_token": self.token, + } + + def __setstate__(self, state): + """Restores dataset from a pickled state.""" + self.__dict__.update(state) + self.tensors = {} + self._init_helper() + def __getitem__( self, item: Union[ diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 4e4339de72..c0b46ec78f 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -13,6 +13,7 @@ from hub.client.client import HubBackendClient from hub.client.utils import has_hub_testing_creds from click.testing import CliRunner +import pickle # need this for 32-bit and 64-bit systems to have correct tests @@ -487,3 +488,36 @@ def test_empty_dataset(): ds.create_tensor("z") ds = Dataset("test") assert list(ds.tensors) == ["x", "y", "z"] + + +@parametrize_all_dataset_storages +def test_dataset_pickling(ds): + ds.create_tensor("image", htype="image", sample_compression="jpeg") + ds.create_tensor("label") + for i in range(10): + ds.image.append(i * np.ones(((i + 1) * 20, (i + 1) * 20, 3), dtype=np.uint8)) + + for i in range(5): + ds.label.append(i) + + pickled_ds = pickle.dumps(ds) + unpickled_ds = pickle.loads(pickled_ds) + assert len(unpickled_ds.image) == len(ds.image) + assert len(unpickled_ds.label) == len(ds.label) + assert unpickled_ds.tensors.keys() == ds.tensors.keys() + assert unpickled_ds.index.values[0].value == ds.index.values[0].value + assert unpickled_ds.meta.version == ds.meta.version + + for i in range(10): + np.testing.assert_array_equal( + ds.image[i].numpy(), + (i * np.ones(((i + 1) * 20, (i + 1) * 20, 3), dtype=np.uint8)), + ) + np.testing.assert_array_equal( + ds.image[i].numpy(), unpickled_ds.image[i].numpy() + ) + for i in range(5): + np.testing.assert_array_equal(ds.label[i].numpy(), i) + np.testing.assert_array_equal( + ds.label[i].numpy(), unpickled_ds.label[i].numpy() + ) diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index b957f62911..c75dc3a828 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -261,3 +261,29 @@ def _list_keys(self): for key in self.cache_storage: all_keys.add(key) return list(all_keys) + + def __getstate__(self): + """Returns the state of the cache, for pickling""" + # flushes the cache before pickling + self.flush() + return { + "next_storage": self.next_storage, + "cache_storage": self.cache_storage, + "cache_size": self.cache_size, + } + + def __setstate__(self, state): + """Recreates a cache with the same configuration as the state. + + PS: While restoring the cache, we reset its contents. + In case the cache storage was local/s3 and is still accessible when unpickled (if same machine/s3 creds present respectively), the earlier cache contents are no longer accessible. + + TODO: We might want to change this behaviour in the future by having a separate file that keeps a track of the lru order for restoring the cache. + This would also allow the cache to persist across different different Dataset objects pointing to the same dataset. + """ + self.next_storage = state["next_storage"] + self.cache_storage = state["cache_storage"] + self.cache_size = state["cache_size"] + self.lru_sizes = OrderedDict() + self.dirty_keys = set() + self.cache_used = 0 From 5a8aaba990be85dee36c72214b7aa2dbbcf029d3 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Thu, 15 Jul 2021 21:16:42 +0530 Subject: [PATCH 02/13] fix issue with pickling memory provider --- hub/core/storage/lru_cache.py | 4 +++- hub/core/storage/memory.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index c75dc3a828..9b496450b6 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -264,8 +264,10 @@ def _list_keys(self): def __getstate__(self): """Returns the state of the cache, for pickling""" + # flushes the cache before pickling - self.flush() + if not hasattr(self, "read_only") or not self.read_only: + self.flush() return { "next_storage": self.next_storage, "cache_storage": self.cache_storage, diff --git a/hub/core/storage/memory.py b/hub/core/storage/memory.py index 4f53d5e10e..32503222f2 100644 --- a/hub/core/storage/memory.py +++ b/hub/core/storage/memory.py @@ -96,3 +96,14 @@ def clear(self): """Clears the provider.""" self.check_readonly() self.dict = {} + + def __getstate__(self): + """Returns the state of the provider.""" + # d = self.__dict__ + # return self.__dict__ + return self.root + + def __setstate__(self, state): + """Sets the state of the provider.""" + self.__init__(root=state) + \ No newline at end of file From 3e660360721ff26d5a5aabc08640bec5dc14a5a1 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Thu, 15 Jul 2021 22:13:35 +0530 Subject: [PATCH 03/13] added exception while pickling memory dataset --- hub/api/dataset.py | 3 +++ hub/api/tests/test_api.py | 23 ++++++++++++++++------- hub/core/storage/memory.py | 5 +---- hub/util/exceptions.py | 6 ++++++ 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 41b66ceee0..206c22e7a6 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -16,6 +16,7 @@ from hub.util.cache_chain import generate_chain from hub.util.exceptions import ( InvalidKeyTypeError, + MemoryDatasetCanNotBePickledError, PathNotEmptyException, TensorAlreadyExistsError, TensorDoesNotExistError, @@ -124,6 +125,8 @@ def __getstate__(self): PS: Pickling a dataset does not copy the dataset, it only saves attributes that can be used to restore the dataset. If you pickle a local dataset and try to access it on a machine that does not have the data present, the dataset will not work. """ + if self.path.startswith("mem://"): + raise MemoryDatasetCanNotBePickledError return { "path": self.path, "_read_only": self.read_only, diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 9cdcaabf0f..f4f3bc7cee 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -7,6 +7,7 @@ from hub.core.tests.common import parametrize_all_dataset_storages from hub.tests.common import assert_array_lists_equal from hub.util.exceptions import ( + MemoryDatasetCanNotBePickledError, TensorDtypeMismatchError, TensorInvalidSampleShapeError, ) @@ -507,13 +508,21 @@ def test_empty_dataset(): @parametrize_all_dataset_storages def test_dataset_pickling(ds): - ds.create_tensor("image", htype="image", sample_compression="jpeg") - ds.create_tensor("label") - for i in range(10): - ds.image.append(i * np.ones(((i + 1) * 20, (i + 1) * 20, 3), dtype=np.uint8)) - - for i in range(5): - ds.label.append(i) + if ds.path.startswith("mem://"): + with pytest.raises(MemoryDatasetCanNotBePickledError): + pickle.dumps(ds) + return + + with ds: + ds.create_tensor("image", htype="image", sample_compression="jpeg") + ds.create_tensor("label") + for i in range(10): + ds.image.append( + i * np.ones(((i + 1) * 20, (i + 1) * 20, 3), dtype=np.uint8) + ) + + for i in range(5): + ds.label.append(i) pickled_ds = pickle.dumps(ds) unpickled_ds = pickle.loads(pickled_ds) diff --git a/hub/core/storage/memory.py b/hub/core/storage/memory.py index 32503222f2..f9214e4bcf 100644 --- a/hub/core/storage/memory.py +++ b/hub/core/storage/memory.py @@ -98,12 +98,9 @@ def clear(self): self.dict = {} def __getstate__(self): - """Returns the state of the provider.""" - # d = self.__dict__ - # return self.__dict__ + """Does NOT save the in memory data in state.""" return self.root def __setstate__(self, state): - """Sets the state of the provider.""" self.__init__(root=state) \ No newline at end of file diff --git a/hub/util/exceptions.py b/hub/util/exceptions.py index bab69f6911..9d396732ba 100644 --- a/hub/util/exceptions.py +++ b/hub/util/exceptions.py @@ -431,3 +431,9 @@ def __init__(self): super().__init__( f"Python Shared memory with multiprocessing doesn't work properly on Windows." ) + +class MemoryDatasetCanNotBePickledError(Exception): + def __init__(self): + super().__init__( + "Dataset having MemoryProvider as underlying storage should not be pickled as data won't be saved." + ) \ No newline at end of file From cd9e7f0b174dc045b29c749686764393c738c6e1 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Thu, 15 Jul 2021 22:29:27 +0530 Subject: [PATCH 04/13] modify old test --- .../storage/tests/test_storage_provider.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/hub/core/storage/tests/test_storage_provider.py b/hub/core/storage/tests/test_storage_provider.py index e4931edf00..47c019efff 100644 --- a/hub/core/storage/tests/test_storage_provider.py +++ b/hub/core/storage/tests/test_storage_provider.py @@ -1,3 +1,4 @@ +from hub.core.storage.memory import MemoryProvider import pytest from click.testing import CliRunner @@ -126,12 +127,13 @@ def test_cache(storage): @parametrize_all_storages def test_pickling(storage): - with CliRunner().isolated_filesystem(): - FILE_1 = f"{KEY}_1" - storage[FILE_1] = b"hello world" - assert storage[FILE_1] == b"hello world" - pickle_file = open("storage_pickle", "wb") - pickle.dump(storage, pickle_file) - pickle_file = open("storage_pickle", "rb") - unpickled_storage = pickle.load(pickle_file) - assert unpickled_storage[FILE_1] == b"hello world" + if isinstance(storage, MemoryProvider): + # skip pickling test for memory provider as the actual data isn't pickled for it + return + + FILE_1 = f"{KEY}_1" + storage[FILE_1] = b"hello world" + assert storage[FILE_1] == b"hello world" + pickled_storage = pickle.dumps(storage) + unpickled_storage = pickle.loads(pickled_storage) + assert unpickled_storage[FILE_1] == b"hello world" From 9e88c5aeb69049b5537164c8ba9d2094391e137c Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Thu, 15 Jul 2021 22:31:52 +0530 Subject: [PATCH 05/13] lint fixes --- hub/core/storage/memory.py | 1 - hub/core/storage/tests/test_storage_provider.py | 2 +- hub/util/exceptions.py | 3 ++- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hub/core/storage/memory.py b/hub/core/storage/memory.py index f9214e4bcf..109d5f61a0 100644 --- a/hub/core/storage/memory.py +++ b/hub/core/storage/memory.py @@ -103,4 +103,3 @@ def __getstate__(self): def __setstate__(self, state): self.__init__(root=state) - \ No newline at end of file diff --git a/hub/core/storage/tests/test_storage_provider.py b/hub/core/storage/tests/test_storage_provider.py index 47c019efff..8ee5cb03f4 100644 --- a/hub/core/storage/tests/test_storage_provider.py +++ b/hub/core/storage/tests/test_storage_provider.py @@ -130,7 +130,7 @@ def test_pickling(storage): if isinstance(storage, MemoryProvider): # skip pickling test for memory provider as the actual data isn't pickled for it return - + FILE_1 = f"{KEY}_1" storage[FILE_1] = b"hello world" assert storage[FILE_1] == b"hello world" diff --git a/hub/util/exceptions.py b/hub/util/exceptions.py index 9d396732ba..276f6125ee 100644 --- a/hub/util/exceptions.py +++ b/hub/util/exceptions.py @@ -432,8 +432,9 @@ def __init__(self): f"Python Shared memory with multiprocessing doesn't work properly on Windows." ) + class MemoryDatasetCanNotBePickledError(Exception): def __init__(self): super().__init__( "Dataset having MemoryProvider as underlying storage should not be pickled as data won't be saved." - ) \ No newline at end of file + ) From 4fdf9c7148386436b16624bded835a7fa4bc1d9d Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Fri, 16 Jul 2021 11:46:29 +0530 Subject: [PATCH 06/13] moved pickle test, refactored readonly logic --- hub/api/dataset.py | 6 +---- hub/api/tests/test_api.py | 41 ------------------------------- hub/api/tests/test_pickle.py | 46 +++++++++++++++++++++++++++++++++++ hub/core/storage/lru_cache.py | 32 +++++++++++++++--------- hub/core/storage/provider.py | 5 ++-- hub/core/storage/s3.py | 6 ++--- 6 files changed, 72 insertions(+), 64 deletions(-) create mode 100644 hub/api/tests/test_pickle.py diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 206c22e7a6..a72d8d15b8 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -74,11 +74,7 @@ def __init__( creds = {} base_storage = get_storage_provider(path, storage, read_only, creds, token) - # done instead of directly assigning read_only as backend might return read_only permissions - if hasattr(base_storage, "read_only") and base_storage.read_only: - self._read_only = True - else: - self._read_only = False + self._read_only = base_storage.read_only # uniquely identifies dataset self.path = path or get_path_from_storage(base_storage) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index f4f3bc7cee..22f806158f 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -504,44 +504,3 @@ def test_empty_dataset(): ds.create_tensor("z") ds = Dataset("test") assert list(ds.tensors) == ["x", "y", "z"] - - -@parametrize_all_dataset_storages -def test_dataset_pickling(ds): - if ds.path.startswith("mem://"): - with pytest.raises(MemoryDatasetCanNotBePickledError): - pickle.dumps(ds) - return - - with ds: - ds.create_tensor("image", htype="image", sample_compression="jpeg") - ds.create_tensor("label") - for i in range(10): - ds.image.append( - i * np.ones(((i + 1) * 20, (i + 1) * 20, 3), dtype=np.uint8) - ) - - for i in range(5): - ds.label.append(i) - - pickled_ds = pickle.dumps(ds) - unpickled_ds = pickle.loads(pickled_ds) - assert len(unpickled_ds.image) == len(ds.image) - assert len(unpickled_ds.label) == len(ds.label) - assert unpickled_ds.tensors.keys() == ds.tensors.keys() - assert unpickled_ds.index.values[0].value == ds.index.values[0].value - assert unpickled_ds.meta.version == ds.meta.version - - for i in range(10): - np.testing.assert_array_equal( - ds.image[i].numpy(), - (i * np.ones(((i + 1) * 20, (i + 1) * 20, 3), dtype=np.uint8)), - ) - np.testing.assert_array_equal( - ds.image[i].numpy(), unpickled_ds.image[i].numpy() - ) - for i in range(5): - np.testing.assert_array_equal(ds.label[i].numpy(), i) - np.testing.assert_array_equal( - ds.label[i].numpy(), unpickled_ds.label[i].numpy() - ) diff --git a/hub/api/tests/test_pickle.py b/hub/api/tests/test_pickle.py new file mode 100644 index 0000000000..11c33b3807 --- /dev/null +++ b/hub/api/tests/test_pickle.py @@ -0,0 +1,46 @@ +import numpy as np +import pytest +from hub.core.tests.common import parametrize_all_dataset_storages +from hub.util.exceptions import MemoryDatasetCanNotBePickledError +import pickle + + +@parametrize_all_dataset_storages +def test_dataset(ds): + if ds.path.startswith("mem://"): + with pytest.raises(MemoryDatasetCanNotBePickledError): + pickle.dumps(ds) + return + + with ds: + ds.create_tensor("image", htype="image", sample_compression="jpeg") + ds.create_tensor("label") + for i in range(10): + ds.image.append( + i * np.ones(((i + 1) * 20, (i + 1) * 20, 3), dtype=np.uint8) + ) + + for i in range(5): + ds.label.append(i) + + pickled_ds = pickle.dumps(ds) + unpickled_ds = pickle.loads(pickled_ds) + assert len(unpickled_ds.image) == len(ds.image) + assert len(unpickled_ds.label) == len(ds.label) + assert unpickled_ds.tensors.keys() == ds.tensors.keys() + assert unpickled_ds.index.values[0].value == ds.index.values[0].value + assert unpickled_ds.meta.version == ds.meta.version + + for i in range(10): + np.testing.assert_array_equal( + ds.image[i].numpy(), + (i * np.ones(((i + 1) * 20, (i + 1) * 20, 3), dtype=np.uint8)), + ) + np.testing.assert_array_equal( + ds.image[i].numpy(), unpickled_ds.image[i].numpy() + ) + for i in range(5): + np.testing.assert_array_equal(ds.label[i].numpy(), i) + np.testing.assert_array_equal( + ds.label[i].numpy(), unpickled_ds.label[i].numpy() + ) diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index 513d119239..b8abd205ba 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -1,6 +1,6 @@ from collections import OrderedDict from hub.core.storage.cachable import Cachable -from typing import Callable, Set, Union +from typing import Any, Dict, Set, Union from hub.core.storage.provider import StorageProvider @@ -161,11 +161,10 @@ def __delitem__(self, path: str): self.maybe_flush() def clear_cache(self): - """Flushes the content of the cache and and then deletes contents of all the layers of it. + """Flushes the content of the cache if not in read mode and and then deletes contents of all the layers of it. This doesn't delete data from the actual storage. """ - self.check_readonly() - self.flush() + self._flush_if_not_read_only() self.cache_used = 0 self.lru_sizes.clear() self.dirty_keys.clear() @@ -271,27 +270,36 @@ def _list_keys(self): all_keys.add(key) return list(all_keys) - def __getstate__(self): + def _flush_if_not_read_only(self): + """Flushes the cache if not in read-only mode.""" + if not self.read_only: + self.flush() + + def __getstate__(self) -> Dict[str, Any]: """Returns the state of the cache, for pickling""" # flushes the cache before pickling - if not hasattr(self, "read_only") or not self.read_only: - self.flush() + self._flush_if_not_read_only() + return { "next_storage": self.next_storage, "cache_storage": self.cache_storage, "cache_size": self.cache_size, } - def __setstate__(self, state): + def __setstate__(self, state: Dict[str, Any]): """Recreates a cache with the same configuration as the state. - PS: While restoring the cache, we reset its contents. - In case the cache storage was local/s3 and is still accessible when unpickled (if same machine/s3 creds present respectively), the earlier cache contents are no longer accessible. + Args: + state (dict): The state to be used to recreate the cache. - TODO: We might want to change this behaviour in the future by having a separate file that keeps a track of the lru order for restoring the cache. - This would also allow the cache to persist across different different Dataset objects pointing to the same dataset. + Note: + While restoring the cache, we reset its contents. + In case the cache storage was local/s3 and is still accessible when unpickled (if same machine/s3 creds present respectively), the earlier cache contents are no longer accessible. """ + + # TODO: We might want to change this behaviour in the future by having a separate file that keeps a track of the lru order for restoring the cache. + # This would also allow the cache to persist across different different Dataset objects pointing to the same dataset. self.next_storage = state["next_storage"] self.cache_storage = state["cache_storage"] self.cache_size = state["cache_size"] diff --git a/hub/core/storage/provider.py b/hub/core/storage/provider.py index 9576be33ad..125985d771 100644 --- a/hub/core/storage/provider.py +++ b/hub/core/storage/provider.py @@ -10,6 +10,7 @@ class StorageProvider(ABC, MutableMapping): autoflush = False + read_only = False """An abstract base class for implementing a storage provider. @@ -137,7 +138,7 @@ def disable_readonly(self): def check_readonly(self): """Raises an exception if the provider is in read-only mode.""" - if hasattr(self, "read_only") and self.read_only: + if self.read_only: raise ReadOnlyModeError() def flush(self): @@ -150,7 +151,7 @@ def maybe_flush(self): """Flush cache if autoflush has been enabled. Called at the end of methods which write data, to ensure consistency as a default. """ - if hasattr(self, "autoflush") and self.autoflush: + if self.autoflush: self.flush() @abstractmethod diff --git a/hub/core/storage/s3.py b/hub/core/storage/s3.py index fbeae6c6f4..41a6d7f9cf 100644 --- a/hub/core/storage/s3.py +++ b/hub/core/storage/s3.py @@ -252,10 +252,8 @@ def _check_update_creds(self): client = HubBackendClient(self.token) org_id, ds_name = self.tag.split("/") - if hasattr(self, "read_only") and self.read_only: - mode = "r" - else: - mode = "a" + mode = "r" if self.read_only else "a" + url, creds, mode, expiration = client.get_dataset_credentials( org_id, ds_name, mode ) From 169323560e57bb026827515e1bb2852d885a1116 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Fri, 16 Jul 2021 11:50:58 +0530 Subject: [PATCH 07/13] added typing to state methods in dataset --- hub/api/dataset.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index a72d8d15b8..a62b811fbd 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -1,6 +1,6 @@ from hub.core.storage.provider import StorageProvider from hub.core.tensor import create_tensor -from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence +from typing import Any, Callable, Dict, Optional, Union, Tuple, List, Sequence from hub.constants import DEFAULT_HTYPE, UNSPECIFIED import numpy as np @@ -115,7 +115,7 @@ def __len__(self): tensor_lengths = [len(tensor[self.index]) for tensor in self.tensors.values()] return min(tensor_lengths, default=0) - def __getstate__(self): + def __getstate__(self) -> Dict[str, Any]: """Returns a dict that can be pickled and used to restore this dataset. PS: Pickling a dataset does not copy the dataset, it only saves attributes that can be used to restore the dataset. @@ -132,8 +132,12 @@ def __getstate__(self): "_token": self.token, } - def __setstate__(self, state): - """Restores dataset from a pickled state.""" + def __setstate__(self, state: Dict[str, Any]): + """Restores dataset from a pickled state. + + Args: + state (dict): The pickled state used to restore the dataset. + """ self.__dict__.update(state) self.tensors = {} self._init_helper() From 5c60536ede17b0cdb0b9a092a25e5acd9ce9d335 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Fri, 16 Jul 2021 12:02:17 +0530 Subject: [PATCH 08/13] modified doctring --- hub/api/dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index a62b811fbd..9de4f1ca13 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -118,7 +118,8 @@ def __len__(self): def __getstate__(self) -> Dict[str, Any]: """Returns a dict that can be pickled and used to restore this dataset. - PS: Pickling a dataset does not copy the dataset, it only saves attributes that can be used to restore the dataset. + Note: + Pickling a dataset does not copy the dataset, it only saves attributes that can be used to restore the dataset. If you pickle a local dataset and try to access it on a machine that does not have the data present, the dataset will not work. """ if self.path.startswith("mem://"): From 41e20a6271b655783e99ceebbe929de4223faad7 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Fri, 16 Jul 2021 12:09:44 +0530 Subject: [PATCH 09/13] update pickle test after merge --- hub/api/dataset.py | 2 +- hub/api/tests/test_pickle.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 6c356cef21..2bee27e8ed 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -120,7 +120,7 @@ def __len__(self): def __getstate__(self) -> Dict[str, Any]: """Returns a dict that can be pickled and used to restore this dataset. - Note: + Note: Pickling a dataset does not copy the dataset, it only saves attributes that can be used to restore the dataset. If you pickle a local dataset and try to access it on a machine that does not have the data present, the dataset will not work. """ diff --git a/hub/api/tests/test_pickle.py b/hub/api/tests/test_pickle.py index 11c33b3807..bda1fbc13b 100644 --- a/hub/api/tests/test_pickle.py +++ b/hub/api/tests/test_pickle.py @@ -1,11 +1,11 @@ import numpy as np import pytest -from hub.core.tests.common import parametrize_all_dataset_storages from hub.util.exceptions import MemoryDatasetCanNotBePickledError import pickle +from hub.tests.dataset_fixtures import enabled_datasets -@parametrize_all_dataset_storages +@enabled_datasets def test_dataset(ds): if ds.path.startswith("mem://"): with pytest.raises(MemoryDatasetCanNotBePickledError): From 96a04587e9d7ffa959c0e6a873924e8c25a5001c Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Fri, 16 Jul 2021 12:39:17 +0530 Subject: [PATCH 10/13] added fixture for persistent storages --- hub/core/storage/tests/test_storage_provider.py | 9 ++------- hub/tests/storage_fixtures.py | 6 ++++++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/hub/core/storage/tests/test_storage_provider.py b/hub/core/storage/tests/test_storage_provider.py index adacd0a8c3..23fe2e943d 100644 --- a/hub/core/storage/tests/test_storage_provider.py +++ b/hub/core/storage/tests/test_storage_provider.py @@ -1,5 +1,4 @@ -from hub.core.storage.memory import MemoryProvider -from hub.tests.storage_fixtures import enabled_storages +from hub.tests.storage_fixtures import enabled_storages, enabled_persistent_storages from hub.tests.cache_fixtures import enabled_cache_chains import pytest from hub.constants import MB @@ -125,12 +124,8 @@ def test_cache(cache_chain): check_cache(cache_chain) -@enabled_storages +@enabled_persistent_storages def test_pickling(storage): - if isinstance(storage, MemoryProvider): - # skip pickling test for memory provider as the actual data isn't pickled for it - return - FILE_1 = f"{KEY}_1" storage[FILE_1] = b"hello world" assert storage[FILE_1] == b"hello world" diff --git a/hub/tests/storage_fixtures.py b/hub/tests/storage_fixtures.py index d685b492b3..6e2c864d2d 100644 --- a/hub/tests/storage_fixtures.py +++ b/hub/tests/storage_fixtures.py @@ -11,6 +11,12 @@ indirect=True, ) +enabled_persistent_storages = pytest.mark.parametrize( + "storage", + ["local_storage", "s3_storage", "hub_cloud_storage"], + indirect=True, +) + @pytest.fixture def memory_storage(memory_path): From 088466ed8ed38db979d15a851d74f9541f431654 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Fri, 16 Jul 2021 12:54:45 +0530 Subject: [PATCH 11/13] added typing in memoryprovider --- hub/core/storage/memory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hub/core/storage/memory.py b/hub/core/storage/memory.py index 109d5f61a0..2e02d59bd3 100644 --- a/hub/core/storage/memory.py +++ b/hub/core/storage/memory.py @@ -4,7 +4,7 @@ class MemoryProvider(StorageProvider): """Provider class for using the memory.""" - def __init__(self, root=""): + def __init__(self, root: str = ""): self.dict = {} self.root = root @@ -97,9 +97,9 @@ def clear(self): self.check_readonly() self.dict = {} - def __getstate__(self): + def __getstate__(self) -> str: """Does NOT save the in memory data in state.""" return self.root - def __setstate__(self, state): + def __setstate__(self, state: str): self.__init__(root=state) From 53b58adf46f00c001a5453d039a4e9095b45112d Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Fri, 16 Jul 2021 13:13:03 +0530 Subject: [PATCH 12/13] lint fix --- hub/core/storage/memory.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hub/core/storage/memory.py b/hub/core/storage/memory.py index 2e02d59bd3..8ae91fd840 100644 --- a/hub/core/storage/memory.py +++ b/hub/core/storage/memory.py @@ -1,3 +1,4 @@ +from typing import Any, Dict from hub.core.storage.provider import StorageProvider @@ -5,7 +6,7 @@ class MemoryProvider(StorageProvider): """Provider class for using the memory.""" def __init__(self, root: str = ""): - self.dict = {} + self.dict: Dict[str, Any] = {} self.root = root def __getitem__( @@ -102,4 +103,4 @@ def __getstate__(self) -> str: return self.root def __setstate__(self, state: str): - self.__init__(root=state) + self.__init__(root=state) # type: ignore From 0f32d94098775f83b0cdba28a453d49b0584ad79 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Tue, 20 Jul 2021 11:48:04 +0530 Subject: [PATCH 13/13] renamed helper function --- hub/api/dataset.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 2bee27e8ed..1081d5052f 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -89,20 +89,7 @@ def __init__( self.tensors: Dict[str, Tensor] = {} self._token = token self.public = public - self._init_helper() - - def _init_helper(self): - self.storage.autoflush = True - if self.path.startswith("hub://"): - split_path = self.path.split("/") - self.org_id, self.ds_name = split_path[2], split_path[3] - self.client = HubBackendClient(token=self._token) - - self._load_meta() - - hub_reporter.feature_report( - feature_name="Dataset", parameters={"Path": str(self.path)} - ) + self._set_derived_attributes() def __enter__(self): self.storage.autoflush = False @@ -143,7 +130,7 @@ def __setstate__(self, state: Dict[str, Any]): """ self.__dict__.update(state) self.tensors = {} - self._init_helper() + self._set_derived_attributes() def __getitem__( self, @@ -323,6 +310,20 @@ def _get_total_meta(self): for tensor_key, tensor_value in self.tensors.items() } + def _set_derived_attributes(self): + """Sets derived attributes during init and unpickling.""" + self.storage.autoflush = True + if self.path.startswith("hub://"): + split_path = self.path.split("/") + self.org_id, self.ds_name = split_path[2], split_path[3] + self.client = HubBackendClient(token=self._token) + + self._load_meta() + + hub_reporter.feature_report( + feature_name="Dataset", parameters={"Path": str(self.path)} + ) + def tensorflow(self): """Converts the dataset into a tensorflow compatible format.