From 8f484f6d0abf56e50d5ac43fdbd7f3eb85db695a Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sat, 10 Jul 2021 22:11:37 +0530 Subject: [PATCH 1/4] fix tensor create --- hub/api/dataset.py | 8 ++++---- hub/api/tests/test_api.py | 11 +++++++++++ hub/core/storage/lru_cache.py | 8 +++++--- hub/core/storage/provider.py | 4 ++-- hub/core/tensor.py | 1 + 5 files changed, 23 insertions(+), 9 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 4ec6afcaa4..16a13e54a8 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -1,6 +1,6 @@ from hub.core.tensor import create_tensor from hub.constants import DEFAULT_HTYPE -from typing import Callable, Dict, Optional, Union, Tuple, List +from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence import numpy as np from hub.api.tensor import Tensor @@ -196,7 +196,7 @@ def create_tensor( self.tensors[name] = tensor self.meta.tensors.append(name) - + self.flush([get_dataset_meta_key()]) return tensor __getattr__ = __getitem__ @@ -311,13 +311,13 @@ def tensorflow(self): """ return dataset_to_tensorflow(self) - def flush(self): + def flush(self, keys: Optional[Sequence[str]] = None): """Necessary operation after writes if caches are being used. Writes all the dirty data from the cache layers (if any) to the underlying storage. Here dirty data corresponds to data that has been changed/assigned and but hasn't yet been sent to the underlying storage. """ - self.storage.flush() + self.storage.flush(keys) def clear_cache(self): """Flushes (see Dataset.flush documentation) the contents of the cache layers (if any) and then deletes contents diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 5e7e9399e0..6530362969 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -10,6 +10,7 @@ from hub.util.exceptions import TensorDtypeMismatchError, TensorInvalidSampleShapeError from hub.client.client import HubBackendClient from hub.client.utils import has_hub_testing_creds +from click.testing import CliRunner # need this for 32-bit and 64-bit systems to have correct tests @@ -464,3 +465,13 @@ def test_hub_cloud_dataset(): np.testing.assert_array_equal(ds.image[i].numpy(), i * np.ones((100, 100))) ds.delete() + + +def test_empty_dataset(): + with CliRunner().isolated_filesystem(): + ds = Dataset("test") + ds.create_tensor("x") + ds.create_tensor("y") + ds.create_tensor("z") + ds = Dataset("test") + assert list(ds.tensors) == ["x", "y", "z"] diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index b957f62911..95979f24db 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -1,6 +1,6 @@ from collections import OrderedDict from hub.core.storage.cachable import Cachable -from typing import Callable, Set, Union +from typing import Callable, Set, Union, Optional, Sequence from hub.core.storage.provider import StorageProvider @@ -37,12 +37,14 @@ def __init__( self.dirty_keys: Set[str] = set() # keys present in cache but not next_storage self.cache_used = 0 - def flush(self): + def flush(self, keys: Optional[Sequence[str]] = None): """Writes data from cache_storage to next_storage. Only the dirty keys are written. This is a cascading function and leads to data being written to the final storage in case of a chained cache. """ + if not keys: + keys = self.dirty_keys.copy() # type: ignore self.check_readonly() - for key in self.dirty_keys.copy(): + for key in keys: # type: ignore self._forward(key) self.next_storage.flush() diff --git a/hub/core/storage/provider.py b/hub/core/storage/provider.py index 9576be33ad..cdbaf9ddba 100644 --- a/hub/core/storage/provider.py +++ b/hub/core/storage/provider.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from collections.abc import MutableMapping from hub.core.storage.cachable import Cachable -from typing import Optional +from typing import Optional, Sequence from hub.constants import BYTE_PADDING from hub.util.assert_byte_indexes import assert_byte_indexes @@ -140,7 +140,7 @@ def check_readonly(self): if hasattr(self, "read_only") and self.read_only: raise ReadOnlyModeError() - def flush(self): + def flush(self, keys: Optional[Sequence[str]] = None): """Only needs to be implemented for caches. Flushes the data to the next storage provider. Should be a no op for Base Storage Providers like local, s3, azure, gcs, etc. """ diff --git a/hub/core/tensor.py b/hub/core/tensor.py index bc5858c567..75884253d9 100644 --- a/hub/core/tensor.py +++ b/hub/core/tensor.py @@ -43,3 +43,4 @@ def create_tensor( **kwargs, ) storage[meta_key] = meta # type: ignore + storage.flush() From dbfbb58680d89239fc0ac2012b67dbab3caf4955 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sat, 10 Jul 2021 22:13:53 +0530 Subject: [PATCH 2/4] rem mad flush --- hub/core/tensor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hub/core/tensor.py b/hub/core/tensor.py index 75884253d9..bc5858c567 100644 --- a/hub/core/tensor.py +++ b/hub/core/tensor.py @@ -43,4 +43,3 @@ def create_tensor( **kwargs, ) storage[meta_key] = meta # type: ignore - storage.flush() From 60f30ff7483c31efb44e99b1769641826b78ef3e Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Sat, 10 Jul 2021 22:30:27 +0530 Subject: [PATCH 3/4] smol fix --- hub/core/tensor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hub/core/tensor.py b/hub/core/tensor.py index bc5858c567..e82bd0531a 100644 --- a/hub/core/tensor.py +++ b/hub/core/tensor.py @@ -43,3 +43,4 @@ def create_tensor( **kwargs, ) storage[meta_key] = meta # type: ignore + storage.flush([meta_key]) From ba446b01dd15bd42b1b0bff027e539094684e2bd Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 12 Jul 2021 15:47:02 +0530 Subject: [PATCH 4/4] simplify --- hub/api/dataset.py | 10 +++++----- hub/core/storage/lru_cache.py | 8 +++----- hub/core/storage/provider.py | 4 ++-- hub/core/tensor.py | 1 - 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 16a13e54a8..dabd61e6ee 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -1,6 +1,6 @@ from hub.core.tensor import create_tensor from hub.constants import DEFAULT_HTYPE -from typing import Callable, Dict, Optional, Union, Tuple, List, Sequence +from typing import Callable, Dict, Optional, Union, Tuple, List import numpy as np from hub.api.tensor import Tensor @@ -182,6 +182,7 @@ def create_tensor( if tensor_exists(name, self.storage): raise TensorAlreadyExistsError(name) + self.meta.tensors.append(name) create_tensor( name, self.storage, @@ -195,8 +196,7 @@ def create_tensor( tensor = Tensor(name, self.storage) # type: ignore self.tensors[name] = tensor - self.meta.tensors.append(name) - self.flush([get_dataset_meta_key()]) + return tensor __getattr__ = __getitem__ @@ -311,13 +311,13 @@ def tensorflow(self): """ return dataset_to_tensorflow(self) - def flush(self, keys: Optional[Sequence[str]] = None): + def flush(self): """Necessary operation after writes if caches are being used. Writes all the dirty data from the cache layers (if any) to the underlying storage. Here dirty data corresponds to data that has been changed/assigned and but hasn't yet been sent to the underlying storage. """ - self.storage.flush(keys) + self.storage.flush() def clear_cache(self): """Flushes (see Dataset.flush documentation) the contents of the cache layers (if any) and then deletes contents diff --git a/hub/core/storage/lru_cache.py b/hub/core/storage/lru_cache.py index 95979f24db..b957f62911 100644 --- a/hub/core/storage/lru_cache.py +++ b/hub/core/storage/lru_cache.py @@ -1,6 +1,6 @@ from collections import OrderedDict from hub.core.storage.cachable import Cachable -from typing import Callable, Set, Union, Optional, Sequence +from typing import Callable, Set, Union from hub.core.storage.provider import StorageProvider @@ -37,14 +37,12 @@ def __init__( self.dirty_keys: Set[str] = set() # keys present in cache but not next_storage self.cache_used = 0 - def flush(self, keys: Optional[Sequence[str]] = None): + def flush(self): """Writes data from cache_storage to next_storage. Only the dirty keys are written. This is a cascading function and leads to data being written to the final storage in case of a chained cache. """ - if not keys: - keys = self.dirty_keys.copy() # type: ignore self.check_readonly() - for key in keys: # type: ignore + for key in self.dirty_keys.copy(): self._forward(key) self.next_storage.flush() diff --git a/hub/core/storage/provider.py b/hub/core/storage/provider.py index cdbaf9ddba..9576be33ad 100644 --- a/hub/core/storage/provider.py +++ b/hub/core/storage/provider.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from collections.abc import MutableMapping from hub.core.storage.cachable import Cachable -from typing import Optional, Sequence +from typing import Optional from hub.constants import BYTE_PADDING from hub.util.assert_byte_indexes import assert_byte_indexes @@ -140,7 +140,7 @@ def check_readonly(self): if hasattr(self, "read_only") and self.read_only: raise ReadOnlyModeError() - def flush(self, keys: Optional[Sequence[str]] = None): + def flush(self): """Only needs to be implemented for caches. Flushes the data to the next storage provider. Should be a no op for Base Storage Providers like local, s3, azure, gcs, etc. """ diff --git a/hub/core/tensor.py b/hub/core/tensor.py index e82bd0531a..bc5858c567 100644 --- a/hub/core/tensor.py +++ b/hub/core/tensor.py @@ -43,4 +43,3 @@ def create_tensor( **kwargs, ) storage[meta_key] = meta # type: ignore - storage.flush([meta_key])