activeloopai · verbose-void · Jul 31, 2021 · Jul 13, 2021 · Jul 13, 2021 · Jul 16, 2021
diff --git a/hub/__init__.py b/hub/__init__.py
@@ -12,12 +12,23 @@
 
 from .api.dataset import dataset
 from .api.read import read
+from .core.transform import compute, compose
 from .util.bugout_reporter import hub_reporter
 
 load = dataset.load
 empty = dataset.empty
 like = dataset.like
-__all__ = ["dataset", "read", "__version__", "load", "empty", "like"]
+
+__all__ = [
+    "dataset",
+    "read",
+    "__version__",
+    "load",
+    "empty",
+    "compute",
+    "compose",
+    "like",
+]
 
 __version__ = "2.0.2"
 __encoded_version__ = np.array(__version__)

diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
@@ -45,7 +45,11 @@ def is_uniform_sequence(samples):
 
 class ChunkEngine:
     def __init__(
-        self, key: str, cache: LRUCache, max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE
+        self,
+        key: str,
+        cache: LRUCache,
+        max_chunk_size: int = DEFAULT_MAX_CHUNK_SIZE,
+        memory_cache: LRUCache = None,
     ):
         """Handles creating `Chunk`s and filling them with incoming samples.
 
@@ -98,6 +102,7 @@ def __init__(
             key (str): Tensor key.
             cache (LRUCache): Cache for which chunks and the metadata are stored.
             max_chunk_size (int): Chunks generated by this instance will never exceed this size. Defaults to DEFAULT_MAX_CHUNK_SIZE.
+            memory_cache (LRUCache): Cache used for storing non chunk data such as tensor meta and chunk id encoder during transforms in memory.
 
         Raises:
             ValueError: If invalid max chunk size.
@@ -113,6 +118,7 @@ def __init__(
 
         # only the last chunk may be less than this
         self.min_chunk_size = self.max_chunk_size // 2
+        self.mem_cache = memory_cache
 
     @property
     def chunk_id_encoder(self) -> ChunkIdEncoder:
@@ -126,6 +132,7 @@ def chunk_id_encoder(self) -> ChunkIdEncoder:
             ChunkIdEncoder: The chunk ID encoder handles the mapping between sample indices
                 and their corresponding chunks.
         """
+        cache = self.mem_cache or self.cache
 
         key = get_chunk_id_encoder_key(self.key)
         if not self.chunk_id_encoder_exists:
@@ -137,15 +144,21 @@ def chunk_id_encoder(self) -> ChunkIdEncoder:
                 )
 
             enc = ChunkIdEncoder()
-            self.cache[key] = enc
+            cache[key] = enc
             return enc
 
-        enc = self.cache.get_cachable(key, ChunkIdEncoder)
+        enc = cache.get_cachable(key, ChunkIdEncoder)
         return enc
 
     @property
     def chunk_id_encoder_exists(self) -> bool:
-        return get_chunk_id_encoder_key(self.key) in self.cache
+        cache = self.mem_cache or self.cache
+        try:
+            key = get_chunk_id_encoder_key(self.key)
+            cache[key]
+            return True
+        except KeyError:
+            return False
 
     @property
     def num_chunks(self) -> int:
@@ -174,8 +187,9 @@ def last_chunk_key(self) -> str:
 
     @property
     def tensor_meta(self):
+        cache = self.mem_cache or self.cache
         tensor_meta_key = get_tensor_meta_key(self.key)
-        return self.cache.get_cachable(tensor_meta_key, TensorMeta)
+        return cache.get_cachable(tensor_meta_key, TensorMeta)
 
     def _append_bytes(self, buffer: memoryview, shape: Tuple[int], dtype: np.dtype):
         """Treat `buffer` as a single sample and place them into `Chunk`s. This function implements the algorithm for
@@ -209,19 +223,19 @@ def _synchronize_cache(self):
 
         # TODO implement tests for cache size compute
         # TODO: optimize this by storing all of these keys in the chunk engine's state (posixpath.joins are pretty slow)
-
+        cache = self.mem_cache or self.cache
         # synchronize last chunk
         last_chunk_key = self.last_chunk_key
         last_chunk = self.last_chunk
         self.cache.update_used_cache_for_path(last_chunk_key, last_chunk.nbytes)  # type: ignore
 
         # synchronize tensor meta
         tensor_meta_key = get_tensor_meta_key(self.key)
-        self.cache[tensor_meta_key] = self.tensor_meta
+        cache[tensor_meta_key] = self.tensor_meta
 
         # synchronize chunk ID encoder
         chunk_id_key = get_chunk_id_encoder_key(self.key)
-        self.cache[chunk_id_key] = self.chunk_id_encoder
+        cache[chunk_id_key] = self.chunk_id_encoder
 
     def _try_appending_to_last_chunk(
         self, buffer: memoryview, shape: Tuple[int]

diff --git a/hub/core/dataset.py b/hub/core/dataset.py
@@ -38,6 +38,7 @@ def __init__(
         read_only: bool = False,
         public: Optional[bool] = True,
         token: Optional[str] = None,
+        log_loading: bool = True,
     ):
         """Initializes a new or existing dataset.
 
@@ -48,6 +49,7 @@ def __init__(
                 Datasets stored on Hub cloud that your account does not have write access to will automatically open in read mode.
             public (bool, optional): Applied only if storage is Hub cloud storage and a new Dataset is being created. Defines if the dataset will have public access.
             token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.
+            log_loading (bool): Logs the loading of the dataset. Defaults to True.
 
         Raises:
             ValueError: If an existing local path is given, it must be a directory.
@@ -65,6 +67,7 @@ def __init__(
         self.tensors: Dict[str, Tensor] = {}
         self._token = token
         self.public = public
+        self.log_loading = log_loading
 
         self._set_derived_attributes()
 
@@ -104,6 +107,7 @@ def __getstate__(self) -> Dict[str, Any]:
             "public": self.public,
             "storage": self.storage,
             "_token": self.token,
+            "log_loading": self.log_loading,
         }
 
     def __setstate__(self, state: Dict[str, Any]):
@@ -133,6 +137,7 @@ def __getitem__(
                 index=self.index[item],
                 read_only=self.read_only,
                 token=self._token,
+                log_loading=False,
             )
         else:
             raise InvalidKeyTypeError(item)
@@ -252,7 +257,8 @@ def _load_meta(self):
         meta_key = get_dataset_meta_key()
 
         if dataset_exists(self.storage):
-            logger.info(f"{self.path} loaded successfully.")
+            if self.log_loading:
+                logger.info(f"{self.path} loaded successfully.")
             self.meta = self.storage.get_cachable(meta_key, DatasetMeta)
 
             for tensor_name in self.meta.tensors:

diff --git a/hub/core/sample.py b/hub/core/sample.py
@@ -92,7 +92,7 @@ def compressed_bytes(self, compression: str) -> bytes:
             bytes: Bytes for the compressed sample. Contains all metadata required to decompress within these bytes.
         """
 
-        compression = compression.lower()
+        # compression = compression.lower()
 
         if compression is None:
             return self.uncompressed_bytes()

diff --git a/hub/core/storage/cachable.py b/hub/core/storage/cachable.py
@@ -21,7 +21,7 @@ def nbytes(self):
         raise NotImplementedError
 
     def __getstate__(self) -> Dict[str, Any]:
-        raise NotImplementedError
+        return self.__dict__
 
     def __setstate__(self, state: Dict[str, Any]):
         self.__dict__.update(state)

diff --git a/hub/core/transform/__init__.py b/hub/core/transform/__init__.py
@@ -0,0 +1 @@
+from hub.core.transform.transform import compute, compose