Added store to DatasetView (#740)

* store added to dataset, dsv * added tests for store
activeloopai · Apr 2, 2021 · f19d566 · f19d566
1 parent fd234fd
commit f19d566
Show file tree

Hide file tree

Showing 4 changed files with 125 additions and 1 deletion.
diff --git a/hub/api/dataset.py b/hub/api/dataset.py
@@ -42,14 +42,14 @@
     _copy_helper,
     _get_compressor,
     _get_dynamic_tensor_dtype,
+    _store_helper,
 )
 
 import hub.schema.serialize
 import hub.schema.deserialize
 from hub.schema.features import flatten
 from hub.schema import ClassLabel
 from hub import auto
-
 from hub.store.dynamic_tensor import DynamicTensor
 from hub.store.store import get_fs_and_path, get_storage_map
 from hub.exceptions import (
@@ -628,6 +628,46 @@ def filter(self, fn):
         indexes = [index for index in self.indexes if fn(self[index])]
         return DatasetView(dataset=self, lazy=self.lazy, indexes=indexes)
 
+    def store(
+        self,
+        url: str,
+        token: dict = None,
+        sample_per_shard: int = None,
+        public: bool = True,
+        scheduler="single",
+        workers=1,
+    ):
+        """| Used to save the dataset as a new dataset, very similar to copy but uses transforms instead
+
+        Parameters
+        ----------
+        url: str
+            path where the data is going to be stored
+        token: str or dict, optional
+            If url is referring to a place where authorization is required,
+            token is the parameter to pass the credentials, it can be filepath or dict
+        length: int
+            in case shape is None, user can provide length
+        sample_per_shard: int
+            How to split the iterator not to overfill RAM
+        public: bool, optional
+            only applicable if using hub storage, ignored otherwise
+            setting this to False allows only the user who created it to access the dataset and
+            the dataset won't be visible in the visualizer to the public
+        scheduler: str
+            choice between "single", "threaded", "processed"
+        workers: int
+            how many threads or processes to use
+        Returns
+        ----------
+        ds: hub.Dataset
+            uploaded dataset
+        """
+
+        return _store_helper(
+            self, url, token, sample_per_shard, public, scheduler, workers
+        )
+
     def copy(self, dst_url: str, token=None, fs=None, public=True):
         """| Creates a copy of the dataset at the specified url and returns the dataset object
         Parameters

diff --git a/hub/api/dataset_utils.py b/hub/api/dataset_utils.py
@@ -205,6 +205,25 @@ def _copy_helper(
     return dst_url
 
 
+def _store_helper(
+    ds,
+    url: str,
+    token: dict = None,
+    sample_per_shard: int = None,
+    public: bool = True,
+    scheduler="single",
+    workers=1,
+):
+    from hub import transform
+
+    @transform(schema=ds.schema, workers=workers, scheduler=scheduler)
+    def identity(sample):
+        return sample
+
+    ds2 = identity(ds)
+    return ds2.store(url, token=token, sample_per_shard=sample_per_shard, public=public)
+
+
 def _get_dynamic_tensor_dtype(t_dtype):
     if isinstance(t_dtype, Primitive):
         return t_dtype.dtype

diff --git a/hub/api/datasetview.py b/hub/api/datasetview.py
@@ -12,6 +12,7 @@
     get_value,
     slice_split,
     str_to_int,
+    _store_helper,
 )
 from hub.exceptions import NoneValueException
 from hub.api.objectview import ObjectView
@@ -197,6 +198,46 @@ def filter(self, fn):
             indexes = [index for index in self.indexes if fn(self.dataset[index])]
         return DatasetView(dataset=self.dataset, lazy=self.lazy, indexes=indexes)
 
+    def store(
+        self,
+        url: str,
+        token: dict = None,
+        sample_per_shard: int = None,
+        public: bool = True,
+        scheduler="single",
+        workers=1,
+    ):
+        """| Used to save the datasetview as a new dataset
+
+        Parameters
+        ----------
+        url: str
+            path where the data is going to be stored
+        token: str or dict, optional
+            If url is referring to a place where authorization is required,
+            token is the parameter to pass the credentials, it can be filepath or dict
+        length: int
+            in case shape is None, user can provide length
+        sample_per_shard: int
+            How to split the iterator not to overfill RAM
+        public: bool, optional
+            only applicable if using hub storage, ignored otherwise
+            setting this to False allows only the user who created it to access the dataset and
+            the dataset won't be visible in the visualizer to the public
+        scheduler: str
+            choice between "single", "threaded", "processed"
+        workers: int
+            how many threads or processes to use
+        Returns
+        ----------
+        ds: hub.Dataset
+            uploaded dataset
+        """
+
+        return _store_helper(
+            self, url, token, sample_per_shard, public, scheduler, workers
+        )
+
     @property
     def keys(self):
         """

diff --git a/hub/api/tests/test_dataset.py b/hub/api/tests/test_dataset.py
@@ -1184,6 +1184,30 @@ def test_minio_endpoint():
         assert (ds["abc", i].compute() == i * np.ones((100, 100, 3))).all()
 
 
+def test_dataset_store():
+    my_schema = {"image": Tensor((100, 100), "uint8"), "abc": "uint8"}
+
+    ds = Dataset("./test/ds_store", schema=my_schema, shape=(100,))
+    for i in range(100):
+        ds["image", i] = i * np.ones((100, 100))
+        ds["abc", i] = i
+
+    def my_filter(sample):
+        return sample["abc"].compute() % 5 == 0
+
+    dsv = ds.filter(my_filter)
+
+    ds2 = ds.store("./test/ds2_store")
+    for i in range(100):
+        assert (ds2["image", i].compute() == i * np.ones((100, 100))).all()
+        assert ds["abc", i].compute() == i
+
+    ds3 = dsv.store("./test/ds3_store")
+    for i in range(20):
+        assert (ds3["image", i].compute() == 5 * i * np.ones((100, 100))).all()
+        assert ds3["abc", i].compute() == 5 * i
+
+
 if __name__ == "__main__":
     test_dataset_dynamic_shaped_slicing()
     test_dataset_assign_value()