Skip to content

Commit

Permalink
Added store to DatasetView (#740)
Browse files Browse the repository at this point in the history
* store added to dataset, dsv

* added tests for store
  • Loading branch information
AbhinavTuli committed Apr 2, 2021
1 parent fd234fd commit f19d566
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 1 deletion.
42 changes: 41 additions & 1 deletion hub/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@
_copy_helper,
_get_compressor,
_get_dynamic_tensor_dtype,
_store_helper,
)

import hub.schema.serialize
import hub.schema.deserialize
from hub.schema.features import flatten
from hub.schema import ClassLabel
from hub import auto

from hub.store.dynamic_tensor import DynamicTensor
from hub.store.store import get_fs_and_path, get_storage_map
from hub.exceptions import (
Expand Down Expand Up @@ -628,6 +628,46 @@ def filter(self, fn):
indexes = [index for index in self.indexes if fn(self[index])]
return DatasetView(dataset=self, lazy=self.lazy, indexes=indexes)

def store(
self,
url: str,
token: dict = None,
sample_per_shard: int = None,
public: bool = True,
scheduler="single",
workers=1,
):
"""| Used to save the dataset as a new dataset, very similar to copy but uses transforms instead
Parameters
----------
url: str
path where the data is going to be stored
token: str or dict, optional
If url is referring to a place where authorization is required,
token is the parameter to pass the credentials, it can be filepath or dict
length: int
in case shape is None, user can provide length
sample_per_shard: int
How to split the iterator not to overfill RAM
public: bool, optional
only applicable if using hub storage, ignored otherwise
setting this to False allows only the user who created it to access the dataset and
the dataset won't be visible in the visualizer to the public
scheduler: str
choice between "single", "threaded", "processed"
workers: int
how many threads or processes to use
Returns
----------
ds: hub.Dataset
uploaded dataset
"""

return _store_helper(
self, url, token, sample_per_shard, public, scheduler, workers
)

def copy(self, dst_url: str, token=None, fs=None, public=True):
"""| Creates a copy of the dataset at the specified url and returns the dataset object
Parameters
Expand Down
19 changes: 19 additions & 0 deletions hub/api/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,25 @@ def _copy_helper(
return dst_url


def _store_helper(
ds,
url: str,
token: dict = None,
sample_per_shard: int = None,
public: bool = True,
scheduler="single",
workers=1,
):
from hub import transform

@transform(schema=ds.schema, workers=workers, scheduler=scheduler)
def identity(sample):
return sample

ds2 = identity(ds)
return ds2.store(url, token=token, sample_per_shard=sample_per_shard, public=public)


def _get_dynamic_tensor_dtype(t_dtype):
if isinstance(t_dtype, Primitive):
return t_dtype.dtype
Expand Down
41 changes: 41 additions & 0 deletions hub/api/datasetview.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
get_value,
slice_split,
str_to_int,
_store_helper,
)
from hub.exceptions import NoneValueException
from hub.api.objectview import ObjectView
Expand Down Expand Up @@ -197,6 +198,46 @@ def filter(self, fn):
indexes = [index for index in self.indexes if fn(self.dataset[index])]
return DatasetView(dataset=self.dataset, lazy=self.lazy, indexes=indexes)

def store(
self,
url: str,
token: dict = None,
sample_per_shard: int = None,
public: bool = True,
scheduler="single",
workers=1,
):
"""| Used to save the datasetview as a new dataset
Parameters
----------
url: str
path where the data is going to be stored
token: str or dict, optional
If url is referring to a place where authorization is required,
token is the parameter to pass the credentials, it can be filepath or dict
length: int
in case shape is None, user can provide length
sample_per_shard: int
How to split the iterator not to overfill RAM
public: bool, optional
only applicable if using hub storage, ignored otherwise
setting this to False allows only the user who created it to access the dataset and
the dataset won't be visible in the visualizer to the public
scheduler: str
choice between "single", "threaded", "processed"
workers: int
how many threads or processes to use
Returns
----------
ds: hub.Dataset
uploaded dataset
"""

return _store_helper(
self, url, token, sample_per_shard, public, scheduler, workers
)

@property
def keys(self):
"""
Expand Down
24 changes: 24 additions & 0 deletions hub/api/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1184,6 +1184,30 @@ def test_minio_endpoint():
assert (ds["abc", i].compute() == i * np.ones((100, 100, 3))).all()


def test_dataset_store():
my_schema = {"image": Tensor((100, 100), "uint8"), "abc": "uint8"}

ds = Dataset("./test/ds_store", schema=my_schema, shape=(100,))
for i in range(100):
ds["image", i] = i * np.ones((100, 100))
ds["abc", i] = i

def my_filter(sample):
return sample["abc"].compute() % 5 == 0

dsv = ds.filter(my_filter)

ds2 = ds.store("./test/ds2_store")
for i in range(100):
assert (ds2["image", i].compute() == i * np.ones((100, 100))).all()
assert ds["abc", i].compute() == i

ds3 = dsv.store("./test/ds3_store")
for i in range(20):
assert (ds3["image", i].compute() == 5 * i * np.ones((100, 100))).all()
assert ds3["abc", i].compute() == 5 * i


if __name__ == "__main__":
test_dataset_dynamic_shaped_slicing()
test_dataset_assign_value()
Expand Down

0 comments on commit f19d566

Please sign in to comment.