[AL-1331] Api reference updates (#1384)

* update .pytorch reference * add docstring to tensor tobytes * add audio, json, text and list documentation * add hub.delete to docs * added details about eval * improved formatting * lint fixes * lint fixes * black fix * api reference updates * improve compression docs
activeloopai · Dec 13, 2021 · 40dcecd · 40dcecd
1 parent e7df28b
commit 40dcecd
Show file tree

Hide file tree

Showing 6 changed files with 114 additions and 19 deletions.
diff --git a/hub/__init__.py b/hub/__init__.py
@@ -61,6 +61,7 @@
     "compressions",
     "htypes",
     "config",
+    "delete",
 ]
 
 __version__ = "2.1.2"

diff --git a/hub/compression.py b/hub/compression.py
@@ -1,10 +1,17 @@
 """
 Supported compressions (formats):
+
     Image : bmp, dib, gif, ico, jpeg, jp2, pcx, png, ppm, sgi, tga, tiff, webp, wmf, xbm
     Audio : flac, mp3, wav
     Video : mp4, mkv, avi
     Bytes : lz4
 
+__Note__:- 
+
+For video compressions, we only support already compressed data read using hub.read. We do not actually compress the video data. 
+
+Also, when using hub.read with one of the video compressions, ensure that the compression matches, otherwise hub will be unable to compress the data to the specified compression.
+
 """
 from PIL import Image  # type: ignore
 
@@ -90,6 +97,7 @@
 
 
 def get_compression_type(c):
+    """Returns the compression type for the given compression name."""
     if c is None:
         return None
     ret = _compression_types.get(c)

diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py
@@ -598,8 +598,8 @@ def pytorch(
                 Read torch.utils.data.DataLoader docs for more details.
             pin_memory (bool): If True, the data loader will copy Tensors into CUDA pinned memory before returning them. Default value is False.
                 Read torch.utils.data.DataLoader docs for more details.
-            shuffle (bool): If True, the data loader will shuffle the data indices. Default value is False.
-            buffer_size (int): The size of the buffer used to prefetch/shuffle in MB. The buffer uses shared memory under the hood. Default value is 2 GB. Increasing the buffer_size will increase the extent of shuffling.
+            shuffle (bool): If True, the data loader will shuffle the data indices. Default value is False. Details about how hub shuffles data can be found at https://docs.activeloop.ai/how-hub-works/shuffling-in-ds.pytorch.
+            buffer_size (int): The size of the buffer used to shuffle the data in MBs. Defaults to 2048 MB. Increasing the buffer_size will increase the extent of shuffling.
             use_local_cache (bool): If True, the data loader will use a local cache to store data. This is useful when the dataset can fit on the machine and we don't want to fetch the data multiple times for each iteration. Default value is False.
             use_progress_bar (bool): If True, tqdm will be wrapped around the returned dataloader. Default value is True.
 

diff --git a/hub/core/tensor.py b/hub/core/tensor.py
@@ -451,6 +451,18 @@ def data(self) -> Any:
             return self.numpy()
 
     def tobytes(self) -> bytes:
+        """Returns the bytes of the tensor. Only works for a single sample of tensor.
+        If the tensor is uncompressed, this returns the bytes of the numpy array.
+        If the tensor is sample compressed, this returns the compressed bytes of the sample.
+        If the tensor is chunk compressed, this raises an error.
+
+        Returns:
+            bytes: The bytes of the tensor.
+
+        Raises:
+            ValueError: If the tensor has multiple samples.
+        """
+
         if self.index.values[0].subscriptable():
             raise ValueError("tobytes() can be used only on exatcly 1 sample.")
         return self.chunk_engine.read_bytes_for_sample(self.index.values[0].value)  # type: ignore
diff --git a/hub/core/transform/transform.py b/hub/core/transform/transform.py
@@ -53,21 +53,22 @@ def eval(
         """Evaluates the TransformFunction on data_in to produce an output dataset ds_out.
 
         Args:
-            data_in: Input passed to the transform to generate output dataset. Should support __getitem__ and __len__. Can be a Hub dataset.
-            ds_out (Dataset): The dataset object to which the transform will get written.
-                Should have all keys being generated in output already present as tensors. It's initial state should be either:-
+            data_in: Input passed to the transform to generate output dataset. Should support \__getitem__ and \__len__. Can be a Hub dataset.
+            ds_out (Dataset, optional): The dataset object to which the transform will get written. If this is not provided, data_in will be overwritten if it is a Hub dataset, otherwise error will be raised.
+                It should have all keys being generated in output already present as tensors. It's initial state should be either:-
                 - Empty i.e. all tensors have no samples. In this case all samples are added to the dataset.
                 - All tensors are populated and have sampe length. In this case new samples are appended to the dataset.
             num_workers (int): The number of workers to use for performing the transform. Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
-            scheduler (str): The scheduler to be used to compute the transformation. Supported values include: "serial", 'threaded', 'processed' and 'ray.
+            scheduler (str): The scheduler to be used to compute the transformation. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
+                Defaults to 'threaded'.
             progressbar (bool): Displays a progress bar if True (default).
 
 
         Raises:
-            InvalidInputDataError: If data_in passed to transform is invalid. It should support __getitem__ and __len__ operations. Using scheduler other than "threaded" with hub dataset having base storage as memory as data_in will also raise this.
+            InvalidInputDataError: If data_in passed to transform is invalid. It should support \__getitem__ and \__len__ operations. Using scheduler other than "threaded" with hub dataset having base storage as memory as data_in will also raise this.
             InvalidOutputDatasetError: If all the tensors of ds_out passed to transform don't have the same length. Using scheduler other than "threaded" with hub dataset having base storage as memory as ds_out will also raise this.
             TensorMismatchError: If one or more of the outputs generated during transform contain different tensors than the ones present in 'ds_out' provided to transform.
-            UnsupportedSchedulerError: If the scheduler passed is not recognized. Supported values include: "serial", 'threaded', 'processed' and 'ray'.
+            UnsupportedSchedulerError: If the scheduler passed is not recognized. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
         """
 
         pipeline = Pipeline([self])
@@ -93,20 +94,21 @@ def eval(
         """Evaluates the pipeline on data_in to produce an output dataset ds_out.
 
         Args:
-            data_in: Input passed to the transform to generate output dataset. Should support __getitem__ and __len__. Can be a Hub dataset.
-            ds_out (Dataset): The dataset object to which the transform will get written.
-                Should have all keys being generated in output already present as tensors. It's initial state should be either:-
+            data_in: Input passed to the transform to generate output dataset. Should support \__getitem__ and \__len__. Can be a Hub dataset.
+            ds_out (Dataset, optional): The dataset object to which the transform will get written. If this is not provided, data_in will be overwritten if it is a Hub dataset, otherwise error will be raised.
+                It should have all keys being generated in output already present as tensors. It's initial state should be either:-
                 - Empty i.e. all tensors have no samples. In this case all samples are added to the dataset.
                 - All tensors are populated and have sampe length. In this case new samples are appended to the dataset.
             num_workers (int): The number of workers to use for performing the transform. Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
-            scheduler (str): The scheduler to be used to compute the transformation. Supported values include: "serial", 'threaded', 'processed' and 'ray'.
+            scheduler (str): The scheduler to be used to compute the transformation. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
+                Defaults to 'threaded'.
             progressbar (bool): Displays a progress bar if True (default).
 
         Raises:
-            InvalidInputDataError: If data_in passed to transform is invalid. It should support __getitem__ and __len__ operations. Using scheduler other than "threaded" with hub dataset having base storage as memory as data_in will also raise this.
+            InvalidInputDataError: If data_in passed to transform is invalid. It should support \__getitem__ and \__len__ operations. Using scheduler other than "threaded" with hub dataset having base storage as memory as data_in will also raise this.
             InvalidOutputDatasetError: If all the tensors of ds_out passed to transform don't have the same length. Using scheduler other than "threaded" with hub dataset having base storage as memory as ds_out will also raise this.
             TensorMismatchError: If one or more of the outputs generated during transform contain different tensors than the ones present in 'ds_out' provided to transform.
-            UnsupportedSchedulerError: If the scheduler passed is not recognized. Supported values include: "serial", 'threaded', 'processed' and 'ray'.
+            UnsupportedSchedulerError: If the scheduler passed is not recognized. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
             TransformError: All other exceptions raised if there are problems while running the pipeline.
         """
         if num_workers <= 0:
@@ -266,8 +268,40 @@ def _run(progress_port=None):
             )
 
 
-def compose(functions: List[TransformFunction]):
-    """Takes a list of functions decorated using hub.compute and creates a pipeline that can be evaluated using .eval"""
+def compose(functions: List[TransformFunction]):  # noqa: DAR101, DAR102, DAR201, DAR401
+    """Takes a list of functions decorated using hub.compute and creates a pipeline that can be evaluated using .eval
+
+    Example::
+
+        pipeline = hub.compose([my_fn(a=3), another_function(b=2)])
+        pipeline.eval(data_in, ds_out, scheduler="processed", num_workers=2)
+
+    The __eval__ method evaluates the pipeline/transform function.
+
+    It has the following arguments:-
+
+    - data_in: Input passed to the transform to generate output dataset.
+    It should support \__getitem__ and \__len__. This can be a Hub dataset.
+    - ds_out (Dataset, optional): The dataset object to which the transform will get written.
+    If this is not provided, data_in will be overwritten if it is a Hub dataset, otherwise error will be raised.
+    It should have all keys being generated in output already present as tensors.
+    It's initial state should be either:-
+        - Empty i.e. all tensors have no samples. In this case all samples are added to the dataset.
+        - All tensors are populated and have sampe length. In this case new samples are appended to the dataset.
+    - num_workers (int): The number of workers to use for performing the transform.
+    Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
+    - scheduler (str): The scheduler to be used to compute the transformation.
+    Supported values include: 'serial', 'threaded', 'processed' and 'ray'. Defaults to 'threaded'.
+    - progressbar (bool): Displays a progress bar if True (default).
+
+    It raises the following errors:-
+
+    - InvalidInputDataError: If data_in passed to transform is invalid. It should support \__getitem__ and \__len__ operations. Using scheduler other than "threaded" with hub dataset having base storage as memory as data_in will also raise this.
+    - InvalidOutputDatasetError: If all the tensors of ds_out passed to transform don't have the same length. Using scheduler other than "threaded" with hub dataset having base storage as memory as ds_out will also raise this.
+    - TensorMismatchError: If one or more of the outputs generated during transform contain different tensors than the ones present in 'ds_out' provided to transform.
+    - UnsupportedSchedulerError: If the scheduler passed is not recognized. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
+    - TransformError: All other exceptions raised if there are problems while running the pipeline.
+    """
     if not functions:
         raise HubComposeEmptyListError
     for index, fn in enumerate(functions):
@@ -276,7 +310,7 @@ def compose(functions: List[TransformFunction]):
     return Pipeline(functions)
 
 
-def compute(fn):
+def compute(fn):  # noqa: DAR101, DAR102, DAR201, DAR401
     """Compute is a decorator for functions.
     The functions should have atleast 2 argument, the first two will correspond to sample_in and samples_out.
     There can be as many other arguments as required.
@@ -286,8 +320,44 @@ def compute(fn):
     Example::
 
         @hub.compute
-        def your_function(sample_in: Any, samples_out, your_arg0, your_arg1=0):
-            samples_out.your_tensor.append(your_arg0 * your_arg1)
+        def my_fn(sample_in: Any, samples_out, my_arg0, my_arg1=0):
+            samples_out.my_tensor.append(my_arg0 * my_arg1)
+
+        # This transform can be used using the eval method in one of these 2 ways:-
+
+        # Directly evaluating the method
+        # here arg0 and arg1 correspond to the 3rd and 4th argument in my_fn
+        my_fn(arg0, arg1).eval(data_in, ds_out, scheduler="threaded", num_workers=5)
+
+        # As a part of a Transform pipeline containing other functions
+        pipeline = hub.compose([my_fn(a, b), another_function(x=2)])
+        pipeline.eval(data_in, ds_out, scheduler="processed", num_workers=2)
+
+    The __eval__ method evaluates the pipeline/transform function.
+
+    It has the following arguments:-
+
+    - data_in: Input passed to the transform to generate output dataset.
+    It should support \__getitem__ and \__len__. This can be a Hub dataset.
+    - ds_out (Dataset, optional): The dataset object to which the transform will get written.
+    If this is not provided, data_in will be overwritten if it is a Hub dataset, otherwise error will be raised.
+    It should have all keys being generated in output already present as tensors.
+    It's initial state should be either:-
+        - Empty i.e. all tensors have no samples. In this case all samples are added to the dataset.
+        - All tensors are populated and have sampe length. In this case new samples are appended to the dataset.
+    - num_workers (int): The number of workers to use for performing the transform.
+    Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
+    - scheduler (str): The scheduler to be used to compute the transformation.
+    Supported values include: 'serial', 'threaded', 'processed' and 'ray'. Defaults to 'threaded'.
+    - progressbar (bool): Displays a progress bar if True (default).
+
+    It raises the following errors:-
+
+    - InvalidInputDataError: If data_in passed to transform is invalid. It should support \__getitem__ and \__len__ operations. Using scheduler other than "threaded" with hub dataset having base storage as memory as data_in will also raise this.
+    - InvalidOutputDatasetError: If all the tensors of ds_out passed to transform don't have the same length. Using scheduler other than "threaded" with hub dataset having base storage as memory as ds_out will also raise this.
+    - TensorMismatchError: If one or more of the outputs generated during transform contain different tensors than the ones present in 'ds_out' provided to transform.
+    - UnsupportedSchedulerError: If the scheduler passed is not recognized. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
+    - TransformError: All other exceptions raised if there are problems while running the pipeline.
     """
 
     def inner(*args, **kwargs):

diff --git a/hub/htype.py b/hub/htype.py
@@ -27,6 +27,10 @@
 | binary_mask    |  bool     |  none         |
 | segment_mask   |  uint32   |  none         |
 | keypoints_coco |  int32    |  none         |
+| audio          |  float64  |  none         |
+| text           |  str      |  none         |
+| json           |  Any      |  none         |
+| list           |  List     |  none         |
 
 """