Merge branch 'master' of https://github.com/activeloopai/Hub into fea…

…ture/supervisely
activeloopai · May 11, 2021 · 691cfba · 691cfba
2 parents eff07d4 + 28dce6d
commit 691cfba
Show file tree

Hide file tree

Showing 13 changed files with 161 additions and 23 deletions.
diff --git a/hub/api/dataset.py b/hub/api/dataset.py
@@ -765,6 +765,7 @@ def to_pytorch(
         output_type=dict,
         indexes=None,
         key_list=None,
+        shuffle=False,
     ):
         """| Converts the dataset into a pytorch compatible format.
         ** Pytorch does not support uint16, uint32, uint64 dtypes. These are implicitly type casted to int32, int64 and int64 respectively.
@@ -784,10 +785,14 @@ def to_pytorch(
         key_list: list, optional
             The list of keys that are needed in Pytorch format. For nested schemas such as {"a":{"b":{"c": Tensor()}}}
             use ["a/b/c"] as key_list
+        shuffle: bool, optional
+            whether to shuffle the data chunkwise or not. Default is False.
         """
         from .integrations import _to_pytorch
 
-        ds = _to_pytorch(self, transform, inplace, output_type, indexes, key_list)
+        ds = _to_pytorch(
+            self, transform, inplace, output_type, indexes, key_list, shuffle
+        )
         return ds
 
     def to_tensorflow(self, indexes=None, include_shapes=False, key_list=None):

diff --git a/hub/api/dataset_utils.py b/hub/api/dataset_utils.py
@@ -20,7 +20,7 @@
 import numcodecs
 import numcodecs.lz4
 import numcodecs.zstd
-from hub.schema.features import Primitive, SchemaDict
+from hub.schema.features import Primitive, SchemaDict, Tensor
 from hub.numcodecs import PngCodec
 from hub.schema import ClassLabel
 
@@ -46,15 +46,21 @@ def same_schema(schema1, schema2):
     if schema1.dict_.keys() != schema2.dict_.keys():
         return False
     for k, v in schema1.dict_.items():
-        if isinstance(v, SchemaDict) and not same_schema(v, schema2.dict_[k]):
-            return False
-        elif (
-            v.shape != schema2.dict_[k].shape
-            or v.max_shape != schema2.dict_[k].max_shape
-            or v.chunks != schema2.dict_[k].chunks
-            or v.dtype != schema2.dict_[k].dtype
-            or v.compressor != schema2.dict_[k].compressor
+        if isinstance(v, SchemaDict) and isinstance(schema2.dict_[k], SchemaDict):
+            if not same_schema(v, schema2.dict_[k]):
+                return False
+        elif (isinstance(v, Tensor) and isinstance(schema2.dict_[k], Tensor)) or (
+            isinstance(v, Primitive) and isinstance(schema2.dict_[k], Primitive)
         ):
+            if (
+                v.shape != schema2.dict_[k].shape
+                or v.max_shape != schema2.dict_[k].max_shape
+                or v.chunks != schema2.dict_[k].chunks
+                or v.dtype != schema2.dict_[k].dtype
+                or v.compressor != schema2.dict_[k].compressor
+            ):
+                return False
+        else:
             return False
     return True
 

diff --git a/hub/api/datasetview.py b/hub/api/datasetview.py
@@ -322,7 +322,14 @@ def to_tensorflow(self, include_shapes=False, key_list=None):
             indexes=self.indexes, include_shapes=include_shapes, key_list=key_list
         )
 
-    def to_pytorch(self, transform=None, inplace=True, output_type=dict, key_list=None):
+    def to_pytorch(
+        self,
+        transform=None,
+        inplace=True,
+        output_type=dict,
+        key_list=None,
+        shuffle=False,
+    ):
         """| Converts the dataset into a pytorch compatible format.
         ** Pytorch does not support uint16, uint32, uint64 dtypes. These are implicitly type casted to int32, int64 and int64 respectively.
         Avoid having schema with these dtypes if you want to avoid this implicit conversion.
@@ -336,13 +343,16 @@ def to_pytorch(self, transform=None, inplace=True, output_type=dict, key_list=No
             type you need for Transforms). Default is True.
         output_type: one of list, tuple, dict, optional
             Defines the output type. Default is dict - same as in original Hub Dataset.
+        shuffle: bool, optional
+            whether to shuffle the data chunkwise or not. Default is False.
         """
         return self.dataset.to_pytorch(
             transform=transform,
             indexes=self.indexes,
             inplace=inplace,
             output_type=output_type,
             key_list=key_list,
+            shuffle=shuffle,
         )
 
     def resize_shape(self, size: int) -> None:

diff --git a/hub/api/integrations.py b/hub/api/integrations.py
@@ -18,10 +18,17 @@
 import hub.store.pickle_s3_storage
 import hub.schema.serialize
 import hub.schema.deserialize
+import random
 
 
 def _to_pytorch(
-    dataset, transform=None, inplace=True, output_type=dict, indexes=None, key_list=None
+    dataset,
+    transform=None,
+    inplace=True,
+    output_type=dict,
+    indexes=None,
+    key_list=None,
+    shuffle=False,
 ):
     """| Converts the dataset into a pytorch compatible format.
 
@@ -38,6 +45,8 @@ def _to_pytorch(
     key_list: list, optional
         The list of keys that are needed in Pytorch format. For nested schemas such as {"a":{"b":{"c": Tensor()}}}
         use ["a/b/c"] as key_list
+    shuffle: bool, optional
+        whether to shuffle the data chunkwise or not. Default is False.
     """
     try:
         import torch
@@ -56,6 +65,7 @@ def _to_pytorch(
         output_type=output_type,
         indexes=indexes,
         key_list=key_list,
+        shuffle=shuffle,
     )
 
 
@@ -572,14 +582,14 @@ def __init__(
         output_type=dict,
         indexes=None,
         key_list=None,
+        shuffle=False,
     ):
         self._ds = None
         self._url = ds.url
         self._token = ds.token
         self._transform = transform
         self.inplace = inplace
         self.output_type = output_type
-        self.indexes = indexes
         self._inited = False
         self.key_list = key_list
         self.key_list = self.key_list or list(ds.keys)
@@ -590,6 +600,38 @@ def __init__(
             if key not in ds.keys:
                 raise KeyError(key)
 
+        self.max_chunk = self.get_max_chunk(ds)
+        self.last_index = None
+        if isinstance(indexes, int):
+            self.last_index = indexes
+        elif len(indexes) > 0:
+            self.last_index = indexes[-1]
+        self.indexes = self.shuffle_indexes(indexes, shuffle)
+
+    def shuffle_indexes(self, indexes, shuffle):
+        if not shuffle or isinstance(indexes, int):
+            return indexes
+        chunk_indexes_map = defaultdict(list)
+        chunk_size = self.max_chunk
+        for index in indexes:
+            chunk = index // chunk_size
+            chunk_indexes_map[chunk].append(index)
+        chunk_indexes = list(chunk_indexes_map.values())
+        random.shuffle(chunk_indexes)
+        new_indexes = []
+        for item in chunk_indexes:
+            new_indexes.extend(item)
+        return new_indexes
+
+    def get_max_chunk(self, ds):
+        max_chunk = 1
+        for key, value in ds._tensors.items():
+            if key in self.key_list:
+                max_chunk = max(
+                    max_chunk, ((None in value.shape) and 1 or value.chunks[0])
+                )
+        return max_chunk
+
     def _do_transform(self, data):
         return self._transform(data) if self._transform else data
 
@@ -619,7 +661,7 @@ def _get_active_item(self, key, index):
             active_range_start = index - index % samples_per_chunk
             active_range = range(
                 active_range_start,
-                min(active_range_start + samples_per_chunk, self.indexes[-1] + 1),
+                min(active_range_start + samples_per_chunk, self.last_index + 1),
             )
             self._active_chunks_range[key] = active_range
             self._active_chunks[key] = self._ds._tensors[key][

diff --git a/hub/api/tests/test_converters.py b/hub/api/tests/test_converters.py
@@ -585,6 +585,21 @@ def test_from_supervisely_video():
     trans = hub.Dataset.from_supervisely(os.path.join(data_path, project_name))
 
 
+@pytest.mark.skipif(not pytorch_loaded(), reason="requires pytorch to be loaded")
+def test_to_pytorch_shuffle():
+    schema = {
+        "image": hub.schema.Image((1000, 1000, 3)),
+        "cl": hub.schema.Primitive("uint16", chunks=16),
+    }
+
+    ds = hub.Dataset("./data/test_shuffle", schema=schema, shape=(1024), mode="w")
+    for i in range(len(ds)):
+        ds["cl", i] = i
+    pds = ds.to_pytorch(shuffle=True)
+    for i, item in enumerate(pds):
+        assert item["cl"].numpy() % 16 == i % 16
+
+
 if __name__ == "__main__":
     with Timer("Test Converters"):
         with Timer("from MNIST"):

diff --git a/hub/api/tests/test_dataset.py b/hub/api/tests/test_dataset.py
@@ -21,7 +21,16 @@
     SchemaMismatchException,
     ReadModeException,
 )
-from hub.schema import BBox, ClassLabel, Image, SchemaDict, Sequence, Tensor, Text
+from hub.schema import (
+    BBox,
+    ClassLabel,
+    Image,
+    SchemaDict,
+    Sequence,
+    Tensor,
+    Text,
+    Primitive,
+)
 from hub.utils import (
     azure_creds_exist,
     gcp_creds_exist,
@@ -1308,6 +1317,24 @@ def my_filter(sample):
         assert ds3["abc", i].compute() == 5 * i
 
 
+def test_dataset_schema_bug():
+    schema = {"abc": Primitive("int32"), "def": "int64"}
+    ds = Dataset("./data/schema_bug", schema=schema, shape=(100,))
+    ds.flush()
+    ds2 = Dataset("./data/schema_bug", schema=schema, shape=(100,))
+
+    schema = {
+        "abc": "uint8",
+        "def": {
+            "ghi": Tensor((100, 100)),
+            "rst": Tensor((100, 100, 100)),
+        },
+    }
+    ds = Dataset("./data/schema_bug_2", schema=schema, shape=(100,))
+    ds.flush()
+    ds2 = Dataset("./data/schema_bug_2", schema=schema, shape=(100,))
+
+
 def test_dataset_google():
     ds = Dataset("google/bike")
     assert ds["image_channels", 0].compute() == 3

diff --git a/hub/auto/tests/test_tabular_data.py b/hub/auto/tests/test_tabular_data.py
@@ -62,9 +62,9 @@ def assert_conversion(tag):
     # Checking if the datatypes of the columns match
     for i in keys_csv_parser:
         if df[i].dtype == np.dtype("O"):
-            assert ds[i].dtype == np.dtype("uint8")
+            assert type(ds[0, i].compute()) == str
         else:
-            assert ds[i].dtype == df[i].dtype
+            assert str(ds[0, i].compute().dtype) == str(df[i].dtype)
 
     # Checking if all the filenames are parsed correctly
     list_names = []

diff --git a/hub/schema/deserialize.py b/hub/schema/deserialize.py
@@ -4,7 +4,7 @@
 If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
 """
 
-from hub.schema.features import Tensor, SchemaDict
+from hub.schema.features import Tensor, SchemaDict, Primitive
 from hub.schema.image import Image
 from hub.schema.class_label import ClassLabel
 from hub.schema.polygon import Polygon
@@ -88,6 +88,12 @@ def deserialize(inp):
                 chunks=inp["chunks"],
                 compressor=_get_compressor(inp),
             )
+        elif inp["type"] == "Primitive":
+            return Primitive(
+                dtype=deserialize(inp["dtype"]),
+                chunks=inp["chunks"],
+                compressor=_get_compressor(inp),
+            )
         elif inp["type"] == "Segmentation":
             class_labels = deserialize(inp["class_labels"])
             if class_labels._names is not None:
@@ -136,8 +142,6 @@ def deserialize(inp):
             return Video(
                 shape=tuple(inp["shape"]),
                 dtype=deserialize(inp["dtype"]),
-                # TODO uncomment back when image encoding will be added
-                # encoding_format=inp["encoding_format"],
                 max_shape=tuple(inp["max_shape"]),
                 chunks=inp["chunks"],
                 compressor=_get_compressor(inp),

diff --git a/hub/schema/features.py b/hub/schema/features.py
@@ -45,11 +45,25 @@ def _flatten(self):
         yield FlatTensor("", (), self._dtype, (), self.chunks)
 
     def __str__(self):
-        return "'" + str(self.dtype) + "'"
+        return f"'{str(self.dtype)}'"
 
     def __repr__(self):
         return self.__str__()
 
+    def __eq__(self, other):
+        if not isinstance(other, Primitive):
+            return False
+        return (
+            self.shape == other.shape
+            and self.max_shape == other.max_shape
+            and self.chunks == other.chunks
+            and self.dtype == other.dtype
+            and self.compressor == other.compressor
+        )
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
 
 class SchemaDict(HubSchema):
     """Class for dict branching of a datatype.

diff --git a/hub/schema/serialize.py b/hub/schema/serialize.py
@@ -44,4 +44,9 @@ def serialize_SchemaDict(fdict):
 
 def serialize_primitive(primitive):
     "Converts Primitive into a serializable format"
-    return str(primitive._dtype)
+    return {
+        "type": "Primitive",
+        "dtype": str(primitive._dtype),
+        "compressor": primitive.compressor,
+        "chunks": primitive.chunks,
+    }
diff --git a/hub/schema/tests/test_serialize_deserialize.py b/hub/schema/tests/test_serialize_deserialize.py
@@ -8,7 +8,7 @@
 from hub.schema.text import Text
 from hub.schema.sequence import Sequence
 import pytest
-from hub.schema.features import Tensor
+from hub.schema.features import Tensor, Primitive
 from hub.schema.serialize import serialize
 from hub.schema.deserialize import deserialize
 from hub.schema.image import Image
@@ -51,6 +51,7 @@ def test_serialize_deserialize():
             ),
             "text": Text((None,), max_shape=(10,)),
             "video": Video((100, 100, 3, 10)),
+            "prim": Primitive("uint16", chunks=5, compressor="zstd"),
         },
     )
     original_result = tuple(t._flatten())

diff --git a/hub/store/shape_detector.py b/hub/store/shape_detector.py
@@ -39,6 +39,12 @@ def __init__(
         self._chunks = chunks = self._get_chunks(
             shape, max_shape, chunks, dtype, chunksize
         )
+        self._chunks = chunks = self.closest_power_of_2(chunks)
+
+    def closest_power_of_2(self, chunks):
+        head_chunk = chunks[0]
+        head_chunk = 2 ** math.floor(math.log2(head_chunk))
+        return (head_chunk,) + chunks[1:]
 
     def _get_chunksize(self, chunksize, compressor):
         if isinstance(compressor, PngCodec):

diff --git a/setup.py b/setup.py
@@ -49,6 +49,9 @@
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3 :: Only",
+        "Topic :: Database",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Scientific/Engineering :: Image Processing",
     ],
     python_requires=">=3.6",
     install_requires=requirements,