Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/activeloopai/Hub into fea…
Browse files Browse the repository at this point in the history
…ture/supervisely
  • Loading branch information
kristinagrig06 committed May 11, 2021
2 parents eff07d4 + 28dce6d commit 691cfba
Show file tree
Hide file tree
Showing 13 changed files with 161 additions and 23 deletions.
7 changes: 6 additions & 1 deletion hub/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,7 @@ def to_pytorch(
output_type=dict,
indexes=None,
key_list=None,
shuffle=False,
):
"""| Converts the dataset into a pytorch compatible format.
** Pytorch does not support uint16, uint32, uint64 dtypes. These are implicitly type casted to int32, int64 and int64 respectively.
Expand All @@ -784,10 +785,14 @@ def to_pytorch(
key_list: list, optional
The list of keys that are needed in Pytorch format. For nested schemas such as {"a":{"b":{"c": Tensor()}}}
use ["a/b/c"] as key_list
shuffle: bool, optional
whether to shuffle the data chunkwise or not. Default is False.
"""
from .integrations import _to_pytorch

ds = _to_pytorch(self, transform, inplace, output_type, indexes, key_list)
ds = _to_pytorch(
self, transform, inplace, output_type, indexes, key_list, shuffle
)
return ds

def to_tensorflow(self, indexes=None, include_shapes=False, key_list=None):
Expand Down
24 changes: 15 additions & 9 deletions hub/api/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import numcodecs
import numcodecs.lz4
import numcodecs.zstd
from hub.schema.features import Primitive, SchemaDict
from hub.schema.features import Primitive, SchemaDict, Tensor
from hub.numcodecs import PngCodec
from hub.schema import ClassLabel

Expand All @@ -46,15 +46,21 @@ def same_schema(schema1, schema2):
if schema1.dict_.keys() != schema2.dict_.keys():
return False
for k, v in schema1.dict_.items():
if isinstance(v, SchemaDict) and not same_schema(v, schema2.dict_[k]):
return False
elif (
v.shape != schema2.dict_[k].shape
or v.max_shape != schema2.dict_[k].max_shape
or v.chunks != schema2.dict_[k].chunks
or v.dtype != schema2.dict_[k].dtype
or v.compressor != schema2.dict_[k].compressor
if isinstance(v, SchemaDict) and isinstance(schema2.dict_[k], SchemaDict):
if not same_schema(v, schema2.dict_[k]):
return False
elif (isinstance(v, Tensor) and isinstance(schema2.dict_[k], Tensor)) or (
isinstance(v, Primitive) and isinstance(schema2.dict_[k], Primitive)
):
if (
v.shape != schema2.dict_[k].shape
or v.max_shape != schema2.dict_[k].max_shape
or v.chunks != schema2.dict_[k].chunks
or v.dtype != schema2.dict_[k].dtype
or v.compressor != schema2.dict_[k].compressor
):
return False
else:
return False
return True

Expand Down
12 changes: 11 additions & 1 deletion hub/api/datasetview.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,14 @@ def to_tensorflow(self, include_shapes=False, key_list=None):
indexes=self.indexes, include_shapes=include_shapes, key_list=key_list
)

def to_pytorch(self, transform=None, inplace=True, output_type=dict, key_list=None):
def to_pytorch(
self,
transform=None,
inplace=True,
output_type=dict,
key_list=None,
shuffle=False,
):
"""| Converts the dataset into a pytorch compatible format.
** Pytorch does not support uint16, uint32, uint64 dtypes. These are implicitly type casted to int32, int64 and int64 respectively.
Avoid having schema with these dtypes if you want to avoid this implicit conversion.
Expand All @@ -336,13 +343,16 @@ def to_pytorch(self, transform=None, inplace=True, output_type=dict, key_list=No
type you need for Transforms). Default is True.
output_type: one of list, tuple, dict, optional
Defines the output type. Default is dict - same as in original Hub Dataset.
shuffle: bool, optional
whether to shuffle the data chunkwise or not. Default is False.
"""
return self.dataset.to_pytorch(
transform=transform,
indexes=self.indexes,
inplace=inplace,
output_type=output_type,
key_list=key_list,
shuffle=shuffle,
)

def resize_shape(self, size: int) -> None:
Expand Down
48 changes: 45 additions & 3 deletions hub/api/integrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,17 @@
import hub.store.pickle_s3_storage
import hub.schema.serialize
import hub.schema.deserialize
import random


def _to_pytorch(
dataset, transform=None, inplace=True, output_type=dict, indexes=None, key_list=None
dataset,
transform=None,
inplace=True,
output_type=dict,
indexes=None,
key_list=None,
shuffle=False,
):
"""| Converts the dataset into a pytorch compatible format.
Expand All @@ -38,6 +45,8 @@ def _to_pytorch(
key_list: list, optional
The list of keys that are needed in Pytorch format. For nested schemas such as {"a":{"b":{"c": Tensor()}}}
use ["a/b/c"] as key_list
shuffle: bool, optional
whether to shuffle the data chunkwise or not. Default is False.
"""
try:
import torch
Expand All @@ -56,6 +65,7 @@ def _to_pytorch(
output_type=output_type,
indexes=indexes,
key_list=key_list,
shuffle=shuffle,
)


Expand Down Expand Up @@ -572,14 +582,14 @@ def __init__(
output_type=dict,
indexes=None,
key_list=None,
shuffle=False,
):
self._ds = None
self._url = ds.url
self._token = ds.token
self._transform = transform
self.inplace = inplace
self.output_type = output_type
self.indexes = indexes
self._inited = False
self.key_list = key_list
self.key_list = self.key_list or list(ds.keys)
Expand All @@ -590,6 +600,38 @@ def __init__(
if key not in ds.keys:
raise KeyError(key)

self.max_chunk = self.get_max_chunk(ds)
self.last_index = None
if isinstance(indexes, int):
self.last_index = indexes
elif len(indexes) > 0:
self.last_index = indexes[-1]
self.indexes = self.shuffle_indexes(indexes, shuffle)

def shuffle_indexes(self, indexes, shuffle):
if not shuffle or isinstance(indexes, int):
return indexes
chunk_indexes_map = defaultdict(list)
chunk_size = self.max_chunk
for index in indexes:
chunk = index // chunk_size
chunk_indexes_map[chunk].append(index)
chunk_indexes = list(chunk_indexes_map.values())
random.shuffle(chunk_indexes)
new_indexes = []
for item in chunk_indexes:
new_indexes.extend(item)
return new_indexes

def get_max_chunk(self, ds):
max_chunk = 1
for key, value in ds._tensors.items():
if key in self.key_list:
max_chunk = max(
max_chunk, ((None in value.shape) and 1 or value.chunks[0])
)
return max_chunk

def _do_transform(self, data):
return self._transform(data) if self._transform else data

Expand Down Expand Up @@ -619,7 +661,7 @@ def _get_active_item(self, key, index):
active_range_start = index - index % samples_per_chunk
active_range = range(
active_range_start,
min(active_range_start + samples_per_chunk, self.indexes[-1] + 1),
min(active_range_start + samples_per_chunk, self.last_index + 1),
)
self._active_chunks_range[key] = active_range
self._active_chunks[key] = self._ds._tensors[key][
Expand Down
15 changes: 15 additions & 0 deletions hub/api/tests/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,21 @@ def test_from_supervisely_video():
trans = hub.Dataset.from_supervisely(os.path.join(data_path, project_name))


@pytest.mark.skipif(not pytorch_loaded(), reason="requires pytorch to be loaded")
def test_to_pytorch_shuffle():
schema = {
"image": hub.schema.Image((1000, 1000, 3)),
"cl": hub.schema.Primitive("uint16", chunks=16),
}

ds = hub.Dataset("./data/test_shuffle", schema=schema, shape=(1024), mode="w")
for i in range(len(ds)):
ds["cl", i] = i
pds = ds.to_pytorch(shuffle=True)
for i, item in enumerate(pds):
assert item["cl"].numpy() % 16 == i % 16


if __name__ == "__main__":
with Timer("Test Converters"):
with Timer("from MNIST"):
Expand Down
29 changes: 28 additions & 1 deletion hub/api/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,16 @@
SchemaMismatchException,
ReadModeException,
)
from hub.schema import BBox, ClassLabel, Image, SchemaDict, Sequence, Tensor, Text
from hub.schema import (
BBox,
ClassLabel,
Image,
SchemaDict,
Sequence,
Tensor,
Text,
Primitive,
)
from hub.utils import (
azure_creds_exist,
gcp_creds_exist,
Expand Down Expand Up @@ -1308,6 +1317,24 @@ def my_filter(sample):
assert ds3["abc", i].compute() == 5 * i


def test_dataset_schema_bug():
schema = {"abc": Primitive("int32"), "def": "int64"}
ds = Dataset("./data/schema_bug", schema=schema, shape=(100,))
ds.flush()
ds2 = Dataset("./data/schema_bug", schema=schema, shape=(100,))

schema = {
"abc": "uint8",
"def": {
"ghi": Tensor((100, 100)),
"rst": Tensor((100, 100, 100)),
},
}
ds = Dataset("./data/schema_bug_2", schema=schema, shape=(100,))
ds.flush()
ds2 = Dataset("./data/schema_bug_2", schema=schema, shape=(100,))


def test_dataset_google():
ds = Dataset("google/bike")
assert ds["image_channels", 0].compute() == 3
Expand Down
4 changes: 2 additions & 2 deletions hub/auto/tests/test_tabular_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ def assert_conversion(tag):
# Checking if the datatypes of the columns match
for i in keys_csv_parser:
if df[i].dtype == np.dtype("O"):
assert ds[i].dtype == np.dtype("uint8")
assert type(ds[0, i].compute()) == str
else:
assert ds[i].dtype == df[i].dtype
assert str(ds[0, i].compute().dtype) == str(df[i].dtype)

# Checking if all the filenames are parsed correctly
list_names = []
Expand Down
10 changes: 7 additions & 3 deletions hub/schema/deserialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
"""

from hub.schema.features import Tensor, SchemaDict
from hub.schema.features import Tensor, SchemaDict, Primitive
from hub.schema.image import Image
from hub.schema.class_label import ClassLabel
from hub.schema.polygon import Polygon
Expand Down Expand Up @@ -88,6 +88,12 @@ def deserialize(inp):
chunks=inp["chunks"],
compressor=_get_compressor(inp),
)
elif inp["type"] == "Primitive":
return Primitive(
dtype=deserialize(inp["dtype"]),
chunks=inp["chunks"],
compressor=_get_compressor(inp),
)
elif inp["type"] == "Segmentation":
class_labels = deserialize(inp["class_labels"])
if class_labels._names is not None:
Expand Down Expand Up @@ -136,8 +142,6 @@ def deserialize(inp):
return Video(
shape=tuple(inp["shape"]),
dtype=deserialize(inp["dtype"]),
# TODO uncomment back when image encoding will be added
# encoding_format=inp["encoding_format"],
max_shape=tuple(inp["max_shape"]),
chunks=inp["chunks"],
compressor=_get_compressor(inp),
Expand Down
16 changes: 15 additions & 1 deletion hub/schema/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,25 @@ def _flatten(self):
yield FlatTensor("", (), self._dtype, (), self.chunks)

def __str__(self):
return "'" + str(self.dtype) + "'"
return f"'{str(self.dtype)}'"

def __repr__(self):
return self.__str__()

def __eq__(self, other):
if not isinstance(other, Primitive):
return False
return (
self.shape == other.shape
and self.max_shape == other.max_shape
and self.chunks == other.chunks
and self.dtype == other.dtype
and self.compressor == other.compressor
)

def __ne__(self, other):
return not self.__eq__(other)


class SchemaDict(HubSchema):
"""Class for dict branching of a datatype.
Expand Down
7 changes: 6 additions & 1 deletion hub/schema/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,9 @@ def serialize_SchemaDict(fdict):

def serialize_primitive(primitive):
"Converts Primitive into a serializable format"
return str(primitive._dtype)
return {
"type": "Primitive",
"dtype": str(primitive._dtype),
"compressor": primitive.compressor,
"chunks": primitive.chunks,
}
3 changes: 2 additions & 1 deletion hub/schema/tests/test_serialize_deserialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from hub.schema.text import Text
from hub.schema.sequence import Sequence
import pytest
from hub.schema.features import Tensor
from hub.schema.features import Tensor, Primitive
from hub.schema.serialize import serialize
from hub.schema.deserialize import deserialize
from hub.schema.image import Image
Expand Down Expand Up @@ -51,6 +51,7 @@ def test_serialize_deserialize():
),
"text": Text((None,), max_shape=(10,)),
"video": Video((100, 100, 3, 10)),
"prim": Primitive("uint16", chunks=5, compressor="zstd"),
},
)
original_result = tuple(t._flatten())
Expand Down
6 changes: 6 additions & 0 deletions hub/store/shape_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ def __init__(
self._chunks = chunks = self._get_chunks(
shape, max_shape, chunks, dtype, chunksize
)
self._chunks = chunks = self.closest_power_of_2(chunks)

def closest_power_of_2(self, chunks):
head_chunk = chunks[0]
head_chunk = 2 ** math.floor(math.log2(head_chunk))
return (head_chunk,) + chunks[1:]

def _get_chunksize(self, chunksize, compressor):
if isinstance(compressor, PngCodec):
Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3 :: Only",
"Topic :: Database",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Image Processing",
],
python_requires=">=3.6",
install_requires=requirements,
Expand Down

0 comments on commit 691cfba

Please sign in to comment.