diff --git a/hub/api/dataset.py b/hub/api/dataset.py index f24504510f..ff941772a4 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -23,12 +23,12 @@ from hub.log import logger import hub.store.pickle_s3_storage -from hub.api.datasetview import DatasetView, ObjectView, TensorView - +from hub.api.datasetview import DatasetView +from hub.api.objectview import ObjectView +from hub.api.tensorview import TensorView from hub.api.dataset_utils import ( create_numpy_dict, get_value, - slice_extract_info, slice_split, str_to_int, ) @@ -41,8 +41,10 @@ from hub.store.store import get_fs_and_path, get_storage_map from hub.exceptions import ( HubDatasetNotFoundException, + LargeShapeFilteringException, NotHubDatasetToOverwriteException, NotHubDatasetToAppendException, + OutOfBoundsError, ShapeArgumentNotFoundException, SchemaArgumentNotFoundException, ModuleNotInstalledException, @@ -55,7 +57,7 @@ from hub.schema import Audio, BBox, ClassLabel, Image, Sequence, Text, Video from hub.numcodecs import PngCodec -from hub.utils import norm_cache, norm_shape +from hub.utils import norm_cache, norm_shape, _tuple_product from hub import defaults @@ -195,6 +197,8 @@ def __init__( logger.error("Deleting the dataset " + traceback.format_exc() + str(e)) raise + self.indexes = list(range(self._shape[0])) + if needcreate and ( self._path.startswith("s3://snark-hub-dev/") or self._path.startswith("s3://snark-hub/") @@ -386,12 +390,10 @@ def __getitem__(self, slice_): raise ValueError( "Can't slice a dataset with multiple slices without key" ) - num, ofs = slice_extract_info(slice_list[0], self._shape[0]) + indexes = self.indexes[slice_list[0]] return DatasetView( dataset=self, - num_samples=num, - offset=ofs, - squeeze_dim=isinstance(slice_list[0], int), + indexes=indexes, lazy=self.lazy, ) elif not slice_list: @@ -402,22 +404,18 @@ def __getitem__(self, slice_): slice_=slice(0, self._shape[0]), lazy=self.lazy, ) - if self.lazy: - return tensorview - else: - return tensorview.compute() + return tensorview if self.lazy else tensorview.compute() for key in self.keys: if subpath.startswith(key): objectview = ObjectView( - dataset=self, subpath=subpath, lazy=self.lazy + dataset=self, + subpath=subpath, + lazy=self.lazy, + slice_=[slice(0, self._shape[0])], ) - if self.lazy: - return objectview - else: - return objectview.compute() + return objectview if self.lazy else objectview.compute() return self._get_dictionary(subpath) else: - num, ofs = slice_extract_info(slice_list[0], self.shape[0]) schema_obj = self.schema.dict_[subpath.split("/")[1]] if subpath in self.keys and ( not isinstance(schema_obj, Sequence) or len(slice_list) <= 1 @@ -425,22 +423,16 @@ def __getitem__(self, slice_): tensorview = TensorView( dataset=self, subpath=subpath, slice_=slice_list, lazy=self.lazy ) - if self.lazy: - return tensorview - else: - return tensorview.compute() + return tensorview if self.lazy else tensorview.compute() for key in self.keys: if subpath.startswith(key): objectview = ObjectView( dataset=self, subpath=subpath, - slice_list=slice_list, + slice_=slice_list, lazy=self.lazy, ) - if self.lazy: - return objectview - else: - return objectview.compute() + return objectview if self.lazy else objectview.compute() if len(slice_list) > 1: raise ValueError("You can't slice a dictionary of Tensors") return self._get_dictionary(subpath, slice_list[0]) @@ -463,26 +455,43 @@ def __setitem__(self, slice_, value): subpath, slice_list = slice_split(slice_) if not subpath: - raise ValueError("Can't assign to dataset sliced without key") - elif not slice_list: - if subpath in self.keys: - self._tensors[subpath][:] = assign_value # Add path check - else: - ObjectView(dataset=self, subpath=subpath)[:] = assign_value + raise ValueError("Can't assign to dataset sliced without subpath") + elif subpath not in self.keys: + raise KeyError(f"Key {subpath} not found in the dataset") + + if not slice_list: + self._tensors[subpath][:] = assign_value else: - if subpath in self.keys: - self._tensors[subpath][slice_list] = assign_value - else: - ObjectView(dataset=self, subpath=subpath, slice_list=slice_list)[ - : - ] = assign_value + self._tensors[subpath][slice_list] = assign_value + + def filter(self, dic): + """| Applies a filter to get a new datasetview that matches the dictionary provided + + Parameters + ---------- + dic: dictionary + A dictionary of key value pairs, used to filter the dataset. For nested schemas use flattened dictionary representation + i.e instead of {"abc": {"xyz" : 5}} use {"abc/xyz" : 5} + """ + indexes = self.indexes + for k, v in dic.items(): + k = k if k.startswith("/") else "/" + k + if k not in self.keys: + raise KeyError(f"Key {k} not found in the dataset") + tsv = self[k] + max_shape = tsv.dtype.max_shape + prod = _tuple_product(max_shape) + if prod > 100: + raise LargeShapeFilteringException(k) + indexes = [index for index in indexes if tsv[index].compute() == v] + return DatasetView(dataset=self, lazy=self.lazy, indexes=indexes) def resize_shape(self, size: int) -> None: """ Resize the shape of the dataset by resizing each tensor first dimension """ if size == self._shape[0]: return - self._shape = (int(size),) + self.indexes = list(range(self.shape[0])) self.meta = self._store_meta() for t in self._tensors.values(): t.resize_shape(int(size)) @@ -518,8 +527,7 @@ def to_pytorch( transform=None, inplace=True, output_type=dict, - offset=None, - num_samples=None, + indexes=None, ): """| Converts the dataset into a pytorch compatible format. @@ -542,18 +550,15 @@ def to_pytorch( raise ModuleNotInstalledException("torch") global torch + indexes = indexes or self.indexes + if "r" not in self.mode: self.flush() # FIXME Without this some tests in test_converters.py fails, not clear why return TorchDataset( - self, - transform, - inplace=inplace, - output_type=output_type, - offset=offset, - num_samples=num_samples, + self, transform, inplace=inplace, output_type=output_type, indexes=indexes ) - def to_tensorflow(self, offset=None, num_samples=None): + def to_tensorflow(self, indexes=None): """| Converts the dataset into a tensorflow compatible format Parameters @@ -570,11 +575,11 @@ def to_tensorflow(self, offset=None, num_samples=None): global tf - offset = 0 if offset is None else offset - num_samples = self._shape[0] if num_samples is None else num_samples + indexes = indexes or self.indexes + indexes = [indexes] if isinstance(indexes, int) else indexes def tf_gen(): - for index in range(offset, offset + num_samples): + for index in indexes: d = {} for key in self.keys: split_key = key.split("/") @@ -1144,13 +1149,7 @@ def my_transform(sample): class TorchDataset: def __init__( - self, - ds, - transform=None, - inplace=True, - output_type=dict, - num_samples=None, - offset=None, + self, ds, transform=None, inplace=True, output_type=dict, indexes=None ): self._ds = None self._url = ds.url @@ -1158,8 +1157,7 @@ def __init__( self._transform = transform self.inplace = inplace self.output_type = output_type - self.num_samples = num_samples - self.offset = offset + self.indexes = indexes self._inited = False def _do_transform(self, data): @@ -1182,7 +1180,7 @@ def _init_ds(self): def __len__(self): self._init_ds() - return self.num_samples if self.num_samples is not None else self._ds.shape[0] + return len(self.indexes) if isinstance(self.indexes, list) else 1 def _get_active_item(self, key, index): active_range = self._active_chunks_range.get(key) @@ -1198,8 +1196,13 @@ def _get_active_item(self, key, index): ] return self._active_chunks[key][index % samples_per_chunk] - def __getitem__(self, index): - index = index + self.offset if self.offset is not None else index + def __getitem__(self, ind): + if isinstance(self.indexes, int): + if ind != 0: + raise OutOfBoundsError(f"Got index {ind} for dataset of length 1") + index = self.indexes + else: + index = self.indexes[ind] self._init_ds() d = {} for key in self._ds._tensors.keys(): diff --git a/hub/api/datasetview.py b/hub/api/datasetview.py index a31815adfa..94f40ee067 100644 --- a/hub/api/datasetview.py +++ b/hub/api/datasetview.py @@ -1,23 +1,23 @@ +from hub.utils import _tuple_product +from hub.api.tensorview import TensorView import collections.abc as abc from hub.api.dataset_utils import ( create_numpy_dict, get_value, - slice_extract_info, slice_split, str_to_int, ) -from hub.exceptions import NoneValueException -from hub.schema import Sequence, Tensor, SchemaDict, Primitive, Text +from hub.exceptions import LargeShapeFilteringException, NoneValueException +from hub.api.objectview import ObjectView +from hub.schema import Sequence class DatasetView: def __init__( self, dataset=None, - num_samples: int = None, - offset: int = None, - squeeze_dim: bool = False, lazy: bool = True, + indexes=None, # list or integer ): """Creates a DatasetView object for a subset of the Dataset. @@ -25,27 +25,24 @@ def __init__( ---------- dataset: hub.api.dataset.Dataset object The dataset whose DatasetView is being created - num_samples: int - The number of samples in this DatasetView - offset: int - The offset from which the DatasetView starts - squeeze_dim: bool, optional - For slicing with integers we would love to remove the first dimension to make it nicer lazy: bool, optional Setting this to False will stop lazy computation and will allow items to be accessed without .compute() + indexes: optional + It can be either a list or an integer depending upon the slicing. Represents the indexes that the datasetview is representing. """ if dataset is None: raise NoneValueException("dataset") - if num_samples is None: - raise NoneValueException("num_samples") - if offset is None: - raise NoneValueException("offset") + if indexes is None: + raise NoneValueException("indexes") self.dataset = dataset - self.num_samples = num_samples - self.offset = offset - self.squeeze_dim = squeeze_dim self.lazy = lazy + self.indexes = indexes + self.is_contiguous = False + if isinstance(self.indexes, list) and self.indexes: + self.is_contiguous = self.indexes[-1] - self.indexes[0] + 1 == len( + self.indexes + ) def __getitem__(self, slice_): """| Gets a slice or slices from DatasetView @@ -56,32 +53,21 @@ def __getitem__(self, slice_): """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] - slice_ = list(slice_) subpath, slice_list = slice_split(slice_) - - slice_list = [0] + slice_list if self.squeeze_dim else slice_list - + slice_list = [0] + slice_list if isinstance(self.indexes, int) else slice_list if not subpath: if len(slice_list) > 1: - raise ValueError( - "Can't slice a dataset with multiple slices without subpath" - ) - num, ofs = slice_extract_info(slice_list[0], self.num_samples) - return DatasetView( - dataset=self.dataset, - num_samples=num, - offset=ofs + self.offset, - squeeze_dim=isinstance(slice_list[0], int), - lazy=self.lazy, - ) + raise ValueError("Can't slice dataset with multiple slices without key") + indexes = self.indexes[slice_list[0]] + return DatasetView(dataset=self.dataset, lazy=self.lazy, indexes=indexes) elif not slice_list: slice_ = ( - slice(self.offset, self.offset + self.num_samples) - if not self.squeeze_dim - else self.offset + [slice(self.indexes[0], self.indexes[-1] + 1)] + if self.is_contiguous + else [self.indexes] ) - if subpath in self.dataset._tensors.keys(): + if subpath in self.keys: tensorview = TensorView( dataset=self.dataset, subpath=subpath, @@ -89,26 +75,27 @@ def __getitem__(self, slice_): lazy=self.lazy, ) return tensorview if self.lazy else tensorview.compute() - for key in self.dataset._tensors.keys(): + for key in self.keys: if subpath.startswith(key): objectview = ObjectView( dataset=self.dataset, subpath=subpath, - slice_list=[slice_], + slice_=slice_, lazy=self.lazy, ) return objectview if self.lazy else objectview.compute() return self._get_dictionary(self.dataset, subpath, slice=slice_) else: - num, ofs = slice_extract_info(slice_list[0], self.num_samples) - slice_list[0] = ( - ofs + self.offset - if isinstance(slice_list[0], int) - else slice(ofs + self.offset, ofs + self.offset + num) - ) + if isinstance(self.indexes, list): + indexes = self.indexes[slice_list[0]] + if self.is_contiguous and isinstance(indexes, list) and indexes: + indexes = slice(indexes[0], indexes[-1] + 1) + else: + indexes = self.indexes + slice_list[0] = indexes schema_obj = self.dataset.schema.dict_[subpath.split("/")[1]] - if subpath in self.dataset._tensors.keys() and ( + if subpath in self.keys and ( not isinstance(schema_obj, Sequence) or len(slice_list) <= 1 ): tensorview = TensorView( @@ -118,12 +105,12 @@ def __getitem__(self, slice_): lazy=self.lazy, ) return tensorview if self.lazy else tensorview.compute() - for key in self.dataset._tensors.keys(): + for key in self.keys: if subpath.startswith(key): objectview = ObjectView( dataset=self.dataset, subpath=subpath, - slice_list=slice_list, + slice_=slice_list, lazy=self.lazy, ) return objectview if self.lazy else objectview.compute() @@ -139,53 +126,73 @@ def __setitem__(self, slice_, value): >>> ds_view["image", 3, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets the 8th image """ assign_value = get_value(value) - # handling strings and bytes - assign_value = str_to_int(assign_value, self.dataset.tokenizer) + assign_value = str_to_int( + assign_value, self.dataset.tokenizer + ) # handling strings and bytes if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) - slice_list = [0] + slice_list if self.squeeze_dim else slice_list + slice_list = [0] + slice_list if isinstance(self.indexes, int) else slice_list + if not subpath: - raise ValueError("Can't assign to dataset sliced without subpath") - elif not slice_list: + raise ValueError("Can't assign to dataset sliced without key") + elif subpath not in self.keys: + raise KeyError(f"Key {subpath} not found in dataset") + + if not slice_list: slice_ = ( - self.offset - # if self.num_samples == 1 - if self.squeeze_dim - else slice(self.offset, self.offset + self.num_samples) + slice(self.indexes[0], self.indexes[-1] + 1) + if self.is_contiguous + else self.indexes ) - if subpath in self.dataset._tensors.keys(): - self.dataset._tensors[subpath][slice_] = assign_value # Add path check - for key in self.dataset._tensors.keys(): - if subpath.startswith(key): - ObjectView( - dataset=self.dataset, subpath=subpath, slice_list=[slice_] - )[:] = assign_value - # raise error + if not isinstance(slice_, list): + self.dataset._tensors[subpath][slice_] = assign_value + else: + for i, index in enumerate(slice_): + self.dataset._tensors[subpath][index] = assign_value[i] else: - num, ofs = ( - slice_extract_info(slice_list[0], self.num_samples) - if isinstance(slice_list[0], slice) - else (1, slice_list[0]) - ) - slice_list[0] = ( - slice(ofs + self.offset, ofs + self.offset + num) - if isinstance(slice_list[0], slice) - else ofs + self.offset - ) - # self.dataset._tensors[subpath][slice_list] = assign_value - if subpath in self.dataset._tensors.keys(): - self.dataset._tensors[subpath][ - slice_list - ] = assign_value # Add path check - return - for key in self.dataset._tensors.keys(): - if subpath.startswith(key): - ObjectView( - dataset=self.dataset, subpath=subpath, slice_list=slice_list - )[:] = assign_value + if isinstance(self.indexes, list): + indexes = self.indexes[slice_list[0]] + if self.is_contiguous and isinstance(indexes, list) and indexes: + slice_list[0] = slice(indexes[0], indexes[-1] + 1) + else: + slice_list[0] = indexes + else: + slice_list[0] = self.indexes + + if not isinstance(slice_list[0], list): + self.dataset._tensors[subpath][slice_list] = assign_value + else: + for i, index in enumerate(slice_list[0]): + current_slice = [index] + slice_list[1:] + self.dataset._tensors[subpath][current_slice] = assign_value[i] + + def filter(self, dic): + """| Applies a filter to get a new datasetview that matches the dictionary provided + + Parameters + ---------- + dic: dictionary + A dictionary of key value pairs, used to filter the dataset. For nested schemas use flattened dictionary representation + i.e instead of {"abc": {"xyz" : 5}} use {"abc/xyz" : 5} + """ + indexes = self.indexes + for k, v in dic.items(): + k = k if k.startswith("/") else "/" + k + if k not in self.keys: + raise KeyError(f"Key {k} not found in the dataset") + tsv = self.dataset[k] + max_shape = tsv.dtype.max_shape + prod = _tuple_product(max_shape) + if prod > 100: + raise LargeShapeFilteringException(k) + if isinstance(indexes, list): + indexes = [index for index in indexes if tsv[index].compute() == v] + else: + indexes = indexes if tsv[indexes].compute() == v else [] + return DatasetView(dataset=self.dataset, lazy=self.lazy, indexes=indexes) @property def keys(self): @@ -198,7 +205,7 @@ def _get_dictionary(self, subpath, slice_): """Gets dictionary from dataset given incomplete subpath""" tensor_dict = {} subpath = subpath if subpath.endswith("/") else subpath + "/" - for key in self.dataset._tensors.keys(): + for key in self.keys: if key.startswith(subpath): suffix_key = key[len(subpath) :] split_key = suffix_key.split("/") @@ -220,35 +227,25 @@ def _get_dictionary(self, subpath, slice_): def __iter__(self): """ Returns Iterable over samples """ - if self.squeeze_dim: - assert len(self) == 1 + if isinstance(self.indexes, int): yield self return - for i in range(len(self)): + for i in range(len(self.indexes)): yield self[i] def __len__(self): - return self.num_samples + return len(self.indexes) if isinstance(self.indexes, list) else 1 def __str__(self): - out = "DatasetView(" + str(self.dataset) + ", slice=" - out = ( - out + str(self.offset) - if self.squeeze_dim - else out + str(slice(self.offset, self.offset + self.num_samples)) - ) - out += ")" - return out + return "DatasetView(" + str(self.dataset) + ")" def __repr__(self): return self.__str__() def to_tensorflow(self): """Converts the dataset into a tensorflow compatible format""" - return self.dataset.to_tensorflow( - num_samples=self.num_samples, offset=self.offset - ) + return self.dataset.to_tensorflow(indexes=self.indexes) def to_pytorch( self, @@ -259,8 +256,7 @@ def to_pytorch( """Converts the dataset into a pytorch compatible format""" return self.dataset.to_pytorch( transform=transform, - num_samples=self.num_samples, - offset=self.offset, + indexes=self.indexes, inplace=inplace, output_type=output_type, ) @@ -274,13 +270,10 @@ def commit(self) -> None: self.dataset.commit() def numpy(self): - if self.num_samples == 1 and self.squeeze_dim: - return create_numpy_dict(self.dataset, self.offset) + if isinstance(self.indexes, int): + return create_numpy_dict(self.dataset, self.indexes) else: - return [ - create_numpy_dict(self.dataset, self.offset + i) - for i in range(self.num_samples) - ] + return [create_numpy_dict(self.dataset, index) for index in self.indexes] def disable_lazy(self): self.lazy = False @@ -290,592 +283,3 @@ def enable_lazy(self): def compute(self): return self.numpy() - - -class TensorView: - def __init__( - self, - dataset=None, - subpath=None, - slice_=None, - lazy: bool = True, - ): - """Creates a TensorView object for a particular tensor in the dataset - - Parameters - ---------- - dataset: hub.api.dataset.Dataset object - The dataset whose TensorView is being created - subpath: str - The full path to the particular Tensor in the Dataset - slice_: optional - The `slice_` of this Tensor that needs to be accessed - lazy: bool, optional - Setting this to False will stop lazy computation and will allow items to be accessed without .compute() - """ - - if dataset is None: - raise NoneValueException("dataset") - if subpath is None: - raise NoneValueException("subpath") - - self.dataset = dataset - self.subpath = subpath - self.lazy = lazy - - if isinstance(slice_, (int, slice)): - self.slice_ = [slice_] - elif isinstance(slice_, (tuple, list)): - self.slice_ = list(slice_) - self.nums = [] - self.offsets = [] - - self.squeeze_dims = [] - for it in self.slice_: - if isinstance(it, int): - self.nums.append(1) - self.offsets.append(it) - self.squeeze_dims.append(True) - elif isinstance(it, slice): - ofs = it.start or 0 - num = it.stop - ofs if it.stop else None - self.nums.append(num) - self.offsets.append(ofs) - self.squeeze_dims.append(False) - self.nums[0] = ( - self.dataset.shape[0] - self.offsets[0] - if self.nums[0] is None - else self.nums[0] - ) - self.dtype = self.dtype_from_path(subpath) - self.shape = self.dataset._tensors[self.subpath].get_shape(self.slice_) - - def numpy(self): - """Gets the value from tensorview""" - if isinstance(self.dtype, Text): - value = self.dataset._tensors[self.subpath][self.slice_] - if self.dataset.tokenizer is not None: - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") - if value.ndim == 1: - return tokenizer.decode(value.tolist()) - elif value.ndim == 1: - return "".join(chr(it) for it in value.tolist()) - raise ValueError("Can only access Text with integer index") - return self.dataset._tensors[self.subpath][self.slice_] - - def compute(self): - """Gets the value from tensorview""" - return self.numpy() - - def __getitem__(self, slice_): - """| Gets a slice or slices from tensorview - | Usage: - - >>> images_tensorview = ds["image"] - >>> return images_tensorview[7, 0:1920, 0:1080, 0:3].compute() # returns numpy array of 7th image - """ - if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): - slice_ = [slice_] - slice_ = list(slice_) - slice_ = self.slice_fill(slice_) - subpath, slice_list = slice_split(slice_) - - new_nums = self.nums.copy() - new_offsets = self.offsets.copy() - if len(new_nums) < len(slice_list): - new_nums.extend([None] * (len(slice_list) - len(new_nums))) - new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) - for i in range(len(slice_list)): - slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) - for i in range(len(slice_list), len(new_nums)): - cur_slice = ( - slice(new_offsets[i], new_offsets[i] + new_nums[i]) - if new_nums[i] > 1 - else new_offsets[i] - ) - slice_list.append(cur_slice) - if subpath or ( - len(slice_list) > len(self.nums) and isinstance(self.dtype, Sequence) - ): - objectview = ObjectView( - dataset=self.dataset, - subpath=self.subpath + subpath, - slice_list=slice_list, - lazy=self.lazy, - ) - return objectview if self.lazy else objectview.compute() - else: - tensorview = TensorView( - dataset=self.dataset, - subpath=self.subpath, - slice_=slice_list, - lazy=self.lazy, - ) - return tensorview if self.lazy else tensorview.compute() - - def __setitem__(self, slice_, value): - """| Sets a slice or slices with a value - | Usage: - - >>> images_tensorview = ds["image"] - >>> images_tensorview[7, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets 7th image - """ - assign_value = get_value(value) - # handling strings and bytes - assign_value = str_to_int(assign_value, self.dataset.tokenizer) - - if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): - slice_ = [slice_] - slice_ = list(slice_) - slice_ = self.slice_fill(slice_) - subpath, slice_list = slice_split(slice_) - new_nums = self.nums.copy() - new_offsets = self.offsets.copy() - if len(new_nums) < len(slice_list): - new_nums.extend([None] * (len(slice_list) - len(new_nums))) - new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) - for i in range(len(slice_list)): - slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) - for i in range(len(slice_list), len(new_nums)): - cur_slice = ( - slice(new_offsets[i], new_offsets[i] + new_nums[i]) - if new_nums[i] > 1 - else new_offsets[i] - ) - slice_list.append(cur_slice) - if subpath or ( - len(slice_list) > len(self.nums) and isinstance(self.dtype, Sequence) - ): - ObjectView( - dataset=self.dataset, - subpath=self.subpath + subpath, - slice_list=slice_list, - )[:] = assign_value - else: - self.dataset._tensors[self.subpath][slice_list] = assign_value - - def _combine(self, slice_, num=None, ofs=0): - "Combines a `slice_` with the current num and offset present in tensorview" - if isinstance(slice_, int): - self.check_slice_bounds(num=num, start=slice_) - return ofs + slice_ - elif isinstance(slice_, slice): - self.check_slice_bounds( - num=num, start=slice_.start, stop=slice_.stop, step=slice_.step - ) - if slice_.start is None and slice_.stop is None: - return slice(ofs, None) if num is None else slice(ofs, ofs + num) - elif slice_.stop is None: - return ( - slice(ofs + slice_.start, None) - if num is None - else slice(ofs + slice_.start, ofs + num) - ) - elif slice_.start is None: - return slice(ofs, ofs + slice_.stop) - else: - return slice(ofs + slice_.start, ofs + slice_.stop) - else: - raise TypeError( - "type {} isn't supported in dataset slicing".format(type(slice_)) - ) - - def check_slice_bounds(self, num=None, start=None, stop=None, step=None): - "Checks whether the bounds of slice are in limits" - if step and step < 0: # negative step not supported - raise ValueError("Negative step not supported in dataset slicing") - if num and ((start and start >= num) or (stop and stop > num)): - raise IndexError( - "index out of bounds for dimension with length {}".format(num) - ) - if start and stop and start > stop: - raise IndexError("start index is greater than stop index") - - def dtype_from_path(self, path): - "Gets the dtype of the Tensorview by traversing the schema" - path = path.split("/") - cur_type = self.dataset.schema.dict_ - for subpath in path[1:-1]: - cur_type = cur_type[subpath] - cur_type = cur_type.dict_ - return cur_type[path[-1]] - - def slice_fill(self, slice_): - "Fills the slice with zeroes for the dimensions that have single elements and squeeze_dims true" - new_slice_ = [] - offset = 0 - for i, num in enumerate(self.nums): - if num == 1 and self.squeeze_dims[i]: - new_slice_.append(0) - elif offset < len(slice_): - new_slice_.append(slice_[offset]) - offset += 1 - new_slice_ += slice_[offset:] - return new_slice_ - - def __repr__(self): - return self.__str__() - - def __str__(self): - return ( - "TensorView(" - + str(self.dtype) - + ", subpath=" - + "'" - + self.subpath - + "', slice=" - + str(self.slice_) - + ")" - ) - - def set_shape(self): - if self.is_dynamic: - self.shape = [ - self.dataset._tensors[self.subpath].get_shape([i] + self.slice_[1:]) - for i in range(self.offsets[0], self.offsets[0] + self.nums[0]) - ] - if len(self.shape) == 1: - self.shape = self.shape[0] - self.shape = ( - (1,) + self.shape - if isinstance(self.slice_[0], slice) - else self.shape - ) - else: - self.shape = self.dataset._tensors[self.subpath].get_shape(self.slice_) - - @property - def chunksize(self): - return self.dataset._tensors[self.subpath].chunksize - - @property - def is_dynamic(self): - return self.dataset._tensors[self.subpath].is_dynamic - - def disable_lazy(self): - self.lazy = False - - def enable_lazy(self): - self.lazy = True - - -class ObjectView: - def __init__( - self, - dataset, - subpath=None, - slice_list=None, - nums=[], - offsets=[], - squeeze_dims=[], - inner_schema_obj=None, - lazy=True, - new=True, - ): - """Creates an ObjectView object for dataset from a Dataset, DatasetView or TensorView - object, or creates a different ObjectView from an existing one - - Parameters - ---------- - These parameters are used to create a new ObjectView. - dataset: hub.api.dataset.Dataset object - The dataset whose ObjectView is being created, or its DatasetView - subpath: str (optional) - A potentially incomplete path to any element in the Dataset - slice_list: optional - The `slice_` of this Tensor that needs to be accessed - lazy: bool, optional - Setting this to False will stop lazy computation and will allow items to be accessed without .compute() - - These parameters are also needed to create an ObjectView from an existing one. - nums: List[int] - Number of elements in each dimension of the ObjectView to be created - offsets: List[int] - Starting element in each dimension of the ObjectView to be created - squeeze_dims: List[bool] - Whether each dimension can be squeezed or not - inner_schema_obj: Child of hub.schema.Tensor or hub.schema.SchemaDict - The deepest element in the schema upto which the previous ObjectView had been processed - - new: bool - Whether to create a new ObjectView object from a Dataset, DatasetView or TensorView - or create a different ObjectView from an existing one - """ - self.dataset = dataset - self.schema = ( - dataset.schema.dict_ - if not isinstance(dataset, DatasetView) - else dataset.dataset.schema.dict_ - ) - self.subpath = subpath - - self.nums = nums - self.offsets = offsets - self.squeeze_dims = squeeze_dims - - self.inner_schema_obj = inner_schema_obj - self.lazy = lazy - - if new: - # Creating new obj - if self.subpath: - ( - self.inner_schema_obj, - self.nums, - self.offsets, - self.squeeze_dims, - ) = self.process_path( - self.subpath, - self.inner_schema_obj, - self.nums.copy(), - self.offsets.copy(), - self.squeeze_dims.copy(), - ) - # Check if dataset view needs to be made - if slice_list and len(slice_list) >= 1: - num, ofs = slice_extract_info(slice_list[0], dataset.shape[0]) - self.dataset = DatasetView( - dataset, num, ofs, isinstance(slice_list[0], int) - ) - - if slice_list and len(slice_list) > 1: - slice_list = slice_list[1:] - if len(slice_list) > len(self.nums): - raise IndexError("Too many indices") - for i, it in enumerate(slice_list): - num, ofs = slice_extract_info(it, self.nums[i]) - self.nums[i] = num - self.offsets[i] += ofs - self.squeeze_dims[i] = num == 1 - - def num_process(self, schema_obj, nums, offsets, squeeze_dims): - """Determines the maximum number of elements in each discovered dimension""" - if isinstance(schema_obj, SchemaDict): - return - elif isinstance(schema_obj, Sequence): - nums.append(0) - offsets.append(0) - squeeze_dims.append(False) - if isinstance(schema_obj.dtype, Tensor): - self.num_process(schema_obj.dtype, nums, offsets, squeeze_dims) - else: - for dim in schema_obj.max_shape: - nums.append(dim) - offsets.append(0) - squeeze_dims.append(False) - if not isinstance(schema_obj.dtype, Primitive) and not isinstance( - schema_obj, Sequence - ): - raise ValueError("Only sequences can be nested") - - def process_path(self, subpath, inner_schema_obj, nums, offsets, squeeze_dims): - """Checks if a subpath is valid or not. Does not repeat computation done in a - previous ObjectView object""" - paths = subpath.split("/")[1:] - try: - # If key is invalid raises KeyError - # If schema object is not subscriptable raises AttributeError - if inner_schema_obj: - if isinstance(inner_schema_obj, Sequence): - schema_obj = inner_schema_obj.dtype.dict_[paths[0]] - elif isinstance(inner_schema_obj, SchemaDict): - schema_obj = inner_schema_obj.dict_[paths[0]] - else: - raise KeyError() - else: - schema_obj = self.schema[paths[0]] - except (KeyError, AttributeError): - raise KeyError(f"{paths[0]} is an invalid key") - self.num_process(schema_obj, nums, offsets, squeeze_dims) - for path in paths[1:]: - try: - if isinstance(schema_obj, Sequence): - schema_obj = schema_obj.dtype.dict_[path] - elif isinstance(schema_obj, SchemaDict): - schema_obj = schema_obj.dict_[path] - else: - raise KeyError() - self.num_process(schema_obj, nums, offsets, squeeze_dims) - except (KeyError, AttributeError): - raise KeyError(f"{path} is an invalid key") - return schema_obj, nums, offsets, squeeze_dims - - def __getitem__(self, slice_): - """| Gets a slice from an objectview""" - if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): - slice_ = [slice_] - slice_ = list(slice_) - subpath, slice_list = slice_split(slice_) - - dataset = self.dataset - nums, offsets, squeeze_dims, inner_schema_obj = ( - self.nums.copy(), - self.offsets.copy(), - self.squeeze_dims.copy(), - self.inner_schema_obj, - ) - - if subpath: - inner_schema_obj, nums, offsets, squeeze_dims = self.process_path( - subpath, inner_schema_obj, nums, offsets, squeeze_dims - ) - subpath = self.subpath + subpath - if len(slice_list) >= 1: - # Slice first dim - if isinstance(self.dataset, DatasetView) and not self.dataset.squeeze_dim: - dataset = self.dataset[slice_list[0]] - slice_list = slice_list[1:] - elif not isinstance(self.dataset, DatasetView): - num, ofs = slice_extract_info(slice_list[0], self.dataset.shape[0]) - dataset = DatasetView( - self.dataset, num, ofs, isinstance(slice_list[0], int) - ) - slice_list = slice_list[1:] - - # Expand slice list for rest of dims - if len(slice_list) >= 1: - exp_slice_list = [] - for squeeze in squeeze_dims: - if squeeze: - exp_slice_list += [None] - else: - if len(slice_list) > 0: - exp_slice_list += [slice_list.pop(0)] - else: - # slice list smaller than max - exp_slice_list += [None] - if len(slice_list) > 0: - # slice list longer than max - raise IndexError("Too many indices") - for i, it in enumerate(exp_slice_list): - if it is not None: - num, ofs = slice_extract_info(it, nums[i]) - nums[i] = num - offsets[i] += ofs - squeeze_dims[i] = num == 1 - objectview = ObjectView( - dataset=dataset, - subpath=subpath, - slice_list=None, - nums=nums, - offsets=offsets, - squeeze_dims=squeeze_dims, - inner_schema_obj=inner_schema_obj, - lazy=self.lazy, - new=False, - ) - return objectview if self.lazy else objectview.compute() - - def numpy(self): - """Gets the value from the objectview""" - if not isinstance(self.dataset, DatasetView): - # subpath present but no slice done - if len(self.subpath.split("/")[1:]) > 1: - raise IndexError("Can only go deeper on single datapoint") - if not self.dataset.squeeze_dim: - # return a combined tensor for multiple datapoints - # only possible if the field has a fixed size - paths = self.subpath.split("/")[1:] - if len(paths) > 1: - raise IndexError("Can only go deeper on single datapoint") - else: - # single datapoint - paths = self.subpath.split("/")[1:] - schema = self.schema[paths[0]] - slice_ = [ - ofs if sq else slice(ofs, ofs + num) if num else slice(None, None) - for ofs, num, sq in zip(self.offsets, self.nums, self.squeeze_dims) - ] - if isinstance(schema, Sequence): - if isinstance(schema.dtype, SchemaDict): - # if sequence of dict, have to fetch everything - value = self.dataset[paths[0]].compute() - for path in paths[1:]: - value = value[path] - try: - return value[tuple(slice_)] - except TypeError: - # raise error - return value - except KeyError: - raise KeyError("Invalid slice") - else: - # sequence of tensors - return self.dataset[paths[0]].compute()[tuple(slice_)] - - def compute(self): - return self.numpy() - - def __setitem__(self, slice_, value): - """| Sets a slice of the objectview with a value""" - if isinstance(slice_, slice) and (slice_.start is None and slice_.stop is None): - objview = self - else: - objview = self.__getitem__(slice_) - assign_value = get_value(value) - - if not isinstance(objview.dataset, DatasetView): - # subpath present but no slice done - assign_value = str_to_int(assign_value, objview.dataset.tokenizer) - if len(objview.subpath.split("/")[1:]) > 1: - raise IndexError("Can only go deeper on single datapoint") - if not objview.dataset.squeeze_dim: - # assign a combined tensor for multiple datapoints - # only possible if the field has a fixed size - assign_value = str_to_int(assign_value, objview.dataset.dataset.tokenizer) - paths = objview.subpath.split("/")[1:] - if len(paths) > 1: - raise IndexError("Can only go deeper on single datapoint") - else: - # single datapoint - def assign(paths, value): - # helper function for recursive assign - if len(paths) > 0: - path = paths.pop(0) - value[path] = assign(paths, value[path]) - return value - try: - value[tuple(slice_)] = assign_value - except TypeError: - value = assign_value - return value - - assign_value = str_to_int(assign_value, objview.dataset.dataset.tokenizer) - paths = objview.subpath.split("/")[1:] - schema = objview.schema[paths[0]] - slice_ = [ - of if sq else slice(of, of + num) if num else slice(None, None) - for num, of, sq in zip( - objview.nums, objview.offsets, objview.squeeze_dims - ) - ] - if isinstance(schema, Sequence): - if isinstance(schema.dtype, SchemaDict): - # if sequence of dict, have to fetch everything - value = objview.dataset[paths[0]].compute() - value = assign(paths[1:], value) - objview.dataset[paths[0]] = value - else: - # sequence of tensors - value = objview.dataset[paths[0]].compute() - value[tuple(slice_)] = assign_value - objview.dataset[paths[0]] = value - - def __str__(self): - if isinstance(self.dataset, DatasetView): - slice_ = [ - self.dataset.offset - if self.dataset.squeeze_dim - else slice( - self.dataset.offset, self.dataset.offset + self.dataset.num_samples - ) - ] - else: - slice_ = [slice(None, None)] - slice_ += [ - ofs if sq else slice(ofs, ofs + num) if num else slice(None, None) - for ofs, num, sq in zip(self.offsets, self.nums, self.squeeze_dims) - ] - return f"ObjectView(subpath='{self.subpath}', slice={str(slice_)})" diff --git a/hub/api/objectview.py b/hub/api/objectview.py new file mode 100644 index 0000000000..234ecace62 --- /dev/null +++ b/hub/api/objectview.py @@ -0,0 +1,270 @@ +from hub.schema import Sequence, Tensor, SchemaDict, Primitive +from hub.api.dataset_utils import slice_extract_info, slice_split + +import collections.abc as abc + + +class ObjectView: + def __init__( + self, + dataset, + subpath=None, + slice_=None, + indexes=None, + nums=[], + offsets=[], + squeeze_dims=[], + inner_schema_obj=None, + lazy=True, + check_bounds=True, + ): + """Creates an ObjectView object for dataset from a Dataset, DatasetView or TensorView + object, or creates a different ObjectView from an existing one + + Parameters + ---------- + These parameters are used to create a new ObjectView. + dataset: hub.api.dataset.Dataset object + The dataset whose ObjectView is being created, or its DatasetView + subpath: str (optional) + A potentially incomplete path to any element in the Dataset + slice_list: optional + The `slice_` of this Tensor that needs to be accessed + lazy: bool, optional + Setting this to False will stop lazy computation and will allow items to be accessed without .compute() + + These parameters are also needed to create an ObjectView from an existing one. + nums: List[int] + Number of elements in each dimension of the ObjectView to be created + offsets: List[int] + Starting element in each dimension of the ObjectView to be created + squeeze_dims: List[bool] + Whether each dimension can be squeezed or not + inner_schema_obj: Child of hub.schema.Tensor or hub.schema.SchemaDict + The deepest element in the schema upto which the previous ObjectView had been processed + check_bounds: bool + Whether to create a new ObjectView object from a Dataset, DatasetView or TensorView + or create a different ObjectView from an existing one + """ + self.dataset = dataset + self.schema = dataset.schema.dict_ + self.subpath = subpath + + self.nums = nums + self.offsets = offsets + self.squeeze_dims = squeeze_dims + + self.inner_schema_obj = inner_schema_obj + self.lazy = lazy + + if check_bounds: + if self.subpath: + ( + self.inner_schema_obj, + self.nums, + self.offsets, + self.squeeze_dims, + ) = self.process_path( + self.subpath, + self.inner_schema_obj, + self.nums.copy(), + self.offsets.copy(), + self.squeeze_dims.copy(), + ) + if slice_ and len(slice_) >= 1: + self.indexes = slice_[0] + self.is_contiguous = False + if isinstance(self.indexes, list) and self.indexes: + self.is_contiguous = self.indexes[-1] - self.indexes[0] + 1 == len( + self.indexes + ) + slice_ = slice_[1:] + if len(slice_) > len(self.nums): + raise IndexError("Too many indices") + for i, it in enumerate(slice_): + num, ofs = slice_extract_info(it, self.nums[i]) + self.nums[i] = num + self.offsets[i] += ofs + self.squeeze_dims[i] = num == 1 + else: + self.indexes = indexes + self.is_contiguous = False + if isinstance(self.indexes, list) and self.indexes: + self.is_contiguous = self.indexes[-1] - self.indexes[0] + 1 == len( + self.indexes + ) + + def num_process(self, schema_obj, nums, offsets, squeeze_dims): + """Determines the maximum number of elements in each discovered dimension""" + if isinstance(schema_obj, SchemaDict): + return + elif isinstance(schema_obj, Sequence): + nums.append(0) + offsets.append(0) + squeeze_dims.append(False) + if isinstance(schema_obj.dtype, Tensor): + self.num_process(schema_obj.dtype, nums, offsets, squeeze_dims) + else: + for dim in schema_obj.max_shape: + nums.append(dim) + offsets.append(0) + squeeze_dims.append(False) + if not isinstance(schema_obj.dtype, Primitive) and not isinstance( + schema_obj, Sequence + ): + raise ValueError("Only sequences can be nested") + + def process_path(self, subpath, inner_schema_obj, nums, offsets, squeeze_dims): + """Checks if a subpath is valid or not. Does not repeat computation done in a previous ObjectView object""" + paths = subpath.split("/")[1:] + try: + # If key is invalid raises KeyError + # If schema object is not subscriptable raises AttributeError + if inner_schema_obj: + if isinstance(inner_schema_obj, Sequence): + schema_obj = inner_schema_obj.dtype.dict_[paths[0]] + elif isinstance(inner_schema_obj, SchemaDict): + schema_obj = inner_schema_obj.dict_[paths[0]] + else: + raise KeyError() + else: + schema_obj = self.schema[paths[0]] + except (KeyError, AttributeError): + raise KeyError(f"{paths[0]} is an invalid key") + self.num_process(schema_obj, nums, offsets, squeeze_dims) + for path in paths[1:]: + try: + if isinstance(schema_obj, Sequence): + schema_obj = schema_obj.dtype.dict_[path] + elif isinstance(schema_obj, SchemaDict): + schema_obj = schema_obj.dict_[path] + else: + raise KeyError() + self.num_process(schema_obj, nums, offsets, squeeze_dims) + except (KeyError, AttributeError): + raise KeyError(f"{path} is an invalid key") + return schema_obj, nums, offsets, squeeze_dims + + def __getitem__(self, slice_): + """| Gets a slice from an objectview""" + if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): + slice_ = [slice_] + slice_ = list(slice_) + subpath, slice_list = slice_split(slice_) + + nums, offsets, squeeze_dims, inner_schema_obj = ( + self.nums.copy(), + self.offsets.copy(), + self.squeeze_dims.copy(), + self.inner_schema_obj, + ) + + if subpath: + inner_schema_obj, nums, offsets, squeeze_dims = self.process_path( + subpath, inner_schema_obj, nums, offsets, squeeze_dims + ) + subpath = self.subpath + subpath + + new_indexes = self.indexes + if len(slice_list) >= 1: + if isinstance(self.indexes, list): + new_indexes = self.indexes[slice_list[0]] + if self.is_contiguous and new_indexes: + new_indexes = slice(new_indexes[0], new_indexes[-1] + 1) + slice_list = slice_list[1:] + elif isinstance(self.indexes, slice): + ofs = self.indexes.start or 0 + num = self.indexes.stop - ofs if self.indexes.stop else None + num, ofs_temp = slice_extract_info(slice_list[0], num) + new_indexes = ( + ofs + ofs_temp + if isinstance(slice_list[0], int) + else slice(ofs + ofs_temp, ofs + ofs_temp + num) + ) + slice_list = slice_list[1:] + + if len(slice_list) >= 1: + # Expand slice list + exp_slice_list = [] + for squeeze in squeeze_dims: + if squeeze: + exp_slice_list += [None] + else: + if len(slice_list) > 0: + exp_slice_list += [slice_list.pop(0)] + else: + # slice list smaller than max + exp_slice_list += [None] + if len(slice_list) > 0: + # slice list longer than max + raise IndexError("Too many indices") + for i, it in enumerate(exp_slice_list): + if it is not None: + num, ofs = slice_extract_info(it, nums[i]) + nums[i] = num + offsets[i] += ofs + squeeze_dims[i] = isinstance(it, int) + + objectview = ObjectView( + dataset=self.dataset, + subpath=subpath, + slice_=None, + indexes=new_indexes, + nums=nums, + offsets=offsets, + squeeze_dims=squeeze_dims, + inner_schema_obj=inner_schema_obj, + lazy=self.lazy, + check_bounds=False, + ) + return objectview if self.lazy else objectview.compute() + + def numpy(self): + """Gets the value from the objectview""" + if isinstance(self.indexes, list): + if len(self.indexes) > 1: + raise IndexError("Can only go deeper on single datapoint") + else: + slice_0 = self.indexes[0] + elif isinstance(self.indexes, slice): + if self.indexes.stop - self.indexes.start > 1: + raise IndexError("Can only go deeper on single datapoint") + slice_0 = self.indexes.start + else: + slice_0 = self.indexes + # single datapoint + paths = self.subpath.split("/")[1:] + schema = self.schema[paths[0]] + slice_ = [ + ofs if sq else slice(ofs, ofs + num) if num else slice(None, None) + for ofs, num, sq in zip(self.offsets, self.nums, self.squeeze_dims) + ] + if isinstance(schema, Sequence): + if isinstance(schema.dtype, SchemaDict): + # if sequence of dict, have to fetch everything + lazy = self.dataset.lazy + self.dataset.lazy = False + value = self.dataset[[paths[0], slice_0]] + self.dataset.lazy = lazy + for path in paths[1:]: + value = value[path] + try: + return value[tuple(slice_)] + except TypeError: + # raise error + return value + except KeyError: + raise KeyError("Invalid slice") + else: + # sequence of tensors + return self.dataset[[paths[0], slice_0]].compute()[tuple(slice_)] + + def compute(self): + return self.numpy() + + def __str__(self): + slice_ = [ + ofs if sq else slice(ofs, ofs + num) if num else slice(None, None) + for ofs, num, sq in zip(self.offsets, self.nums, self.squeeze_dims) + ] + return f"ObjectView(subpath='{self.subpath}', indexes={str(self.indexes)}, slice={str(slice_)})" diff --git a/hub/api/tensorview.py b/hub/api/tensorview.py new file mode 100644 index 0000000000..c904cf6e65 --- /dev/null +++ b/hub/api/tensorview.py @@ -0,0 +1,309 @@ +import numpy as np +import hub +import collections.abc as abc +from hub.api.dataset_utils import get_value, slice_split, str_to_int +from hub.exceptions import NoneValueException +import hub.api.objectview as objv + + +class TensorView: + def __init__( + self, + dataset=None, + subpath=None, + slice_=None, + lazy: bool = True, + ): + """Creates a TensorView object for a particular tensor in the dataset + + Parameters + ---------- + dataset: hub.api.dataset.Dataset object + The dataset whose TensorView is being created + subpath: str + The full path to the particular Tensor in the Dataset + slice_: optional + The `slice_` of this Tensor that needs to be accessed + lazy: bool, optional + Setting this to False will stop lazy computation and will allow items to be accessed without .compute() + """ + + if dataset is None: + raise NoneValueException("dataset") + if subpath is None: + raise NoneValueException("subpath") + + self.dataset = dataset + self.subpath = subpath + self.lazy = lazy + + if isinstance(slice_, (int, slice)): + self.slice_ = [slice_] + elif isinstance(slice_, (tuple, list)): + self.slice_ = list(slice_) + self.nums = [None] + self.offsets = [None] + self.squeeze_dims = [False] + self.indexes = self.slice_[0] # int, slice or list + self.is_contiguous = False + if isinstance(self.indexes, list) and self.indexes: + self.is_contiguous = self.indexes[-1] - self.indexes[0] + 1 == len( + self.indexes + ) + for it in self.slice_[1:]: + if isinstance(it, int): + self.nums.append(1) + self.offsets.append(it) + self.squeeze_dims.append(True) + elif isinstance(it, slice): + ofs = it.start or 0 + num = it.stop - ofs if it.stop else None + self.nums.append(num) + self.offsets.append(ofs) + self.squeeze_dims.append(False) + self.dtype = self.dtype_from_path(subpath) + self.shape = self.dataset._tensors[self.subpath].get_shape(self.slice_) + + def numpy(self): + """Gets the value from tensorview""" + if isinstance(self.indexes, list): + if ( + len(self.indexes) > 1 + and self.dataset._tensors[self.subpath]._dynamic_tensor + ): + raise ValueError( + "Getting item across multiitem slices is not supported for tensors with dynamic shapes, access them item by item" + ) + value = np.array( + [ + self.dataset._tensors[self.subpath][[index] + self.slice_[1:]] + for index in self.indexes + ] + ) + else: + value = self.dataset._tensors[self.subpath][self.slice_] + + if isinstance(self.dtype, hub.schema.text.Text): + if self.dataset.tokenizer is not None: + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + if value.ndim == 1: + return tokenizer.decode(value.tolist()) + elif value.ndim == 2: + return [tokenizer.decode(val.tolist()) for val in value] + elif value.ndim == 1: + return "".join(chr(it) for it in value.tolist()) + elif value.ndim == 2: + return ["".join(chr(it) for it in val.tolist()) for val in value] + raise ValueError(f"Unexpected value with shape for text {value.shape}") + return value + + def compute(self): + """Gets the value from tensorview""" + return self.numpy() + + def __getitem__(self, slice_): + """| Gets a slice or slices from tensorview + | Usage: + + >>> images_tensorview = ds["image"] + >>> return images_tensorview[7, 0:1920, 0:1080, 0:3].compute() # returns numpy array of 7th image + """ + if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): + slice_ = [slice_] + slice_ = list(slice_) + slice_ = self.slice_fill(slice_) + subpath, slice_list = slice_split(slice_) + new_nums = self.nums.copy() + new_offsets = self.offsets.copy() + if isinstance(self.indexes, list): + new_indexes = self.indexes[slice_list[0]] + if self.is_contiguous and new_indexes: + new_indexes = slice(new_indexes[0], new_indexes[-1] + 1) + elif isinstance(self.indexes, int): + new_indexes = self.indexes + else: + ofs = self.indexes.start or 0 + num = self.indexes.stop - ofs if self.indexes.stop else None + new_indexes = self._combine(slice_list[0], num, ofs) + slice_list[0] = new_indexes + # new_indexes = self.indexes[] + if len(new_nums) < len(slice_list): + new_nums.extend([None] * (len(slice_list) - len(new_nums))) + new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) + for i in range(1, len(slice_list)): + slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) + for i in range(len(slice_list), len(new_nums)): + cur_slice = ( + slice(new_offsets[i], new_offsets[i] + new_nums[i]) + if not self.squeeze_dims[i] + else new_offsets[i] + ) + slice_list.append(cur_slice) + if subpath or ( + len(slice_list) > len(self.nums) and isinstance(self.dtype, objv.Sequence) + ): + objectview = objv.ObjectView( + dataset=self.dataset, + subpath=self.subpath + subpath, + slice_=slice_list, + lazy=self.lazy, + ) + return objectview if self.lazy else objectview.compute() + else: + tensorview = TensorView( + dataset=self.dataset, + subpath=self.subpath, + slice_=slice_list, + lazy=self.lazy, + ) + return tensorview if self.lazy else tensorview.compute() + + def __setitem__(self, slice_, value): + """| Sets a slice or slices with a value + | Usage: + + >>> images_tensorview = ds["image"] + >>> images_tensorview[7, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets 7th image + """ + assign_value = get_value(value) + # handling strings and bytes + assign_value = str_to_int(assign_value, self.dataset.tokenizer) + + if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): + slice_ = [slice_] + slice_ = list(slice_) + slice_ = self.slice_fill(slice_) + subpath, slice_list = slice_split(slice_) + if subpath: + raise ValueError("Can't setitem of TensorView with subpath") + new_nums = self.nums.copy() + new_offsets = self.offsets.copy() + if isinstance(self.indexes, list): + new_indexes = self.indexes[slice_list[0]] + if self.is_contiguous and new_indexes: + new_indexes = slice(new_indexes[0], new_indexes[-1] + 1) + elif isinstance(self.indexes, int): + new_indexes = self.indexes + else: + ofs = self.indexes.start or 0 + num = self.indexes.stop - ofs if self.indexes.stop else None + new_indexes = self._combine(slice_list[0], num, ofs) + slice_list[0] = new_indexes + if len(new_nums) < len(slice_list): + new_nums.extend([None] * (len(slice_list) - len(new_nums))) + new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) + for i in range(1, len(slice_list)): + slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) + for i in range(len(slice_list), len(new_nums)): + cur_slice = ( + slice(new_offsets[i], new_offsets[i] + new_nums[i]) + if not self.squeeze_dims[i] + else new_offsets[i] + ) + slice_list.append(cur_slice) + + if isinstance(slice_list[0], (int, slice)): + self.dataset._tensors[self.subpath][slice_list] = assign_value + else: + for i, index in enumerate(slice_list[0]): + current_slice = [index] + slice_list[1:] + self.dataset._tensors[subpath][current_slice] = assign_value[i] + + def _combine(self, slice_, num=None, ofs=0): + "Combines a `slice_` with the current num and offset present in tensorview" + if isinstance(slice_, int): + self.check_slice_bounds(num=num, start=slice_) + return ofs + slice_ + elif isinstance(slice_, slice): + self.check_slice_bounds( + num=num, start=slice_.start, stop=slice_.stop, step=slice_.step + ) + if slice_.start is None and slice_.stop is None: + return slice(ofs, None) if num is None else slice(ofs, ofs + num) + elif slice_.stop is None: + return ( + slice(ofs + slice_.start, None) + if num is None + else slice(ofs + slice_.start, ofs + num) + ) + elif slice_.start is None: + return slice(ofs, ofs + slice_.stop) + else: + return slice(ofs + slice_.start, ofs + slice_.stop) + else: + raise TypeError( + "type {} isn't supported in dataset slicing".format(type(slice_)) + ) + + def check_slice_bounds(self, num=None, start=None, stop=None, step=None): + "Checks whether the bounds of slice are in limits" + if step and step < 0: # negative step not supported + raise ValueError("Negative step not supported in dataset slicing") + if num and ((start and start >= num) or (stop and stop > num)): + raise IndexError( + "index out of bounds for dimension with length {}".format(num) + ) + if start and stop and start > stop: + raise IndexError("start index is greater than stop index") + + def dtype_from_path(self, path): + "Gets the dtype of the Tensorview by traversing the schema" + path = path.split("/") + cur_type = self.dataset.schema.dict_ + for subpath in path[1:-1]: + cur_type = cur_type[subpath] + cur_type = cur_type.dict_ + return cur_type[path[-1]] + + def slice_fill(self, slice_): + "Fills the slice with zeroes for the dimensions that have single elements and squeeze_dims true" + new_slice_ = [slice_[0]] + offset = 1 + for i in range(1, len(self.nums)): + if self.squeeze_dims[i]: + new_slice_.append(0) + elif offset < len(slice_): + new_slice_.append(slice_[offset]) + offset += 1 + new_slice_ += slice_[offset:] + return new_slice_ + + def __repr__(self): + return self.__str__() + + def __str__(self): + return ( + "TensorView(" + + str(self.dtype) + + ", subpath=" + + "'" + + self.subpath + + "', slice=" + + str(self.slice_) + + ")" + ) + + def __iter__(self): + """ Returns Iterable over samples """ + if isinstance(self.indexes, int): + yield self + return + + for i in range(len(self.indexes)): + yield self[i] + + @property + def chunksize(self): + return self.dataset._tensors[self.subpath].chunksize + + @property + def is_dynamic(self): + return self.dataset._tensors[self.subpath].is_dynamic + + def disable_lazy(self): + self.lazy = False + + def enable_lazy(self): + self.lazy = True diff --git a/hub/api/tests/test_dataset.py b/hub/api/tests/test_dataset.py index 679fd356f7..a49b2f40db 100644 --- a/hub/api/tests/test_dataset.py +++ b/hub/api/tests/test_dataset.py @@ -1,14 +1,16 @@ +from hub.api.dataset_utils import slice_extract_info, slice_split +from hub.schema.class_label import ClassLabel import os import shutil import cloudpickle import pickle from hub.cli.auth import login_fn -from hub.exceptions import HubException +from hub.exceptions import HubException, LargeShapeFilteringException import numpy as np import pytest from hub import transform import hub.api.dataset as dataset -from hub.schema import Tensor, Text, Image +from hub.schema import Tensor, Text, Image, Sequence, BBox, SchemaDict from hub.utils import ( gcp_creds_exist, hub_creds_exist, @@ -30,7 +32,7 @@ } -def test_dataset2(): +def test_dataset_2(): dt = {"first": "float", "second": "float"} ds = Dataset(schema=dt, shape=(2,), url="./data/test/test_dataset2", mode="w") ds.meta_information["description"] = "This is my description" @@ -385,20 +387,36 @@ def test_datasetview_get_dictionary(): ds["label", 5, "a"] = 5 * np.ones((100, 200)) ds["label", 5, "d", "e"] = 3 * np.ones((5, 3)) dsv = ds[2:10] + dsv.disable_lazy() dic = dsv[3, "label"] - assert (dic["a"].compute() == 5 * np.ones((100, 200))).all() - assert (dic["d"]["e"].compute() == 3 * np.ones((5, 3))).all() + assert (dic["a"] == 5 * np.ones((100, 200))).all() + assert (dic["d"]["e"] == 3 * np.ones((5, 3))).all() + dsv.enable_lazy() def test_tensorview_slicing(): dt = {"first": Tensor(shape=(None, None), max_shape=(250, 300))} ds = Dataset(schema=dt, shape=(20,), url="./data/test/tensorivew_slicing", mode="w") tv = ds["first", 5:6, 7:10, 9:10] - assert tv.numpy().shape == tuple(tv.shape) == (1, 3, 1) + tv.disable_lazy() + tv.enable_lazy() + assert tv.compute().shape == tuple(tv.shape) == (1, 3, 1) tv2 = ds["first", 5:6, 7:10, 9] assert tv2.numpy().shape == tuple(tv2.shape) == (1, 3) +def test_tensorview_iter(): + schema = {"abc": "int32"} + ds = Dataset( + schema=schema, shape=(20,), url="./data/test/tensorivew_slicing", mode="w" + ) + for i in range(20): + ds["abc", i] = i + tv = ds["abc", 3] + for item in tv: + assert item.compute() == 3 + + def test_text_dataset(): schema = { "names": Text(shape=(None,), max_shape=(1000,), dtype="int64"), @@ -415,6 +433,13 @@ def test_text_dataset(): dsv["names"][1] = text + "8" assert dsv["names"][1].numpy() == text + "8" + schema2 = { + "id": Text(shape=(4,), dtype="int64"), + } + ds2 = Dataset("./data/test/testing_text_2", mode="w", schema=schema2, shape=(10,)) + ds2[0:5, "id"] = ["abcd", "efgh", "ijkl", "mnop", "qrst"] + assert ds2[2:4, "id"].compute() == ["ijkl", "mnop"] + @pytest.mark.skipif( not transformers_loaded(), reason="requires transformers to be loaded" @@ -437,6 +462,19 @@ def test_text_dataset_tokenizer(): dsv["names"][1] = text + " 8" assert dsv["names"][1].numpy() == text + " 8" + schema2 = { + "id": Text(shape=(4,), dtype="int64"), + } + ds2 = Dataset( + "./data/test/testing_text_2", + mode="w", + schema=schema2, + shape=(10,), + tokenizer=True, + ) + ds2[0:5, "id"] = ["abcd", "abcd", "abcd", "abcd", "abcd"] + assert ds2[2:4, "id"].compute() == ["abcd", "abcd"] + def test_append_dataset(): dt = {"first": Tensor(shape=(250, 300)), "second": "float"} @@ -584,10 +622,47 @@ def test_datasetview_repr(): url = "./data/test/dsv_repr" ds = Dataset(schema=dt, shape=(9,), url=url, mode="w", lazy=False) dsv = ds[2:] - print_text = "DatasetView(Dataset(schema=SchemaDict({'first': Tensor(shape=(2,), dtype='float64'), 'second': 'float64', 'text': Text(shape=(None,), dtype='int64', max_shape=(12,))})url='./data/test/dsv_repr', shape=(9,), mode='w'), slice=slice(2, 9, None))" + print_text = "DatasetView(Dataset(schema=SchemaDict({'first': Tensor(shape=(2,), dtype='float64'), 'second': 'float64', 'text': Text(shape=(None,), dtype='int64', max_shape=(12,))})url='./data/test/dsv_repr', shape=(9,), mode='w'))" assert dsv.__repr__() == print_text +def test_datasetview_2(): + dt = { + "first": Tensor(shape=(2,)), + "second": "float", + "text": Text(shape=(None,), max_shape=(12,)), + } + ds = Dataset("./data/test/dsv_2/", schema=dt, shape=(9,), mode="w") + dsv = ds[2:] + with pytest.raises(ValueError): + dsv[3] = np.ones((3, 5)) + + with pytest.raises(KeyError): + dsv["abc"] = np.ones((3, 5)) + dsv["second"] = np.array([0, 1, 2, 3, 4, 5, 6]) + for i in range(7): + assert dsv[i, "second"].compute() == i + + +def test_dataset_3(): + dt = { + "first": Tensor(shape=(2,)), + "second": "float", + "text": Text(shape=(None,), max_shape=(12,)), + } + ds = Dataset("./data/test/ds_3/", schema=dt, shape=(9,), mode="w") + with pytest.raises(ValueError): + ds[3, 8] = np.ones((3, 5)) + + with pytest.raises(KeyError): + ds["abc"] = np.ones((3, 5)) + ds["second"] = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8]) + for i in range(9): + assert ds[i, "second"].compute() == i + with pytest.raises(ValueError): + ds[3, 8].compute() + + def test_dataset_casting(): my_schema = { "a": Tensor(shape=(1,), dtype="float64"), @@ -658,6 +733,127 @@ def test_dataset_assign_value(): assert ds["text", 6].compute() == "YGFJN75NF" +def test_dataset_filtering(): + my_schema = { + "fname": Text((None,), max_shape=(10,)), + "lname": Text((None,), max_shape=(10,)), + } + ds = Dataset("./test/filtering", shape=(100,), schema=my_schema, mode="w") + for i in range(100): + ds["fname", i] = "John" + ds["lname", i] = "Doe" + + for i in [1, 3, 6, 15, 63, 96, 75]: + ds["fname", i] = "Active" + + for i in [15, 31, 25, 75, 3, 6]: + ds["lname", i] = "loop" + + dsv_combined = ds.filter({"fname": "Active", "lname": "loop"}) + tsv_combined_fname = dsv_combined["fname"] + tsv_combined_lname = dsv_combined["lname"] + for item in dsv_combined: + assert item.compute() == {"fname": "Active", "lname": "loop"} + for item in tsv_combined_fname: + assert item.compute() == "Active" + for item in tsv_combined_lname: + assert item.compute() == "loop" + dsv_1 = ds.filter({"fname": "Active"}) + dsv_2 = dsv_1.filter({"lname": "loop"}) + for item in dsv_1: + assert item.compute()["fname"] == "Active" + tsv_1 = dsv_1["fname"] + tsv_2 = dsv_2["lname"] + for item in tsv_1: + assert item.compute() == "Active" + for item in tsv_2: + assert item.compute() == "loop" + for item in dsv_2: + assert item.compute() == {"fname": "Active", "lname": "loop"} + assert dsv_combined.indexes == [3, 6, 15, 75] + assert dsv_1.indexes == [1, 3, 6, 15, 63, 75, 96] + assert dsv_2.indexes == [3, 6, 15, 75] + + dsv_3 = ds.filter({"lname": "loop"}) + dsv_4 = dsv_3.filter({"fname": "Active"}) + for item in dsv_3: + assert item.compute()["lname"] == "loop" + for item in dsv_4: + assert item.compute() == {"fname": "Active", "lname": "loop"} + assert dsv_3.indexes == [3, 6, 15, 25, 31, 75] + assert dsv_4.indexes == [3, 6, 15, 75] + + my_schema2 = { + "fname": Text((None,), max_shape=(10,)), + "lname": Text((None,), max_shape=(10,)), + "image": Image((1920, 1080, 3)), + } + ds = Dataset("./test/filtering2", shape=(100,), schema=my_schema2, mode="w") + with pytest.raises(LargeShapeFilteringException): + ds.filter({"image": np.ones((1920, 1080, 3))}) + with pytest.raises(KeyError): + ds.filter({"random": np.ones((1920, 1080, 3))}) + + for i in [1, 3, 6, 15, 63, 96, 75]: + ds["fname", i] = "Active" + dsv = ds.filter({"fname": "Active"}) + with pytest.raises(LargeShapeFilteringException): + dsv.filter({"image": np.ones((1920, 1080, 3))}) + with pytest.raises(KeyError): + dsv.filter({"random": np.ones((1920, 1080, 3))}) + + +def test_dataset_filtering_2(): + schema = { + "img": Image((None, None, 3), max_shape=(100, 100, 3)), + "cl": ClassLabel(names=["cat", "dog", "horse"]), + } + ds = Dataset("./test/filtering_3", shape=(100,), schema=schema, mode="w") + for i in range(100): + ds["cl", i] = 0 if i % 5 == 0 else 1 + ds["img", i] = i * np.ones((5, 6, 3)) + ds["cl", 4] = 2 + ds_filtered = ds.filter({"cl": 0}) + assert ds_filtered.indexes == [5 * i for i in range(20)] + with pytest.raises(ValueError): + ds_filtered["img"].compute() + ds_filtered_2 = ds.filter({"cl": 2}) + assert (ds_filtered_2["img"].compute() == 4 * np.ones((1, 5, 6, 3))).all() + for item in ds_filtered_2: + assert (item["img"].compute() == 4 * np.ones((5, 6, 3))).all() + assert item["cl"].compute() == 2 + + +def test_dataset_filtering_3(): + schema = { + "img": Image((None, None, 3), max_shape=(100, 100, 3)), + "cl": ClassLabel(names=["cat", "dog", "horse"]), + } + ds = Dataset("./test/filtering_3", shape=(100,), schema=schema, mode="w") + for i in range(100): + ds["cl", i] = 0 if i < 10 else 1 + ds["img", i] = i * np.ones((5, 6, 3)) + ds_filtered = ds.filter({"cl": 0}) + assert (ds_filtered[3:8, "cl"].compute() == np.zeros((5,))).all() + + +def test_dataset_utils(): + with pytest.raises(TypeError): + slice_split([5.3]) + with pytest.raises(IndexError): + slice_extract_info(5, 3) + with pytest.raises(ValueError): + slice_extract_info(slice(2, 10, -2), 3) + with pytest.raises(IndexError): + slice_extract_info(slice(20, 100), 3) + with pytest.raises(IndexError): + slice_extract_info(slice(1, 20), 3) + with pytest.raises(IndexError): + slice_extract_info(slice(4, 1), 10) + slice_extract_info(slice(None, 10), 20) + slice_extract_info(slice(20, None), 50) + + def test_dataset_name(): schema = {"temp": "uint8"} ds = Dataset( @@ -673,6 +869,30 @@ def test_dataset_name(): if __name__ == "__main__": - # test_pickleability() + test_dataset_assign_value() + test_dataset_setting_shape() + test_datasetview_repr() + test_datasetview_get_dictionary() + test_tensorview_slicing() + test_datasetview_slicing() + test_dataset() + test_dataset_batch_write_2() + test_append_dataset() + test_dataset_2() + test_text_dataset() + test_text_dataset_tokenizer() + test_dataset_compute() + test_dataset_view_compute() + test_dataset_lazy() + test_dataset_view_lazy() + test_dataset_hub() + test_meta_information() + test_dataset_filtering() + test_dataset_filtering_2() test_pickleability() - # test_dataset_append_and_read() + test_dataset_append_and_read() + test_tensorview_iter() + test_dataset_filtering_3() + test_datasetview_2() + test_dataset_3() + test_dataset_utils() diff --git a/hub/api/tests/test_objectview.py b/hub/api/tests/test_objectview.py index 53197f3267..852679a45c 100644 --- a/hub/api/tests/test_objectview.py +++ b/hub/api/tests/test_objectview.py @@ -27,14 +27,10 @@ def test_objectview(): # dataset view unsqueezed with pytest.raises(IndexError): dv["c", "d"].compute() - with pytest.raises(IndexError): - dv["c", "d"] = np.ones((2, 3, 3, 5, 5)) # dataset unsqueezed with pytest.raises(IndexError): ds["c", "d"].compute() - with pytest.raises(IndexError): - ds["c", "d"] = np.ones((5, 3, 3, 5, 5)) # tensorview to object view # sequence of tensor @@ -47,19 +43,17 @@ def test_objectview(): assert (ds[3, "c", "d"].compute() == 5 * np.ones((2, 2, 5, 5))).all() # Sequence of schemadicts - ds[0, "e"] = {"f": {"g": np.ones((3, 5)), "h": np.ones(3)}} - ds[0, "e", 0, "f", "h"] = 42 - # The first slice is unstable but the complete slice is valid - ds[0, "e", 1]["f", "h"] = 25 + ds[0, "e"] = {"f": {"g": np.ones((3, 5)), "h": np.array([42, 25, 15])}} with pytest.raises(KeyError): ds[0, "e", 1].compute() - ds[0, "e"][2]["f"]["h"] = 15 assert (ds[0, "e", "f", "h"].compute() == np.array([42, 25, 15])).all() + # With dataset view - dv[0, "e"] = {"f": {"g": np.ones((3, 5)), "h": np.ones(3)}} - dv[0, "e", 1]["f", "h"] = 25 + dv[0, "e"] = {"f": {"g": np.ones((3, 5)), "h": np.array([1, 25, 1])}} + # dv[0, "e", 1]["f", "h"] = 25 assert (dv[0, "e", "f", "h"].compute() == np.array([1, 25, 1])).all() - # if not lazy mode all slices should be stable + + # If not lazy mode all slices should be stable ds.lazy = False assert ds[0, "e", 0, "f", "h"] == 42 with pytest.raises(KeyError): @@ -71,9 +65,9 @@ def test_objectview(): with pytest.raises(IndexError): ov.compute() assert (ov[3].compute() == 5 * np.ones((2, 2, 5, 5))).all() - ov[3, 1] = 2 * np.ones((2, 5, 5)) + # ov[3, 1] = 2 * np.ones((2, 5, 5)) assert (ov[3][0, 0].compute() == 5 * np.ones((5, 5))).all() - assert (ov[3][1].compute() == 2 * np.ones((2, 5, 5))).all() + assert (ov[3][1].compute() == 5 * np.ones((2, 5, 5))).all() def test_errors(): @@ -102,7 +96,7 @@ def test_errors(): with pytest.raises(IndexError): ds["c", :2, "d"][0, 1, 1, 0, 0, 0] ob = ds["c", :2, "d"][0, 2:5, 1, 0, 0] - assert str(ob[1]) == "ObjectView(subpath='/c/d', slice=[0, 3, 1, 0, 0])" + assert str(ob[1]) == "ObjectView(subpath='/c/d', indexes=0, slice=[3, 1, 0, 0])" with pytest.raises(IndexError): ob[1, 0] diff --git a/hub/api/tests/test_tensorview.py b/hub/api/tests/test_tensorview.py index 83b49a3a73..f92a0c66bc 100644 --- a/hub/api/tests/test_tensorview.py +++ b/hub/api/tests/test_tensorview.py @@ -30,7 +30,7 @@ def test_tensorview_getitem(): def test_tensorview_setitem(): images_tensorview = ds["image"] - with pytest.raises(IndexError): + with pytest.raises(ValueError): images_tensorview["7", 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") diff --git a/hub/compute/ray.py b/hub/compute/ray.py index 1d5cfb23f3..c8e09e0b11 100644 --- a/hub/compute/ray.py +++ b/hub/compute/ray.py @@ -50,8 +50,8 @@ def _func_argd(_func, index, _ds, schema, kwargs): Remote wrapper for user defined function """ - if isinstance(_ds, Dataset) or isinstance(_ds, DatasetView): - _ds.squeeze_dim = False + if isinstance(_ds, (Dataset, DatasetView)) and isinstance(_ds.indexes, int): + _ds.indexes = [_ds.indexes] item = _ds[index] if isinstance(item, DatasetView) or isinstance(item, Dataset): diff --git a/hub/compute/transform.py b/hub/compute/transform.py index b0aedf15e8..b3fbc31ebe 100644 --- a/hub/compute/transform.py +++ b/hub/compute/transform.py @@ -6,11 +6,10 @@ from tqdm import tqdm from collections.abc import MutableMapping from hub.utils import batchify -from hub.api.dataset_utils import get_value, slice_extract_info, slice_split, str_to_int +from hub.api.dataset_utils import get_value, slice_split, str_to_int, slice_extract_info import collections.abc as abc from hub.api.datasetview import DatasetView from pathos.pools import ProcessPool, ThreadPool -from hub.schema import Primitive from hub.schema.sequence import Sequence from hub.schema.features import featurify import posixpath @@ -116,13 +115,7 @@ def __getitem__(self, slice_): slice_list = slice_list or [slice(None, None, None)] num, ofs = slice_extract_info(slice_list[0], self.shape[0]) - - ds_view = DatasetView( - dataset=self._ds, - num_samples=num, - offset=ofs, - squeeze_dim=isinstance(slice_list[0], int), - ) + ds_view = self._ds[slice_list[0]] path = posixpath.expanduser("~/.activeloop/tmparray") new_ds = self.store(path, length=num, ds=ds_view, progressbar=False) @@ -278,12 +271,15 @@ def upload_chunk(i_batch): # Disable dynamic arrays ds.dataset._tensors[f"/{key}"].disable_dynamicness() list(self.map(upload_chunk, index_batched_values)) + offset = ds.indexes[ + 0 + ] # here ds.indexes will always be a contiguous list as obtained after slicing # Enable and rewrite shapes if ds.dataset._tensors[f"/{key}"].is_dynamic: ds.dataset._tensors[f"/{key}"].enable_dynamicness() ds.dataset._tensors[f"/{key}"].set_shape( - [slice(ds.offset, ds.offset + len(value))], value + [slice(offset, offset + len(value))], value ) ds.commit() diff --git a/hub/exceptions.py b/hub/exceptions.py index dcef6e6280..b448c3be86 100644 --- a/hub/exceptions.py +++ b/hub/exceptions.py @@ -164,6 +164,12 @@ def __init__(self): super(HubException, self).__init__(message=message) +class LargeShapeFilteringException(HubException): + def __init__(self, key): + message = f"The shape of {key} is large (product > 100), use smaller keys for filtering" + super(HubException, self).__init__(message=message) + + class ValueShapeError(HubException): def __init__(self, correct_shape, wrong_shape): message = f"parameter 'value': expected array with shape {correct_shape}, got {wrong_shape}" diff --git a/hub/schema/sequence.py b/hub/schema/sequence.py index ad3f526942..2664b6403c 100644 --- a/hub/schema/sequence.py +++ b/hub/schema/sequence.py @@ -17,7 +17,7 @@ class Sequence(Tensor): def __init__( self, shape=(), - max_shape=(), + max_shape=None, dtype=None, chunks=None, compressor="lz4", diff --git a/hub/schema/text.py b/hub/schema/text.py index d67d50f124..dc047ca8df 100644 --- a/hub/schema/text.py +++ b/hub/schema/text.py @@ -12,7 +12,7 @@ def __init__( self, shape: Tuple[int, ...] = (None,), dtype="int64", - max_shape: Tuple[int, ...] = (None,), + max_shape: Tuple[int, ...] = None, chunks=None, compressor="lz4", ): diff --git a/hub/store/dynamic_tensor.py b/hub/store/dynamic_tensor.py index d407b06d4a..f26aad77a3 100644 --- a/hub/store/dynamic_tensor.py +++ b/hub/store/dynamic_tensor.py @@ -282,6 +282,12 @@ def get_shape_samples(self, samples): if self.shape[i] is not None: shapes = np.insert(shapes, i - 1, self.shape[i], axis=1) return shapes + elif isinstance(samples, list): + shapes = np.array([self._dynamic_tensor[index] for index in samples]) + for i in range(1, len(self.shape)): + if self.shape[i] is not None: + shapes = np.insert(shapes, i - 1, self.shape[i], axis=1) + return shapes def combine_shape(self, shape, slice_): """Combines given shape with slice to get final shape""" @@ -322,14 +328,14 @@ def combine_shape(self, shape, slice_): def get_shape(self, slice_): """Gets the shape of the slice from tensor""" - if isinstance(slice_, int) or isinstance(slice_, slice): + if isinstance(slice_, (int, slice)): slice_ = [slice_] if self._dynamic_tensor is None: # returns 1D np array return self.combine_shape(np.array(self.shape), slice_) elif isinstance(slice_[0], int): # returns 1D np array sample_shape = self.get_shape_samples(slice_[0]) return self.combine_shape(sample_shape, slice_[1:]) - elif isinstance(slice_[0], slice): + elif isinstance(slice_[0], (slice, list)): sample_shapes = self.get_shape_samples(slice_[0]) final_shapes = self.combine_shape(sample_shapes, slice_[1:]) if len(final_shapes) == 1: diff --git a/hub/store/shape_detector.py b/hub/store/shape_detector.py index e0de291e6d..e661eb6226 100644 --- a/hub/store/shape_detector.py +++ b/hub/store/shape_detector.py @@ -1,3 +1,4 @@ +from hub.utils import _tuple_product from hub.numcodecs import PngCodec import math @@ -69,7 +70,7 @@ def _get_max_shape(self, shape, max_shape): def _get_chunks(self, shape, max_shape, chunks, dtype, chunksize): if chunks is None: - prod = self._tuple_product(max_shape[1:]) + prod = _tuple_product(max_shape[1:]) if dtype == "object": return (self._object_chunking,) + max_shape[1:] if prod <= 2 * chunksize: @@ -100,12 +101,6 @@ def _get_chunks(self, shape, max_shape, chunks, dtype, chunksize): assert chunks[0] == 1 return chunks - def _tuple_product(self, tuple_): - res = 1 - for t in tuple_: - res *= t - return res - def _determine_chunksizes(self, max_shape, dtype, chunksize): """ Autochunking of tensors diff --git a/hub/utils.py b/hub/utils.py index 4a93488344..86841eb167 100644 --- a/hub/utils.py +++ b/hub/utils.py @@ -158,6 +158,13 @@ def batchify(iterable, n=1): return batches +def _tuple_product(tuple_): + res = 1 + for t in tuple_: + res *= t + return res + + class Timer: def __init__(self, text): self._text = text