From 175ac3ba05316e89bb76750afc6c1c95c87310fa Mon Sep 17 00:00:00 2001
From: benchislett <chislett.ben@gmail.com>
Date: Thu, 3 Jun 2021 22:45:09 -0400
Subject: [PATCH 1/8] Various API changes

- Tensor metadata initialization without data:
- - tensor_meta_from_array split into default and update
- Remove Dataset.__setattr__ and related info since __setitem__ will no
longer be used.
- Add @property for meta to Dataset and setters for it and Tensor
- Add Dataset.create_tensor that does not depend on any data addition
- Split Tensor.append into append and extend
- Remove Tensor.__setitem__, preferring append/extend until update is
supported
- Resolve minor mypy typing issues
---
 hub/api/dataset.py                            | 65 +++++++------------
 hub/api/tensor.py                             | 59 ++++++++++-------
 hub/api/tests/test_api.py                     | 31 ++++++---
 .../tests/test_benchmark_chunk_engine.py      |  6 +-
 hub/core/compression/webp.py                  |  6 +-
 hub/core/meta/tensor_meta.py                  | 23 +++----
 hub/core/tensor.py                            | 23 +++----
 hub/core/tests/common.py                      |  7 +-
 hub/core/tests/test_tensor_failures.py        | 10 +--
 9 files changed, 122 insertions(+), 108 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 7a073e1327..5b2c039039 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -8,10 +8,12 @@
 from hub.core.tensor import tensor_exists
 from hub.core.dataset import dataset_exists
 from hub.core.meta.dataset_meta import read_dataset_meta, write_dataset_meta
-from hub.core.meta.tensor_meta import tensor_meta_from_array
+from hub.core.meta.tensor_meta import default_tensor_meta
 
 from hub.core.typing import StorageProvider
 from hub.util.index import Index
+
+from hub.constants import DEFAULT_CHUNK_SIZE
 from hub.util.path import provider_from_path
 from hub.util.exceptions import (
     InvalidKeyTypeError,
@@ -21,9 +23,6 @@
 )
 from hub.util.path import provider_from_path
 
-# Used to distinguish between attributes and items (tensors)
-DATASET_RESERVED_ATTRIBUTES = ["path", "mode", "index", "provider", "tensors"]
-
 
 class Dataset:
     def __init__(
@@ -35,12 +34,6 @@ def __init__(
     ):
         """Initialize a new or existing dataset.
 
-        Note:
-            Entries of `DATASET_RESERVED_ATTRIBUTES` cannot be used as tensor names.
-            This is to distinguish between attributes (like `ds.mode`) and tensors.
-
-            Be sure to keep `DATASET_RESERVED_ATTRIBUTES` up-to-date when changing this class.
-
         Args:
             path (str): The location of the dataset. Used to initialize the storage provider.
             mode (str): Mode in which the dataset is opened.
@@ -67,11 +60,10 @@ def __init__(
         self.tensors: Dict[str, Tensor] = {}
 
         if dataset_exists(self.provider):
-            ds_meta = read_dataset_meta(self.provider)
-            for tensor_name in ds_meta["tensors"]:
+            for tensor_name in self.meta["tensors"]:
                 self.tensors[tensor_name] = Tensor(tensor_name, self.provider)
         else:
-            write_dataset_meta(self.provider, {"tensors": []})
+            self.meta = {"tensors": []}
 
     def __len__(self):
         """Return the greatest length of tensors"""
@@ -89,43 +81,36 @@ def __getitem__(self, item: Union[str, int, slice, Index]):
         else:
             raise InvalidKeyTypeError(item)
 
-    def __setitem__(self, item: Union[slice, str], value):
-        if isinstance(item, str):
-            tensor_key = item
-
-            if tensor_exists(tensor_key, self.provider):
-                raise TensorAlreadyExistsError(tensor_key)
-
-            if isinstance(value, np.ndarray):
-                tensor_meta = tensor_meta_from_array(value, batched=True)
+    def create_tensor(
+        self, name, chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64"
+    ):
+        if tensor_exists(name, self.provider):
+            raise TensorAlreadyExistsError(name)
 
-                ds_meta = read_dataset_meta(self.provider)
-                ds_meta["tensors"].append(tensor_key)
-                write_dataset_meta(self.provider, ds_meta)
+        ds_meta = self.meta
+        ds_meta["tensors"].append(name)
+        self.meta = ds_meta
 
-                tensor = Tensor(tensor_key, self.provider, tensor_meta=tensor_meta)
-                self.tensors[tensor_key] = tensor
-                tensor.append(value, batched=True)
+        tensor_meta = default_tensor_meta(chunk_size, dtype)
+        tensor = Tensor(name, self.provider, tensor_meta=tensor_meta)
+        self.tensors[name] = tensor
 
-                return tensor
-            else:
-                raise UnsupportedTensorTypeError(item)
-        else:
-            raise InvalidKeyTypeError(item)
+        return tensor
 
     __getattr__ = __getitem__
 
-    def __setattr__(self, name: str, value):
-        """Set the named attribute on the dataset"""
-        if name in DATASET_RESERVED_ATTRIBUTES:
-            return super().__setattr__(name, value)
-        else:
-            return self.__setitem__(name, value)
-
     def __iter__(self):
         for i in range(len(self)):
             yield self[i]
 
+    @property
+    def meta(self):
+        return read_dataset_meta(self.provider)
+
+    @meta.setter
+    def meta(self, new_meta: dict):
+        write_dataset_meta(self.provider, new_meta)
+
     @staticmethod
     def from_path(path: str):
         """Create a local hub dataset from unstructured data.
diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 46d105e109..3cd8309091 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -8,6 +8,7 @@
     add_samples_to_tensor,
     read_samples_from_tensor,
     read_tensor_meta,
+    write_tensor_meta,
     tensor_exists,
 )
 from hub.core.typing import StorageProvider
@@ -58,19 +59,45 @@ def __init__(
 
             create_tensor(self.key, self.provider, tensor_meta)
 
-    def append(self, array: np.ndarray, batched: bool):
-        # TODO: split into `append`/`extend`
-        add_samples_to_tensor(
-            array,
-            self.key,
-            storage=self.provider,
-            batched=batched,
-        )
+    def extend(self, array: np.ndarray):
+        """Extend tensor by appending elements from a batched numpy array.
+
+        Example:
+            >>> len(image)
+            0
+            >>> image.extend(np.zeros((100, 28, 28, 1)))
+            >>> len(image)
+            100
+
+        Args:
+            array (np.ndarray): The data to add to the tensor.
+                The primary axis should be the number of samples to add.
+        """
+        add_samples_to_tensor(array, self.key, storage=self.provider, batched=True)
+
+    def append(self, array: np.ndarray):
+        """Append a sample to the end of the tensor.
+
+        Example:
+            >>> len(image)
+            0
+            >>> image.append(np.zeros((28, 28, 1)))
+            >>> len(image)
+            1
+
+        Args:
+            array (np.ndarray): The data to add to the tensor.
+        """
+        add_samples_to_tensor(array, self.key, storage=self.provider, batched=False)
 
     @property
     def meta(self):
         return read_tensor_meta(self.key, self.provider)
 
+    @meta.setter
+    def meta(self, new_meta: dict):
+        write_tensor_meta(self.key, self.provider, new_meta)
+
     @property
     def shape(self):
         # TODO: when dynamic arrays are supported, handle `min_shape != max_shape` (right now they're always equal)
@@ -84,21 +111,7 @@ def __getitem__(self, item: Union[int, slice, Index]):
         return Tensor(self.key, self.provider, index=self.index[item])
 
     def __setitem__(self, item: Union[int, slice], value: np.ndarray):
-        sliced_self = self[item]
-        if sliced_self.index.item != slice(None):
-            raise NotImplementedError(
-                "Assignment to Tensor subsections not currently supported!"
-            )
-        else:
-            if tensor_exists(self.key, self.provider):
-                raise TensorAlreadyExistsError(self.key)
-
-            add_samples_to_tensor(
-                array=value,
-                key=self.key,
-                storage=self.provider,
-                batched=True,
-            )
+        raise NotImplementedError("Tensor update not currently supported!")
 
     def __iter__(self):
         for i in range(len(self)):
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 39776057da..fbf4a0af9b 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -12,7 +12,8 @@ def test_persist_local(local_storage):
         pytest.skip()
 
     ds = Dataset(local_storage.root)
-    ds.image = np.ones((4, 4096, 4096))
+    ds.create_tensor("image")
+    ds.image.extend(np.ones((4, 4096, 4096)))
 
     ds_new = Dataset(local_storage.root)
     assert len(ds_new) == 4
@@ -22,21 +23,32 @@ def test_persist_local(local_storage):
 
 @parametrize_all_dataset_storages
 def test_populate_dataset(ds):
-    assert read_dataset_meta(ds.provider) == {"tensors": []}
-    ds.image = np.ones((4, 28, 28))
-    assert read_dataset_meta(ds.provider) == {"tensors": ["image"]}
+    assert ds.meta == {"tensors": []}
+    ds.create_tensor("image")
+    assert len(ds) == 0
+    assert len(ds.image) == 0
+
+    ds.image.extend(np.ones((4, 28, 28)))
     assert len(ds) == 4
+    assert len(ds.image) == 4
+
+    ds.image.append(np.ones((28, 28)))
+    assert len(ds.image) == 5
+
+    assert ds.meta == {"tensors": ["image"]}
 
 
 @parametrize_all_dataset_storages
 def test_compute_tensor(ds):
-    ds.image = np.ones((32, 28, 28))
+    ds.create_tensor("image")
+    ds.image.extend(np.ones((32, 28, 28)))
     np.testing.assert_array_equal(ds.image.numpy(), np.ones((32, 28, 28)))
 
 
 @parametrize_all_dataset_storages
 def test_compute_tensor_slice(ds):
-    ds.image = np.vstack((np.arange(16),) * 8)
+    ds.create_tensor("image")
+    ds.image.extend(np.vstack((np.arange(16),) * 8))
 
     sliced_data = ds.image[2:5].numpy()
     expected_data = np.vstack((np.arange(16),) * 3)
@@ -46,8 +58,11 @@ def test_compute_tensor_slice(ds):
 @parametrize_all_dataset_storages
 def test_iterate_dataset(ds):
     labels = [1, 9, 7, 4]
-    ds.image = np.ones((4, 28, 28))
-    ds.label = np.asarray(labels).reshape((4, 1))
+    ds.create_tensor("image")
+    ds.create_tensor("label")
+
+    ds.image.extend(np.ones((4, 28, 28)))
+    ds.label.extend(np.asarray(labels).reshape((4, 1)))
 
     for idx, sub_ds in enumerate(ds):
         img = sub_ds.image.numpy()
diff --git a/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py b/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py
index 4ea4fbcffc..6f18277d08 100644
--- a/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py
+++ b/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py
@@ -10,7 +10,7 @@
     add_samples_to_tensor,
     create_tensor,
 )
-from hub.core.meta.tensor_meta import tensor_meta_from_array
+from hub.core.meta.tensor_meta import default_tensor_meta
 from hub.core.tests.common import TENSOR_KEY
 from hub.core.typing import StorageProvider
 from hub.tests.common_benchmark import (
@@ -24,9 +24,7 @@
 def single_benchmark_write(info, key, arrays, chunk_size, storage, batched):
     actual_key = "%s_%i" % (key, info["iteration"])
 
-    create_tensor(
-        actual_key, storage, tensor_meta_from_array(arrays[0], batched, chunk_size)
-    )
+    create_tensor(actual_key, storage, default_tensor_meta(chunk_size))
 
     for a_in in arrays:
         add_samples_to_tensor(
diff --git a/hub/core/compression/webp.py b/hub/core/compression/webp.py
index f7c50d786b..77f335ecb7 100644
--- a/hub/core/compression/webp.py
+++ b/hub/core/compression/webp.py
@@ -46,9 +46,9 @@ def encode_single_image(self, image: np.ndarray) -> bytes:
             Encoded data.
         """
         with BytesIO() as buffer:
-            image = Image.fromarray(image)
-            image = image.convert("RGB")
-            image.save(buffer, format=self.codec_id, quality=self.quality)
+            img = Image.fromarray(image)
+            img = img.convert("RGB")
+            img.save(buffer, format=self.codec_id, quality=self.quality)
             return buffer.getvalue()
 
     def decode_single_image(self, buf: bytes, image_shape: tuple) -> np.ndarray:
diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
index 0c7262bd10..a2868be607 100644
--- a/hub/core/meta/tensor_meta.py
+++ b/hub/core/meta/tensor_meta.py
@@ -17,18 +17,19 @@ def read_tensor_meta(key: str, storage: StorageProvider) -> dict:
     return pickle.loads(storage[get_tensor_meta_key(key)])
 
 
-def tensor_meta_from_array(
-    array: np.ndarray, batched: bool, chunk_size: int = DEFAULT_CHUNK_SIZE
+def default_tensor_meta(chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64"):
+    return {"chunk_size": chunk_size, "dtype": dtype, "length": 0}
+
+
+def update_tensor_meta_with_array(
+    tensor_meta: dict, array: np.ndarray, batched=False
 ) -> dict:
-    array = normalize_and_batchify_shape(array, batched=batched)
-
-    tensor_meta = {
-        "chunk_size": chunk_size,
-        "dtype": array.dtype.name,
-        "min_shape": tuple(array.shape[1:]),
-        "max_shape": tuple(array.shape[1:]),
-        # TODO: add entry in meta for which tobytes function is used and handle mismatch versions for this
-    }
+    shape = array.shape
+    if batched:
+        shape = shape[1:]
+    tensor_meta["dtype"] = str(array.dtype)
+    tensor_meta["min_shape"] = shape
+    tensor_meta["max_shape"] = shape
 
     return tensor_meta
 
diff --git a/hub/core/tensor.py b/hub/core/tensor.py
index 216f38747f..8c99b2a79a 100644
--- a/hub/core/tensor.py
+++ b/hub/core/tensor.py
@@ -6,6 +6,7 @@
 from hub.core.meta.tensor_meta import (
     read_tensor_meta,
     write_tensor_meta,
+    update_tensor_meta_with_array,
     validate_tensor_meta,
 )
 from hub.core.meta.index_map import read_index_map, write_index_map
@@ -37,9 +38,7 @@ def create_tensor(key: str, storage: StorageProvider, meta: dict):
     Args:
         key (str): Key for where the chunks, index_map, and meta will be located in `storage` relative to it's root.
         storage (StorageProvider): StorageProvider that all tensor data is written to.
-        meta (dict): Meta for the tensor. Required properties:
-            chunk_size (int): Desired length of chunks.
-            dtype (str): Datatype for each sample.
+        meta (dict): Meta for the tensor. For required properties, see `default_tensor_meta`.
 
     Raises:
         TensorAlreadyExistsError: If a tensor defined with `key` already exists.
@@ -48,8 +47,6 @@ def create_tensor(key: str, storage: StorageProvider, meta: dict):
     if tensor_exists(key, storage):
         raise TensorAlreadyExistsError(key)
 
-    meta.update({"length": 0})
-
     validate_tensor_meta(meta)
 
     write_tensor_meta(key, storage, meta)
@@ -77,15 +74,17 @@ def add_samples_to_tensor(
         TensorDoesNotExistError: If a tensor at `key` does not exist. A tensor must be created first using `create_tensor(...)`.
     """
 
-    # TODO: split into `append` and `extend`
-
-    array = normalize_and_batchify_shape(array, batched=batched)
-
     if not tensor_exists(key, storage):
         raise TensorDoesNotExistError(key)
 
     index_map = read_index_map(key, storage)
     tensor_meta = read_tensor_meta(key, storage)
+
+    array = normalize_and_batchify_shape(array, batched=batched)
+
+    if "min_shape" not in tensor_meta:
+        tensor_meta = update_tensor_meta_with_array(tensor_meta, array, batched=True)
+
     _check_array_and_tensor_are_compatible(tensor_meta, array)
 
     # TODO: get the tobytes function from meta
@@ -137,10 +136,12 @@ def read_samples_from_tensor(
         array = sample_from_index_entry(key, storage, index_entry, meta["dtype"])
         samples.append(array)
 
+    array = np.array(samples)
+
     if isinstance(index.item, int):
-        samples = samples[0]
+        array = array.squeeze(axis=0)
 
-    return np.array(samples)
+    return array
 
 
 def _check_array_and_tensor_are_compatible(tensor_meta: dict, array: np.ndarray):
diff --git a/hub/core/tests/common.py b/hub/core/tests/common.py
index 169f1bb7ad..7293d99055 100644
--- a/hub/core/tests/common.py
+++ b/hub/core/tests/common.py
@@ -11,7 +11,7 @@
     tensor_exists,
     read_samples_from_tensor,
 )
-from hub.core.meta.tensor_meta import read_tensor_meta, tensor_meta_from_array
+from hub.core.meta.tensor_meta import read_tensor_meta, default_tensor_meta
 from hub.core.meta.index_map import read_index_map
 
 from hub.core.typing import StorageProvider
@@ -139,7 +139,7 @@ def run_engine_test(
     key = TENSOR_KEY
     sample_count = 0
 
-    create_tensor(key, storage, tensor_meta_from_array(arrays[0], batched, chunk_size))
+    create_tensor(key, storage, default_tensor_meta(chunk_size))
 
     for i, a_in in enumerate(arrays):
         add_samples_to_tensor(
@@ -180,7 +180,8 @@ def run_engine_test(
 def benchmark_write(
     key, arrays, chunk_size, storage, batched, clear_memory_after_write=True
 ):
-    create_tensor(key, storage, tensor_meta_from_array(arrays[0], batched, chunk_size))
+
+    create_tensor(key, storage, default_tensor_meta(chunk_size))
 
     for a_in in arrays:
         add_samples_to_tensor(
diff --git a/hub/core/tests/test_tensor_failures.py b/hub/core/tests/test_tensor_failures.py
index ca8e000682..4dc229b925 100644
--- a/hub/core/tests/test_tensor_failures.py
+++ b/hub/core/tests/test_tensor_failures.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from hub.core.meta.tensor_meta import tensor_meta_from_array
+from hub.core.meta.tensor_meta import default_tensor_meta
 from hub.core.tensor import add_samples_to_tensor, create_tensor
 
 from hub.tests.common import TENSOR_KEY
@@ -17,7 +17,7 @@
 def test_dtype_mismatch(memory_storage):
     a1 = np.array([1, 2, 3, 5.3], dtype=float)
     a2 = np.array([0, 1, 1, 0], dtype=bool)
-    create_tensor(TENSOR_KEY, memory_storage, tensor_meta_from_array(a1, batched=False))
+    create_tensor(TENSOR_KEY, memory_storage, default_tensor_meta())
     add_samples_to_tensor(a1, TENSOR_KEY, memory_storage, batched=False)
     add_samples_to_tensor(a2, TENSOR_KEY, memory_storage, batched=False)
 
@@ -26,7 +26,7 @@ def test_dtype_mismatch(memory_storage):
 def test_shape_length_mismatch(memory_storage):
     a1 = np.arange(100).reshape(5, 20)
     a2 = np.arange(200).reshape(5, 20, 2)
-    create_tensor(TENSOR_KEY, memory_storage, tensor_meta_from_array(a1, batched=False))
+    create_tensor(TENSOR_KEY, memory_storage, default_tensor_meta())
     add_samples_to_tensor(a1, TENSOR_KEY, memory_storage, batched=False)
     add_samples_to_tensor(a2, TENSOR_KEY, memory_storage, batched=False)
 
@@ -40,8 +40,8 @@ def test_tensor_does_not_exist(memory_storage):
 @pytest.mark.xfail(raises=TensorAlreadyExistsError, strict=True)
 def test_tensor_already_exists(memory_storage):
     a1 = np.arange(10)
-    create_tensor(TENSOR_KEY, memory_storage, tensor_meta_from_array(a1, batched=False))
-    create_tensor(TENSOR_KEY, memory_storage, tensor_meta_from_array(a1, batched=False))
+    create_tensor(TENSOR_KEY, memory_storage, default_tensor_meta())
+    create_tensor(TENSOR_KEY, memory_storage, default_tensor_meta())
 
 
 @pytest.mark.xfail(raises=TensorMetaInvalidValue, strict=True)

From c2a615c34ad26f386c66a8e4943e3d3aa4f021fd Mon Sep 17 00:00:00 2001
From: benchislett <chislett.ben@gmail.com>
Date: Thu, 3 Jun 2021 23:08:55 -0400
Subject: [PATCH 2/8] Add docstring for Dataset.create_tensor

---
 hub/api/dataset.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 5b2c039039..665b2e1a74 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -82,8 +82,22 @@ def __getitem__(self, item: Union[str, int, slice, Index]):
             raise InvalidKeyTypeError(item)
 
     def create_tensor(
-        self, name, chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64"
+        self, name: str, chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64"
     ):
+        """Create a new tensor in this dataset.
+
+        Args:
+            name (str): The name of the tensor to be created.
+            chunk_size (int): The target size for chunks in this tensor.
+            dtype (str): The dtype to use for this tensor.
+                Will be overwritten when the first sample is added.
+
+        Returns:
+            The new tensor, which can also be accessed by `self[name]`.
+
+        Raises:
+            TensorAlreadyExistsError: Duplicate tensors are not allowed.
+        """
         if tensor_exists(name, self.provider):
             raise TensorAlreadyExistsError(name)
 

From 5377ffce6bc75fc9b8b6de6436caa43049ad83d5 Mon Sep 17 00:00:00 2001
From: benchislett <chislett.ben@gmail.com>
Date: Fri, 4 Jun 2021 11:17:17 -0400
Subject: [PATCH 3/8] Update docstring for from_path

---
 hub/api/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 665b2e1a74..fa408dcb91 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -127,10 +127,10 @@ def meta(self, new_meta: dict):
 
     @staticmethod
     def from_path(path: str):
-        """Create a local hub dataset from unstructured data.
+        """Create a hub dataset from unstructured data.
 
         Note:
-            This copies the data locally in hub format.
+            This copies the data into hub format.
             Be careful when using this with large datasets.
 
         Args:

From 852baa9e7b92de73efdf885fc705a5da96c65be6 Mon Sep 17 00:00:00 2001
From: benchislett <chislett.ben@gmail.com>
Date: Fri, 4 Jun 2021 11:29:59 -0400
Subject: [PATCH 4/8] Use descriptive-style docstrings

---
 hub/api/dataset.py |  6 +++---
 hub/api/tensor.py  | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index fa408dcb91..15eaab5402 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -32,7 +32,7 @@ def __init__(
         provider: Optional[StorageProvider] = None,
         index: Union[int, slice, Index] = None,
     ):
-        """Initialize a new or existing dataset.
+        """Initializes a new or existing dataset.
 
         Args:
             path (str): The location of the dataset. Used to initialize the storage provider.
@@ -84,7 +84,7 @@ def __getitem__(self, item: Union[str, int, slice, Index]):
     def create_tensor(
         self, name: str, chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64"
     ):
-        """Create a new tensor in this dataset.
+        """Creates a new tensor in a dataset.
 
         Args:
             name (str): The name of the tensor to be created.
@@ -127,7 +127,7 @@ def meta(self, new_meta: dict):
 
     @staticmethod
     def from_path(path: str):
-        """Create a hub dataset from unstructured data.
+        """Creates a hub dataset from unstructured data.
 
         Note:
             This copies the data into hub format.
diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 3cd8309091..74f1348fbe 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -25,7 +25,7 @@ def __init__(
         tensor_meta: dict = None,
         index: Union[int, slice, Index] = None,
     ):
-        """Initialize a new tensor.
+        """Initializes a new tensor.
 
         Note:
             This operation does not create a new tensor in the storage provider,
@@ -60,7 +60,7 @@ def __init__(
             create_tensor(self.key, self.provider, tensor_meta)
 
     def extend(self, array: np.ndarray):
-        """Extend tensor by appending elements from a batched numpy array.
+        """Extends a tensor by appending elements from a batched numpy array.
 
         Example:
             >>> len(image)
@@ -76,7 +76,7 @@ def extend(self, array: np.ndarray):
         add_samples_to_tensor(array, self.key, storage=self.provider, batched=True)
 
     def append(self, array: np.ndarray):
-        """Append a sample to the end of the tensor.
+        """Appends a sample to the end of a tensor.
 
         Example:
             >>> len(image)
@@ -104,7 +104,7 @@ def shape(self):
         return self.meta["max_shape"]
 
     def __len__(self):
-        """Return the length of the primary axis."""
+        """Returns the length of the primary axis of a tensor."""
         return self.meta["length"]
 
     def __getitem__(self, item: Union[int, slice, Index]):
@@ -118,7 +118,7 @@ def __iter__(self):
             yield self[i]
 
     def numpy(self):
-        """Compute the contents of this tensor in numpy format.
+        """Computes the contents of a tensor in numpy format.
 
         Returns:
             A numpy array containing the data represented by this tensor.

From db24b05e9071cb5cbfc12cd7697a94c127adcf24 Mon Sep 17 00:00:00 2001
From: benchislett <chislett.ben@gmail.com>
Date: Fri, 4 Jun 2021 12:26:25 -0400
Subject: [PATCH 5/8] Support iterables in Tensor.extend

---
 hub/api/tensor.py         | 17 +++++++++++------
 hub/api/tests/test_api.py |  3 +++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/hub/api/tensor.py b/hub/api/tensor.py
index 74f1348fbe..b27d550e91 100644
--- a/hub/api/tensor.py
+++ b/hub/api/tensor.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Iterable
 import warnings
 
 import numpy as np
@@ -59,8 +59,9 @@ def __init__(
 
             create_tensor(self.key, self.provider, tensor_meta)
 
-    def extend(self, array: np.ndarray):
-        """Extends a tensor by appending elements from a batched numpy array.
+    def extend(self, array: Union[np.ndarray, Iterable[np.ndarray]]):
+        """Extends a tensor by appending multiple elements from an iterable.
+        Accepts an iterable of numpy arrays or a single batched numpy array.
 
         Example:
             >>> len(image)
@@ -70,10 +71,14 @@ def extend(self, array: np.ndarray):
             100
 
         Args:
-            array (np.ndarray): The data to add to the tensor.
-                The primary axis should be the number of samples to add.
+            array: The data to add to the tensor.
+                The length should be equal to the number of samples to add.
         """
-        add_samples_to_tensor(array, self.key, storage=self.provider, batched=True)
+        if isinstance(array, np.ndarray):
+            add_samples_to_tensor(array, self.key, storage=self.provider, batched=True)
+        else:
+            for sample in array:
+                self.append(sample)
 
     def append(self, array: np.ndarray):
         """Appends a sample to the end of a tensor.
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index fbf4a0af9b..7b20e4f0e9 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -35,6 +35,9 @@ def test_populate_dataset(ds):
     ds.image.append(np.ones((28, 28)))
     assert len(ds.image) == 5
 
+    ds.image.extend([np.ones((28, 28)), np.ones((28, 28))])
+    assert len(ds.image) == 7
+
     assert ds.meta == {"tensors": ["image"]}
 
 

From 146e82fd685afa1d36e6cd31a470b34337ee7e24 Mon Sep 17 00:00:00 2001
From: benchislett <chislett.ben@gmail.com>
Date: Fri, 4 Jun 2021 12:51:53 -0400
Subject: [PATCH 6/8] Update parameters to Dataset.create_tensor

---
 hub/api/dataset.py                            | 16 ++++++++---
 hub/api/tests/test_api.py                     |  7 ++---
 hub/constants.py                              |  2 ++
 .../tests/test_benchmark_chunk_engine.py      |  2 +-
 hub/core/meta/tensor_meta.py                  | 27 ++++++++++++++++---
 hub/core/tests/common.py                      |  4 +--
 6 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 15eaab5402..fbd2c226e5 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -82,15 +82,23 @@ def __getitem__(self, item: Union[str, int, slice, Index]):
             raise InvalidKeyTypeError(item)
 
     def create_tensor(
-        self, name: str, chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64"
+        self,
+        name: str,
+        htype: Optional[str] = None,
+        chunk_size: Optional[int] = None,
+        dtype: Optional[str] = None,
+        extra_meta: Optional[dict] = None,
     ):
         """Creates a new tensor in a dataset.
 
         Args:
             name (str): The name of the tensor to be created.
-            chunk_size (int): The target size for chunks in this tensor.
-            dtype (str): The dtype to use for this tensor.
+            htype (str, optional): The type of the data for the tensor.
+                May also modify the defaults for other parameters.
+            chunk_size (int, optional): The target size for chunks in this tensor.
+            dtype (str, optional): The data type to use for this tensor.
                 Will be overwritten when the first sample is added.
+            extra_meta (dict, optional): Any additional metadata to be added to the tensor.
 
         Returns:
             The new tensor, which can also be accessed by `self[name]`.
@@ -105,7 +113,7 @@ def create_tensor(
         ds_meta["tensors"].append(name)
         self.meta = ds_meta
 
-        tensor_meta = default_tensor_meta(chunk_size, dtype)
+        tensor_meta = default_tensor_meta(htype, chunk_size, dtype, extra_meta)
         tensor = Tensor(name, self.provider, tensor_meta=tensor_meta)
         self.tensors[name] = tensor
 
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 7b20e4f0e9..d10881500f 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -32,11 +32,12 @@ def test_populate_dataset(ds):
     assert len(ds) == 4
     assert len(ds.image) == 4
 
-    ds.image.append(np.ones((28, 28)))
-    assert len(ds.image) == 5
+    for _ in range(10):
+        ds.image.append(np.ones((28, 28)))
+    assert len(ds.image) == 14
 
     ds.image.extend([np.ones((28, 28)), np.ones((28, 28))])
-    assert len(ds.image) == 7
+    assert len(ds.image) == 16
 
     assert ds.meta == {"tensors": ["image"]}
 
diff --git a/hub/constants.py b/hub/constants.py
index 64f318d614..902480d327 100644
--- a/hub/constants.py
+++ b/hub/constants.py
@@ -6,6 +6,8 @@
 MB = 1000 * KB
 GB = 1000 * MB
 
+DEFAULT_DTYPE = "float64"
+
 DEFAULT_CHUNK_SIZE = 16 * MB
 MIN_FIRST_CACHE_SIZE = 32 * MB
 MIN_SECOND_CACHE_SIZE = 160 * MB
diff --git a/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py b/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py
index 6f18277d08..8354c34789 100644
--- a/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py
+++ b/hub/core/chunk_engine/tests/test_benchmark_chunk_engine.py
@@ -24,7 +24,7 @@
 def single_benchmark_write(info, key, arrays, chunk_size, storage, batched):
     actual_key = "%s_%i" % (key, info["iteration"])
 
-    create_tensor(actual_key, storage, default_tensor_meta(chunk_size))
+    create_tensor(actual_key, storage, default_tensor_meta(chunk_size=chunk_size))
 
     for a_in in arrays:
         add_samples_to_tensor(
diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
index a2868be607..694fff2369 100644
--- a/hub/core/meta/tensor_meta.py
+++ b/hub/core/meta/tensor_meta.py
@@ -1,10 +1,10 @@
 from hub.util.exceptions import TensorMetaInvalidValue, TensorMetaMissingKey
 import numpy as np
 import pickle  # TODO: NEVER USE PICKLE
-from typing import Any, Callable
+from typing import Any, Callable, Optional
 
 from hub.core.typing import StorageProvider
-from hub.constants import DEFAULT_CHUNK_SIZE
+from hub.constants import DEFAULT_CHUNK_SIZE, DEFAULT_DTYPE
 from hub.util.keys import get_tensor_meta_key
 from hub.util.array import normalize_and_batchify_shape
 
@@ -17,8 +17,27 @@ def read_tensor_meta(key: str, storage: StorageProvider) -> dict:
     return pickle.loads(storage[get_tensor_meta_key(key)])
 
 
-def default_tensor_meta(chunk_size: int = DEFAULT_CHUNK_SIZE, dtype: str = "float64"):
-    return {"chunk_size": chunk_size, "dtype": dtype, "length": 0}
+def default_tensor_meta(
+    htype: Optional[str] = None,
+    chunk_size: Optional[int] = None,
+    dtype: Optional[str] = None,
+    extra_meta: Optional[dict] = None,
+):
+    if chunk_size is None:
+        chunk_size = DEFAULT_CHUNK_SIZE
+    if dtype is None:
+        dtype = DEFAULT_DTYPE
+    if extra_meta is None:
+        extra_meta = {}
+
+    tensor_meta = extra_meta
+    tensor_meta["chunk_size"] = chunk_size
+    tensor_meta["dtype"] = dtype
+    tensor_meta["length"] = 0
+    if htype is not None:
+        tensor_meta["htype"] = htype  # TODO: identify presets
+
+    return tensor_meta
 
 
 def update_tensor_meta_with_array(
diff --git a/hub/core/tests/common.py b/hub/core/tests/common.py
index 7293d99055..039e2ac663 100644
--- a/hub/core/tests/common.py
+++ b/hub/core/tests/common.py
@@ -139,7 +139,7 @@ def run_engine_test(
     key = TENSOR_KEY
     sample_count = 0
 
-    create_tensor(key, storage, default_tensor_meta(chunk_size))
+    create_tensor(key, storage, default_tensor_meta(chunk_size=chunk_size))
 
     for i, a_in in enumerate(arrays):
         add_samples_to_tensor(
@@ -181,7 +181,7 @@ def benchmark_write(
     key, arrays, chunk_size, storage, batched, clear_memory_after_write=True
 ):
 
-    create_tensor(key, storage, default_tensor_meta(chunk_size))
+    create_tensor(key, storage, default_tensor_meta(chunk_size=chunk_size))
 
     for a_in in arrays:
         add_samples_to_tensor(

From a9dca38111aebc4f99549e422c3a12d804c52a19 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <chislett.ben@gmail.com>
Date: Fri, 4 Jun 2021 14:29:22 -0400
Subject: [PATCH 7/8] Update Dataset.create_tensor docstring

Better explanation of htype
---
 hub/api/dataset.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 18efa8f2c1..2b85da0c25 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -102,7 +102,10 @@ def create_tensor(
 
                 Args:
                     name (str): The name of the tensor to be created.
-                    htype (str, optional): The type of the data for the tensor.
+                    htype (str, optional): The class of data for the tensor. 
+                        The defaults for other parameters are determined in terms of this value. 
+                        For example, `htype="image"` would have `dtype` default to `uint8`.
+                        These defaults can be overridden by explicitly passing any of the other parameters to this function.
                         May also modify the defaults for other parameters.
                     chunk_size (int, optional): The target size for chunks in this tensor.
                     dtype (str, optional): The data type to use for this tensor.

From 0e175383ff31505debe705f9fa52139a681bc3af Mon Sep 17 00:00:00 2001
From: benchislett <chislett.ben@gmail.com>
Date: Fri, 4 Jun 2021 14:30:03 -0400
Subject: [PATCH 8/8] Remove artifact of merge conflict from docstring

---
 hub/api/dataset.py | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 2b85da0c25..4e46ebe004 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -100,24 +100,23 @@ def create_tensor(
     ):
         """Creates a new tensor in a dataset.
 
-                Args:
-                    name (str): The name of the tensor to be created.
-                    htype (str, optional): The class of data for the tensor. 
-                        The defaults for other parameters are determined in terms of this value. 
-                        For example, `htype="image"` would have `dtype` default to `uint8`.
-                        These defaults can be overridden by explicitly passing any of the other parameters to this function.
-                        May also modify the defaults for other parameters.
-                    chunk_size (int, optional): The target size for chunks in this tensor.
-                    dtype (str, optional): The data type to use for this tensor.
-                        Will be overwritten when the first sample is added.
-                    extra_meta (dict, optional): Any additional metadata to be added to the tensor.
-
-        <<<<<<< HEAD
-                Returns:
-                    The new tensor, which can also be accessed by `self[name]`.
-
-                Raises:
-                    TensorAlreadyExistsError: Duplicate tensors are not allowed.
+        Args:
+            name (str): The name of the tensor to be created.
+            htype (str, optional): The class of data for the tensor.
+                The defaults for other parameters are determined in terms of this value.
+                For example, `htype="image"` would have `dtype` default to `uint8`.
+                These defaults can be overridden by explicitly passing any of the other parameters to this function.
+                May also modify the defaults for other parameters.
+            chunk_size (int, optional): The target size for chunks in this tensor.
+            dtype (str, optional): The data type to use for this tensor.
+                Will be overwritten when the first sample is added.
+            extra_meta (dict, optional): Any additional metadata to be added to the tensor.
+
+        Returns:
+            The new tensor, which can also be accessed by `self[name]`.
+
+        Raises:
+            TensorAlreadyExistsError: Duplicate tensors are not allowed.
         """
         if tensor_exists(name, self.storage):
             raise TensorAlreadyExistsError(name)