Merge branch 'master' into feature/filtering

activeloopai · Jan 19, 2021 · 9bd63d3 · 9bd63d3
2 parents 5098469 + 4f537c5
commit 9bd63d3
Show file tree

Hide file tree

Showing 9 changed files with 157 additions and 13 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -124,6 +124,7 @@ jobs:
     environment:
       IMAGE_NAME: snarkai/hub
     steps:
+      - setup_remote_docker
       - checkout
       - run:
           name: "Init .pypirc"

diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst
@@ -0,0 +1,15 @@
+# Credit
+For everyone who has contributed in big and in small ways, **thank you!**. 
+
+## Contributors
+This is a list of active contributors. Feel free to @ message them in issues and converse with them directly.
+
+- [Darkborderman](https://github.com/Darkborderman)
+- [sparkingdark](https://github.com/sparkingdark)
+- [sanggusti](https://github.com/sanggusti)
+- [haiyangdeperci](https://github.com/haiyangdeperci)
+- [sohamsshah](https://github.com/sohamsshah)
+- [YuvalOfer](https://github.com/YuvalOfer)
+
+## Data Maintainers
+Help us make Hub the largest repository of unstructured datasets!
diff --git a/README.md b/README.md
@@ -117,6 +117,9 @@ Also, if you need a publicly available dataset that you cannot find in the Hub,
 ```sh
 hub register
 hub login
+
+# alternatively, on platforms like kaggle use this, replacing username and password with actual credentials
+hub login -u username -p password
 ```
 
 2. Then create a dataset, specifying its name and upload it to your account. For instance:

diff --git a/docs/source/concepts/features.md b/docs/source/concepts/features.md
@@ -29,8 +29,10 @@ Np-array like structure that contains any type of elements (Primitive and non-Pr
 ```python
 from hub.schema import Tensor
 
-schema = {"tensor_1": Tensor((None, None), max_shape=(200, 200), "int32"),
-          "tensor_2": Tensor((100, 400), "int64", chunks=(6, 50, 200)) }
+schema = {
+    "tensor_1": Tensor((None, None), "int32", max_shape=(200, 200)),
+    "tensor_2": Tensor((100, 400), "int64", chunks=(6, 50, 200))
+}
 ```
 
 ### Image
@@ -44,8 +46,7 @@ from hub.schema import Image
 
 schema = {"image": Image(shape=(None, None),
                          dtype="int32",
-                         max_shape=(100, 100)
-          ) }
+                         max_shape=(100, 100))}
 ```
 
 ### ClassLabel
@@ -55,10 +56,11 @@ Integer representation of feature labels. Can be constructed from number of labe
 ```python
 from hub.schema import ClassLabel
 
-schema = {"class_label_1": ClassLabel(num_classes=10),
-          "class_label_2": ClassLabel(names=['class1', 'class2', 'class3', ...]),
-          "class_label_3": ClassLabel(names_file='/path/to/file/with/names')
-          ) }
+schema = {
+    "class_label_1": ClassLabel(num_classes=10),
+    "class_label_2": ClassLabel(names=['class1', 'class2', 'class3', ...]),
+    "class_label_3": ClassLabel(names_file='/path/to/file/with/names')
+}
 ```
 
 ### Mask 
@@ -103,7 +105,7 @@ will attempt to infer it from the file extension. Also, `sample_rate` parameter
 ```python
 from hub.schema import Audio
 
-schema = {'audio': Audio(shape=(300,)}
+schema = {'audio': Audio(shape=(300,))}
 ```
 
 ### Video

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
@@ -7,6 +7,7 @@
 from collections import defaultdict
 
 import fsspec
+from fsspec.spec import AbstractFileSystem
 import numcodecs
 import numcodecs.lz4
 import numcodecs.zstd
@@ -68,7 +69,7 @@ class Dataset:
     def __init__(
         self,
         url: str,
-        mode: str = "a",
+        mode: str = None,
         shape=None,
         schema=None,
         token=None,
@@ -81,6 +82,7 @@ def __init__(
         tokenizer=None,
         lazy: bool = True,
         public: bool = True,
+        name: str = None,
     ):
         """| Open a new or existing dataset for read/write
 
@@ -115,21 +117,23 @@ def __init__(
             only applicable if using hub storage, ignored otherwise
             setting this to False allows only the user who created it to access the dataset and
             the dataset won't be visible in the visualizer to the public
+        name: str, optional
+            only applicable when using hub storage, this is the name that shows up on the visualizer
         """
 
         shape = norm_shape(shape)
         if len(shape) != 1:
             raise ShapeLengthException()
-        mode = mode or "a"
+
         storage_cache = norm_cache(storage_cache) if cache else 0
         cache = norm_cache(cache)
         schema: SchemaDict = featurify(schema) if schema else None
 
         self._url = url
         self._token = token
-        self._mode = mode
         self.tokenizer = tokenizer
         self.lazy = lazy
+        self._name = name
 
         self._fs, self._path = (
             (fs, url) if fs else get_fs_and_path(self._url, token=token, public=public)
@@ -138,7 +142,8 @@ def __init__(
         self._storage_cache = storage_cache
         self.lock_cache = lock_cache
         self.verison = "1.x"
-
+        mode = self._get_mode(mode, self._fs)
+        self._mode = mode
         needcreate = self._check_and_prepare_dir()
         fs_map = fs_map or get_storage_map(
             self._fs, self._path, cache, lock=lock_cache, storage_cache=storage_cache
@@ -149,6 +154,7 @@ def __init__(
         self.dataset_name = None
         if not needcreate:
             self.meta = json.loads(fs_map["meta.json"].decode("utf-8"))
+            self._name = self.meta.get("name") or None
             self._shape = tuple(self.meta["shape"])
             self._schema = hub.schema.deserialize.deserialize(self.meta["schema"])
             self._meta_information = self.meta.get("meta_info") or dict()
@@ -219,6 +225,10 @@ def url(self):
     def shape(self):
         return self._shape
 
+    @property
+    def name(self):
+        return self._name
+
     @property
     def token(self):
         return self._token
@@ -245,6 +255,7 @@ def _store_meta(self) -> dict:
             "schema": hub.schema.serialize.serialize(self._schema),
             "version": 1,
             "meta_info": self._meta_information or dict(),
+            "name": self._name,
         }
 
         self._fs_map["meta.json"] = bytes(json.dumps(meta), "utf-8")
@@ -492,6 +503,12 @@ def append_shape(self, size: int):
         size += self._shape[0]
         self.resize_shape(size)
 
+    def rename(self, name: str) -> None:
+        """ Renames the dataset """
+        self._name = name
+        self.meta = self._store_meta()
+        self.flush()
+
     def delete(self):
         """ Deletes the dataset """
         fs, path = self._fs, self._path
@@ -727,6 +744,24 @@ def keys(self):
         """
         return self._tensors.keys()
 
+    def _get_mode(self, mode: str, fs: AbstractFileSystem):
+        if mode:
+            if mode not in ["r", "r+", "a", "a+", "w", "w+"]:
+                raise Exception(f"Invalid mode {mode}")
+            return mode
+        else:
+            try:
+                meta_path = posixpath.join(self._path, "meta.json")
+                if not fs.exists(self._path) or not fs.exists(meta_path):
+                    return "a"
+                bytes_ = bytes("Hello", "utf-8")
+                path = posixpath.join(self._path, "mode_test")
+                fs.pipe(path, bytes_)
+                fs.rm(path)
+            except:
+                return "r"
+            return "a"
+
     @staticmethod
     def from_tensorflow(ds, scheduler: str = "single", workers: int = 1):
         """Converts a tensorflow dataset into hub format.

diff --git a/hub/api/tests/test_dataset.py b/hub/api/tests/test_dataset.py
@@ -854,6 +854,20 @@ def test_dataset_utils():
     slice_extract_info(slice(20, None), 50)
 
 
+def test_dataset_name():
+    schema = {"temp": "uint8"}
+    ds = Dataset(
+        "./data/test_ds_name", shape=(10,), schema=schema, name="my_dataset", mode="w"
+    )
+    ds.flush()
+    assert ds.name == "my_dataset"
+    ds2 = Dataset("./data/test_ds_name")
+    ds2.rename("my_dataset_2")
+    assert ds2.name == "my_dataset_2"
+    ds3 = Dataset("./data/test_ds_name")
+    assert ds3.name == "my_dataset_2"
+
+
 if __name__ == "__main__":
     test_dataset_assign_value()
     test_dataset_setting_shape()

diff --git a/hub/defaults.py b/hub/defaults.py
@@ -3,3 +3,4 @@
 DEFAULT_COMPRESSOR = "default"
 DEFAULT_MEMORY_CACHE_SIZE = 2 ** 26
 DEFAULT_STORAGE_CACHE_SIZE = 2 ** 28
+AZURE_HOST_SUFFIX = "blob.core.windows.net"
diff --git a/hub/tests/test_url.py b/hub/tests/test_url.py
@@ -0,0 +1,15 @@
+# JUST TO GET COVERAGE
+from hub.url import UrlProtocol, UrlType, Url
+
+
+def test_url():
+    Url.parse("Some url")
+    Url(
+        UrlType.LOCAL,
+        UrlProtocol.FILESYSTEM,
+        "some path",
+        "some bucket",
+        "some user",
+        "some dataset",
+        "some endpoint",
+    ).url
diff --git a/hub/url.py b/hub/url.py
@@ -0,0 +1,58 @@
+# NOT IMPLEMENTED COMPLETELY YET!
+
+from typing import Union
+from enum import Enum
+
+from hub.defaults import AZURE_HOST_SUFFIX
+
+
+class UrlProtocol(Enum):
+    UNKNOWN = "unknown"
+    S3 = "s3"
+    GCS = "gcs"
+    AZURE = "azure"
+    FILESYSTEM = "filesystem"
+
+
+class UrlType(Enum):
+    HUB = "hub"
+    LOCAL = "local"
+    CLOUD = "cloud"
+
+
+class Url:
+    @classmethod
+    def parse(cls, url: str) -> "Url":
+        assert isinstance(url, str)
+
+        pass
+
+    def __init__(
+        self,
+        url_type: UrlType,
+        protocol: UrlProtocol,
+        path: str,  # for get_mapper(path)
+        bucket: Union[str, None] = None,
+        user: Union[str, None] = None,
+        dataset: Union[str, None] = None,
+        endpoint_url: Union[str, None] = None,
+    ):
+        assert isinstance(url_type, UrlType)
+        assert isinstance(protocol, UrlProtocol)
+        assert isinstance(path, str)
+        assert isinstance(bucket, str) or bucket is None
+        assert isinstance(user, str) or user is None
+        assert isinstance(dataset, str) or dataset is None
+        assert isinstance(endpoint_url, str) or endpoint_url is None
+
+        self.url_type = url_type
+        self.protocol = protocol
+        self.path = path
+        self.bucket = bucket
+        self.user = user
+        self.dataset = dataset
+        self.endpoint_url = endpoint_url
+
+    @property
+    def url(self) -> str:
+        pass