Skip to content

Commit

Permalink
Merge branch 'master' into feature/filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
AbhinavTuli committed Jan 19, 2021
2 parents 5098469 + 4f537c5 commit 9bd63d3
Show file tree
Hide file tree
Showing 9 changed files with 157 additions and 13 deletions.
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ jobs:
environment:
IMAGE_NAME: snarkai/hub
steps:
- setup_remote_docker
- checkout
- run:
name: "Init .pypirc"
Expand Down
15 changes: 15 additions & 0 deletions CONTRIBUTORS.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Credit
For everyone who has contributed in big and in small ways, **thank you!**.

## Contributors
This is a list of active contributors. Feel free to @ message them in issues and converse with them directly.

- [Darkborderman](https://github.com/Darkborderman)
- [sparkingdark](https://github.com/sparkingdark)
- [sanggusti](https://github.com/sanggusti)
- [haiyangdeperci](https://github.com/haiyangdeperci)
- [sohamsshah](https://github.com/sohamsshah)
- [YuvalOfer](https://github.com/YuvalOfer)

## Data Maintainers
Help us make Hub the largest repository of unstructured datasets!
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ Also, if you need a publicly available dataset that you cannot find in the Hub,
```sh
hub register
hub login

# alternatively, on platforms like kaggle use this, replacing username and password with actual credentials
hub login -u username -p password
```

2. Then create a dataset, specifying its name and upload it to your account. For instance:
Expand Down
20 changes: 11 additions & 9 deletions docs/source/concepts/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@ Np-array like structure that contains any type of elements (Primitive and non-Pr
```python
from hub.schema import Tensor

schema = {"tensor_1": Tensor((None, None), max_shape=(200, 200), "int32"),
"tensor_2": Tensor((100, 400), "int64", chunks=(6, 50, 200)) }
schema = {
"tensor_1": Tensor((None, None), "int32", max_shape=(200, 200)),
"tensor_2": Tensor((100, 400), "int64", chunks=(6, 50, 200))
}
```

### Image
Expand All @@ -44,8 +46,7 @@ from hub.schema import Image

schema = {"image": Image(shape=(None, None),
dtype="int32",
max_shape=(100, 100)
) }
max_shape=(100, 100))}
```

### ClassLabel
Expand All @@ -55,10 +56,11 @@ Integer representation of feature labels. Can be constructed from number of labe
```python
from hub.schema import ClassLabel

schema = {"class_label_1": ClassLabel(num_classes=10),
"class_label_2": ClassLabel(names=['class1', 'class2', 'class3', ...]),
"class_label_3": ClassLabel(names_file='/path/to/file/with/names')
) }
schema = {
"class_label_1": ClassLabel(num_classes=10),
"class_label_2": ClassLabel(names=['class1', 'class2', 'class3', ...]),
"class_label_3": ClassLabel(names_file='/path/to/file/with/names')
}
```

### Mask
Expand Down Expand Up @@ -103,7 +105,7 @@ will attempt to infer it from the file extension. Also, `sample_rate` parameter
```python
from hub.schema import Audio

schema = {'audio': Audio(shape=(300,)}
schema = {'audio': Audio(shape=(300,))}
```

### Video
Expand Down
43 changes: 39 additions & 4 deletions hub/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from collections import defaultdict

import fsspec
from fsspec.spec import AbstractFileSystem
import numcodecs
import numcodecs.lz4
import numcodecs.zstd
Expand Down Expand Up @@ -68,7 +69,7 @@ class Dataset:
def __init__(
self,
url: str,
mode: str = "a",
mode: str = None,
shape=None,
schema=None,
token=None,
Expand All @@ -81,6 +82,7 @@ def __init__(
tokenizer=None,
lazy: bool = True,
public: bool = True,
name: str = None,
):
"""| Open a new or existing dataset for read/write
Expand Down Expand Up @@ -115,21 +117,23 @@ def __init__(
only applicable if using hub storage, ignored otherwise
setting this to False allows only the user who created it to access the dataset and
the dataset won't be visible in the visualizer to the public
name: str, optional
only applicable when using hub storage, this is the name that shows up on the visualizer
"""

shape = norm_shape(shape)
if len(shape) != 1:
raise ShapeLengthException()
mode = mode or "a"

storage_cache = norm_cache(storage_cache) if cache else 0
cache = norm_cache(cache)
schema: SchemaDict = featurify(schema) if schema else None

self._url = url
self._token = token
self._mode = mode
self.tokenizer = tokenizer
self.lazy = lazy
self._name = name

self._fs, self._path = (
(fs, url) if fs else get_fs_and_path(self._url, token=token, public=public)
Expand All @@ -138,7 +142,8 @@ def __init__(
self._storage_cache = storage_cache
self.lock_cache = lock_cache
self.verison = "1.x"

mode = self._get_mode(mode, self._fs)
self._mode = mode
needcreate = self._check_and_prepare_dir()
fs_map = fs_map or get_storage_map(
self._fs, self._path, cache, lock=lock_cache, storage_cache=storage_cache
Expand All @@ -149,6 +154,7 @@ def __init__(
self.dataset_name = None
if not needcreate:
self.meta = json.loads(fs_map["meta.json"].decode("utf-8"))
self._name = self.meta.get("name") or None
self._shape = tuple(self.meta["shape"])
self._schema = hub.schema.deserialize.deserialize(self.meta["schema"])
self._meta_information = self.meta.get("meta_info") or dict()
Expand Down Expand Up @@ -219,6 +225,10 @@ def url(self):
def shape(self):
return self._shape

@property
def name(self):
return self._name

@property
def token(self):
return self._token
Expand All @@ -245,6 +255,7 @@ def _store_meta(self) -> dict:
"schema": hub.schema.serialize.serialize(self._schema),
"version": 1,
"meta_info": self._meta_information or dict(),
"name": self._name,
}

self._fs_map["meta.json"] = bytes(json.dumps(meta), "utf-8")
Expand Down Expand Up @@ -492,6 +503,12 @@ def append_shape(self, size: int):
size += self._shape[0]
self.resize_shape(size)

def rename(self, name: str) -> None:
""" Renames the dataset """
self._name = name
self.meta = self._store_meta()
self.flush()

def delete(self):
""" Deletes the dataset """
fs, path = self._fs, self._path
Expand Down Expand Up @@ -727,6 +744,24 @@ def keys(self):
"""
return self._tensors.keys()

def _get_mode(self, mode: str, fs: AbstractFileSystem):
if mode:
if mode not in ["r", "r+", "a", "a+", "w", "w+"]:
raise Exception(f"Invalid mode {mode}")
return mode
else:
try:
meta_path = posixpath.join(self._path, "meta.json")
if not fs.exists(self._path) or not fs.exists(meta_path):
return "a"
bytes_ = bytes("Hello", "utf-8")
path = posixpath.join(self._path, "mode_test")
fs.pipe(path, bytes_)
fs.rm(path)
except:
return "r"
return "a"

@staticmethod
def from_tensorflow(ds, scheduler: str = "single", workers: int = 1):
"""Converts a tensorflow dataset into hub format.
Expand Down
14 changes: 14 additions & 0 deletions hub/api/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,20 @@ def test_dataset_utils():
slice_extract_info(slice(20, None), 50)


def test_dataset_name():
schema = {"temp": "uint8"}
ds = Dataset(
"./data/test_ds_name", shape=(10,), schema=schema, name="my_dataset", mode="w"
)
ds.flush()
assert ds.name == "my_dataset"
ds2 = Dataset("./data/test_ds_name")
ds2.rename("my_dataset_2")
assert ds2.name == "my_dataset_2"
ds3 = Dataset("./data/test_ds_name")
assert ds3.name == "my_dataset_2"


if __name__ == "__main__":
test_dataset_assign_value()
test_dataset_setting_shape()
Expand Down
1 change: 1 addition & 0 deletions hub/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
DEFAULT_COMPRESSOR = "default"
DEFAULT_MEMORY_CACHE_SIZE = 2 ** 26
DEFAULT_STORAGE_CACHE_SIZE = 2 ** 28
AZURE_HOST_SUFFIX = "blob.core.windows.net"
15 changes: 15 additions & 0 deletions hub/tests/test_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# JUST TO GET COVERAGE
from hub.url import UrlProtocol, UrlType, Url


def test_url():
Url.parse("Some url")
Url(
UrlType.LOCAL,
UrlProtocol.FILESYSTEM,
"some path",
"some bucket",
"some user",
"some dataset",
"some endpoint",
).url
58 changes: 58 additions & 0 deletions hub/url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# NOT IMPLEMENTED COMPLETELY YET!

from typing import Union
from enum import Enum

from hub.defaults import AZURE_HOST_SUFFIX


class UrlProtocol(Enum):
UNKNOWN = "unknown"
S3 = "s3"
GCS = "gcs"
AZURE = "azure"
FILESYSTEM = "filesystem"


class UrlType(Enum):
HUB = "hub"
LOCAL = "local"
CLOUD = "cloud"


class Url:
@classmethod
def parse(cls, url: str) -> "Url":
assert isinstance(url, str)

pass

def __init__(
self,
url_type: UrlType,
protocol: UrlProtocol,
path: str, # for get_mapper(path)
bucket: Union[str, None] = None,
user: Union[str, None] = None,
dataset: Union[str, None] = None,
endpoint_url: Union[str, None] = None,
):
assert isinstance(url_type, UrlType)
assert isinstance(protocol, UrlProtocol)
assert isinstance(path, str)
assert isinstance(bucket, str) or bucket is None
assert isinstance(user, str) or user is None
assert isinstance(dataset, str) or dataset is None
assert isinstance(endpoint_url, str) or endpoint_url is None

self.url_type = url_type
self.protocol = protocol
self.path = path
self.bucket = bucket
self.user = user
self.dataset = dataset
self.endpoint_url = endpoint_url

@property
def url(self) -> str:
pass

0 comments on commit 9bd63d3

Please sign in to comment.