Skip to content

Commit

Permalink
Merge branch 'main' of http://github.com/activeloopai/Hub into featur…
Browse files Browse the repository at this point in the history
…e/polish_main_directory
  • Loading branch information
kristinagrig06 committed Aug 6, 2021
2 parents 395d794 + 7aa63c5 commit ae7ccea
Show file tree
Hide file tree
Showing 47 changed files with 1,302 additions and 290 deletions.
5 changes: 5 additions & 0 deletions hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@
from .core.transform import compute, compose
from .util.bugout_reporter import hub_reporter

list = dataset.list
load = dataset.load
empty = dataset.empty
like = dataset.like
ingest = dataset.ingest
ingest_kaggle = dataset.ingest_kaggle

__all__ = [
"dataset",
Expand All @@ -28,6 +31,8 @@
"compute",
"compose",
"like",
"ingest",
"ingest_kaggle",
]

__version__ = "2.0.4"
Expand Down
198 changes: 187 additions & 11 deletions hub/api/dataset.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,23 @@
from hub.util.exceptions import DatasetHandlerError
from hub.util.exceptions import (
DatasetHandlerError,
InvalidPathException,
SamePathException,
)
from hub.util.storage import get_storage_and_cache_chain
import hub
import os
from typing import Optional, Union
from hub.constants import DEFAULT_LOCAL_CACHE_SIZE, DEFAULT_MEMORY_CACHE_SIZE, MB
from hub.core.dataset import Dataset

from hub.constants import DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE, MB
from hub.client.log import logger
from hub.util.keys import dataset_exists
from hub.util.bugout_reporter import hub_reporter
from hub.auto.unstructured.image_classification import ImageClassification
from hub.auto.unstructured.kaggle import download_kaggle_dataset
from hub.client.client import HubBackendClient
from hub.util.exceptions import DatasetHandlerError
from hub.util.storage import get_storage_and_cache_chain, storage_provider_from_path
from hub.core.dataset import Dataset


class dataset:
Expand Down Expand Up @@ -189,8 +202,27 @@ def load(
@staticmethod
@hub_reporter.record_call
def delete(path: str, force: bool = False, large_ok: bool = False) -> None:
"""Deletes a dataset"""
raise NotImplementedError
"""Deletes a dataset at a given path.
This is an IRREVERSIBLE operation. Data once deleted can not be recovered.
Args:
path (str): The path to the dataset to be deleted.
force (bool): Delete data regardless of whether
it looks like a hub dataset. All data at the path will be removed.
large_ok (bool): Delete datasets larger than 1GB. Disabled by default.
"""

try:
ds = hub.load(path)
ds.delete(large_ok=large_ok)
except:
if force:
base_storage = storage_provider_from_path(
path, creds={}, read_only=False, token=None
)
base_storage.clear()
else:
raise

@staticmethod
@hub_reporter.record_call
Expand Down Expand Up @@ -227,12 +259,156 @@ def like(
@staticmethod
@hub_reporter.record_call
def ingest(
path: str, src: str, src_creds: dict, overwrite: bool = False
src: str,
dest: str,
dest_creds: dict,
compression: str,
overwrite: bool = False,
**dataset_kwargs,
) -> Dataset:
"""Ingests a dataset from a source and stores it as a structured dataset to destination
Note:
- Currently only local source paths and image classification datasets are supported for automatic ingestion.
- Supported filetypes: png/jpeg/jpg.
- All files and sub-directories with unsupported filetypes are ignored.
- Valid source directory structures look like:
src:
directory -
img.jpg
...
src:
directory -
class0 -
img.jpg
...
class1 -
img.jpg
...
...
src:
directory -
train -
class0 -
img.jpg
...
...
test -
class1 -
img.jpg
...
...
...
- Classes defined as sub-directories can be accessed at `ds["test/labels"].info.class_names`.
- Support for train and test sub directories is present under ds["train/images"], ds["train/labels"] and ds["test/images"], ds["test/labels"]
- Mapping filenames to classes from an external file is currently not supported.
Args:
src (str): Local path to where the unstructured dataset is stored.
dest (str): Destination path where the structured dataset will be stored. Can be:-
- a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
- an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
- a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
- a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
dest_creds (dict): A dictionary containing credentials used to access the destination path of the dataset.
compression (str): Compression type of dataset.
overwrite (bool): WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
**dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function.
Returns:
Dataset: New dataset object with structured dataset.
Raises:
InvalidPathException: If the source directory does not exist.
SamePathException: If the source and destination path are same.
"""
if not os.path.isdir(src):
raise InvalidPathException(src)

if os.path.isdir(dest):
if os.path.samefile(src, dest):
raise SamePathException(src)

ds = hub.dataset(dest, creds=dest_creds, **dataset_kwargs)

# TODO: support more than just image classification (and update docstring)
unstructured = ImageClassification(source=src)

# TODO: auto detect compression
unstructured.structure(
ds, image_tensor_args={"sample_compression": compression} # type: ignore
)

return ds # type: ignore

@staticmethod
@hub_reporter.record_call
def ingest_kaggle(
tag: str,
src: str,
dest: str,
dest_creds: dict,
compression: str,
overwrite: bool = False,
**dataset_kwargs,
) -> Dataset:
"""Ingests a dataset from a source"""
raise NotImplementedError
"""Download and ingest a kaggle dataset and store it as a structured dataset to destination
Note:
Currently only local source paths and image classification datasets are supported for automatic ingestion.
Args:
tag (str): Kaggle dataset tag. Example: `"coloradokb/dandelionimages"` points to https://www.kaggle.com/coloradokb/dandelionimages
src (str): Local path to where the unstructured dataset is stored.
dest (str): Destination path where the structured dataset will be stored. Can be:-
- a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
- an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
- a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
- a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
dest_creds (dict): A dictionary containing credentials used to access the destination path of the dataset.
compression (str): Compression type of dataset.
overwrite (bool): WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
**dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function.
Returns:
Dataset: New dataset object with structured dataset.
Raises:
SamePathException: If the source and destination path are same.
"""
if os.path.isdir(src) and os.path.isdir(dest):
if os.path.samefile(src, dest):
raise SamePathException(src)

download_kaggle_dataset(tag, local_path=src)

ds = hub.ingest(
src=src,
dest=dest,
dest_creds=dest_creds,
compression=compression,
overwrite=overwrite,
**dataset_kwargs,
)

return ds

@staticmethod
def list(workspace: str) -> None:
"""List all datasets"""
raise NotImplementedError
def list(workspace: str = "") -> None:
"""List all available hub cloud datasets.
Args:
workspace (str): Specify user/organization name. If not given,
returns a list of all datasets that can be accessed, regardless of what workspace they are in.
Otherwise, lists all datasets in the given workspace.
Returns:
List of dataset names.
"""
client = HubBackendClient()
datasets = client.get_datasets(workspace=workspace)
return datasets
Loading

0 comments on commit ae7ccea

Please sign in to comment.