Skip to content

Commit

Permalink
Merge pull request #2256 from activeloopai/hf_updates
Browse files Browse the repository at this point in the history
Updates to ingest_huggingface
  • Loading branch information
istranic authored Mar 28, 2023
2 parents b2a8df2 + 424e173 commit aedeb46
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 6 deletions.
36 changes: 33 additions & 3 deletions deeplake/integrations/huggingface/huggingface.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import pathlib
from typing import Union, Set
from typing import Dict, Set
from deeplake.core.dataset import Dataset
import posixpath
import deeplake
from typing import Optional
from tqdm import tqdm # type: ignore
from deeplake.util.bugout_reporter import feature_report_path, deeplake_reporter


def _is_seq_convertible(seq):
Expand Down Expand Up @@ -68,6 +70,9 @@ def ingest_huggingface(
src,
dest,
use_progressbar=True,
token: Optional[str] = None,
connect_kwargs: Optional[Dict] = None,
**dataset_kwargs,
) -> Dataset:
"""Converts Hugging Face datasets to Deep Lake format.
Expand All @@ -76,10 +81,16 @@ def ingest_huggingface(
DatasetDict will be stored under respective tensor groups.
dest (Dataset, str, pathlib.Path): Destination dataset or path to it.
use_progressbar (bool): Defines if progress bar should be used to show conversion progress.
token (Optional[str]): The token to use for accessing the dataset and/or connecting it to Deep Lake.
connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`.
**dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function. See :func:`deeplake.empty`.
Returns:
Dataset: The destination Deep Lake dataset.
Raises:
ValueError: If ``dest`` is not a path or a Deep Lake :class:`Dataset <deeplake.core.dataset.Dataset>`.
Note:
- if DatasetDict looks like:
Expand All @@ -100,12 +111,31 @@ def ingest_huggingface(
Features of the type ``Sequence(feature=Value(dtype='string'))`` are not supported. Columns of such type are skipped.
"""

feature_report_path(
dest,
"ingest_huggingface",
parameters={},
token=token,
)

from datasets import DatasetDict

if isinstance(dest, (str, pathlib.Path)):
ds = deeplake.dataset(dest)
ds = deeplake.empty(dest, token=token, **dataset_kwargs)
elif isinstance(dest, Dataset):
if dataset_kwargs.get("overwrite"):
ds = deeplake.empty(dest.path, token=token, **dataset_kwargs)
else:
ds = dest # type: ignore
else:
ds = dest # type: ignore
raise ValueError(
f"Expected `dest` to be a path or deeplake Dataset object, got {type(dest)}."
)

if connect_kwargs is not None:
connect_kwargs["token"] = token or connect_kwargs.get("token")
ds.connect(**connect_kwargs)

if isinstance(src, DatasetDict):
for split, src_ds in src.items():
Expand Down
15 changes: 13 additions & 2 deletions deeplake/integrations/tests/test_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from deeplake.api.tests.test_api import convert_string_to_pathlib_if_needed
from deeplake.integrations.huggingface import ingest_huggingface
from deeplake.integrations.huggingface.huggingface import _is_seq_convertible
from deeplake.util.exceptions import TensorAlreadyExistsError
from numpy.testing import assert_array_equal

import pytest
Expand Down Expand Up @@ -39,9 +40,9 @@ def test_split():
assert_array_equal(dl_ds[column].numpy().reshape(-1), ds[column])


def test_seq_with_dict():
def test_seq_with_dict(local_path):
ds = load_dataset("squad", split="train[:5%]")
dl_ds = deeplake.ingest_huggingface(ds, "mem://xyz")
dl_ds = deeplake.ingest_huggingface(ds, local_path)

keys = set(ds.column_names) - {"answers"} | {"answers/text", "answers/answer_start"}

Expand All @@ -60,6 +61,16 @@ def test_seq_with_dict():
dl_ds["answers/answer_start"].numpy().reshape(-1), answers["answer_start"]
)

# test overwrite
with pytest.raises(TensorAlreadyExistsError):
dl_ds = deeplake.ingest_huggingface(ds, dl_ds)

dl_ds = deeplake.ingest_huggingface(ds, dl_ds, overwrite=True)
assert_array_equal(dl_ds["answers/text"].numpy().reshape(-1), answers["text"])
assert_array_equal(
dl_ds["answers/answer_start"].numpy().reshape(-1), answers["answer_start"]
)


def test_seq():
arr1 = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
Expand Down
2 changes: 1 addition & 1 deletion deeplake/util/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def __init__(self, tensor_name: str):
class TensorAlreadyExistsError(Exception):
def __init__(self, key: str):
super().__init__(
f"Tensor '{key}' already exists. If applicable, you can use the `overwrite=True` parameter!"
f"Tensor '{key}' already exists. You can use the `exist_ok=True` parameter to ignore this error message."
)


Expand Down

0 comments on commit aedeb46

Please sign in to comment.