Allow virtual tensors .data to use indra_ts.numpy as fallback. (#2873) · activeloopai/deeplake@8b27518 · GitHub

GitHub Actions / JUnit Test Report failed Jun 13, 2024 in 0s

22067 tests run, 11585 passed, 10473 skipped, 9 failed.

Annotations

Check failure on line 17 in deeplake/util/tests/test_read.py

github-actions / JUnit Test Report

test_read.test[3.9.10]

deeplake.util.exceptions.DatasetHandlerError: A Deep Lake dataset does not exist at the given path (./datasets/3_9_10). Check the path provided or in case you want to create a new dataset, use deeplake.empty().

Raw output


            version = '3.9.10', request = <FixtureRequest for <Function test[3.9.10]>>

    @versions
    def test(version, request):
        assert_version(version)
>       ds = load_dataset(version)

buH/buh/tests/test_read.py:17: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
buH/buh/tests/common.py:86: in load_dataset
    return _bc_load_dataset(dataset_path)
buH/buh/tests/common.py:79: in _bc_load_dataset
    return loader(path)
buH/buh/tests/common.py:58: in _load1
    return hub.load(path)
deeplake/util/spinner.py:151: in inner
    return func(*args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

path = './datasets/3_9_10', read_only = None, memory_cache_size = 2000
local_cache_size = 0, creds = {}, token = None, org_id = None, verbose = True
access_method = 'stream', unlink = False, reset = False, indra = False
check_integrity = None, lock_timeout = 0, lock_enabled = True
index_params = None

    @staticmethod
    @spinner
    def load(
        path: Union[str, pathlib.Path],
        read_only: Optional[bool] = None,
        memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE,
        local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
        creds: Optional[Union[dict, str]] = None,
        token: Optional[str] = None,
        org_id: Optional[str] = None,
        verbose: bool = True,
        access_method: str = "stream",
        unlink: bool = False,
        reset: bool = False,
        indra: bool = USE_INDRA,
        check_integrity: Optional[bool] = None,
        lock_timeout: Optional[int] = 0,
        lock_enabled: Optional[bool] = True,
        index_params: Optional[Dict[str, Union[int, str]]] = None,
    ) -> Dataset:
        """Loads an existing Deep Lake dataset.
    
        Examples:
    
            >>> ds = deeplake.load("hub://org_id/dataset") # Load dataset managed by Deep Lake.
            >>> ds = deeplake.load("s3://mybucket/my_dataset", creds = {"aws_access_key_id": ..., ...}) # Load dataset stored in your cloud using your own credentials.
            >>> ds = deeplake.load("s3://mybucket/my_dataset", creds = {"creds_key": "managed_creds_key"}, org_id = "my_org_id") # Load dataset stored in your cloud using Deep Lake managed credentials.
    
            Loading to a specfic version:
    
            >>> ds = deeplake.load("hub://org_id/dataset@new_branch")
            >>> ds = deeplake.load("hub://org_id/dataset@3e49cded62b6b335c74ff07e97f8451a37aca7b2)
    
            >>> my_commit_id = "3e49cded62b6b335c74ff07e97f8451a37aca7b2"
            >>> ds = deeplake.load(f"hub://org_id/dataset@{my_commit_id}")
    
        Args:
            path (str, pathlib.Path): - The full path to the dataset. Can be:
                - a Deep Lake cloud path of the form ``hub://org_id/datasetname``. To write to Deep Lake cloud datasets, ensure that you are authenticated to Deep Lake (pass in a token using the 'token' parameter).
                - an s3 path of the form ``s3://bucketname/path/to/dataset``. Credentials are required in either the environment or passed to the creds argument.
                - a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``.
                - a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
                - Loading to a specific version:
    
                        - You can also specify a ``commit_id`` or ``branch`` to load the dataset to that version directly by using the ``@`` symbol.
                        - The path will then be of the form ``hub://org_id/dataset@{branch}`` or ``hub://org_id/dataset@{commit_id}``.
                        - See examples above.
            read_only (bool, optional): Opens dataset in read only mode if this is passed as ``True``. Defaults to ``False``.
                Datasets stored on Deep Lake cloud that your account does not have write access to will automatically open in read mode.
            memory_cache_size (int): The size of the memory cache to be used in MB.
            local_cache_size (int): The size of the local filesystem cache to be used in MB.
            creds (dict, str, optional): The string ``ENV`` or a dictionary containing credentials used to access the dataset at the path.
                - If 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token' are present, these take precedence over credentials present in the environment or in credentials file. Currently only works with s3 paths.
                - It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url', 'aws_region', 'profile_name' as keys.
                - To use credentials managed in your Activeloop organization, use they key 'creds_key': 'managed_key_name'. This requires the org_id dataset argument to be set.
                - If 'ENV' is passed, credentials are fetched from the environment variables. This is also the case when creds is not passed for cloud datasets. For datasets connected to hub cloud, specifying 'ENV' will override the credentials fetched from Activeloop and use local ones.
            token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated.
            org_id (str, Optional): Organization id to be used for enabling high-performance features. Only applicable for local datasets.
            verbose (bool): If ``True``, logs will be printed. Defaults to ``True``.
            access_method (str): The access method to use for the dataset. Can be:
    
                    - 'stream'
    
                        - Streams the data from the dataset i.e. only fetches data when required. This is the default value.
    
                    - 'download'
    
                        - Downloads the data to the local filesystem to the path specified in environment variable ``DEEPLAKE_DOWNLOAD_PATH``.
                          This will overwrite ``DEEPLAKE_DOWNLOAD_PATH``.
                        - Raises an exception if ``DEEPLAKE_DOWNLOAD_PATH`` environment variable is not set or if the dataset does not exist.
                        - The 'download' access method can be modified to specify num_workers and/or scheduler.
                          For example: 'download:2:processed' will use 2 workers and use processed scheduler, while 'download:3' will use 3 workers and
                          default scheduler (threaded), and 'download:processed' will use a single worker and use processed scheduler.
    
                    - 'local'
    
                        - Downloads the dataset if it doesn't already exist, otherwise loads from local storage.
                        - Raises an exception if ``DEEPLAKE_DOWNLOAD_PATH`` environment variable is not set.
                        - The 'local' access method can be modified to specify num_workers and/or scheduler to be used in case dataset needs to be downloaded.
                          If dataset needs to be downloaded, 'local:2:processed' will use 2 workers and use processed scheduler, while 'local:3' will use 3 workers
                          and default scheduler (threaded), and 'local:processed' will use a single worker and use processed scheduler.
            unlink (bool): Downloads linked samples if set to ``True``. Only applicable if ``access_method`` is ``download`` or ``local``. Defaults to ``False``.
            reset (bool): If the specified dataset cannot be loaded due to a corrupted HEAD state of the branch being loaded,
                          setting ``reset=True`` will reset HEAD changes and load the previous version.
            check_integrity (bool, Optional): Performs an integrity check by default (None) if the dataset has 20 or fewer tensors.
                                              Set to ``True`` to force integrity check, ``False`` to skip integrity check.
            indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false
    
        ..
            # noqa: DAR101
    
        Returns:
            Dataset: Dataset loaded using the arguments provided.
    
        Raises:
            DatasetHandlerError: If a Dataset does not exist at the given path.
            AgreementError: When agreement is rejected
            UserNotLoggedInException: When user is not authenticated
            InvalidTokenException: If the specified toke is invalid
            TokenPermissionError: When there are permission or other errors related to token
            CheckoutError: If version address specified in the path cannot be found
            DatasetCorruptError: If loading the dataset failed due to corruption and ``reset`` is not ``True``
            ReadOnlyModeError: If reset is attempted in read-only mode
            LockedException: When attempting to open a dataset for writing when it is locked by another machine
            ValueError: If ``org_id`` is specified for a non-local dataset
            Exception: Re-raises caught exception if reset cannot fix the issue
            ValueError: If the org id is provided but the dataset is not local
    
        Warning:
            Setting ``access_method`` to download will overwrite the local copy of the dataset if it was previously downloaded.
    
        Note:
            Any changes made to the dataset in download / local mode will only be made to the local copy and will not be reflected in the original dataset.
        """
        _check_indra_and_read_only_flags(indra, read_only)
        access_method, num_workers, scheduler = parse_access_method(access_method)
        check_access_method(access_method, overwrite=False, unlink=unlink)
    
        path, address = process_dataset_path(path)
    
        if creds is None:
            creds = {}
    
        dataset_creds_key = _fetch_creds_from_key(creds, org_id, token)
    
        try:
            storage, cache_chain = get_storage_and_cache_chain(
                path=path,
                read_only=read_only,
                creds=creds,
                token=token,
                memory_cache_size=memory_cache_size,
                local_cache_size=local_cache_size,
                indra=indra,
            )
            feature_report_path(
                path,
                "load",
                {
                    "lock_enabled": lock_enabled,
                    "lock_timeout": lock_timeout,
                    "index_params": index_params,
                },
                token=token,
            )
        except Exception as e:
            if isinstance(e, UserNotLoggedInException):
                raise UserNotLoggedInException from None
            raise
        if not dataset_exists(cache_chain):
>           raise DatasetHandlerError(
                f"A Deep Lake dataset does not exist at the given path ({path}). Check the path provided or in case you want to create a new dataset, use deeplake.empty()."
            )
E           deeplake.util.exceptions.DatasetHandlerError: A Deep Lake dataset does not exist at the given path (./datasets/3_9_10). Check the path provided or in case you want to create a new dataset, use deeplake.empty().

deeplake/api/dataset.py:711: DatasetHandlerError

Check failure on line 25 in test_write

github-actions / JUnit Test Report

test_write.test_new_samples[3.9.10]

FileNotFoundError: [Errno 2] No such file or directory: './datasets/3_9_10'

Raw output


            version = '3.9.10'
request = <FixtureRequest for <Function test_new_samples[3.9.10]>>

    @versions
    def test_new_samples(version, request):
        assert_version(version)
>       ds = load_dataset_copy(version, overwrite=True)

buH/buh/tests/test_write.py:25: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
buH/buh/tests/common.py:96: in load_dataset_copy
    new_path = shutil.copytree(dataset_path, new_dataset_path)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

src = './datasets/3_9_10', dst = './datasets/3_9_10_ffw3_9_10', symlinks = False
ignore = None, copy_function = <function copy2 at 0x7fd0db023eb0>
ignore_dangling_symlinks = False, dirs_exist_ok = False

    def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
                 ignore_dangling_symlinks=False, dirs_exist_ok=False):
        """Recursively copy a directory tree and return the destination directory.
    
        If exception(s) occur, an Error is raised with a list of reasons.
    
        If the optional symlinks flag is true, symbolic links in the
        source tree result in symbolic links in the destination tree; if
        it is false, the contents of the files pointed to by symbolic
        links are copied. If the file pointed by the symlink doesn't
        exist, an exception will be added in the list of errors raised in
        an Error exception at the end of the copy process.
    
        You can set the optional ignore_dangling_symlinks flag to true if you
        want to silence this exception. Notice that this has no effect on
        platforms that don't support os.symlink.
    
        The optional ignore argument is a callable. If given, it
        is called with the `src` parameter, which is the directory
        being visited by copytree(), and `names` which is the list of
        `src` contents, as returned by os.listdir():
    
            callable(src, names) -> ignored_names
    
        Since copytree() is called recursively, the callable will be
        called once for each directory that is copied. It returns a
        list of names relative to the `src` directory that should
        not be copied.
    
        The optional copy_function argument is a callable that will be used
        to copy each file. It will be called with the source path and the
        destination path as arguments. By default, copy2() is used, but any
        function that supports the same signature (like copy()) can be used.
    
        If dirs_exist_ok is false (the default) and `dst` already exists, a
        `FileExistsError` is raised. If `dirs_exist_ok` is true, the copying
        operation will continue if it encounters existing directories, and files
        within the `dst` tree will be overwritten by corresponding files from the
        `src` tree.
        """
        sys.audit("shutil.copytree", src, dst)
>       with os.scandir(src) as itr:
E       FileNotFoundError: [Errno 2] No such file or directory: './datasets/3_9_10'

/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/shutil.py:557: FileNotFoundError

Check failure on line 53 in test_write

github-actions / JUnit Test Report

test_write.test_new_tensor[3.9.10]

FileNotFoundError: [Errno 2] No such file or directory: './datasets/3_9_10'

Raw output


            version = '3.9.10'
request = <FixtureRequest for <Function test_new_tensor[3.9.10]>>

    @versions
    def test_new_tensor(version, request):
        assert_version(version)
>       ds = load_dataset_copy(version, overwrite=True)

buH/buh/tests/test_write.py:53: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
buH/buh/tests/common.py:96: in load_dataset_copy
    new_path = shutil.copytree(dataset_path, new_dataset_path)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

src = './datasets/3_9_10', dst = './datasets/3_9_10_ffw3_9_10', symlinks = False
ignore = None, copy_function = <function copy2 at 0x7fd0db023eb0>
ignore_dangling_symlinks = False, dirs_exist_ok = False

    def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
                 ignore_dangling_symlinks=False, dirs_exist_ok=False):
        """Recursively copy a directory tree and return the destination directory.
    
        If exception(s) occur, an Error is raised with a list of reasons.
    
        If the optional symlinks flag is true, symbolic links in the
        source tree result in symbolic links in the destination tree; if
        it is false, the contents of the files pointed to by symbolic
        links are copied. If the file pointed by the symlink doesn't
        exist, an exception will be added in the list of errors raised in
        an Error exception at the end of the copy process.
    
        You can set the optional ignore_dangling_symlinks flag to true if you
        want to silence this exception. Notice that this has no effect on
        platforms that don't support os.symlink.
    
        The optional ignore argument is a callable. If given, it
        is called with the `src` parameter, which is the directory
        being visited by copytree(), and `names` which is the list of
        `src` contents, as returned by os.listdir():
    
            callable(src, names) -> ignored_names
    
        Since copytree() is called recursively, the callable will be
        called once for each directory that is copied. It returns a
        list of names relative to the `src` directory that should
        not be copied.
    
        The optional copy_function argument is a callable that will be used
        to copy each file. It will be called with the source path and the
        destination path as arguments. By default, copy2() is used, but any
        function that supports the same signature (like copy()) can be used.
    
        If dirs_exist_ok is false (the default) and `dst` already exists, a
        `FileExistsError` is raised. If `dirs_exist_ok` is true, the copying
        operation will continue if it encounters existing directories, and files
        within the `dst` tree will be overwritten by corresponding files from the
        `src` tree.
        """
        sys.audit("shutil.copytree", src, dst)
>       with os.scandir(src) as itr:
E       FileNotFoundError: [Errno 2] No such file or directory: './datasets/3_9_10'

/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/shutil.py:557: FileNotFoundError

Check failure on line 64 in test_write

github-actions / JUnit Test Report

test_write.test_update_samples[3.9.10]

FileNotFoundError: [Errno 2] No such file or directory: './datasets/3_9_10'

Raw output


            version = '3.9.10'
request = <FixtureRequest for <Function test_update_samples[3.9.10]>>

    @versions
    def test_update_samples(version, request):
        assert_version(version)
>       ds = load_dataset_copy(version, overwrite=True)

buH/buh/tests/test_write.py:64: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
buH/buh/tests/common.py:96: in load_dataset_copy
    new_path = shutil.copytree(dataset_path, new_dataset_path)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

src = './datasets/3_9_10', dst = './datasets/3_9_10_ffw3_9_10', symlinks = False
ignore = None, copy_function = <function copy2 at 0x7fd0db023eb0>
ignore_dangling_symlinks = False, dirs_exist_ok = False

    def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
                 ignore_dangling_symlinks=False, dirs_exist_ok=False):
        """Recursively copy a directory tree and return the destination directory.
    
        If exception(s) occur, an Error is raised with a list of reasons.
    
        If the optional symlinks flag is true, symbolic links in the
        source tree result in symbolic links in the destination tree; if
        it is false, the contents of the files pointed to by symbolic
        links are copied. If the file pointed by the symlink doesn't
        exist, an exception will be added in the list of errors raised in
        an Error exception at the end of the copy process.
    
        You can set the optional ignore_dangling_symlinks flag to true if you
        want to silence this exception. Notice that this has no effect on
        platforms that don't support os.symlink.
    
        The optional ignore argument is a callable. If given, it
        is called with the `src` parameter, which is the directory
        being visited by copytree(), and `names` which is the list of
        `src` contents, as returned by os.listdir():
    
            callable(src, names) -> ignored_names
    
        Since copytree() is called recursively, the callable will be
        called once for each directory that is copied. It returns a
        list of names relative to the `src` directory that should
        not be copied.
    
        The optional copy_function argument is a callable that will be used
        to copy each file. It will be called with the source path and the
        destination path as arguments. By default, copy2() is used, but any
        function that supports the same signature (like copy()) can be used.
    
        If dirs_exist_ok is false (the default) and `dst` already exists, a
        `FileExistsError` is raised. If `dirs_exist_ok` is true, the copying
        operation will continue if it encounters existing directories, and files
        within the `dst` tree will be overwritten by corresponding files from the
        `src` tree.
        """
        sys.audit("shutil.copytree", src, dst)
>       with os.scandir(src) as itr:
E       FileNotFoundError: [Errno 2] No such file or directory: './datasets/3_9_10'

/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/shutil.py:557: FileNotFoundError

Check failure on line 45 in deeplake/api/tests/test_agreement.py

github-actions / JUnit Test Report

test_agreement.test_agreement_logged_in

deeplake.util.exceptions.BadRequestException: Invalid Request. One or more request parameters is incorrect.
'_InstrumentedFastAPI' object has no attribute 'mongodb_datastore'

Raw output


            hub_cloud_dev_token = 'eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJpZCI6InRlc3RpbmdhY2MyIiwiYXBpX2tleSI6IjU4Y0tLb1p6UE1BbThPU2RpbTRiZ2tBekhWekt1VUE3MFJpNTNyZUpKRTJuaiJ9.'

    @pytest.mark.slow
    @pytest.mark.flaky(reruns=3)
    def test_agreement_logged_in(hub_cloud_dev_token):
        path = "hub://activeloop/imagenet-test"
        agree(path, hub_cloud_dev_token)
>       reject(path, hub_cloud_dev_token)

deeplake/api/tests/test_agreement.py:59: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
deeplake/api/tests/test_agreement.py:45: in reject
    client.reject_agreements(org_id, ds_name)
deeplake/client/client.py:328: in reject_agreements
    self.request(
deeplake/client/client.py:137: in request
    check_response_status(response)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

response = <Response [400]>

    def check_response_status(response: requests.Response):
        """Check response status and throw corresponding exception on failure."""
        code = response.status_code
        if code >= 200 and code < 300:
            return
    
        try:
            message = response.json()["description"]
        except Exception:
            message = " "
    
        if code == 400:
>           raise BadRequestException(message)
E           deeplake.util.exceptions.BadRequestException: Invalid Request. One or more request parameters is incorrect.
E           '_InstrumentedFastAPI' object has no attribute 'mongodb_datastore'

deeplake/client/utils.py:56: BadRequestException

Check failure on line 24 in deeplake/api/tests/test_agreement.py

github-actions / JUnit Test Report

test_agreement.test_not_agreement_logged_in

Failed: DID NOT RAISE <class 'deeplake.util.exceptions.AgreementNotAcceptedError'>

Raw output


            hub_cloud_dev_token = 'eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJpZCI6InRlc3RpbmdhY2MyIiwiYXBpX2tleSI6IjU4Y0tLb1p6UE1BbThPU2RpbTRiZ2tBekhWekt1VUE3MFJpNTNyZUpKRTJuaiJ9.'

    @pytest.mark.flaky(reruns=3)
    @pytest.mark.slow
    def test_not_agreement_logged_in(hub_cloud_dev_token):
        path = "hub://activeloop/imagenet-test"
>       dont_agree(path, hub_cloud_dev_token)

deeplake/api/tests/test_agreement.py:66: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

path = 'hub://activeloop/imagenet-test'
token = 'eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJpZCI6InRlc3RpbmdhY2MyIiwiYXBpX2tleSI6IjU4Y0tLb1p6UE1BbThPU2RpbTRiZ2tBekhWekt1VUE3MFJpNTNyZUpKRTJuaiJ9.'

    def dont_agree(path, token: str):
        """Load the Deep Lake cloud dataset at path and simulate disagreeing to the terms of access."""
    
>       with pytest.raises(AgreementNotAcceptedError):
E       Failed: DID NOT RAISE <class 'deeplake.util.exceptions.AgreementNotAcceptedError'>

deeplake/api/tests/test_agreement.py:24: Failed

Check failure on line 57 in deeplake/core/test_serialize.py

github-actions / JUnit Test Report

test_serialize.test_get_large_header

deeplake.util.exceptions.DatasetCorruptError: Exception occurred (see Traceback). The dataset maybe corrupted. Try using `reset=True` to reset HEAD changes and load the previous commit. This will delete all uncommitted changes on the branch you are trying to load.

Raw output


            path = 'hub://activeloop/hmdb51-train', read_only = None
memory_cache_size = 2000, local_cache_size = 0, creds = {}, token = None
org_id = None, verbose = True, access_method = 'stream', unlink = False
reset = False, indra = False, check_integrity = None, lock_timeout = 0
lock_enabled = True, index_params = None

    @staticmethod
    @spinner
    def load(
        path: Union[str, pathlib.Path],
        read_only: Optional[bool] = None,
        memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE,
        local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
        creds: Optional[Union[dict, str]] = None,
        token: Optional[str] = None,
        org_id: Optional[str] = None,
        verbose: bool = True,
        access_method: str = "stream",
        unlink: bool = False,
        reset: bool = False,
        indra: bool = USE_INDRA,
        check_integrity: Optional[bool] = None,
        lock_timeout: Optional[int] = 0,
        lock_enabled: Optional[bool] = True,
        index_params: Optional[Dict[str, Union[int, str]]] = None,
    ) -> Dataset:
        """Loads an existing Deep Lake dataset.
    
        Examples:
    
            >>> ds = deeplake.load("hub://org_id/dataset") # Load dataset managed by Deep Lake.
            >>> ds = deeplake.load("s3://mybucket/my_dataset", creds = {"aws_access_key_id": ..., ...}) # Load dataset stored in your cloud using your own credentials.
            >>> ds = deeplake.load("s3://mybucket/my_dataset", creds = {"creds_key": "managed_creds_key"}, org_id = "my_org_id") # Load dataset stored in your cloud using Deep Lake managed credentials.
    
            Loading to a specfic version:
    
            >>> ds = deeplake.load("hub://org_id/dataset@new_branch")
            >>> ds = deeplake.load("hub://org_id/dataset@3e49cded62b6b335c74ff07e97f8451a37aca7b2)
    
            >>> my_commit_id = "3e49cded62b6b335c74ff07e97f8451a37aca7b2"
            >>> ds = deeplake.load(f"hub://org_id/dataset@{my_commit_id}")
    
        Args:
            path (str, pathlib.Path): - The full path to the dataset. Can be:
                - a Deep Lake cloud path of the form ``hub://org_id/datasetname``. To write to Deep Lake cloud datasets, ensure that you are authenticated to Deep Lake (pass in a token using the 'token' parameter).
                - an s3 path of the form ``s3://bucketname/path/to/dataset``. Credentials are required in either the environment or passed to the creds argument.
                - a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``.
                - a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
                - Loading to a specific version:
    
                        - You can also specify a ``commit_id`` or ``branch`` to load the dataset to that version directly by using the ``@`` symbol.
                        - The path will then be of the form ``hub://org_id/dataset@{branch}`` or ``hub://org_id/dataset@{commit_id}``.
                        - See examples above.
            read_only (bool, optional): Opens dataset in read only mode if this is passed as ``True``. Defaults to ``False``.
                Datasets stored on Deep Lake cloud that your account does not have write access to will automatically open in read mode.
            memory_cache_size (int): The size of the memory cache to be used in MB.
            local_cache_size (int): The size of the local filesystem cache to be used in MB.
            creds (dict, str, optional): The string ``ENV`` or a dictionary containing credentials used to access the dataset at the path.
                - If 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token' are present, these take precedence over credentials present in the environment or in credentials file. Currently only works with s3 paths.
                - It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url', 'aws_region', 'profile_name' as keys.
                - To use credentials managed in your Activeloop organization, use they key 'creds_key': 'managed_key_name'. This requires the org_id dataset argument to be set.
                - If 'ENV' is passed, credentials are fetched from the environment variables. This is also the case when creds is not passed for cloud datasets. For datasets connected to hub cloud, specifying 'ENV' will override the credentials fetched from Activeloop and use local ones.
            token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated.
            org_id (str, Optional): Organization id to be used for enabling high-performance features. Only applicable for local datasets.
            verbose (bool): If ``True``, logs will be printed. Defaults to ``True``.
            access_method (str): The access method to use for the dataset. Can be:
    
                    - 'stream'
    
                        - Streams the data from the dataset i.e. only fetches data when required. This is the default value.
    
                    - 'download'
    
                        - Downloads the data to the local filesystem to the path specified in environment variable ``DEEPLAKE_DOWNLOAD_PATH``.
                          This will overwrite ``DEEPLAKE_DOWNLOAD_PATH``.
                        - Raises an exception if ``DEEPLAKE_DOWNLOAD_PATH`` environment variable is not set or if the dataset does not exist.
                        - The 'download' access method can be modified to specify num_workers and/or scheduler.
                          For example: 'download:2:processed' will use 2 workers and use processed scheduler, while 'download:3' will use 3 workers and
                          default scheduler (threaded), and 'download:processed' will use a single worker and use processed scheduler.
    
                    - 'local'
    
                        - Downloads the dataset if it doesn't already exist, otherwise loads from local storage.
                        - Raises an exception if ``DEEPLAKE_DOWNLOAD_PATH`` environment variable is not set.
                        - The 'local' access method can be modified to specify num_workers and/or scheduler to be used in case dataset needs to be downloaded.
                          If dataset needs to be downloaded, 'local:2:processed' will use 2 workers and use processed scheduler, while 'local:3' will use 3 workers
                          and default scheduler (threaded), and 'local:processed' will use a single worker and use processed scheduler.
            unlink (bool): Downloads linked samples if set to ``True``. Only applicable if ``access_method`` is ``download`` or ``local``. Defaults to ``False``.
            reset (bool): If the specified dataset cannot be loaded due to a corrupted HEAD state of the branch being loaded,
                          setting ``reset=True`` will reset HEAD changes and load the previous version.
            check_integrity (bool, Optional): Performs an integrity check by default (None) if the dataset has 20 or fewer tensors.
                                              Set to ``True`` to force integrity check, ``False`` to skip integrity check.
            indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false
    
        ..
            # noqa: DAR101
    
        Returns:
            Dataset: Dataset loaded using the arguments provided.
    
        Raises:
            DatasetHandlerError: If a Dataset does not exist at the given path.
            AgreementError: When agreement is rejected
            UserNotLoggedInException: When user is not authenticated
            InvalidTokenException: If the specified toke is invalid
            TokenPermissionError: When there are permission or other errors related to token
            CheckoutError: If version address specified in the path cannot be found
            DatasetCorruptError: If loading the dataset failed due to corruption and ``reset`` is not ``True``
            ReadOnlyModeError: If reset is attempted in read-only mode
            LockedException: When attempting to open a dataset for writing when it is locked by another machine
            ValueError: If ``org_id`` is specified for a non-local dataset
            Exception: Re-raises caught exception if reset cannot fix the issue
            ValueError: If the org id is provided but the dataset is not local
    
        Warning:
            Setting ``access_method`` to download will overwrite the local copy of the dataset if it was previously downloaded.
    
        Note:
            Any changes made to the dataset in download / local mode will only be made to the local copy and will not be reflected in the original dataset.
        """
        _check_indra_and_read_only_flags(indra, read_only)
        access_method, num_workers, scheduler = parse_access_method(access_method)
        check_access_method(access_method, overwrite=False, unlink=unlink)
    
        path, address = process_dataset_path(path)
    
        if creds is None:
            creds = {}
    
        dataset_creds_key = _fetch_creds_from_key(creds, org_id, token)
    
        try:
            storage, cache_chain = get_storage_and_cache_chain(
                path=path,
                read_only=read_only,
                creds=creds,
                token=token,
                memory_cache_size=memory_cache_size,
                local_cache_size=local_cache_size,
                indra=indra,
            )
            feature_report_path(
                path,
                "load",
                {
                    "lock_enabled": lock_enabled,
                    "lock_timeout": lock_timeout,
                    "index_params": index_params,
                },
                token=token,
            )
        except Exception as e:
            if isinstance(e, UserNotLoggedInException):
                raise UserNotLoggedInException from None
            raise
        if not dataset_exists(cache_chain):
            raise DatasetHandlerError(
                f"A Deep Lake dataset does not exist at the given path ({path}). Check the path provided or in case you want to create a new dataset, use deeplake.empty()."
            )
    
        if indra and read_only:
            from indra import api  # type: ignore
    
            ids = api.load_from_storage(storage.core)
            return IndraDatasetView(indra_ds=ids)
    
        dataset_kwargs: Dict[str, Union[None, str, bool, int, Dict]] = {
            "path": path,
            "read_only": read_only,
            "token": token,
            "org_id": org_id,
            "verbose": verbose,
            "lock_timeout": lock_timeout,
            "lock_enabled": lock_enabled,
            "index_params": index_params,
        }
    
        if dataset_creds_key:
            dataset_kwargs["dataset_creds_key"] = dataset_creds_key
            dataset_kwargs["dataset_creds_key_org_id"] = org_id
            dataset_kwargs["dataset_creds_key_token"] = token
    
        if access_method == "stream":
            dataset_kwargs.update(
                {
                    "address": address,
                    "storage": cache_chain,
                }
            )
        else:
            dataset_kwargs.update(
                {
                    "access_method": access_method,
                    "memory_cache_size": memory_cache_size,
                    "local_cache_size": local_cache_size,
                    "creds": creds,
                    "ds_exists": True,
                    "num_workers": num_workers,
                    "scheduler": scheduler,
                    "reset": reset,
                    "unlink": unlink,
                }
            )
    
        try:
>           return dataset._load(
                dataset_kwargs, access_method, check_integrity=check_integrity
            )

deeplake/api/dataset.py:760: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
deeplake/api/dataset.py:831: in _load
    ret = dataset_factory(**dataset_kwargs)
deeplake/core/dataset/__init__.py:23: in dataset_factory
    ds = clz(path=path, *args, **kwargs)
deeplake/core/dataset/dataset.py:318: in __init__
    _load_tensor_metas(self)
deeplake/core/dataset/dataset.py:172: in _load_tensor_metas
    for _ in dataset.storage.get_items(meta_keys):
deeplake/core/storage/lru_cache.py:230: in get_items
    if _get_nbytes(result) <= self.cache_size:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

obj = S3GetError(ClientError('An error occurred (InternalError) when calling the GetObject operation (reached max retries: 4): We encountered an internal error.  Please retry the operation again later.'))

    def _get_nbytes(obj: Union[bytes, memoryview, DeepLakeMemoryObject]):
        if isinstance(obj, DeepLakeMemoryObject):
            return obj.nbytes
>       return len(obj)
E       TypeError: object of type 'S3GetError' has no len()

deeplake/core/storage/lru_cache.py:15: TypeError

The above exception was the direct cause of the following exception:

    @pytest.mark.slow
    @pytest.mark.flaky
    def test_get_large_header():
        # headers for videos in this dataset are larger than the 100 bytes originally fetched
        # ideally this test would just be calling `serialize.get_header_from_url` directly, but that requires all the URL buliding up logic that lives in the chunk engine.
        # So calling a larger codepath that includes `get_header_from_url`
>       ds = deeplake.load("hub://activeloop/hmdb51-train")

deeplake/core/tests/test_serialize.py:57: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
deeplake/util/spinner.py:151: in inner
    return func(*args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

path = 'hub://activeloop/hmdb51-train', read_only = None
memory_cache_size = 2000, local_cache_size = 0, creds = {}, token = None
org_id = None, verbose = True, access_method = 'stream', unlink = False
reset = False, indra = False, check_integrity = None, lock_timeout = 0
lock_enabled = True, index_params = None

    @staticmethod
    @spinner
    def load(
        path: Union[str, pathlib.Path],
        read_only: Optional[bool] = None,
        memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE,
        local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
        creds: Optional[Union[dict, str]] = None,
        token: Optional[str] = None,
        org_id: Optional[str] = None,
        verbose: bool = True,
        access_method: str = "stream",
        unlink: bool = False,
        reset: bool = False,
        indra: bool = USE_INDRA,
        check_integrity: Optional[bool] = None,
        lock_timeout: Optional[int] = 0,
        lock_enabled: Optional[bool] = True,
        index_params: Optional[Dict[str, Union[int, str]]] = None,
    ) -> Dataset:
        """Loads an existing Deep Lake dataset.
    
        Examples:
    
            >>> ds = deeplake.load("hub://org_id/dataset") # Load dataset managed by Deep Lake.
            >>> ds = deeplake.load("s3://mybucket/my_dataset", creds = {"aws_access_key_id": ..., ...}) # Load dataset stored in your cloud using your own credentials.
            >>> ds = deeplake.load("s3://mybucket/my_dataset", creds = {"creds_key": "managed_creds_key"}, org_id = "my_org_id") # Load dataset stored in your cloud using Deep Lake managed credentials.
    
            Loading to a specfic version:
    
            >>> ds = deeplake.load("hub://org_id/dataset@new_branch")
            >>> ds = deeplake.load("hub://org_id/dataset@3e49cded62b6b335c74ff07e97f8451a37aca7b2)
    
            >>> my_commit_id = "3e49cded62b6b335c74ff07e97f8451a37aca7b2"
            >>> ds = deeplake.load(f"hub://org_id/dataset@{my_commit_id}")
    
        Args:
            path (str, pathlib.Path): - The full path to the dataset. Can be:
                - a Deep Lake cloud path of the form ``hub://org_id/datasetname``. To write to Deep Lake cloud datasets, ensure that you are authenticated to Deep Lake (pass in a token using the 'token' parameter).
                - an s3 path of the form ``s3://bucketname/path/to/dataset``. Credentials are required in either the environment or passed to the creds argument.
                - a local file system path of the form ``./path/to/dataset`` or ``~/path/to/dataset`` or ``path/to/dataset``.
                - a memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
                - Loading to a specific version:
    
                        - You can also specify a ``commit_id`` or ``branch`` to load the dataset to that version directly by using the ``@`` symbol.
                        - The path will then be of the form ``hub://org_id/dataset@{branch}`` or ``hub://org_id/dataset@{commit_id}``.
                        - See examples above.
            read_only (bool, optional): Opens dataset in read only mode if this is passed as ``True``. Defaults to ``False``.
                Datasets stored on Deep Lake cloud that your account does not have write access to will automatically open in read mode.
            memory_cache_size (int): The size of the memory cache to be used in MB.
            local_cache_size (int): The size of the local filesystem cache to be used in MB.
            creds (dict, str, optional): The string ``ENV`` or a dictionary containing credentials used to access the dataset at the path.
                - If 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token' are present, these take precedence over credentials present in the environment or in credentials file. Currently only works with s3 paths.
                - It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url', 'aws_region', 'profile_name' as keys.
                - To use credentials managed in your Activeloop organization, use they key 'creds_key': 'managed_key_name'. This requires the org_id dataset argument to be set.
                - If 'ENV' is passed, credentials are fetched from the environment variables. This is also the case when creds is not passed for cloud datasets. For datasets connected to hub cloud, specifying 'ENV' will override the credentials fetched from Activeloop and use local ones.
            token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated.
            org_id (str, Optional): Organization id to be used for enabling high-performance features. Only applicable for local datasets.
            verbose (bool): If ``True``, logs will be printed. Defaults to ``True``.
            access_method (str): The access method to use for the dataset. Can be:
    
                    - 'stream'
    
                        - Streams the data from the dataset i.e. only fetches data when required. This is the default value.
    
                    - 'download'
    
                        - Downloads the data to the local filesystem to the path specified in environment variable ``DEEPLAKE_DOWNLOAD_PATH``.
                          This will overwrite ``DEEPLAKE_DOWNLOAD_PATH``.
                        - Raises an exception if ``DEEPLAKE_DOWNLOAD_PATH`` environment variable is not set or if the dataset does not exist.
                        - The 'download' access method can be modified to specify num_workers and/or scheduler.
                          For example: 'download:2:processed' will use 2 workers and use processed scheduler, while 'download:3' will use 3 workers and
                          default scheduler (threaded), and 'download:processed' will use a single worker and use processed scheduler.
    
                    - 'local'
    
                        - Downloads the dataset if it doesn't already exist, otherwise loads from local storage.
                        - Raises an exception if ``DEEPLAKE_DOWNLOAD_PATH`` environment variable is not set.
                        - The 'local' access method can be modified to specify num_workers and/or scheduler to be used in case dataset needs to be downloaded.
                          If dataset needs to be downloaded, 'local:2:processed' will use 2 workers and use processed scheduler, while 'local:3' will use 3 workers
                          and default scheduler (threaded), and 'local:processed' will use a single worker and use processed scheduler.
            unlink (bool): Downloads linked samples if set to ``True``. Only applicable if ``access_method`` is ``download`` or ``local``. Defaults to ``False``.
            reset (bool): If the specified dataset cannot be loaded due to a corrupted HEAD state of the branch being loaded,
                          setting ``reset=True`` will reset HEAD changes and load the previous version.
            check_integrity (bool, Optional): Performs an integrity check by default (None) if the dataset has 20 or fewer tensors.
                                              Set to ``True`` to force integrity check, ``False`` to skip integrity check.
            indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false
    
        ..
            # noqa: DAR101
    
        Returns:
            Dataset: Dataset loaded using the arguments provided.
    
        Raises:
            DatasetHandlerError: If a Dataset does not exist at the given path.
            AgreementError: When agreement is rejected
            UserNotLoggedInException: When user is not authenticated
            InvalidTokenException: If the specified toke is invalid
            TokenPermissionError: When there are permission or other errors related to token
            CheckoutError: If version address specified in the path cannot be found
            DatasetCorruptError: If loading the dataset failed due to corruption and ``reset`` is not ``True``
            ReadOnlyModeError: If reset is attempted in read-only mode
            LockedException: When attempting to open a dataset for writing when it is locked by another machine
            ValueError: If ``org_id`` is specified for a non-local dataset
            Exception: Re-raises caught exception if reset cannot fix the issue
            ValueError: If the org id is provided but the dataset is not local
    
        Warning:
            Setting ``access_method`` to download will overwrite the local copy of the dataset if it was previously downloaded.
    
        Note:
            Any changes made to the dataset in download / local mode will only be made to the local copy and will not be reflected in the original dataset.
        """
        _check_indra_and_read_only_flags(indra, read_only)
        access_method, num_workers, scheduler = parse_access_method(access_method)
        check_access_method(access_method, overwrite=False, unlink=unlink)
    
        path, address = process_dataset_path(path)
    
        if creds is None:
            creds = {}
    
        dataset_creds_key = _fetch_creds_from_key(creds, org_id, token)
    
        try:
            storage, cache_chain = get_storage_and_cache_chain(
                path=path,
                read_only=read_only,
                creds=creds,
                token=token,
                memory_cache_size=memory_cache_size,
                local_cache_size=local_cache_size,
                indra=indra,
            )
            feature_report_path(
                path,
                "load",
                {
                    "lock_enabled": lock_enabled,
                    "lock_timeout": lock_timeout,
                    "index_params": index_params,
                },
                token=token,
            )
        except Exception as e:
            if isinstance(e, UserNotLoggedInException):
                raise UserNotLoggedInException from None
            raise
        if not dataset_exists(cache_chain):
            raise DatasetHandlerError(
                f"A Deep Lake dataset does not exist at the given path ({path}). Check the path provided or in case you want to create a new dataset, use deeplake.empty()."
            )
    
        if indra and read_only:
            from indra import api  # type: ignore
    
            ids = api.load_from_storage(storage.core)
            return IndraDatasetView(indra_ds=ids)
    
        dataset_kwargs: Dict[str, Union[None, str, bool, int, Dict]] = {
            "path": path,
            "read_only": read_only,
            "token": token,
            "org_id": org_id,
            "verbose": verbose,
            "lock_timeout": lock_timeout,
            "lock_enabled": lock_enabled,
            "index_params": index_params,
        }
    
        if dataset_creds_key:
            dataset_kwargs["dataset_creds_key"] = dataset_creds_key
            dataset_kwargs["dataset_creds_key_org_id"] = org_id
            dataset_kwargs["dataset_creds_key_token"] = token
    
        if access_method == "stream":
            dataset_kwargs.update(
                {
                    "address": address,
                    "storage": cache_chain,
                }
            )
        else:
            dataset_kwargs.update(
                {
                    "access_method": access_method,
                    "memory_cache_size": memory_cache_size,
                    "local_cache_size": local_cache_size,
                    "creds": creds,
                    "ds_exists": True,
                    "num_workers": num_workers,
                    "scheduler": scheduler,
                    "reset": reset,
                    "unlink": unlink,
                }
            )
    
        try:
            return dataset._load(
                dataset_kwargs, access_method, check_integrity=check_integrity
            )
        except (AgreementError, CheckoutError, LockedException) as e:
            raise e from None
        except Exception as e:
            if access_method == "stream":
                if not reset:
                    if isinstance(e, DatasetCorruptError):
                        raise DatasetCorruptError(
                            message=e.message,
                            action="Try using `reset=True` to reset HEAD changes and load the previous commit.",
                            cause=e.__cause__,
                        )
>                   raise DatasetCorruptError(
                        "Exception occurred (see Traceback). The dataset maybe corrupted. "
                        "Try using `reset=True` to reset HEAD changes and load the previous commit. "
                        "This will delete all uncommitted changes on the branch you are trying to load."
                    ) from e
E                   deeplake.util.exceptions.DatasetCorruptError: Exception occurred (see Traceback). The dataset maybe corrupted. Try using `reset=True` to reset HEAD changes and load the previous commit. This will delete all uncommitted changes on the branch you are trying to load.

deeplake/api/dataset.py:774: DatasetCorruptError

Check failure on line 1097 in deeplake/core/transform/test_transform.py

github-actions / JUnit Test Report

test_transform.test_tensor_dataset_memory_leak

assert 6 == 0

Raw output


            local_ds = Dataset(path='./hub_pytest/test_transform/test_tensor_dataset_memory_leak', tensors=['image'])

    @pytest.mark.slow
    def test_tensor_dataset_memory_leak(local_ds):
        local_ds.create_tensor("image", htype="image", sample_compression="png")
        add_images().eval(list(range(100)), local_ds, scheduler="threaded")
    
        n = retrieve_objects_from_memory()
>       assert n == 0
E       assert 6 == 0

deeplake\core\transform\test_transform.py:1097: AssertionError

Check failure on line 480 in deeplake/core/query/test/test_query.py

github-actions / JUnit Test Report

test_query.test_link_materialize[1]

deeplake.util.exceptions.TransformError: Transform failed at index 9 of the input data. See traceback for more details. If you wish to skip the samples that cause errors, please specify `ignore_errors=True`.

Raw output


            self = Sample(is_lazy=True, path=https://picsum.photos/10/10)

    def _read_from_path(self) -> bytes:  # type: ignore
        if self._buffer is None:
            path_type = get_path_type(self.path)
            try:
                if path_type == "local":
                    self._buffer = self._read_from_local()
                elif path_type == "gcs":
                    self._buffer = self._read_from_gcs()
                elif path_type == "s3":
                    self._buffer = self._read_from_s3()
                elif path_type == "azure":
                    self._buffer = self._read_from_azure()
                elif path_type == "gdrive":
                    self._buffer = self._read_from_gdrive()
                elif path_type == "http":
>                   self._buffer = self._read_from_http(timeout=self._timeout)

deeplake\core\sample.py:466: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = Sample(is_lazy=True, path=https://picsum.photos/10/10), timeout = None

    def _read_from_http(self, timeout=None) -> bytes:
        assert self.path is not None
        if "Authorization" in self._creds:
            headers = {"Authorization": self._creds["Authorization"]}
        else:
            headers = {}
        result = requests.get(self.path, headers=headers, timeout=timeout)
        if result.status_code != 200:
>           raise UnableToReadFromUrlError(self.path, result.status_code)
E           deeplake.util.exceptions.UnableToReadFromUrlError: Unable to read from url https://picsum.photos/10/10. Status code: 520

deeplake\core\sample.py:532: UnableToReadFromUrlError

The above exception was the direct cause of the following exception:

self = <deeplake.core.transform.transform_tensor.TransformTensor object at 0x000001FA93C44EB0>
item = Sample(is_lazy=True, path=https://picsum.photos/10/10)

    def append(self, item):
        """Adds an item to the tensor."""
        if self.is_group:
            raise TensorDoesNotExistError(self.name)
        try:
            # optimization applicable only if extending
            self.non_numpy_only()
    
>           self._verify_item(item)

deeplake\core\transform\transform_tensor.py:122: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
deeplake\core\transform\transform_tensor.py:112: in _verify_item
    shape = getattr(item, "shape", None)  # verify sample
deeplake\core\sample.py:169: in shape
    self._read_meta()
deeplake\core\sample.py:204: in _read_meta
    f = self._read_from_path()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = Sample(is_lazy=True, path=https://picsum.photos/10/10)

    def _read_from_path(self) -> bytes:  # type: ignore
        if self._buffer is None:
            path_type = get_path_type(self.path)
            try:
                if path_type == "local":
                    self._buffer = self._read_from_local()
                elif path_type == "gcs":
                    self._buffer = self._read_from_gcs()
                elif path_type == "s3":
                    self._buffer = self._read_from_s3()
                elif path_type == "azure":
                    self._buffer = self._read_from_azure()
                elif path_type == "gdrive":
                    self._buffer = self._read_from_gdrive()
                elif path_type == "http":
                    self._buffer = self._read_from_http(timeout=self._timeout)
            except Exception as e:
>               raise SampleReadError(self.path) from e  # type: ignore
E               deeplake.util.exceptions.SampleReadError: Unable to read sample from https://picsum.photos/10/10

deeplake\core\sample.py:468: SampleReadError

The above exception was the direct cause of the following exception:

data_slice = Dataset(path='./hub_pytest/test_query/test_link_materialize-1-', index=Index([slice(0, 20, 2)]), tensors=['abc'])
offset = 0
transform_dataset = <deeplake.core.transform.transform_dataset.TransformDataset object at 0x000001FA93C4A790>
pipeline = <deeplake.core.transform.transform.Pipeline object at 0x000001FA93C4C190>
tensors = ['abc'], skip_ok = True
pg_callback = <function ComputeProvider.map_with_progress_bar.<locals>.sub_func.<locals>.pg_callback at 0x000001FA93C3B820>
ignore_errors = False

    def _transform_and_append_data_slice(
        data_slice,
        offset,
        transform_dataset,
        pipeline,
        tensors,
        skip_ok,
        pg_callback,
        ignore_errors,
    ):
        """Appends a data slice. Returns ``True`` if any samples were appended and ``False`` otherwise."""
        try:
            import pandas as pd  # type: ignore
        except ImportError:
            pd = None
    
        n = len(data_slice)
        skipped_samples = 0
        skipped_samples_in_current_batch = 0
    
        pipeline_checked = False
    
        last_pg_update_time = time.time()
        progress = 0
    
        for i, sample in enumerate(
            (data_slice[i : i + 1] for i in range(n))
            if pd and isinstance(data_slice, pd.DataFrame)
            else data_slice
        ):
            try:
                transform_dataset.set_start_input_idx(i)
    
                try:
>                   out = transform_sample(sample, pipeline, tensors)

deeplake\util\transform.py:227: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
deeplake\util\transform.py:76: in transform_sample
    fn(out, result, *args, **kwargs)
deeplake\core\dataset\dataset.py:4252: in _copy_tensor_append
    sample_out[tensor_name].append(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <deeplake.core.transform.transform_tensor.TransformTensor object at 0x000001FA93C44EB0>
item = Sample(is_lazy=True, path=https://picsum.photos/10/10)

    def append(self, item):
        """Adds an item to the tensor."""
        if self.is_group:
            raise TensorDoesNotExistError(self.name)
        try:
            # optimization applicable only if extending
            self.non_numpy_only()
    
            self._verify_item(item)
            self.items.append(item)
            self._item_added(item)
        except Exception as e:
            self.items.clear()
>           raise SampleAppendError(self.name, item) from e
E           deeplake.util.exceptions.SampleAppendError: Failed to append the sample at path 'https://picsum.photos/10/10' to the tensor 'abc'. See more details in the traceback.

deeplake\core\transform\transform_tensor.py:127: SampleAppendError

The above exception was the direct cause of the following exception:

local_ds = Dataset(path='./hub_pytest/test_query/test_link_materialize-1-', tensors=['abc'])
num_workers = 1

    @pytest.mark.slow
    @pytest.mark.parametrize("num_workers", [1, 2])
    def test_link_materialize(local_ds, num_workers):
        with local_ds as ds:
            ds.create_tensor("abc", htype="link[image]", sample_compression="jpg")
            ds.abc.extend(
                [
                    (
                        deeplake.link("https://picsum.photos/20/20")
                        if i % 2
                        else deeplake.link("https://picsum.photos/10/10")
                    )
                    for i in range(20)
                ]
            )
            ds.commit()
    
        view = ds[::2]
>       view.save_view(id="view_1", optimize=True, num_workers=num_workers)

deeplake\core\query\test\test_query.py:480: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
deeplake\core\dataset\dataset.py:3741: in save_view
    return self._save_view(
deeplake\core\dataset\dataset.py:3839: in _save_view
    vds = self._save_view_in_subdir(
deeplake\core\dataset\dataset.py:3639: in _save_view_in_subdir
    self._write_vds(vds, info, copy, tensors, num_workers, scheduler, ignore_errors)
deeplake\core\dataset\dataset.py:3570: in _write_vds
    self._copy(
deeplake\core\dataset\dataset.py:4268: in _copy
    deeplake.compute(
deeplake\core\transform\transform.py:125: in eval
    pipeline.eval(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <deeplake.core.transform.transform.Pipeline object at 0x000001FA93C4C190>
data_in = Dataset(path='./hub_pytest/test_query/test_link_materialize-1-', index=Index([slice(None, None, 2)]), tensors=['abc'])
ds_out = Dataset(path='./hub_pytest/test_query/test_link_materialize-1-\.queries/view_1', tensors=['abc'])
num_workers = 1, scheduler = 'threaded', progressbar = True, skip_ok = True
check_lengths = False, pad_data_in = False, read_only_ok = False
cache_size = 16, checkpoint_interval = 0, ignore_errors = False, verbose = True
kwargs = {'disable_label_sync': True, 'extend_only': False}, overwrite = False
original_data_in = Dataset(path='./hub_pytest/test_query/test_link_materialize-1-', index=Index([slice(None, None, 2)]), tensors=['abc'])
initial_padding_state = None
target_ds = Dataset(path='./hub_pytest/test_query/test_link_materialize-1-\.queries/view_1', tensors=['abc'])
compute_provider = <deeplake.core.compute.thread.ThreadProvider object at 0x000001FA93C4C670>
compute_id = '9326b0be9aba49c68e1841e8d446ad0a', initial_autoflush = False

    def eval(
        self,
        data_in,
        ds_out: Optional[deeplake.Dataset] = None,
        num_workers: int = 0,
        scheduler: str = "threaded",
        progressbar: bool = True,
        skip_ok: bool = False,
        check_lengths: bool = True,
        pad_data_in: bool = False,
        read_only_ok: bool = False,
        cache_size: int = DEFAULT_TRANSFORM_SAMPLE_CACHE_SIZE,
        checkpoint_interval: int = 0,
        ignore_errors: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        """
        Evaluates the Pipeline of ComputeFunctions on ``data_in`` to produce an output dataset ``ds_out``. The purpose of compute functions is to process the input data in parallel,
        which is useful when rapidly ingesting data to a Deep Lake dataset. Pipelines can also be executed in-place, where it modifies the input dataset (see ``ds_out`` parameters below) instead of writing to a new dataset.
    
        Args:
            data_in: Input passed to the transform to generate output dataset. Should support ``__getitem__`` and ``__len__`` operations. Can be a Deep Lake dataset.
            ds_out (Dataset, optional): The dataset object to which the transform will get written. If this is not provided, the ComputeFunction will operate in-place, which means that data will be written to tensors in ``data_in`` .
                All tensors modified in the ComputeFunction should already be defined in ``ds_out``. It's initial state should be either:
                - Empty i.e. all tensors have no samples. In this case all samples are added to the dataset.
                - All tensors are populated and have same length. In this case new samples are appended to the dataset.
            num_workers (int): The number of workers to use for performing the transform. Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
            scheduler (str): The scheduler to be used to compute the transformation. Supported values include: ``serial``, ``threaded``, and ``processed``.
                Defaults to 'threaded'.
            progressbar (bool): Displays a progress bar if ``True`` (default).
            skip_ok (bool): If ``True``, skips the check for output tensors generated. This allows the user to skip certain tensors in the function definition.
                This is especially useful for inplace transformations in which certain tensors are not modified. Defaults to ``False``.
            check_lengths (bool): If ``True``, checks whether ``ds_out`` has tensors of same lengths initially.
            pad_data_in (bool): If ``True``, pads tensors of ``data_in`` to match the length of the largest tensor in ``data_in``.
                Defaults to ``False``.
            read_only_ok (bool): If ``True`` and output dataset is same as input dataset, the read-only check is skipped.
                Defaults to False.
            cache_size (int): Cache size to be used by transform per worker.
            checkpoint_interval (int): If > 0, the ComputeFunction will be checkpointed with a commit every ``checkpoint_interval`` input samples to avoid restarting full transform due to intermitten failures. If the transform is interrupted, the intermediate data is deleted and the dataset is reset to the last commit.
                If <= 0, no checkpointing is done. Checkpoint interval should be a multiple of num_workers if ``num_workers`` > 0. Defaults to 0.
            ignore_errors (bool): If ``True``, input samples that causes transform to fail will be skipped and the errors will be ignored **if possible**.
            verbose (bool): If ``True``, prints additional information about the transform.
            **kwargs: Additional arguments.
    
        Raises:
            InvalidInputDataError: If ``data_in`` passed to transform is invalid. It should support ``__getitem__`` and ``__len__`` operations. Using scheduler other than ``threaded`` with deeplake dataset having base storage as memory as ``data_in`` will also raise this.
            InvalidOutputDatasetError: If all the tensors of ``ds_out`` passed to transform don't have the same length. Using scheduler other than "threaded" with deeplake dataset having base storage as memory as ``ds_out`` will also raise this.
            TensorMismatchError: If one or more of the outputs generated during transform contain different tensors than the ones present in 'ds_out' provided to transform.
            UnsupportedSchedulerError: If the scheduler passed is not recognized. Supported values include: ``serial``, ``threaded``, and ``processed``.
            TransformError: All other exceptions raised if there are problems while running the pipeline.
            ValueError: If ``num_workers`` > 0 and ``checkpoint_interval`` is not a multiple of ``num_workers`` or if ``checkpoint_interval`` > 0 and ds_out is None.
            AllSamplesSkippedError: If all samples are skipped during execution of the Pipeline.
            ModuleNotInstalledException: If the module ``ray`` is not installed and the scheduler is set to ``ray``.
    
        # noqa: DAR401
    
        Example:
    
            # Suppose we have a series of operations that we want to perform in parallel on images using reusable pipelines.
            # We use the pipeline to ingest the transfomed data from one dataset to another dataset.
    
            # First, we define the ComputeFunctions that will be used in the pipeline
            @deeplake.compute
            def flip_vertical(sample_in, sample_out):
                sample_out.append({'labels': sample_in.labels.numpy(),
                                    'images': np.flip(sample_in.images.numpy(), axis = 0)})
    
            @deeplake.compute
            def resize(sample_in, sample_out, new_size):
                sample_out.append({"labels": sample_in.labels.numpy(),
                                    "images": np.array(Image.fromarray(sample_in.images.numpy()).resize(new_size))})
    
            # Append the label and image to the output sample
            sample_out.labels.append(sample_in.labels.numpy())
            sample_out.images.append(np.array(Image.fromarray(sample_in.images.numpy()).resize(new_size)))
    
            # We can define the pipeline using:
            pipeline = deeplake.compose([flip_vertical(), resize(new_size = (64,64))])
    
            # Finally, we can evaluate the pipeline using:
            pipeline.eval(ds_in, ds_out, num_workers = 4)
    
        Note:
            ``pad_data_in`` is only applicable if ``data_in`` is a Deep Lake dataset.
    
        """
        num_workers, scheduler = sanitize_workers_scheduler(num_workers, scheduler)
        overwrite = ds_out is None
        deeplake_reporter.feature_report(
            feature_name="eval",
            parameters={"Num_Workers": str(num_workers), "Scheduler": scheduler},
        )
        check_transform_data_in(data_in, scheduler)
    
        data_in, original_data_in, initial_padding_state = prepare_data_in(
            data_in, pad_data_in, overwrite
        )
        target_ds = data_in if overwrite else ds_out
    
        check_transform_ds_out(
            target_ds, scheduler, check_lengths, read_only_ok and overwrite
        )
    
        # if overwrite then we've already flushed and autocheckecked out data_in which is target_ds now
        if not overwrite:
            target_ds.flush()
            auto_checkout(target_ds)
    
        compute_provider = get_compute_provider(scheduler, num_workers)
        compute_id = str(uuid4().hex)
        target_ds._send_compute_progress(compute_id=compute_id, start=True, progress=0)
    
        initial_autoflush = target_ds.storage.autoflush
        target_ds.storage.autoflush = False
    
        if not check_lengths or read_only_ok:
            skip_ok = True
    
        checkpointing_enabled = checkpoint_interval > 0
        total_samples = len_data_in(data_in)
        if checkpointing_enabled:
            check_checkpoint_interval(
                data_in,
                checkpoint_interval,
                num_workers,
                overwrite,
                verbose,
            )
            datas_in = [
                data_in[i : i + checkpoint_interval]
                for i in range(0, len_data_in(data_in), checkpoint_interval)
            ]
    
        else:
            datas_in = [data_in]
    
        samples_processed = 0
        desc = get_pbar_description(self.functions)
        if progressbar:
            pbar = get_progress_bar(len_data_in(data_in), desc)
            pqueue = compute_provider.create_queue()
        else:
            pbar, pqueue = None, None
        try:
            desc = desc.split()[1]
            completed = False
            progress = 0.0
            for data_in in datas_in:
                if checkpointing_enabled:
                    target_ds._commit(
                        f"Auto-commit during deeplake.compute of {desc} after {progress}% progress",
                        None,
                        False,
                        is_checkpoint=True,
                        total_samples_processed=samples_processed,
                    )
                progress = round(
                    (samples_processed + len_data_in(data_in)) / total_samples * 100, 2
                )
                end = progress == 100
                progress_args = {
                    "compute_id": compute_id,
                    "progress": progress,
                    "end": end,
                }
    
                try:
                    self.run(
                        data_in,
                        target_ds,
                        compute_provider,
                        num_workers,
                        scheduler,
                        progressbar,
                        overwrite,
                        skip_ok,
                        read_only_ok and overwrite,
                        cache_size,
                        pbar,
                        pqueue,
                        ignore_errors,
                        **kwargs,
                    )
                    target_ds._send_compute_progress(**progress_args, status="success")
                    samples_processed += len_data_in(data_in)
                    completed = end
                except Exception as e:
                    if checkpointing_enabled:
                        print(
                            "Transform failed. Resetting back to last committed checkpoint."
                        )
                        target_ds.reset(force=True)
                    target_ds._send_compute_progress(**progress_args, status="failed")
                    index, sample, suggest = None, None, False
                    if isinstance(e, TransformError):
                        index, sample, suggest = e.index, e.sample, e.suggest
                        if checkpointing_enabled and isinstance(index, int):
                            index = samples_processed + index
                        e = e.__cause__  # type: ignore
                    if isinstance(e, AllSamplesSkippedError):
                        raise e
>                   raise TransformError(
                        index=index,
                        sample=sample,
                        samples_processed=samples_processed,
                        suggest=suggest,
                    ) from e
E                   deeplake.util.exceptions.TransformError: Transform failed at index 9 of the input data. See traceback for more details. If you wish to skip the samples that cause errors, please specify `ignore_errors=True`.

deeplake\core\transform\transform.py:355: TransformError

View more details on GitHub Actions