Merge branch 'main' into VectorStoreSmallUpdates

activeloopai · Jun 5, 2023 · ff4c3b5 · ff4c3b5
2 parents ad6e8ab + 2ff28ba
commit ff4c3b5
Show file tree

Hide file tree

Showing 12 changed files with 133 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
      <img src="https://i.postimg.cc/rsjcWc3S/deeplake-logo.png" width="400"/>
 </h1>
     </br>
-    <h1 align="center">Deep Lake: Vector Database for any AI data
+    <h1 align="center">Deep Lake: Database for AI
  </h1>
 <p align="center">
     <a href="https://github.com/activeloopai/Hub/actions/workflows/test-pr-on-label.yml"><img src="https://github.com/activeloopai/Hub/actions/workflows/test-push.yml/badge.svg" alt="PyPI version" height="18"></a>
@@ -30,7 +30,7 @@
 
 ## About Deep Lake
 
-Deep Lake is a Vector Database powered by a unique storage format optimized for deep-learning and Large Language Model (LLM) based applications. It simplifies the deployment of enterprise-grade LLM-based products by offering storage for all data types (embeddings, audio, text, videos, images, pdfs, annotations, etc.), querying and vector search, data streaming while training models at scale, data versioning and lineage for all workloads, and integrations with popular tools such as LangChain, LlamaIndex, Weights and Biases, and many more. Deep Lake works with data of any size, it is serverless, and it enables you to store all of your data in once place. Deep Lake is used by Google, Intel, Airbus, Matterport, Red Cross, Yale, & Oxford. 
+Deep Lake is a Database for AI powered by a unique storage format optimized for deep-learning and Large Language Model (LLM) based applications. It simplifies the deployment of enterprise-grade LLM-based products by offering storage for all data types (embeddings, audio, text, videos, images, pdfs, annotations, etc.), querying and vector search, data streaming while training models at scale, data versioning and lineage for all workloads, and integrations with popular tools such as LangChain, LlamaIndex, Weights & Biases, and many more. Deep Lake works with data of any size, it is serverless, and it enables you to store all of your data in once place. Deep Lake is used by Intel, Airbus, Matterport, ZERO Systems, Red Cross, Yale, & Oxford. 
 
 Deep Lake includes the following features:
 
@@ -270,14 +270,7 @@ Deep Lake offers integrations with other tools in order to streamline your deep
 Getting started guides, examples, tutorials, API reference, and other useful information can be found on our [documentation page](http://docs.activeloop.ai/?utm_source=github&utm_medium=repo&utm_campaign=readme). 
 
 ## 🎓 For Students and Educators
-Deep Lake users can access and visualize a variety of popular datasets through a free integration with Activeloop's Platform. Users can also create and store their own datasets and make them available to the public. Free storage of up to 300 GB is available for students and educators:
-
-| <!-- -->    | <!-- -->    |
-| ---------------------------------------------------- | ------------- |
-| Storage for public datasets hosted by Activeloop     | 200GB Free    |
-| Storage for private datasets hosted by Activeloop    | 100GB Free    |
-
-
+Deep Lake users can access and visualize a variety of popular datasets through a free integration with Activeloop's Platform. Universities can get up to 1TB of data storage and 100,000 monthly queries on the Tensor Database for free per month. Chat in on [our website](https://activeloop.ai): to claim the access!
 
 ## 👩‍💻 Comparisons to Familiar Tools
 
@@ -312,6 +305,17 @@ Deep Lake and DVC offer dataset version control similar to git for data, but the
 </details>
 
 
+<details>
+  <summary><b>Deep Lake vs MosaicML MDS format </b></summary>
+
+- **Data Storage Format:** Deep Lake operates on a columnar storage format, whereas MDS utilizes a row-wise storage approach. This fundamentally impacts how data is read, written, and organized in each system.
+- **Compression:** Deep Lake offers a more flexible compression scheme, allowing control over both chunk-level and sample-level compression for each column or tensor. This feature eliminates the need for additional compressions like zstd, which would otherwise demand more CPU cycles for decompressing on top of formats like jpeg.
+- **Shuffling:** MDS currently offers more advanced shuffling strategies.
+- **Version Control & Visualization Support:** A notable feature of Deep Lake is its native version control and in-browser data visualization, a feature not present for MosaicML data format. This can provide significant advantages in managing, understanding, and tracking different versions of the data.
+
+</details>
+
+
 <details>
   <summary><b>Deep Lake vs TensorFlow Datasets (TFDS)</b></summary>
 

diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py
@@ -26,6 +26,7 @@
     convert_pathlib_to_string_if_needed,
     verify_dataset_name,
     process_dataset_path,
+    get_path_type,
 )
 from deeplake.util.tensor_db import parse_runtime_parameters
 from deeplake.hooks import (
@@ -173,7 +174,7 @@ def init(
             TokenPermissionError: When there are permission or other errors related to token
             CheckoutError: If version address specified in the path cannot be found
             DatasetCorruptError: If loading the dataset failed due to corruption and ``reset`` is not ``True``
-            ValueError: If version is specified in the path when creating a dataset
+            ValueError: If version is specified in the path when creating a dataset or If the org id is provided but dataset is ot local, or If the org id is provided but dataset is ot local
             ReadOnlyModeError: If reset is attempted in read-only mode
             LockedException: When attempting to open a dataset for writing when it is locked by another machine
             Exception: Re-raises caught exception if reset cannot fix the issue
@@ -193,6 +194,9 @@ def init(
         path, address = process_dataset_path(path)
         verify_dataset_name(path)
 
+        if org_id is not None and get_path_type(path) != "local":
+            raise ValueError("org_id parameter can only be used with local datasets")
+
         if creds is None:
             creds = {}
 
@@ -380,6 +384,9 @@ def empty(
         path, address = process_dataset_path(path)
         db_engine = parse_runtime_parameters(path, runtime)["tensor_db"]
 
+        if org_id is not None and get_path_type(path) != "local":
+            raise ValueError("org_id parameter can only be used with local datasets")
+
         if address:
             raise ValueError(
                 "deeplake.empty does not accept version address in the dataset path."
@@ -524,6 +531,7 @@ def load(
             ReadOnlyModeError: If reset is attempted in read-only mode
             LockedException: When attempting to open a dataset for writing when it is locked by another machine
             Exception: Re-raises caught exception if reset cannot fix the issue
+            ValueError: If the org id is provided but the dataset is not local
 
         Warning:
             Setting ``access_method`` to download will overwrite the local copy of the dataset if it was previously downloaded.
@@ -539,6 +547,9 @@ def load(
         if creds is None:
             creds = {}
 
+        if org_id is not None and get_path_type(path) != "local":
+            raise ValueError("org_id parameter can only be used with local datasets")
+
         try:
             storage, cache_chain = get_storage_and_cache_chain(
                 path=path,
@@ -835,11 +846,18 @@ def like(
 
         Returns:
             Dataset: New dataset object.
+
+        Raises:
+            ValueError: If the org id is provided but the dataset is not local
         """
         if isinstance(dest, Dataset):
             path = dest.path
         else:
             path = dest
+
+        if org_id is not None and get_path_type(path) != "local":
+            raise ValueError("org_id parameter can only be used with local datasets")
+
         feature_report_path(
             path,
             "like",

diff --git a/deeplake/api/tests/test_api.py b/deeplake/api/tests/test_api.py
@@ -2602,3 +2602,17 @@ def test_shape_squeeze(memory_ds):
         ds.abc.extend(np.ones((5, 10, 12, 20)))
 
     assert ds.abc[5:, :, 9].shape == (5, 10, 20)
+
+
+def test_non_local_org_id():
+    with pytest.raises(ValueError):
+        ds = deeplake.dataset("hub://test/test_dataset", org_id="test")
+
+    with pytest.raises(ValueError):
+        ds = deeplake.empty("hub://test/test_dataset", org_id="test")
+
+    with pytest.raises(ValueError):
+        ds = deeplake.load("hub://test/test_dataset", org_id="test")
+
+    with pytest.raises(ValueError):
+        ds = deeplake.like("hub://test/test_dataset", "test/test_ds", org_id="test")
diff --git a/deeplake/core/dataset/deeplake_cloud_dataset.py b/deeplake/core/dataset/deeplake_cloud_dataset.py
@@ -59,7 +59,7 @@ def token(self):
 
     def _set_org_and_name(self):
         if self.is_actually_cloud:
-            if self.org_id is not None:
+            if self.org_id is not None and self.ds_name is not None:
                 return
             _, org_id, ds_name, subdir = process_hub_path(self.path)
             if subdir:

diff --git a/deeplake/core/link_creds.py b/deeplake/core/link_creds.py
@@ -12,6 +12,14 @@
 )
 from deeplake.util.token import expires_in_to_expires_at, is_expired_token
 from deeplake.client.log import logger
+from datetime import datetime, timezone
+
+
+def _is_expired_creds(creds: dict) -> bool:
+    if "expiration" not in creds:
+        return False
+
+    return creds["expiration"] < datetime.now(timezone.utc).timestamp()
 
 
 class LinkCreds(DeepLakeMemoryObject):
@@ -44,7 +52,10 @@ def get_creds(self, key: Optional[str]):
         if (
             self.client is not None
             and key in self.managed_creds_keys
-            and is_expired_token(self.creds_dict[key])
+            and (
+                is_expired_token(self.creds_dict[key])
+                or _is_expired_creds(self.creds_dict[key])
+            )
         ):
             self.refresh_managed_creds(key)  # type: ignore
         return self.creds_dict[key]

diff --git a/deeplake/core/storage/s3.py b/deeplake/core/storage/s3.py
@@ -87,6 +87,7 @@ def __init__(
         aws_region: Optional[str] = None,
         profile_name: Optional[str] = None,
         token: Optional[str] = None,
+        **kwargs,
     ):
         """Initializes the S3Provider
 
@@ -108,6 +109,7 @@ def __init__(
             profile_name (str, optional): Specifies the AWS profile name to use.
             token (str, optional): Activeloop token, used for fetching credentials for Deep Lake datasets (if this is underlying storage for Deep Lake dataset).
                 This is optional, tokens are normally autogenerated.
+            **kwargs: Additional arguments to pass to the S3 client. Includes: ``expiration``.
         """
         self.root = root
         self.aws_access_key_id = aws_access_key_id

diff --git a/deeplake/core/tests/test_deeplake_indra_dataset.py b/deeplake/core/tests/test_deeplake_indra_dataset.py
@@ -1,7 +1,12 @@
 import deeplake
 import numpy as np
 from deeplake.tests.common import requires_libdeeplake
-from deeplake.util.exceptions import DynamicTensorNumpyError
+from deeplake.util.exceptions import (
+    DynamicTensorNumpyError,
+    EmptyTokenException,
+)
+
+
 from deeplake.core.dataset.deeplake_query_dataset import DeepLakeQueryDataset
 import random
 import pytest
@@ -73,6 +78,19 @@ def test_save_view(local_ds_generator):
     )
 
 
+@requires_libdeeplake
+def test_empty_token_exception(local_ds):
+    from deeplake.enterprise.convert_to_libdeeplake import dataset_to_libdeeplake
+
+    with local_ds:
+        local_ds.create_tensor("label", htype="generic", dtype=np.int32)
+
+    loaded = deeplake.load(local_ds.path, token="")
+
+    with pytest.raises(EmptyTokenException):
+        dss = dataset_to_libdeeplake(loaded)
+
+
 @requires_libdeeplake
 def test_load_view(local_ds_generator):
     from deeplake.enterprise.convert_to_libdeeplake import dataset_to_libdeeplake

diff --git a/deeplake/core/tests/test_locking.py b/deeplake/core/tests/test_locking.py
@@ -10,6 +10,7 @@
     enabled_persistent_non_gdrive_dataset_generators,
 )
 from concurrent.futures import ThreadPoolExecutor
+from deeplake.tests.common import requires_non_python11
 import deeplake
 
 _counter = 0
@@ -36,6 +37,7 @@ def __exit__(self, *args, **kwargs):
         deeplake.core.lock._LOCKS.update(self._locks)
 
 
+@requires_non_python11
 @enabled_persistent_non_gdrive_dataset_generators
 def test_dataset_locking(ds_generator):
     deeplake.constants.LOCK_LOCAL_DATASETS = True
@@ -65,6 +67,7 @@ def test_dataset_locking(ds_generator):
         deeplake.constants.LOCK_LOCAL_DATASETS = False
 
 
+@requires_non_python11
 @enabled_persistent_non_gdrive_dataset_generators
 def test_vc_locking(ds_generator):
     deeplake.constants.LOCK_LOCAL_DATASETS = True
@@ -84,6 +87,7 @@ def test_vc_locking(ds_generator):
         deeplake.constants.LOCK_LOCAL_DATASETS = False
 
 
+@requires_non_python11
 def test_lock_thread_leaking(s3_ds_generator):
     locks = deeplake.core.lock._LOCKS
     refs = deeplake.core.lock._REFS
@@ -123,6 +127,7 @@ def nlocks():
     assert nlocks() == 0  # 0 because dataset and all views deleted
 
 
+@requires_non_python11
 def test_concurrent_locking(memory_ds):
     storage = memory_ds.base_storage
 

diff --git a/deeplake/enterprise/convert_to_libdeeplake.py b/deeplake/enterprise/convert_to_libdeeplake.py
@@ -1,6 +1,7 @@
 from deeplake.core.storage.gcs import GCSProvider
 from deeplake.enterprise.util import raise_indra_installation_error  # type: ignore
 from deeplake.core.storage import S3Provider
+from deeplake.util.exceptions import EmptyTokenException
 
 from deeplake.util.dataset import try_flushing  # type: ignore
 import importlib
@@ -142,6 +143,8 @@ def dataset_to_libdeeplake(hub2_dataset):
             org_id = hub2_dataset.org_id
             if token is None:
                 libdeeplake_dataset = api.dataset(path)
+            elif token == "":
+                raise EmptyTokenException
             else:
                 org_id = (
                     org_id

diff --git a/deeplake/integrations/tests/test_mmdet.py b/deeplake/integrations/tests/test_mmdet.py
@@ -41,6 +41,16 @@ def load_pickle_file(pickle_file):
         return pickle.load(f)
 
 
+def process_cfg(cfg, model_name, dataset_path):
+    if dataset_path == _BALLOON_PATH:
+        if model_name in _INSTANCE_SEGMENTATION:
+            cfg.model.roi_head.bbox_head.num_classes = 1
+            cfg.model.roi_head.mask_head.num_classes = 1
+        elif model_name in _OBJECT_DETECTION:
+            cfg.model.bbox_head.num_classes = 1
+    return cfg
+
+
 @pytest.mark.skipif(
     sys.platform != "linux" or sys.version_info < (3, 7),
     reason="MMDet is installed on CI only for linux and python version >= 3.7.",
@@ -432,31 +442,35 @@ def test_mmdet(
         ds_train_with_none = dp.empty("ds_train", overwrite=True)
         ds_val_with_none = dp.empty("ds_val", overwrite=True)
 
-        ds_train_with_none.create_tensor_like("images", ds_train.images)
-        ds_train_with_none.create_tensor_like("bounding_boxes", ds_train.bounding_boxes)
-        ds_train_with_none.create_tensor_like(
-            "segmentation_polygons", ds_train.segmentation_polygons
-        )
-        ds_train_with_none.create_tensor_like("labels", ds_train.labels)
-
-        ds_val_with_none.create_tensor_like("images", ds_val.images)
-        ds_val_with_none.create_tensor_like("bounding_boxes", ds_val.bounding_boxes)
-        ds_val_with_none.create_tensor_like(
-            "segmentation_polygons", ds_val.segmentation_polygons
-        )
-        ds_val_with_none.create_tensor_like("labels", ds_val.labels)
-
-        ds_train_with_none.append(ds_train[0])
-        ds_train_with_none.images.append(ds_train.images[1])
-        ds_train_with_none.bounding_boxes.append(None)
-        ds_train_with_none.segmentation_polygons.append(None)
-        ds_train_with_none.labels.append(None)
-
-        ds_val_with_none.append(ds_val[0])
-        ds_val_with_none.images.append(ds_val.images[1])
-        ds_val_with_none.bounding_boxes.append(None)
-        ds_val_with_none.segmentation_polygons.append(None)
-        ds_val_with_none.labels.append(None)
+        with ds_train_with_none:
+            ds_train_with_none.create_tensor_like("images", ds_train.images)
+            ds_train_with_none.create_tensor_like(
+                "bounding_boxes", ds_train.bounding_boxes
+            )
+            ds_train_with_none.create_tensor_like(
+                "segmentation_polygons", ds_train.segmentation_polygons
+            )
+            ds_train_with_none.create_tensor_like("labels", ds_train.labels)
+
+            ds_train_with_none.append(ds_train[0])
+            ds_train_with_none.images.append(ds_train.images[1])
+            ds_train_with_none.bounding_boxes.append(None)
+            ds_train_with_none.segmentation_polygons.append(None)
+            ds_train_with_none.labels.append(None)
+
+        with ds_val_with_none:
+            ds_val_with_none.create_tensor_like("images", ds_val.images)
+            ds_val_with_none.create_tensor_like("bounding_boxes", ds_val.bounding_boxes)
+            ds_val_with_none.create_tensor_like(
+                "segmentation_polygons", ds_val.segmentation_polygons
+            )
+            ds_val_with_none.create_tensor_like("labels", ds_val.labels)
+
+            ds_val_with_none.append(ds_val[0])
+            ds_val_with_none.images.append(ds_val.images[1])
+            ds_val_with_none.bounding_boxes.append(None)
+            ds_val_with_none.segmentation_polygons.append(None)
+            ds_val_with_none.labels.append(None)
 
         ds_train = ds_train_with_none
         ds_val = ds_val_with_none
@@ -471,13 +485,3 @@ def test_mmdet(
         ds_val=ds_val,
         ds_val_tensors=deeplake_tensors,
     )
-
-
-def process_cfg(cfg, model_name, dataset_path):
-    if dataset_path == _BALLOON_PATH:
-        if model_name in _INSTANCE_SEGMENTATION:
-            cfg.model.roi_head.bbox_head.num_classes = 1
-            cfg.model.roi_head.mask_head.num_classes = 1
-        elif model_name in _OBJECT_DETECTION:
-            cfg.model.bbox_head.num_classes = 1
-    return cfg
diff --git a/deeplake/tests/common.py b/deeplake/tests/common.py
@@ -128,6 +128,11 @@ def assert_images_close(img1: np.ndarray, img2: np.ndarray, eps=0.5):
     reason="These tests require libdeeplake to be installed",
 )
 
+requires_non_python11 = pytest.mark.skipif(
+    sys.version_info[0] == 3 and sys.version_info[1] > 10,
+    reason="These tests require to run on all python vestions lover than 3.11",
+)
+
 
 class LinkTransformTestContext:
     def __init__(self, func: Callable, name: str):

diff --git a/deeplake/util/spinner.py b/deeplake/util/spinner.py
@@ -104,7 +104,6 @@ def show(self):
 
     def stop(self):
         self._stop_event.set()
-        self.join()
         self._clear_line()
         self._show_cursor()