Correctly load chunks with larger headers (#2574)

* Increased original header byte request to 1K * Dynamically get more data when reading headers if the headers are surprisingly large
activeloopai · Sep 6, 2023 · 727116d · 727116d
1 parent a811cd5
commit 727116d
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 10 deletions.
diff --git a/deeplake/api/tests/test_reset.py b/deeplake/api/tests/test_reset.py
@@ -57,9 +57,7 @@ def test_load_corrupt_dataset(path):
         deeplake.load(path, access_method=access_method)
 
     with pytest.raises(ReadOnlyModeError):
-        deeplake.load(
-            path, read_only=True, access_method=access_method, reset=True
-        )
+        deeplake.load(path, read_only=True, access_method=access_method, reset=True)
 
     ds = deeplake.load(
         path,

diff --git a/deeplake/core/serialize.py b/deeplake/core/serialize.py
@@ -148,7 +148,7 @@ def get_header_from_url(url: str):
     enc_dtype = np.dtype(deeplake.constants.ENCODING_DTYPE)
     itemsize = enc_dtype.itemsize
 
-    headers = {"Range": "bytes=0-100"}
+    headers = {"Range": "bytes=0-1000"}
 
     request = Request(url, None, headers)
     byts = urlopen(request).read()
@@ -164,8 +164,16 @@ def get_header_from_url(url: str):
     if shape_info_nbytes == 0:
         shape_info = np.array([], dtype=enc_dtype)
     else:
+        end_bytes = offset + shape_info_nbytes
+        if len(byts) < end_bytes:
+            # need to fetch more data than the initial guess of 100 bytes. Doubling to hopefully have enough for reading the byte positions
+            headers = {"Range": f"bytes=0-{end_bytes * 2}"}
+
+            request = Request(url, None, headers)
+            byts = urlopen(request).read()
+
         shape_info = (
-            np.frombuffer(byts[offset : offset + shape_info_nbytes], dtype=enc_dtype)
+            np.frombuffer(byts[offset:end_bytes], dtype=enc_dtype)
             .reshape(shape_info_nrows, shape_info_ncols)
             .copy()
         )
@@ -178,10 +186,15 @@ def get_header_from_url(url: str):
     if byte_positions_nbytes == 0:
         byte_positions = np.array([], dtype=enc_dtype)
     else:
+        end_bytes = offset + byte_positions_nbytes
+        if len(byts) < end_bytes:
+            headers = {"Range": f"bytes=0-{end_bytes}"}
+
+            request = Request(url, None, headers)
+            byts = urlopen(request).read()
+
         byte_positions = (
-            np.frombuffer(
-                byts[offset : offset + byte_positions_nbytes], dtype=enc_dtype
-            )
+            np.frombuffer(byts[offset:end_bytes], dtype=enc_dtype)
             .reshape(byte_positions_rows, 3)
             .copy()
         )

diff --git a/deeplake/core/tests/test_serialize.py b/deeplake/core/tests/test_serialize.py
@@ -1,3 +1,5 @@
+import pytest
+
 from deeplake.constants import ENCODING_DTYPE
 from deeplake.core.serialize import (
     serialize_chunk,
@@ -44,3 +46,12 @@ def test_chunkids_serialize():
     version2, ids, dtype = decoded
     assert version2 == version
     np.testing.assert_array_equal(arr, ids)
+
+
+@pytest.mark.slow
+def test_get_large_header():
+    # headers for videos in this dataset are larger than the 100 bytes originally fetched
+    # ideally this test would just be calling `serialize.get_header_from_url` directly, but that requires all the URL buliding up logic that lives in the chunk engine.
+    # So calling a larger codepath that includes `get_header_from_url`
+    ds = deeplake.load("hub://activeloop/hmdb51-train")
+    assert ds.videos[0].shape == (75, 240, 560, 3)
diff --git a/deeplake/enterprise/convert_to_libdeeplake.py b/deeplake/enterprise/convert_to_libdeeplake.py
@@ -211,7 +211,7 @@ def dataset_to_libdeeplake(hub2_dataset):
     commit_id = hub2_dataset.pending_commit_id
     libdeeplake_dataset.checkout(commit_id)
     slice_ = hub2_dataset.index.values[0].value
-    if slice_ != slice(None)and isinstance(slice_, tuple):
+    if slice_ != slice(None) and isinstance(slice_, tuple):
         slice_ = list(slice_)
     libdeeplake_dataset = libdeeplake_dataset[slice_]
     return libdeeplake_dataset
diff --git a/deeplake/enterprise/libdeeplake_query.py b/deeplake/enterprise/libdeeplake_query.py
@@ -40,7 +40,7 @@ def query(dataset, query_string: str):
         ds = dataset.libdeeplake_dataset
         slice_ = dataset.index.values[0].value
         if slice_ != slice(None) and isinstance(slice_, tuple):
-                slice_ = list(slice_)
+            slice_ = list(slice_)
         ds = ds[slice_]
     else:
         ds = dataset_to_libdeeplake(dataset)