activeloopai · nvoxland · Sep 6, 2023 · Sep 1, 2023 · Sep 1, 2023 · Sep 2, 2023
diff --git a/deeplake/core/serialize.py b/deeplake/core/serialize.py
@@ -164,8 +164,16 @@
     if shape_info_nbytes == 0:
         shape_info = np.array([], dtype=enc_dtype)
     else:
+        end_bytes = offset + shape_info_nbytes
+        if len(byts) < end_bytes:
+            # need to fetch more data than the initial guess of 100 bytes. Doubling to hopefully have enough for reading the byte positions
+            headers = {"Range": f"bytes=0-{end_bytes * 2}"}
+
+            request = Request(url, None, headers)
+            byts = urlopen(request).read()
+
         shape_info = (
-            np.frombuffer(byts[offset : offset + shape_info_nbytes], dtype=enc_dtype)
+            np.frombuffer(byts[offset: end_bytes], dtype=enc_dtype)
             .reshape(shape_info_nrows, shape_info_ncols)
             .copy()
         )
@@ -178,9 +186,16 @@
     if byte_positions_nbytes == 0:
         byte_positions = np.array([], dtype=enc_dtype)
     else:
+        end_bytes = offset + byte_positions_nbytes
+        if len(byts) < end_bytes:
+            headers = {"Range": f"bytes=0-{end_bytes}"}
+
+            request = Request(url, None, headers)
+            byts = urlopen(request).read()
+
         byte_positions = (
             np.frombuffer(
-                byts[offset : offset + byte_positions_nbytes], dtype=enc_dtype
+                byts[offset : end_bytes], dtype=enc_dtype
             )
             .reshape(byte_positions_rows, 3)
             .copy()

diff --git a/deeplake/core/tests/test_serialize.py b/deeplake/core/tests/test_serialize.py
@@ -1,3 +1,5 @@
+import pytest
+
 from deeplake.constants import ENCODING_DTYPE
 from deeplake.core.serialize import (
     serialize_chunk,
@@ -44,3 +46,11 @@ def test_chunkids_serialize():
     version2, ids, dtype = decoded
     assert version2 == version
     np.testing.assert_array_equal(arr, ids)
+
+@pytest.mark.slow
+def test_get_large_header():
+    # headers for videos in this dataset are larger than the 100 bytes originally fetched
+    # ideally this test would just be calling `serialize.get_header_from_url` directly, but that requires all the URL buliding up logic that lives in the chunk engine.
+    # So calling a larger codepath that includes `get_header_from_url`
+    ds = deeplake.load('hub://activeloop/hmdb51-train')
+    assert ds.videos[0].shape == (75, 240, 560, 3)