diff --git a/deeplake/api/tests/test_reset.py b/deeplake/api/tests/test_reset.py index cf6cdaeca3..a02aeb696f 100644 --- a/deeplake/api/tests/test_reset.py +++ b/deeplake/api/tests/test_reset.py @@ -57,9 +57,7 @@ def test_load_corrupt_dataset(path): deeplake.load(path, access_method=access_method) with pytest.raises(ReadOnlyModeError): - deeplake.load( - path, read_only=True, access_method=access_method, reset=True - ) + deeplake.load(path, read_only=True, access_method=access_method, reset=True) ds = deeplake.load( path, diff --git a/deeplake/core/serialize.py b/deeplake/core/serialize.py index b8f16af787..42513dadd1 100644 --- a/deeplake/core/serialize.py +++ b/deeplake/core/serialize.py @@ -148,7 +148,7 @@ def get_header_from_url(url: str): enc_dtype = np.dtype(deeplake.constants.ENCODING_DTYPE) itemsize = enc_dtype.itemsize - headers = {"Range": "bytes=0-100"} + headers = {"Range": "bytes=0-1000"} request = Request(url, None, headers) byts = urlopen(request).read() @@ -164,8 +164,16 @@ def get_header_from_url(url: str): if shape_info_nbytes == 0: shape_info = np.array([], dtype=enc_dtype) else: + end_bytes = offset + shape_info_nbytes + if len(byts) < end_bytes: + # need to fetch more data than the initial guess of 100 bytes. Doubling to hopefully have enough for reading the byte positions + headers = {"Range": f"bytes=0-{end_bytes * 2}"} + + request = Request(url, None, headers) + byts = urlopen(request).read() + shape_info = ( - np.frombuffer(byts[offset : offset + shape_info_nbytes], dtype=enc_dtype) + np.frombuffer(byts[offset:end_bytes], dtype=enc_dtype) .reshape(shape_info_nrows, shape_info_ncols) .copy() ) @@ -178,10 +186,15 @@ def get_header_from_url(url: str): if byte_positions_nbytes == 0: byte_positions = np.array([], dtype=enc_dtype) else: + end_bytes = offset + byte_positions_nbytes + if len(byts) < end_bytes: + headers = {"Range": f"bytes=0-{end_bytes}"} + + request = Request(url, None, headers) + byts = urlopen(request).read() + byte_positions = ( - np.frombuffer( - byts[offset : offset + byte_positions_nbytes], dtype=enc_dtype - ) + np.frombuffer(byts[offset:end_bytes], dtype=enc_dtype) .reshape(byte_positions_rows, 3) .copy() ) diff --git a/deeplake/core/tests/test_serialize.py b/deeplake/core/tests/test_serialize.py index 5f61cd6521..a613fea790 100644 --- a/deeplake/core/tests/test_serialize.py +++ b/deeplake/core/tests/test_serialize.py @@ -1,3 +1,5 @@ +import pytest + from deeplake.constants import ENCODING_DTYPE from deeplake.core.serialize import ( serialize_chunk, @@ -44,3 +46,12 @@ def test_chunkids_serialize(): version2, ids, dtype = decoded assert version2 == version np.testing.assert_array_equal(arr, ids) + + +@pytest.mark.slow +def test_get_large_header(): + # headers for videos in this dataset are larger than the 100 bytes originally fetched + # ideally this test would just be calling `serialize.get_header_from_url` directly, but that requires all the URL buliding up logic that lives in the chunk engine. + # So calling a larger codepath that includes `get_header_from_url` + ds = deeplake.load("hub://activeloop/hmdb51-train") + assert ds.videos[0].shape == (75, 240, 560, 3) diff --git a/deeplake/enterprise/convert_to_libdeeplake.py b/deeplake/enterprise/convert_to_libdeeplake.py index c354a27dfa..6fd8845455 100644 --- a/deeplake/enterprise/convert_to_libdeeplake.py +++ b/deeplake/enterprise/convert_to_libdeeplake.py @@ -211,7 +211,7 @@ def dataset_to_libdeeplake(hub2_dataset): commit_id = hub2_dataset.pending_commit_id libdeeplake_dataset.checkout(commit_id) slice_ = hub2_dataset.index.values[0].value - if slice_ != slice(None)and isinstance(slice_, tuple): + if slice_ != slice(None) and isinstance(slice_, tuple): slice_ = list(slice_) libdeeplake_dataset = libdeeplake_dataset[slice_] return libdeeplake_dataset diff --git a/deeplake/enterprise/libdeeplake_query.py b/deeplake/enterprise/libdeeplake_query.py index 76be94205c..29a4669a4d 100644 --- a/deeplake/enterprise/libdeeplake_query.py +++ b/deeplake/enterprise/libdeeplake_query.py @@ -40,7 +40,7 @@ def query(dataset, query_string: str): ds = dataset.libdeeplake_dataset slice_ = dataset.index.values[0].value if slice_ != slice(None) and isinstance(slice_, tuple): - slice_ = list(slice_) + slice_ = list(slice_) ds = ds[slice_] else: ds = dataset_to_libdeeplake(dataset)