Skip to content

Commit

Permalink
Correctly load chunks with larger headers (#2574)
Browse files Browse the repository at this point in the history
* Increased original header byte request to 1K
* Dynamically get more data when reading headers if the headers are surprisingly large
  • Loading branch information
nvoxland committed Sep 6, 2023
1 parent a811cd5 commit 727116d
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 10 deletions.
4 changes: 1 addition & 3 deletions deeplake/api/tests/test_reset.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,7 @@ def test_load_corrupt_dataset(path):
deeplake.load(path, access_method=access_method)

with pytest.raises(ReadOnlyModeError):
deeplake.load(
path, read_only=True, access_method=access_method, reset=True
)
deeplake.load(path, read_only=True, access_method=access_method, reset=True)

ds = deeplake.load(
path,
Expand Down
23 changes: 18 additions & 5 deletions deeplake/core/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def get_header_from_url(url: str):
enc_dtype = np.dtype(deeplake.constants.ENCODING_DTYPE)
itemsize = enc_dtype.itemsize

headers = {"Range": "bytes=0-100"}
headers = {"Range": "bytes=0-1000"}

request = Request(url, None, headers)
byts = urlopen(request).read()
Expand All @@ -164,8 +164,16 @@ def get_header_from_url(url: str):
if shape_info_nbytes == 0:
shape_info = np.array([], dtype=enc_dtype)
else:
end_bytes = offset + shape_info_nbytes
if len(byts) < end_bytes:
# need to fetch more data than the initial guess of 100 bytes. Doubling to hopefully have enough for reading the byte positions
headers = {"Range": f"bytes=0-{end_bytes * 2}"}

request = Request(url, None, headers)
byts = urlopen(request).read()

shape_info = (
np.frombuffer(byts[offset : offset + shape_info_nbytes], dtype=enc_dtype)
np.frombuffer(byts[offset:end_bytes], dtype=enc_dtype)
.reshape(shape_info_nrows, shape_info_ncols)
.copy()
)
Expand All @@ -178,10 +186,15 @@ def get_header_from_url(url: str):
if byte_positions_nbytes == 0:
byte_positions = np.array([], dtype=enc_dtype)
else:
end_bytes = offset + byte_positions_nbytes
if len(byts) < end_bytes:
headers = {"Range": f"bytes=0-{end_bytes}"}

request = Request(url, None, headers)
byts = urlopen(request).read()

byte_positions = (
np.frombuffer(
byts[offset : offset + byte_positions_nbytes], dtype=enc_dtype
)
np.frombuffer(byts[offset:end_bytes], dtype=enc_dtype)
.reshape(byte_positions_rows, 3)
.copy()
)
Expand Down
11 changes: 11 additions & 0 deletions deeplake/core/tests/test_serialize.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pytest

from deeplake.constants import ENCODING_DTYPE
from deeplake.core.serialize import (
serialize_chunk,
Expand Down Expand Up @@ -44,3 +46,12 @@ def test_chunkids_serialize():
version2, ids, dtype = decoded
assert version2 == version
np.testing.assert_array_equal(arr, ids)


@pytest.mark.slow
def test_get_large_header():
# headers for videos in this dataset are larger than the 100 bytes originally fetched
# ideally this test would just be calling `serialize.get_header_from_url` directly, but that requires all the URL buliding up logic that lives in the chunk engine.
# So calling a larger codepath that includes `get_header_from_url`
ds = deeplake.load("hub://activeloop/hmdb51-train")
assert ds.videos[0].shape == (75, 240, 560, 3)
2 changes: 1 addition & 1 deletion deeplake/enterprise/convert_to_libdeeplake.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def dataset_to_libdeeplake(hub2_dataset):
commit_id = hub2_dataset.pending_commit_id
libdeeplake_dataset.checkout(commit_id)
slice_ = hub2_dataset.index.values[0].value
if slice_ != slice(None)and isinstance(slice_, tuple):
if slice_ != slice(None) and isinstance(slice_, tuple):
slice_ = list(slice_)
libdeeplake_dataset = libdeeplake_dataset[slice_]
return libdeeplake_dataset
2 changes: 1 addition & 1 deletion deeplake/enterprise/libdeeplake_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def query(dataset, query_string: str):
ds = dataset.libdeeplake_dataset
slice_ = dataset.index.values[0].value
if slice_ != slice(None) and isinstance(slice_, tuple):
slice_ = list(slice_)
slice_ = list(slice_)
ds = ds[slice_]
else:
ds = dataset_to_libdeeplake(dataset)
Expand Down

0 comments on commit 727116d

Please sign in to comment.