Skip to content

Commit

Permalink
Merge pull request #2503 from activeloopai/fy_skip_summary
Browse files Browse the repository at this point in the history
[AL-2351] Print a transform summary if ignore_errors=True
  • Loading branch information
FayazRahman authored Jul 25, 2023
2 parents e3cd2b3 + a1f4eb7 commit 0c6b1b7
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 5 deletions.
53 changes: 53 additions & 0 deletions deeplake/core/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1682,3 +1682,56 @@ def upload(sample_in, ds, class_names):
)
assert e.index == 20
assert e.sample == 10


def test_transform_summary(local_ds, capsys):
@deeplake.compute
def upload(sample_in, sample_out):
sample_out.images.append(sample_in)

with local_ds as ds:
ds.create_tensor("images", htype="image", sample_compression="jpg")

samples = (
["bad_sample"]
+ [np.random.randint(0, 255, (10, 10), dtype=np.uint8) for _ in range(8)]
+ ["bad_sample"]
) * 2

upload().eval(
samples,
ds,
num_workers=TRANSFORM_TEST_NUM_WORKERS,
progressbar=False,
ignore_errors=True,
)

captured = capsys.readouterr()
assert captured.out == (
"No. of samples successfully processed: 16 (80.0%)\n"
"No. of samples skipped: 4 (20.0%)\n"
)

samples = [np.random.randint(0, 255, (10, 10), dtype=np.uint8) for _ in range(8)]
upload().eval(
samples,
ds,
num_workers=TRANSFORM_TEST_NUM_WORKERS,
progressbar=False,
ignore_errors=True,
)

captured = capsys.readouterr()
assert captured.out == (
"No. of samples successfully processed: 8 (100.0%)\n"
"No. of samples skipped: 0 (0.0%)\n"
)

# no summary if ignore_errors=False
samples = [np.random.randint(0, 255, (10, 10), dtype=np.uint8) for _ in range(8)]
upload().eval(
samples, ds, num_workers=TRANSFORM_TEST_NUM_WORKERS, progressbar=False
)

captured = capsys.readouterr()
assert captured.out == ""
4 changes: 4 additions & 0 deletions deeplake/core/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
store_data_slice_with_pbar,
check_checkpoint_interval,
len_data_in,
transform_summary,
)
from deeplake.util.encoder import merge_all_meta_info
from deeplake.util.exceptions import (
Expand Down Expand Up @@ -464,6 +465,9 @@ def run(
verbose=progressbar,
)

if ignore_errors:
transform_summary(data_in, result)

for res in result["error"]:
if res is not None:
raise res
Expand Down
26 changes: 21 additions & 5 deletions deeplake/util/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,10 @@ def _transform_and_append_data_slice(
)
continue

if skipped_samples == n:
return False
return True
return {
"samples_skipped": skipped_samples,
"all_samples_skipped": skipped_samples == n,
}


def _retrieve_memory_objects(all_chunk_engines):
Expand Down Expand Up @@ -335,7 +336,10 @@ def store_data_slice_with_pbar(pg_callback, transform_input: Tuple) -> Dict:
cache_size=cache_size,
)

ret = True
ret = {
"all_samples_skipped": False,
"samples_skipped": 0,
}
err = None
try:
if extend_only:
Expand Down Expand Up @@ -366,7 +370,7 @@ def store_data_slice_with_pbar(pg_callback, transform_input: Tuple) -> Dict:
finally:
# retrieve relevant objects from memory
meta = _retrieve_memory_objects(all_chunk_engines)
meta["all_samples_skipped"] = not ret
meta.update(ret)
meta["error"] = err
return meta

Expand Down Expand Up @@ -541,6 +545,18 @@ def len_data_in(data_in):
return len(data_in)


def transform_summary(data_in, result):
samples_skipped = sum(result["samples_skipped"])
successful = len_data_in(data_in) - samples_skipped
successful_percent = round((successful / len_data_in(data_in)) * 100, 2)
skipped_percent = round(100 - successful_percent, 2)

print(
"No. of samples successfully processed:", successful, f"({successful_percent}%)"
)
print("No. of samples skipped:", samples_skipped, f"({skipped_percent}%)")


def create_slices(data_in, num_workers):
size = math.ceil(len_data_in(data_in) / num_workers)

Expand Down

0 comments on commit 0c6b1b7

Please sign in to comment.