diff --git a/.gitignore b/.gitignore index 3421eb801b..974eec3c5b 100644 --- a/.gitignore +++ b/.gitignore @@ -195,3 +195,7 @@ cov.xml hub/api/cov.xml hub/api/nested_seq nested_seq + +# Benchmark local test data (auto-downloaded) +benchmarks/hub_data +benchmarks/torch_data diff --git a/benchmarks/benchmark_access_hub_full.py b/benchmarks/benchmark_access_hub_full.py new file mode 100644 index 0000000000..0d1780e798 --- /dev/null +++ b/benchmarks/benchmark_access_hub_full.py @@ -0,0 +1,16 @@ +from hub import Dataset + + +def benchmark_access_hub_full_setup(dataset_name, field=None): + dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r") + + keys = dset.keys + if field is not None: + keys = (field,) + return (dset, keys) + + +def benchmark_access_hub_full_run(params): + dset, keys = params + for k in keys: + dset[k].compute() diff --git a/benchmarks/benchmark_access_hub_slice.py b/benchmarks/benchmark_access_hub_slice.py new file mode 100644 index 0000000000..9d9b3f8eb7 --- /dev/null +++ b/benchmarks/benchmark_access_hub_slice.py @@ -0,0 +1,16 @@ +from hub import Dataset + + +def benchmark_access_hub_slice_setup(dataset_name, slice_bounds, field=None): + dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r") + + keys = dset.keys + if field is not None: + keys = (field,) + return (dset, slice_bounds, keys) + + +def benchmark_access_hub_slice_run(params): + dset, slice_bounds, keys = params + for k in keys: + dset[k][slice_bounds[0] : slice_bounds[1]].compute() diff --git a/benchmarks/benchmark_compress_hub.py b/benchmarks/benchmark_compress_hub.py new file mode 100644 index 0000000000..648b659a6c --- /dev/null +++ b/benchmarks/benchmark_compress_hub.py @@ -0,0 +1,28 @@ +import numpy as np +from PIL import Image + +import hub + + +def benchmark_compress_hub_setup( + times, image_path="./images/compression_benchmark_image.png" +): + img = Image.open(image_path) + arr = np.array(img) + ds = hub.Dataset( + "./data/bench_png_compression", + mode="w", + shape=times, + schema={"image": hub.schema.Image(arr.shape, compressor="png")}, + ) + + batch = np.zeros((times,) + arr.shape, dtype="uint8") + for i in range(times): + batch[i] = arr + + return (ds, times, batch) + + +def benchmark_compress_hub_run(params): + ds, times, batch = params + ds["image", :times] = batch diff --git a/benchmarks/benchmark_compress_pillow.py b/benchmarks/benchmark_compress_pillow.py new file mode 100644 index 0000000000..1d37dfbaf5 --- /dev/null +++ b/benchmarks/benchmark_compress_pillow.py @@ -0,0 +1,16 @@ +from PIL import Image +from io import BytesIO + + +def benchmark_compress_pillow_setup( + times, image_path="./images/compression_benchmark_image.png" +): + img = Image.open(image_path) + return (img, times) + + +def benchmark_compress_pillow_run(params): + img, times = params + for _ in range(times): + b = BytesIO() + img.save(b, format="png") diff --git a/benchmarks/benchmark_compress_time.py b/benchmarks/benchmark_compress_time.py deleted file mode 100644 index 7ffca63493..0000000000 --- a/benchmarks/benchmark_compress_time.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -License: -This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. -If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. -""" - -import numpy as np -from PIL import Image -from io import BytesIO - -import hub -import hub.schema -from hub.utils import Timer - -IMAGE_PATH = "./images/compression_benchmark_image.png" -IMG = Image.open(IMAGE_PATH) - -REPEAT_TIMES = 100 - - -def bench_pil_compression(times=REPEAT_TIMES): - with Timer("PIL compression"): - for i in range(times): - b = BytesIO() - IMG.save(b, format="png") - - -def bench_hub_compression(times=REPEAT_TIMES): - arr = np.array(IMG) - ds = hub.Dataset( - "./data/bench_png_compression", - mode="w", - shape=times, - schema={"image": hub.schema.Image(arr.shape, compressor="png")}, - ) - - batch = np.zeros((times,) + arr.shape, dtype="uint8") - for i in range(times): - batch[i] = arr - - with Timer("Hub compression"): - ds["image", :times] = batch - - -if __name__ == "__main__": - bench_pil_compression() - bench_hub_compression() diff --git a/benchmarks/benchmark_dataset_comparison.py b/benchmarks/benchmark_dataset_comparison.py deleted file mode 100644 index da332ff051..0000000000 --- a/benchmarks/benchmark_dataset_comparison.py +++ /dev/null @@ -1,270 +0,0 @@ -import torch -import torchvision -from torchvision import transforms -import tensorflow as tf -import tensorflow_datasets as tfds - -from hub import Dataset -from hub.utils import Timer -import os - -# import math - -BATCH_SIZE = 16 -PREFETCH_SIZE = 4 -NUM_WORKERS = 1 -# CPUS = os.cpu_count() -# NUM_WORKERS = [ -# min(2 ** n, CPUS) for n in range(math.ceil(math.log2(CPUS)) + 1)] - -ROOT = "." -S3_PATH = "s3://snark-benchmarks/datasets/Hub/" - -DATASET_INFO = [ - { - "name": "mnist", - "pytorch_name": "MNIST", - "hub_name": "activeloop/mnist", - "s3_name": "mnist", - "split": "train+test", - }, - { - "name": "places365_small", - "pytorch_name": "Places365", - "hub_name": "hydp/places365_small_train", - "s3_name": "places365_small_train", - "split": "train", - "kwargs": {"small": True}, - }, -] - - -class HubAdapter(torch.utils.data.Dataset): - def __init__(self, ds): - self.ds = ds - - def __len__(self): - return len(self.ds) - - @property - def shape(self): - return (len(self), None, None, None) - - def __iter__(self): - for i in range(len(self)): - yield self[i] - - def __getitem__(self, index): - img, label = self.ds.__getitem__(index) - return {"image": img, "label": label} - - -class Timer(Timer): - def __init__(self, text): - super().__init__(text) - self._text = f"BENCHMARK - {self._text}" - - -def prepare_torch_dataset(dataset_info): - split = dataset_info["split"].split("+") - trans = transforms.Compose([transforms.ToTensor()]) - data_path = "torch_data" - dset_type = getattr(torchvision.datasets, dataset_info["pytorch_name"]) - kwargs = dataset_info.get("kwargs", {}) - if "train" in split: - dset = dset_type( - os.path.join(ROOT, data_path), transform=trans, download=True, **kwargs - ) - else: - dset = None - if "test" in split: - test_dset = dset_type( - os.path.join(ROOT, data_path), - train=False, - transform=trans, - download=True, - **kwargs, - ) - else: - test_dset = None - if len(split) > 1: - dset = torch.utils.data.ConcatDataset([dset, test_dset]) - return dset if dset else test_dset - - -def time_iter_hub_local_pytorch( - dataset_info, - batch_size=BATCH_SIZE, - prefetch_factor=PREFETCH_SIZE, - num_workers=NUM_WORKERS, - process=None, -): - mnist = prepare_torch_dataset(dataset_info) - path = os.path.join(ROOT, "Hub_data", "torch") - Dataset.from_pytorch(HubAdapter(mnist)).store(path) - dset = Dataset(path, cache=False, storage_cache=False, mode="r") - - loader = torch.utils.data.DataLoader( - dset.to_pytorch(), - batch_size=batch_size, - prefetch_factor=prefetch_factor, - num_workers=num_workers, - ) - - with Timer("Hub (local) `.to_pytorch()`"): - for image, label in loader: - if process is not None: - process(image, label) - - -def time_iter_hub_wasabi_pytorch( - dataset_info, - batch_size=BATCH_SIZE, - prefetch_factor=PREFETCH_SIZE, - num_workers=NUM_WORKERS, - process=None, -): - dset = Dataset(dataset_info["hub_name"], cache=False, storage_cache=False, mode="r") - loader = torch.utils.data.DataLoader( - dset.to_pytorch(), - batch_size=batch_size, - prefetch_factor=prefetch_factor, - num_workers=num_workers, - ) - - with Timer("Hub (remote - Wasabi) `.to_pytorch()`"): - for image, label in loader: - if process is not None: - process(image, label) - - -def time_iter_hub_s3_pytorch( - dataset_info, - batch_size=BATCH_SIZE, - prefetch_factor=PREFETCH_SIZE, - num_workers=NUM_WORKERS, - process=None, -): - dset = Dataset( - f"{S3_PATH}{dataset_info['s3_name']}", - cache=False, - storage_cache=False, - mode="r", - ) - loader = torch.utils.data.DataLoader( - dset.to_pytorch(), - batch_size=batch_size, - prefetch_factor=prefetch_factor, - num_workers=num_workers, - ) - - with Timer("Hub (remote - S3) `.to_pytorch()`"): - for image, label in loader: - if process is not None: - process(image, label) - - -def time_iter_pytorch( - dataset_info, - batch_size=BATCH_SIZE, - prefetch_factor=PREFETCH_SIZE, - num_workers=NUM_WORKERS, - process=None, -): - dset = prepare_torch_dataset(dataset_info) - - loader = torch.utils.data.DataLoader( - dset, - batch_size=batch_size, - prefetch_factor=prefetch_factor, - num_workers=num_workers, - ) - - with Timer("PyTorch (local, native)"): - for image, label in loader: - if process is not None: - process(image, label) - - -def time_iter_hub_local_tensorflow( - dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, process=None -): - dset = Dataset.from_tfds(dataset_info["name"], split=dataset_info["split"]) - path = os.path.join(ROOT, "Hub_data", "tfds") - dset.store(path) - dset = Dataset(path, cache=False, storage_cache=False, mode="r") - loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor) - - with Timer("Hub (local) `.to_tensorflow()`"): - for batch in loader: - image = batch["image"] - label = batch["label"] - if process is not None: - process(image, label) - - -def time_iter_hub_wasabi_tensorflow( - dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, process=None -): - dset = Dataset(dataset_info["hub_name"], cache=False, storage_cache=False, mode="r") - loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor) - - with Timer("Hub (remote - Wasabi) `.to_tensorflow()`"): - for batch in loader: - image = batch["image"] - label = batch["label"] - if process is not None: - process(image, label) - - -def time_iter_hub_s3_tensorflow( - dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, process=None -): - dset = Dataset( - f"{S3_PATH}{dataset_info['s3_name']}", - cache=False, - storage_cache=False, - mode="r", - ) - loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor) - - with Timer("Hub (remote - S3) `.to_tensorflow()`"): - for batch in loader: - image = batch["image"] - label = batch["label"] - if process is not None: - process(image, label) - - -def time_iter_tensorflow( - dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, process=None -): - # turn off auto-splitting and disable multiprocessing - options = tf.data.Options() - blockAS = tf.data.experimental.AutoShardPolicy.OFF - options.experimental_distribute.auto_shard_policy = blockAS - options.experimental_optimization.autotune_cpu_budget = 1 - - loader = tfds.load(dataset_info["name"], split=dataset_info["split"]).with_options( - options - ) - - with Timer("Tensorflow (local, native - TFDS)"): - for batch in loader: - image = batch["image"] - label = batch["label"] - if process is not None: - process(image, label) - - -if __name__ == "__main__": - for i, info in enumerate(DATASET_INFO): - print(f'BENCHMARK DATASET #{i}: {info["name"]}') - time_iter_hub_wasabi_pytorch(info) - time_iter_hub_local_pytorch(info) - time_iter_hub_s3_pytorch(info) - time_iter_pytorch(info) - time_iter_hub_wasabi_tensorflow(info) - time_iter_hub_local_tensorflow(info) - time_iter_hub_s3_tensorflow(info) - time_iter_tensorflow(info) diff --git a/benchmarks/benchmark_dataset_iter.py b/benchmarks/benchmark_dataset_iter.py deleted file mode 100644 index 7f282b318e..0000000000 --- a/benchmarks/benchmark_dataset_iter.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -License: -This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. -If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. -""" - -import torch - -from hub import Dataset -from hub.utils import Timer - -DATASET_NAMES = ["activeloop/mnist", "activeloop/cifar10_train"] - -BATCH_SIZES = [1, 16, 128] - -PREFETCH_SIZES = [1, 4, 16, 128] - - -def time_iter_pytorch( - dataset_name="activeloop/mnist", batch_size=1, prefetch_factor=0, process=None -): - - dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r") - - loader = torch.utils.data.DataLoader( - dset.to_pytorch(), - batch_size=batch_size, - prefetch_factor=prefetch_factor, - num_workers=1, - ) - - with Timer( - f"{dataset_name} PyTorch prefetch {prefetch_factor:03} in batches of {batch_size:03}" - ): - for idx, (image, label) in enumerate(loader): - if process is not None: - process(idx, image, label) - - -def time_iter_tensorflow( - dataset_name="activeloop/mnist", batch_size=1, prefetch_factor=0, process=None -): - - dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r") - - loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor) - - with Timer( - f"{dataset_name} TF prefetch {prefetch_factor:03} in batches of {batch_size:03}" - ): - for idx, batch in enumerate(loader): - image = batch["image"] - label = batch["label"] - if process is not None: - process(idx, image, label) - - -if __name__ == "__main__": - for name in DATASET_NAMES: - for size in BATCH_SIZES: - for prefetch in PREFETCH_SIZES: - time_iter_pytorch(name, size, prefetch, None) - time_iter_tensorflow(name, size, prefetch, None) diff --git a/benchmarks/benchmark_iterate_hub_local_pytorch.py b/benchmarks/benchmark_iterate_hub_local_pytorch.py new file mode 100644 index 0000000000..5915a0d079 --- /dev/null +++ b/benchmarks/benchmark_iterate_hub_local_pytorch.py @@ -0,0 +1,59 @@ +import torchvision +from torchvision import transforms +import torch +import os + +from hub import Dataset + + +class HubAdapter(torch.utils.data.Dataset): + def __init__(self, ds): + self.ds = ds + + def __len__(self): + return len(self.ds) + + @property + def shape(self): + return (len(self), None, None, None) + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __getitem__(self, index): + img, label = self.ds.__getitem__(index) + return {"image": img, "label": label} + + +def benchmark_iterate_hub_local_pytorch_setup( + dataset_name, dataset_split, batch_size, prefetch_factor, num_workers=1 +): + trans = transforms.Compose([transforms.ToTensor()]) + data_path = os.path.join(".", "torch_data") + dset_type = getattr(torchvision.datasets, dataset_name) + path = os.path.join(".", "hub_data", "tfds") + dset = dset_type( + data_path, + transform=trans, + train=(False if "test" in dataset_split else None), + download=True, + ) + + Dataset.from_pytorch(HubAdapter(dset)).store(path) + dset = Dataset(path, cache=False, storage_cache=False, mode="r") + + loader = torch.utils.data.DataLoader( + dset.to_pytorch(), + batch_size=batch_size, + prefetch_factor=prefetch_factor, + num_workers=num_workers, + ) + + return (loader,) + + +def benchmark_iterate_hub_local_pytorch_run(params): + (loader,) = params + for _ in loader: + pass diff --git a/benchmarks/benchmark_iterate_hub_local_tensorflow.py b/benchmarks/benchmark_iterate_hub_local_tensorflow.py new file mode 100644 index 0000000000..3c03f2ee5a --- /dev/null +++ b/benchmarks/benchmark_iterate_hub_local_tensorflow.py @@ -0,0 +1,21 @@ +from hub import Dataset +import os + + +def benchmark_iterate_hub_local_tensorflow_setup( + dataset_name, dataset_split, batch_size, prefetch_factor +): + dset = Dataset.from_tfds(dataset_name, split=dataset_split) + path = os.path.join(".", "hub_data", "tfds") + dset.store(path) + dset = Dataset(path, cache=False, storage_cache=False, mode="r") + + loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor) + + return (loader,) + + +def benchmark_iterate_hub_local_tensorflow_run(params): + (loader,) = params + for _ in loader: + pass diff --git a/benchmarks/benchmark_iterate_hub_pytorch.py b/benchmarks/benchmark_iterate_hub_pytorch.py new file mode 100644 index 0000000000..5e4499a39e --- /dev/null +++ b/benchmarks/benchmark_iterate_hub_pytorch.py @@ -0,0 +1,24 @@ +import torch + +from hub import Dataset + + +def benchmark_iterate_hub_pytorch_setup( + dataset_name, batch_size, prefetch_factor, num_workers=1 +): + dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r") + + loader = torch.utils.data.DataLoader( + dset.to_pytorch(), + batch_size=batch_size, + prefetch_factor=prefetch_factor, + num_workers=num_workers, + ) + + return (loader,) + + +def benchmark_iterate_hub_pytorch_run(params): + (loader,) = params + for _ in loader: + pass diff --git a/benchmarks/benchmark_iterate_hub_tensorflow.py b/benchmarks/benchmark_iterate_hub_tensorflow.py new file mode 100644 index 0000000000..21e4c71a81 --- /dev/null +++ b/benchmarks/benchmark_iterate_hub_tensorflow.py @@ -0,0 +1,14 @@ +from hub import Dataset + + +def benchmark_iterate_hub_tensorflow_setup(dataset_name, batch_size, prefetch_factor): + dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r") + + loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor) + return (loader,) + + +def benchmark_iterate_hub_tensorflow_run(params): + (loader,) = params + for _ in loader: + pass diff --git a/benchmarks/benchmark_random_access.py b/benchmarks/benchmark_random_access.py deleted file mode 100644 index f0033dc158..0000000000 --- a/benchmarks/benchmark_random_access.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -License: -This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. -If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. -""" - -from random import randint -from itertools import chain - -from hub import Dataset -from hub.utils import Timer - -DATASET_NAMES = [ - "activeloop/mnist", - "activeloop/omniglot_test", - "activeloop/cifar10_train", - "activeloop/cifar100_train", -] - -SPAN_POWER_MAX = 10 - - -def time_random_access( - dataset_name="activeloop/mnist", offset=1000, span=1000, field="image" -): - dset = Dataset(dataset_name, cache=False, storage_cache=False) - with Timer(f"{dataset_name} read at offset {offset:03} of length {span:03}"): - dset[field][offset : offset + span].compute() - - -if __name__ == "__main__": - for name in DATASET_NAMES: - for span in range(SPAN_POWER_MAX): - offset = randint(0, 999) - time_random_access(name, offset, 2 ** span) - print() diff --git a/benchmarks/benchmark_run.sh b/benchmarks/benchmark_run.sh deleted file mode 100644 index 154a24855a..0000000000 --- a/benchmarks/benchmark_run.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -for filename in ./benchmark_*.py; do - [ -e "$filename" ] || continue - python3 ./$filename >> results.log -done \ No newline at end of file diff --git a/benchmarks/benchmark_sequential_access.py b/benchmarks/legacy_benchmark_sequential_access.py similarity index 100% rename from benchmarks/benchmark_sequential_access.py rename to benchmarks/legacy_benchmark_sequential_access.py diff --git a/benchmarks/benchmark_sequential_write.py b/benchmarks/legacy_benchmark_sequential_write.py similarity index 100% rename from benchmarks/benchmark_sequential_write.py rename to benchmarks/legacy_benchmark_sequential_write.py diff --git a/benchmarks/runner.ipynb b/benchmarks/runner.ipynb new file mode 100644 index 0000000000..2d20e4e93b --- /dev/null +++ b/benchmarks/runner.ipynb @@ -0,0 +1,328 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hub.utils import Timer\n", + "from hub import Dataset\n", + "from memory_profiler import memory_usage\n", + "import asyncio\n", + "import psutil\n", + "import time\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Network Monitoring Helpers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def network_monitor(check_finish, sample_frequency=1):\n", + " samples = []\n", + " prev_bytes = None\n", + " while True:\n", + " stats = psutil.net_io_counters()\n", + " if prev_bytes is not None:\n", + " samples.append((time.time(), stats.bytes_recv - prev_bytes))\n", + " prev_bytes = stats.bytes_recv\n", + " await asyncio.sleep(sample_frequency)\n", + " if check_finish():\n", + " return samples\n", + "\n", + "async def network_monitor_call(f):\n", + " is_finished = False\n", + " check_finish = lambda: is_finished\n", + " task = asyncio.create_task(network_monitor(check_finish))\n", + " await asyncio.to_thread(f)\n", + " is_finished = True\n", + " await task\n", + " return task.result()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benchmark Runners" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def time_runner(params):\n", + " run_function, setup_function, setup_params = params\n", + " params = setup_function(*setup_params)\n", + " begin = time.time()\n", + " run_function(params)\n", + " end = time.time()\n", + " return end - begin\n", + "\n", + "def memory_runner(params):\n", + " run_function, setup_function, setup_params = params\n", + " params = setup_function(*setup_params)\n", + " baseline = memory_usage()\n", + " usage = memory_usage((run_function, (params,)))\n", + " return (max(baseline), max(usage))\n", + "\n", + "async def network_runner(params):\n", + " run_function, setup_function, setup_params = params\n", + " params = setup_function(*setup_params)\n", + " return await network_monitor_call(lambda: run_function(params))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Benchmarks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hub Full Dataset Access" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from benchmark_access_hub_full import benchmark_access_hub_full_setup, benchmark_access_hub_full_run\n", + "access_full_suite = [(benchmark_access_hub_full_run, benchmark_access_hub_full_setup, (dset,)) for dset in ['activeloop/mnist']]\n", + "\n", + "hub_full_times = list(map(time_runner, access_full_suite))\n", + "hub_full_mem_usages = list(map(memory_runner, access_full_suite))\n", + "hub_full_net_usages = [await network_runner(params) for params in access_full_suite]\n", + "\n", + "print(hub_full_times)\n", + "print(hub_full_mem_usages)\n", + "print(hub_full_net_usages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hub Random Slice Access" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from benchmark_access_hub_slice import benchmark_access_hub_slice_setup, benchmark_access_hub_slice_run\n", + "access_slice_suite = [(benchmark_access_hub_slice_run, benchmark_access_hub_slice_setup, t) for t in [('activeloop/mnist', (0, 256)), ('activeloop/mnist', (2048, 2048+256))]]\n", + "\n", + "hub_slice_times = list(map(time_runner, access_slice_suite))\n", + "hub_slice_mem_usages = list(map(memory_runner, access_slice_suite))\n", + "hub_slice_net_usages = [await network_runner(params) for params in access_slice_suite]\n", + "\n", + "print(hub_slice_times)\n", + "print(hub_slice_mem_usages)\n", + "print(hub_slice_net_usages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hub Compression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from benchmark_compress_hub import benchmark_compress_hub_setup, benchmark_compress_hub_run\n", + "hub_compress_suite = [(benchmark_compress_hub_run, benchmark_compress_hub_setup, t) for t in [(32,)]]\n", + "\n", + "hub_compress_times = list(map(time_runner, hub_compress_suite))\n", + "hub_compress_mem_usages = list(map(memory_runner, hub_compress_suite))\n", + "\n", + "print(hub_compress_times)\n", + "print(hub_compress_mem_usages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pillow Compression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from benchmark_compress_pillow import benchmark_compress_pillow_setup, benchmark_compress_pillow_run\n", + "pillow_compress_suite = [(benchmark_compress_pillow_run, benchmark_compress_pillow_setup, t) for t in [(32,)]]\n", + "\n", + "pillow_compress_times = list(map(time_runner, pillow_compress_suite))\n", + "pillow_compress_mem_usages = list(map(memory_runner, pillow_compress_suite))\n", + "\n", + "print(pillow_compress_times)\n", + "print(pillow_compress_mem_usages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hub Local Dataset Iteration - PyTorch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from benchmark_iterate_hub_local_pytorch import benchmark_iterate_hub_local_pytorch_setup, benchmark_iterate_hub_local_pytorch_run\n", + "iterate_local_pytorch_suite = [(benchmark_iterate_hub_local_pytorch_run, benchmark_iterate_hub_local_pytorch_setup, t) for t in [('MNIST', 'train', 128, 128)]]\n", + "\n", + "hub_iterate_local_pytorch_times = list(map(time_runner, iterate_local_pytorch_suite))\n", + "hub_iterate_local_pytorch_mem_usages = list(map(memory_runner, iterate_local_pytorch_suite))\n", + "hub_iterate_local_pytorch_net_usages = [await network_runner(params) for params in iterate_local_pytorch_suite]\n", + "\n", + "print(hub_iterate_local_pytorch_times)\n", + "print(hub_iterate_local_pytorch_mem_usages)\n", + "print(hub_iterate_local_pytorch_net_usages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hub Local Dataset Iteration - Tensorflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from benchmark_iterate_hub_local_tensorflow import benchmark_iterate_hub_local_tensorflow_setup, benchmark_iterate_hub_local_tensorflow_run\n", + "iterate_local_tensorflow_suite = [(benchmark_iterate_hub_local_tensorflow_run, benchmark_iterate_hub_local_tensorflow_setup, t) for t in [('mnist', 'train', 128, 128)]]\n", + "\n", + "hub_iterate_local_tensorflow_times = list(map(time_runner, iterate_local_tensorflow_suite))\n", + "hub_iterate_local_tensorflow_mem_usages = list(map(memory_runner, iterate_local_tensorflow_suite))\n", + "hub_iterate_local_tensorflow_net_usages = [await network_runner(params) for params in iterate_local_tensorflow_suite]\n", + "\n", + "print(hub_iterate_local_tensorflow_times)\n", + "print(hub_iterate_local_tensorflow_mem_usages)\n", + "print(hub_iterate_local_tensorflow_net_usages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hub Dataset Iteration - PyTorch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from benchmark_iterate_hub_pytorch import benchmark_iterate_hub_pytorch_setup, benchmark_iterate_hub_pytorch_run\n", + "iterate_pytorch_suite = [(benchmark_iterate_hub_pytorch_run, benchmark_iterate_hub_pytorch_setup, t) for t in [('activeloop/mnist', 128, 128)]]\n", + "\n", + "hub_iterate_pytorch_times = list(map(time_runner, iterate_pytorch_suite))\n", + "hub_iterate_pytorch_mem_usages = list(map(memory_runner, iterate_pytorch_suite))\n", + "hub_iterate_pytorch_net_usages = [await network_runner(params) for params in iterate_pytorch_suite]\n", + "\n", + "print(hub_iterate_pytorch_times)\n", + "print(hub_iterate_pytorch_mem_usages)\n", + "print(hub_iterate_pytorch_net_usages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hub Dataset Iteration - Tensorflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from benchmark_iterate_hub_tensorflow import benchmark_iterate_hub_tensorflow_setup, benchmark_iterate_hub_tensorflow_run\n", + "iterate_tensorflow_suite = [(benchmark_iterate_hub_tensorflow_run, benchmark_iterate_hub_tensorflow_setup, t) for t in [('activeloop/mnist', 128, 128)]]\n", + "\n", + "hub_iterate_tensorflow_times = list(map(time_runner, iterate_tensorflow_suite))\n", + "hub_iterate_tensorflow_mem_usages = list(map(memory_runner, iterate_tensorflow_suite))\n", + "hub_iterate_tensorflow_net_usages = [await network_runner(params) for params in iterate_tensorflow_suite]\n", + "\n", + "print(hub_iterate_tensorflow_times)\n", + "print(hub_iterate_tensorflow_mem_usages)\n", + "print(hub_iterate_tensorflow_net_usages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output Benchmark Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write to your favourite file format here" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements-benchmarks.txt b/requirements-benchmarks.txt index 578a345cea..f119fd4c83 100644 --- a/requirements-benchmarks.txt +++ b/requirements-benchmarks.txt @@ -2,3 +2,4 @@ tiledb==0.8.6 torchvision==0.9.0 altair==4.1.0 altair_saver==0.5.0 +psutil>=5.8