In [1]:
import io
import time

import numpy as np
import torch
from image_checker.checker import batch
from image_checker.DaliChecker import DaliChecker
from image_checker.iterators import folder_iterator
from PIL import Image
from torchvision import transforms as t
from torchvision.io import decode_image, read_image
from more_itertools import chunked

In [2]:
img_count=256
files=folder_iterator("/mnt/data/dali_test/fine/",extensions=["jpg","jpeg"],recursive=False)
images_bytes=[im for im,path in files][:img_count]

# PIL-SIMD

In [3]:
#doing this here so it doesn't effect time
images_bytes_io=[io.BytesIO(i) for i in images_bytes]

In [5]:
%%timeit -n 1 -r 100
# PIL_SIMD
for image_bytes in images_bytes_io:
    z=Image.open(image_bytes)
    z=t.functional.pil_to_tensor(z)
#     z.to("cuda:0")

705 ms ± 20.2 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Torchvision

In [6]:
%%timeit -n 1 -r 100
for image_bytes in images_bytes:
    z=torch.from_numpy(image_bytes)
    z=decode_image(z)
#     z.to("cuda:0")

516 ms ± 14.4 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Nvidia DALI (nvJPEG on GPU)

In [7]:
batch_size=256
dali_decoder = DaliChecker(batch_size, prefetch=1, device='mixed') # mixed means GPU+CPU

In [8]:
batches_of_image_bytes=list(chunked(images_bytes,batch_size))

In [9]:
%%timeit -n 1 -r 100
for image_bytes in batches_of_image_bytes:
    dali_decoder.feed(image_bytes)
    dali_tensor=dali_decoder.pipe.run()
    # dali_tensor is on the GPU!

200 ms ± 14.5 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)
