In [1]:
import io
import time

import numpy as np
import torch
from image_checker.checker import batch
from image_checker.DaliChecker import DaliChecker
from image_checker.iterators import folder_iterator
from PIL import Image
from torchvision import transforms as t
from torchvision.io import decode_image,decode_jpeg, read_image
from torchvision import __version__ as torchvision_version
from more_itertools import chunked

In [None]:
# !python3 -c "import fastai.utils; fastai.utils.check_perf()"

# Benchmarking Bytes->Tensor

In [2]:
img_count=256
folder="/home1/ceyda/data/dali_test/fine/" # a folder full of jpegs
files=folder_iterator(folder,extensions=["jpg","jpeg"],recursive=False)
images_bytes=[im for im,path in files][:img_count]

In [3]:
#doing this here so it doesn't effect time
images_bytes_io=[io.BytesIO(i) for i in images_bytes]

# PIL-SIMD

In [4]:
from PIL.features import check_feature
check_feature("libjpeg_turbo")

True

In [5]:
%%timeit -n 1 -r 100
# PIL_SIMD -> Tensor output
for image_bytes in images_bytes_io:
    z=Image.open(image_bytes)
    z=t.functional.pil_to_tensor(z)
#     z.to("cuda:0")
# Tensor output

  img = torch.as_tensor(np.asarray(pic))
714 ms ± 18.6 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Torchvision (CPU)

In [6]:
# !pip3 install --upgrade torch torchvision torchaudio

In [None]:
# !pip3 install --upgrade torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
# currently getting an error on cuda 11
# ~/.local/lib/python3.6/site-packages/torchvision/io/image.py in decode_jpeg(input, mode, device)
#     174     device = torch.device(device)
#     175     if device.type == 'cuda':
# --> 176         output = torch.ops.image.decode_jpeg_cuda(input, mode.value, device)
#     177     else:
#     178         output = torch.ops.image.decode_jpeg(input, mode.value)

# RuntimeError: nvjpegDecode failed: 5

In [7]:
assert '0.10' in torchvision_version
print('torchvision_version',torchvision_version)
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch._C._cuda_getCompiledVersion(), 'cuda compiled version')
# CUDA version >= 10.1

torchvision_version 0.10.0+cu102
True
10.2
10020 cuda compiled version


In [8]:
%%timeit -n 1 -r 100
for img_bytes in images_bytes:
    z=torch.from_numpy(img_bytes)
#     z=z.to("cuda:0")
    z=decode_jpeg(z)
    # z=z.to("cuda:0")
# Tensor output

538 ms ± 24.3 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Torchvision (GPU)

In [9]:
%%timeit -n 1 -r 100
for img_bytes in images_bytes:
    z=torch.from_numpy(img_bytes)
#     z=z.to("cuda:0")
    z=decode_jpeg(z, device='cuda')
#     z.to("cuda:0")
# Tensor output

The slowest run took 6.68 times longer than the fastest. This could mean that an intermediate result is being cached.
985 ms ± 414 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Nvidia DALI (nvJPEG on GPU)

In [10]:
batch_size=256 #speed can change depending on batch size
dali_decoder = DaliChecker(batch_size, prefetch=1, device='mixed') # mixed means GPU+CPU

In [11]:
batches_of_image_bytes=list(chunked(images_bytes,batch_size))

In [12]:
%%timeit -n 1 -r 100
for image_bytes in batches_of_image_bytes:
    dali_decoder.feed(image_bytes)
    dali_tensor=dali_decoder.pipe.run()
    # dali_tensor is on the GPU!
    # Tensor output even moved to GPU

194 ms ± 9.67 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Nvidia DALI (on CPU)

In [13]:
batch_size=256
dali_decoder = DaliChecker(batch_size, prefetch=1, device='cpu')

In [14]:
batches_of_image_bytes=list(chunked(images_bytes,batch_size))

In [15]:
%%timeit -n 1 -r 100
for image_bytes in batches_of_image_bytes:
    dali_decoder.feed(image_bytes)
    dali_tensor=dali_decoder.pipe.run()
    # dali_tensor is on the CPU!

497 ms ± 41.4 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# OPENCV

In [16]:
import cv2 as cv

In [18]:
import re
cv2_jpeg_lib = re.search(
    r".*JPEG:\W+(?P<jpeg_lib>.*)", cv.getBuildInformation()
).groupdict()["jpeg_lib"]
print(cv2_jpeg_lib)

libjpeg-turbo (ver 2.0.6-62)


In [17]:
%%timeit -n 1 -r 100
for img_bytes in images_bytes:
    z = cv.imdecode(img_bytes, cv.IMREAD_COLOR)
    # z is decoded image
    z=torch.as_tensor(z)
#     print(z)
    # z is Tensor on cpu

880 ms ± 18.7 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# SimpleJpeg

In [19]:
import simplejpeg

In [20]:
# !pip3 install simplejpeg

In [21]:
%%timeit -n 1 -r 100
# simplejpeg
for img_bytes in images_bytes:
    z=simplejpeg.decode_jpeg(img_bytes, fastdct=True, fastupsample=True)
    # z is decoded image
    z=torch.as_tensor(z)
    # z is Tensor on cpu

444 ms ± 12.3 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# PyTurboJPEG

In [22]:
# !pip3 install PyTurboJPEG

In [23]:
from turbojpeg import TurboJPEG
# specifying library path explicitly
# jpeg = TurboJPEG('/usr/lib64/libturbojpeg.so')
# using default library installation
jpeg = TurboJPEG()

In [24]:
%%timeit -n 1 -r 100
for img_bytes in images_bytes:
    z = jpeg.decode(img_bytes)
    # z is decoded image
    z = torch.as_tensor(z)
    # z is Tensor on cpu

517 ms ± 16 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


In [None]:
# run on TITAN RTX