# Jpeg Decoding Benchmark
The purpose of this notebook isn't to come to a conclusion about which library is faster. Since little setup differences in versions can make differences and notebooks themselves are not a contained environment. aka **your milage may vary**.

The purpose is to have a simple notebook you can run on your setup to see what is what. Help you see if you can make some improvements. For example; Updating turbojpeg to 2.1 version if you are doing CPU decoding, or trying out GPU decoding etc etc (in my experience, Nvidia DALI gpu decoding is always faster)

Let me know if you spot mistakes & or share your results in a gist.

In [1]:
import io
import time

import numpy as np
import torch
from image_checker.checker import batch
from image_checker.DaliChecker import DaliChecker
from image_checker.iterators import folder_iterator
from PIL import Image
from torchvision import transforms as t
from torchvision.io import decode_image,decode_jpeg, read_image
from torchvision import __version__ as torchvision_version
from more_itertools import chunked

In [2]:
# !python3 -c "import fastai.utils; fastai.utils.check_perf()"

# Benchmarking Bytes->Tensor

In [3]:
img_count=256
folder="/home1/ceyda/data/dali_test/fine/" # a folder full of jpegs
files=folder_iterator(folder,extensions=["jpg","jpeg"],recursive=False)
images_bytes=[im for im,path in files][:img_count]

In [4]:
#doing this here so it doesn't effect time
images_bytes_io=[io.BytesIO(i) for i in images_bytes]

# PIL-SIMD

In [5]:
from PIL.features import check_feature
check_feature("libjpeg_turbo")

True

In [6]:
%%timeit -n 1 -r 100
# PIL_SIMD -> Tensor output
for image_bytes in images_bytes_io:
    z=Image.open(image_bytes)
    z=t.functional.pil_to_tensor(z)
#     z.to("cuda:0")
# Tensor output

  img = torch.as_tensor(np.asarray(pic))
692 ms ± 15.1 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Torchvision (CPU)

In [7]:
# !pip3 install --upgrade torch torchvision torchaudio

In [8]:
# !pip3 install --upgrade torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

In [9]:
assert '0.10' in torchvision_version
print('torchvision_version',torchvision_version)
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch._C._cuda_getCompiledVersion(), 'cuda compiled version')
# CUDA version >= 10.1

torchvision_version 0.10.0+cu102
True
10.2
10020 cuda compiled version


In [10]:
%%timeit -n 1 -r 100
for img_bytes in images_bytes:
    z=torch.from_numpy(img_bytes)
#     z=z.to("cuda:0")
    z=decode_jpeg(z)
# z=z.to("cuda:0")
# Tensor output

514 ms ± 20.9 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Torchvision (GPU) (BETA)

In [16]:
%%timeit -n 1 -r 100
for img_bytes in images_bytes:
    z=torch.from_numpy(img_bytes)
#     z=z.to("cuda:0")
    z=decode_jpeg(z, device='cuda')
    # torch.cuda.synchronize()
#     z.to("cuda:0")
# Tensor output

913 ms ± 81.2 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


In [12]:
# !wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
# !python3 collect_env.py

# Nvidia DALI (nvJPEG on GPU)

In [17]:
batch_size=256 #speed can change depending on batch size
dali_decoder = DaliChecker(batch_size, prefetch=1, device='mixed') # mixed means GPU+CPU

In [18]:
batches_of_image_bytes=list(chunked(images_bytes,batch_size))

In [19]:
%%timeit -n 1 -r 100
for image_bytes in batches_of_image_bytes:
    dali_decoder.feed(image_bytes)
    dali_tensor=dali_decoder.pipe.run()
    # dali_tensor is on the GPU!
    # Tensor output even moved to GPU

195 ms ± 13.2 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Nvidia DALI (on CPU)

In [20]:
batch_size=256
dali_decoder = DaliChecker(batch_size, prefetch=1, device='cpu')

In [21]:
batches_of_image_bytes=list(chunked(images_bytes,batch_size))

In [22]:
%%timeit -n 1 -r 100
for image_bytes in batches_of_image_bytes:
    dali_decoder.feed(image_bytes)
    dali_tensor=dali_decoder.pipe.run()
    # dali_tensor is on the CPU!

487 ms ± 27.6 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# OPENCV

In [23]:
import cv2 as cv

In [24]:
import re
cv2_jpeg_lib = re.search(
    r".*JPEG:\W+(?P<jpeg_lib>.*)", cv.getBuildInformation()
).groupdict()["jpeg_lib"]
print(cv2_jpeg_lib)

libjpeg-turbo (ver 2.0.6-62)


In [25]:
%%timeit -n 1 -r 100
for img_bytes in images_bytes:
    z = cv.imdecode(img_bytes, cv.IMREAD_COLOR)
    # z is decoded image
    z=torch.as_tensor(z)
#     print(z)
    # z is Tensor on cpu

880 ms ± 20.6 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# SimpleJpeg

In [26]:
import simplejpeg

In [27]:
# !pip3 install simplejpeg

In [28]:
%%timeit -n 1 -r 100
# simplejpeg
for img_bytes in images_bytes:
    z=simplejpeg.decode_jpeg(img_bytes, fastdct=True, fastupsample=True)
    # z is decoded image
    z=torch.as_tensor(z)
    # z is Tensor on cpu

442 ms ± 16.9 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# PyTurboJPEG

In [29]:
# !pip3 install PyTurboJPEG

In [30]:
from turbojpeg import TurboJPEG,TJFLAG_FASTUPSAMPLE,TJFLAG_FASTDCT
# specifying library path explicitly
# jpeg = TurboJPEG('/usr/lib64/libturbojpeg.so')
# using default library installation
jpeg = TurboJPEG()

In [31]:
%%timeit -n 1 -r 100
for img_bytes in images_bytes:
    # z = jpeg.decode(img_bytes)
    z = jpeg.decode(img_bytes,flags=TJFLAG_FASTUPSAMPLE|TJFLAG_FASTDCT) # with flags around -50ms faster
    # z is decoded image
    z = torch.as_tensor(z)
    # z is Tensor on cpu

497 ms ± 16.7 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


In [None]:
# run on TITAN RTX