In [1]:
import io
import time

import numpy as np
import torch
from image_checker.checker import batch
from image_checker.DaliChecker import DaliChecker
from image_checker.iterators import folder_iterator
from PIL import Image
from torchvision import transforms as t
from torchvision.io import decode_image, read_image
from more_itertools import chunked
import cv2 as cv
import simplejpeg

In [2]:
!python3 -c "import fastai.utils; fastai.utils.check_perf()"

Running performance checks.

*** libjpeg-turbo status
✔ libjpeg-turbo is on

*** Pillow-SIMD status
✔ Running Pillow-SIMD 7.0.0.post3

*** CUDA status
✔ Running the latest CUDA 10.2 with NVIDIA driver 440.33.01

Refer to https://docs.fast.ai/performance.html to make sense out of these checks and suggestions.


# Benchmarking Bytes->Tensor

In [3]:
img_count=256
files=folder_iterator("/mnt/data/dali_test/fine/",extensions=["jpg","jpeg"],recursive=False)
images_bytes=[im for im,path in files][:img_count]

In [4]:
#doing this here so it doesn't effect time
images_bytes_io=[io.BytesIO(i) for i in images_bytes]

# PIL-SIMD

In [5]:
from PIL.features import check_feature
check_feature("libjpeg_turbo")

True

In [6]:
%%timeit -n 1 -r 100
# PIL_SIMD -> Tensor output
for image_bytes in images_bytes_io:
    z=Image.open(image_bytes)
    z=t.functional.pil_to_tensor(z)
#     z.to("cuda:0")
# Tensor output

684 ms ± 18.5 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Torchvision

In [7]:
%%timeit -n 1 -r 100
for img_bytes in images_bytes:
    z=torch.from_numpy(img_bytes)
#     z=z.to("cuda:0")
    z=decode_image(z)
#     z.to("cuda:0")
# Tensor output

510 ms ± 10.5 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Nvidia DALI (nvJPEG on GPU)

In [8]:
batch_size=256
dali_decoder = DaliChecker(batch_size, prefetch=1, device='mixed') # mixed means GPU+CPU

In [9]:
batches_of_image_bytes=list(chunked(images_bytes,batch_size))

In [10]:
%%timeit -n 1 -r 100
for image_bytes in batches_of_image_bytes:
    dali_decoder.feed(image_bytes)
    dali_tensor=dali_decoder.pipe.run()
    # dali_tensor is on the GPU!
    # Tensor output even moved to GPU

193 ms ± 11.3 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# Nvidia DALI (on CPU)

In [11]:
batch_size=256
dali_decoder = DaliChecker(batch_size, prefetch=1, device='cpu')

In [12]:
batches_of_image_bytes=list(chunked(images_bytes,batch_size))

In [13]:
%%timeit -n 1 -r 100
for image_bytes in batches_of_image_bytes:
    dali_decoder.feed(image_bytes)
    dali_tensor=dali_decoder.pipe.run()
    # dali_tensor is on the CPU!

497 ms ± 37.1 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


# OPENCV

In [15]:
%%timeit -n 1 -r 100
for img_bytes in images_bytes:
    z = cv.imdecode(img_bytes, cv.IMREAD_COLOR)
    # z is decoded image
    z=torch.as_tensor(z)
    # z is Tensor on cpu

844 ms ± 24.7 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)


In [16]:
import re
cv2_jpeg_lib = re.search(
    r".*JPEG:\W+(?P<jpeg_lib>.*)", cv.getBuildInformation()
).groupdict()["jpeg_lib"]
print(cv2_jpeg_lib)

libjpeg-turbo (ver 2.0.2-62)


# SimpleJpeg

In [17]:
%%timeit -n 1 -r 100
# simplejpeg
for img_bytes in images_bytes:
    z=simplejpeg.decode_jpeg(img_bytes, fastdct=True, fastupsample=True)
    # z is decoded image
    z=torch.as_tensor(z)
    # z is Tensor on cpu

437 ms ± 5.59 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)
