### Dependencies
```Bash
$conda install numba pillow scipy joblib numpy imageio
$pip install contexttimer
```

In [73]:
from numba import jit
import imageio
import contexttimer
import time
from joblib import Parallel
import numpy as np
from scipy import ndimage
TESTSIZE=100

Define the image processing function.

In [74]:
def process_img(filename_in, filename_out):
    '''
    Replace with your function loading image, processing it, writing it.
    Note that using @jit(nogil=True) will not work, but Python IO does not use the GIL, so you are ok.
    '''
    img = imageio.imread(filename_in)
    rimg = ndimage.rotate(img, 45)
    img = imageio.imwrite(filename_out, rimg)


Generate test images.

In [75]:
# Create some random data
np.random.seed(42)
# Build list of filenames
inputfiles, outputfiles = zip(*[ ('in_{}.png'.format(i), 'out_{}.png'.format(i)) for i in range(TESTSIZE)])

for infile in inputfiles:
    imageio.imwrite(infile, np.random.randint(0, 255, (512,512,3), dtype=np.uint8))
    

Paralellize

In [76]:
def parallel_image_processing(inputfilenames, outputfilenames, thread_count):
    '''
    Execute without timing.
    '''
    # joblist
    # [ (function_name, argumentlist, dict of keyword args), ...]
    calc_jobs=[(process_img, [fin, fout],{}) for (fin, fout) in zip(inputfilenames, outputfilenames)]
    with Parallel(n_jobs=thread_count,backend='threading') as parallel:
        r=parallel(calc_jobs)

def benchmark_image_processing(inputfilenames, outputfilenames, thread_count):
    '''
    Execute and time
    '''
    calc_jobs=[(process_img, [fin, fout],{}) for (fin, fout) in zip(inputfilenames, outputfilenames)]    
    w, c, s = 0, 0, 0
    with  contexttimer.Timer(time.perf_counter) as wall:
        with contexttimer.Timer(time.process_time) as cpu:
            with Parallel(n_jobs=thread_count,backend='threading') as parallel:
                r=parallel(calc_jobs)
                w = wall.elapsed
                c = cpu.elapsed
                s = c/w
    return w,c,s

### No threads

In [77]:
threads = 1
print('Threading speedup with {} jobs spread over {} threads'.format(len(inputfiles), threads))
wall, cpu, speedup = benchmark_image_processing(inputfiles, outputfiles, thread_count=threads)
print('Total Wall Time {}'.format(wall))
print('Total CPU Time {}'.format(wall))

Threading speedup with 100 jobs spread over 1 threads
Speedup ratio (1 = Linear) 0.9905359970339439
Total Wall Time 29.461417859001813
Total CPU Time 29.461417859001813


### 4 Threads

In [79]:
threads = 4
print('Threading speedup with {} jobs spread over {} threads'.format(len(inputfiles), threads))
wall, cpu, speedup = benchmark_image_processing(inputfiles, outputfiles, thread_count=threads)
print('Speedup ratio (1 = Linear) {}'.format(speedup/threads))
print('Total Wall Time {}'.format(wall))
print('Total CPU Time {}'.format(wall))
print('Speedup factor {}'.format(speedup))

Threading speedup with 100 jobs spread over 4 threads
Speedup ratio (1 = Linear) 0.9309372795824026
Total Wall Time 11.846061301999725
Total CPU Time 11.846061301999725
Speedup factor3.7237491183296103
