In [1]:
# Copyright Â© Advanced Micro Devices, Inc., or its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Multi-thread and Multi-process Tests

In this notebook, we compare hipCIM with OpenSlide in a multi-thread/multi-process environment.

`input/image.tif` file (whose size is 19920x26420 and tile size is 256x256) is used.

Since hipCIM doesn't implement internal cache yet, according to `start_location` variable in the experiment code, hipCIM would have a different performance.

![](static_images/Multi-thread_and_Multi-process_Tests_Alignment.png)

For the first case (`start_location = 0`), when we try to read the whole image starting from (0,0) with 256x256 patch size, both OpenSlide and hipCIM would read each time only once.
However, in the second case (`start_location = 1`) that starts reading patch from (1,1), hipCIM would have a disadvantage -- for the second patch (second red box), hipCIM should need four tiles whereas OpenSlide would use only two tiles (two tiles in the middle would be cached when OpenSlide read the first patch).

**Note::** hipCIM would support cache mechanism in the near future.

The following system is used to conduct the experiment:
- OS: Ubuntu 22.04 LTS
- CPU: AMD EPYC 9654


Before running this notebook, make sure to download the data, follow these steps to download the data
```
cd hipCIM
./run_amd download_testdata
```

In [2]:
from contextlib import ContextDecorator
from time import perf_counter

class Timer(ContextDecorator):
    def __init__(self, message):
        self.message = message
        self.end = None
    def elapsed_time(self):
        self.end = perf_counter()
        return self.end - self.start
    def __enter__(self):
        self.start = perf_counter()
        return self
    def __exit__(self, exc_type, exc, exc_tb):
        if not self.end:
            self.elapsed_time()
        print("{} : {}".format(self.message, self.end - self.start))

## Multithreading

In [3]:
import numpy as np
from openslide import OpenSlide
import concurrent.futures
from cucim import CuImage

import os

# num_threads = os.cpu_count() # uncomment this line to work with all threads
num_threads = 5 #comment this line if you uncomment the line above

input_file = "input/image.tif"
start_location = 0
patch_size = 256


def load_tile_openslide(slide, start_loc, patch_size):
    region = slide.read_region(start_loc, 0, [patch_size, patch_size])

def load_tile_cucim(slide, start_loc, patch_size):
    region = slide.read_region(start_loc, [patch_size, patch_size], 0)

openslide_tot_time = 0
cucim_tot_time = 0
for num_workers in range(1, num_threads + 1):

    print("# of thread : {}".format(num_workers))
    openslide_time = 0
    
    with OpenSlide(input_file) as slide:
        width, height = slide.dimensions

        count = 0
        for h in range(start_location, height, patch_size):
            for w in range(start_location, width, patch_size):
                count += 1
        start_loc_iter = ((sx, sy)
                          for sy in range(start_location, height, patch_size)
                              for sx in range(start_location, width, patch_size))
        with Timer("  Thread elapsed time (OpenSlide)") as timer:
            with concurrent.futures.ThreadPoolExecutor(
                max_workers=num_workers
            ) as executor:
                executor.map(
                    lambda start_loc: load_tile_openslide(slide, start_loc, patch_size),
                    start_loc_iter,
                )
            openslide_time = timer.elapsed_time()
            openslide_tot_time += openslide_time

    cucim_time = 0
    slide = CuImage(input_file)
    start_loc_iter = ((sx, sy)
                      for sy in range(start_location, height, patch_size)
                          for sx in range(start_location, width, patch_size))
    with Timer("  Thread elapsed time (hipCIM)") as timer:
        with concurrent.futures.ThreadPoolExecutor(
            max_workers=num_workers
        ) as executor:
            executor.map(
                lambda start_loc: load_tile_cucim(slide, start_loc, patch_size),
                start_loc_iter,
            )
        cucim_time = timer.elapsed_time()
        cucim_tot_time += cucim_time
    print("  Performance gain (OpenSlide/hipCIM): {}".format(openslide_time / cucim_time))

print("Total time (OpenSlide):", openslide_tot_time)
print("Total time (hipCIM):", cucim_tot_time)
print("Average performance gain (OpenSlide/hipCIM): {}".format(openslide_tot_time / cucim_tot_time))


# of thread : 1
  Thread elapsed time (OpenSlide) : 12.460168448276818
  Thread elapsed time (hipCIM) : 2.401326422113925
  Performance gain (OpenSlide/hipCIM): 5.18886909065363
# of thread : 2
  Thread elapsed time (OpenSlide) : 6.158470947761089
  Thread elapsed time (hipCIM) : 1.462195221800357
  Performance gain (OpenSlide/hipCIM): 4.211798025285808
# of thread : 3
  Thread elapsed time (OpenSlide) : 4.196710088755935
  Thread elapsed time (hipCIM) : 0.9856603043153882
  Performance gain (OpenSlide/hipCIM): 4.257765145235155
# of thread : 4
  Thread elapsed time (OpenSlide) : 3.2360566169954836
  Thread elapsed time (hipCIM) : 0.8101789290085435
  Performance gain (OpenSlide/hipCIM): 3.994249296208688
# of thread : 5
  Thread elapsed time (OpenSlide) : 2.6178723028860986
  Thread elapsed time (hipCIM) : 0.7375791491940618
  Performance gain (OpenSlide/hipCIM): 3.5492764481569146
Total time (OpenSlide): 28.669278404675424
Total time (hipCIM): 6.396940026432276
Average performance ga

## Multiprocessing (method1: Slow)

For each patch, it open the image file.

In [4]:
import concurrent.futures
from itertools import repeat

import numpy as np
from openslide import OpenSlide
from cucim import CuImage

import os

# num_processes = os.cpu_count() # uncomment this line to work with all threads
num_processes = 5 #comment this line if you uncomment the line above

input_file = "input/image.tif"
start_location = 0
patch_size = 256


def load_tile_openslide_mp(inp_file, start_loc, patch_size):
    with OpenSlide(inp_file) as slide:
        region = slide.read_region(start_loc, 0, [patch_size, patch_size])

def load_tile_cucim_mp(inp_file, start_loc, patch_size):
    slide = CuImage(inp_file)
    region = slide.read_region(start_loc, [patch_size, patch_size], 0)

openslide_tot_time = 0
cucim_tot_time = 0
for num_workers in range(1, num_processes + 1):

    print("# of processes : {}".format(num_workers))
    openslide_time = 0
    
    with OpenSlide(input_file) as slide:
        width, height = slide.dimensions

        start_loc_iter = ((sy, sx)
                          for sy in range(start_location, height, patch_size)
                              for sx in range(start_location, width, patch_size))

        with Timer("  Process elapsed time (OpenSlide)") as timer:
            with concurrent.futures.ProcessPoolExecutor(
                max_workers=num_workers
            ) as executor:
                executor.map(
                    load_tile_openslide_mp,
                    repeat(input_file),
                    start_loc_iter,
                    repeat(patch_size)
                )
            openslide_time = timer.elapsed_time()
            openslide_tot_time += openslide_time

    cucim_time = 0
    slide = CuImage(input_file)
    start_loc_iter = ((sy, sx)
                      for sy in range(start_location, height, patch_size)
                          for sx in range(start_location, width, patch_size))
    with Timer("  Process elapsed time (hipCIM)") as timer:
        with concurrent.futures.ProcessPoolExecutor(
            max_workers=num_workers
        ) as executor:
            executor.map(
                load_tile_cucim_mp,
                repeat(input_file),
                start_loc_iter,
                repeat(patch_size)
            )
        cucim_time = timer.elapsed_time()
        cucim_tot_time += cucim_time
    print("  Performance gain (OpenSlide/hipCIM): {}".format(openslide_time / cucim_time))

print("Total time (OpenSlide):", openslide_tot_time)
print("Total time (hipCIM):", cucim_tot_time)
print("Average performance gain (OpenSlide/hipCIM): {}".format(openslide_tot_time / cucim_tot_time))


# of processes : 1
  Process elapsed time (OpenSlide) : 179.53352003870532
  Process elapsed time (hipCIM) : 5.773879339918494
  Performance gain (OpenSlide/hipCIM): 31.09408934084855
# of processes : 2
  Process elapsed time (OpenSlide) : 110.39797480497509
  Process elapsed time (hipCIM) : 3.122366772033274
  Performance gain (OpenSlide/hipCIM): 35.35714503299185
# of processes : 3
  Process elapsed time (OpenSlide) : 313.79820303898305
  Process elapsed time (hipCIM) : 2.2553180432878435
  Performance gain (OpenSlide/hipCIM): 139.1370072938903
# of processes : 4
  Process elapsed time (OpenSlide) : 327.9693857920356
  Process elapsed time (hipCIM) : 1.7310898159630597
  Performance gain (OpenSlide/hipCIM): 189.45833010378837
# of processes : 5
  Process elapsed time (OpenSlide) : 327.44956391723827
  Process elapsed time (hipCIM) : 1.454124998766929
  Performance gain (OpenSlide/hipCIM): 225.18666840533615
Total time (OpenSlide): 1259.1486475919373
Total time (hipCIM): 14.3367789699

## Multiprocessing (method2: Faster)

For each process, reuse the opened file but submit a job for each patch request.

In [5]:
import concurrent.futures
from itertools import repeat
from functools import partial

import numpy as np
from openslide import OpenSlide
from cucim import CuImage

import os

# num_processes = os.cpu_count() # uncomment this line to work with all threads
num_processes = 5 #comment this line if you uncomment the line above

input_file = "input/image.tif"
start_location = 0
patch_size = 256

is_process_initialized = False
openslide_obj = None
cucim_obj = None


def load_tile_openslide_mp(slide, start_loc, patch_size):
    region = slide.read_region(start_loc, 0, [patch_size, patch_size])

def proc_init_openslide(inp_file, f, *iters):
    global is_process_initialized, openslide_obj
    if not is_process_initialized:
        is_process_initialized = True
        openslide_obj = OpenSlide(inp_file)
    return f(openslide_obj, *iters)

def load_tile_cucim_mp(slide, start_loc, patch_size):
    region = slide.read_region(start_loc, [patch_size, patch_size], 0)

def proc_init_cucim(inp_file, f, *iters):
    global is_process_initialized, cucim_obj
    if not is_process_initialized:
        is_process_initialized = True
        cucim_obj = CuImage(inp_file)
    return f(cucim_obj, *iters)

openslide_tot_time = 0
cucim_tot_time = 0
for num_workers in range(1, num_processes + 1):

    print("# of processes : {}".format(num_workers))
    openslide_time = 0
    
    with OpenSlide(input_file) as slide:
        width, height = slide.dimensions

        start_loc_iter = ((sx, sy)
                          for sy in range(start_location, height, patch_size)
                              for sx in range(start_location, width, patch_size))

        with Timer("  Process elapsed time (OpenSlide)") as timer:
            with concurrent.futures.ProcessPoolExecutor(
                max_workers=num_workers
            ) as executor:
                executor.map(
                    partial(proc_init_openslide, input_file, load_tile_openslide_mp),
                    start_loc_iter,
                    repeat(patch_size)
                )
            openslide_time = timer.elapsed_time()
            openslide_tot_time += openslide_time

    cucim_time = 0
    slide = CuImage(input_file)
    start_loc_iter = ((sx, sy)
                      for sy in range(start_location, height, patch_size)
                          for sx in range(start_location, width, patch_size))
    with Timer("  Process elapsed time (hipCIM)") as timer:
        with concurrent.futures.ProcessPoolExecutor(
            max_workers=num_workers
        ) as executor:
            executor.map(
                partial(proc_init_cucim, input_file, load_tile_cucim_mp),
                start_loc_iter,
                repeat(patch_size)
            )
        cucim_time = timer.elapsed_time()
        cucim_tot_time += cucim_time
    print("  Performance gain (OpenSlide/hipCIM): {}".format(openslide_time / cucim_time))

print("Total time (OpenSlide):", openslide_tot_time)
print("Total time (hipCIM):", cucim_tot_time)
print("Average performance gain (OpenSlide/hipCIM): {}".format(openslide_tot_time / cucim_tot_time))


# of processes : 1
  Process elapsed time (OpenSlide) : 14.51807108707726
  Process elapsed time (hipCIM) : 2.574316095095128
  Performance gain (OpenSlide/hipCIM): 5.639583699429412
# of processes : 2
  Process elapsed time (OpenSlide) : 20.287414019927382
  Process elapsed time (hipCIM) : 1.631494911853224
  Performance gain (OpenSlide/hipCIM): 12.434861961587607
# of processes : 3
  Process elapsed time (OpenSlide) : 24.966769919730723
  Process elapsed time (hipCIM) : 1.2246323260478675
  Performance gain (OpenSlide/hipCIM): 20.387155710892806
# of processes : 4
  Process elapsed time (OpenSlide) : 30.499506479129195
  Process elapsed time (hipCIM) : 0.9621431808918715
  Performance gain (OpenSlide/hipCIM): 31.699550633261538
# of processes : 5
  Process elapsed time (OpenSlide) : 32.289236682932824
  Process elapsed time (hipCIM) : 0.8509540921077132
  Performance gain (OpenSlide/hipCIM): 37.94474576525766
Total time (OpenSlide): 122.56099818879738
Total time (hipCIM): 7.243540605

## Multiprocessing (method3: Fastest)

Patch requests are divided into multiple processes and, for each process, request only one job with the list of patch requests.

In [6]:
import concurrent.futures
from itertools import repeat

import numpy as np
from openslide import OpenSlide
from cucim import CuImage

import os

# num_processes = os.cpu_count() # uncomment this line to work with all threads
num_processes = 5 #comment this line if you uncomment the line above

input_file = "input/image.tif"
start_location = 0
patch_size = 256


def load_tile_openslide_chunk_mp(inp_file, start_loc_list, patch_size):
    with OpenSlide(inp_file) as slide:
        for start_loc in start_loc_list:
            region = slide.read_region(start_loc, 0, [patch_size, patch_size])

def load_tile_cucim_chunk_mp(inp_file, start_loc_list, patch_size):
    slide = CuImage(inp_file)
    for start_loc in start_loc_list:
        region = slide.read_region(start_loc, [patch_size, patch_size], 0)

openslide_tot_time = 0
cucim_tot_time = 0
print("Total # of processes : {}".format(num_processes))
for num_workers in range(1, num_processes + 1):

    print("# of processes : {}".format(num_workers))
    openslide_time = 0
    
    with OpenSlide(input_file) as slide:
        width, height = slide.dimensions
        
        start_loc_data = [(sx, sy)
                          for sy in range(start_location, height, patch_size)
                              for sx in range(start_location, width, patch_size)]

        # chunk_size = len(start_loc_data) // num_workers # this line doesn't work with small images like 32x32, because the chunk_size becomes 0
        chunk_size = max(1,len(start_loc_data) // num_workers) # this works with small and large images
        
        start_loc_list_iter = [start_loc_data[i:i+chunk_size] for i in range(0, len(start_loc_data), chunk_size)]


        with Timer("  Process elapsed time (OpenSlide)") as timer:
            with concurrent.futures.ProcessPoolExecutor(
                max_workers=num_workers
            ) as executor:
                executor.map(
                    load_tile_openslide_chunk_mp,
                    repeat(input_file),
                    start_loc_list_iter,
                    repeat(patch_size)
                )
            openslide_time = timer.elapsed_time()
            openslide_tot_time += openslide_time

    cucim_time = 0
    slide = CuImage(input_file)
    start_loc_data = [(sx, sy)
                      for sy in range(start_location, height, patch_size)
                          for sx in range(start_location, width, patch_size)]
    # chunk_size = len(start_loc_data) // num_workers #this line doesn't work with small images like 32x32, because the chunk_size becomes 0
    chunk_size = max(1,len(start_loc_data) // num_workers) # this works with small and large images
    start_loc_list_iter = [start_loc_data[i:i+chunk_size] for i in range(0, len(start_loc_data), chunk_size)]

    with Timer("  Process elapsed time (hipCIM)") as timer:
        with concurrent.futures.ProcessPoolExecutor(
            max_workers=num_workers
        ) as executor:
            executor.map(
                load_tile_cucim_chunk_mp,
                repeat(input_file),
                start_loc_list_iter,
                repeat(patch_size)
            )
        cucim_time = timer.elapsed_time()
        cucim_tot_time += cucim_time
    print("  Performance gain (OpenSlide/hipCIM): {}".format(openslide_time / cucim_time))

print("Total time (OpenSlide):", openslide_tot_time)
print("Total time (hipCIM):", cucim_tot_time)
print("Average performance gain (OpenSlide/hipCIM): {}".format(openslide_tot_time / cucim_tot_time))


Total # of processes : 5
# of processes : 1
  Process elapsed time (OpenSlide) : 13.891080098226666
  Process elapsed time (hipCIM) : 2.251591194886714
  Performance gain (OpenSlide/hipCIM): 6.169450355718582
# of processes : 2
  Process elapsed time (OpenSlide) : 19.395698260050267
  Process elapsed time (hipCIM) : 1.2421654020436108
  Performance gain (OpenSlide/hipCIM): 15.614424800546255
# of processes : 3
  Process elapsed time (OpenSlide) : 24.01676206709817
  Process elapsed time (hipCIM) : 0.8727106000296772
  Performance gain (OpenSlide/hipCIM): 27.51973227583286
# of processes : 4
  Process elapsed time (OpenSlide) : 31.90676002111286
  Process elapsed time (hipCIM) : 0.7442150446586311
  Performance gain (OpenSlide/hipCIM): 42.87303817641631
# of processes : 5
  Process elapsed time (OpenSlide) : 36.040590851102024
  Process elapsed time (hipCIM) : 0.5686894967220724
  Performance gain (OpenSlide/hipCIM): 63.37481360011056
Total time (OpenSlide): 125.25089129758999
Total tim