<a href="https://colab.research.google.com/github/applejxd/colaboratory/blob/master/numba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## numba での高速化

[numpy での重い計算の例](https://qiita.com/gyu-don/items/9d223b007ca620e95abc)

In [1]:
import sys
sys.setrecursionlimit(100000)

def ack(m, n):
    if m == 0:
        return n + 1
    if n == 0:
        return ack(m - 1, 1)
    return ack(m - 1, ack(m, n - 1))

通常時の計算時間を測定

In [2]:
import time
from contextlib import contextmanager

@contextmanager
def timer():
    t = time.perf_counter()
    yield None
    print('Elapsed:', time.perf_counter() - t)

with timer():
    ack(3, 10)

Elapsed: 15.715511877999994


## njit: numba の nopython モード

In [3]:
from numba import njit

@njit
def ack(m, n):
    if m == 0:
        return n + 1
    if n == 0:
        return ack(m - 1, 1)
    return ack(m - 1, ack(m, n - 1))

# コンパイル時間含む
with timer():
    ack(3, 10)

# コンパイル時間含まない
with timer():
    ack(3, 10)

Elapsed: 1.094479265000004
Elapsed: 0.2952723059999869


### 並列化 & fastmath


更に高速化するには[追加オプション](https://numba.readthedocs.io/en/stable/user/performance-tips.html)を使用する

In [4]:
import numpy as np
from numba import prange


@njit
def sum_of_squares(arr):
    s = 0
    for i in range(arr.shape[0]):
        s += arr[i] ** 2
    return s

@njit(parallel=True)
def sum_of_squares_parallel(arr):
    s = 0
    for i in prange(arr.shape[0]):
        s += arr[i] ** 2
    return s

@njit(parallel=True, fastmath=True)
def sum_of_squares_fast(arr):
    s = 0
    for i in prange(arr.shape[0]):
        s += arr[i] ** 2
    return s

arr = np.random.randn(1000000)

sum_of_squares(arr)
with timer():
    sum_of_squares(arr)
    
sum_of_squares_parallel(arr)
with timer():
    sum_of_squares_parallel(arr)

sum_of_squares_fast(arr)
with timer():
    sum_of_squares_fast(arr)

Elapsed: 0.0013868989999963333
Elapsed: 0.0007452540000087993
Elapsed: 0.0026067459999978837


### numba.cuda

[GPUを意識したプログラミング](https://co-crea.jp/wp-content/uploads/2016/07/File_2.pdf)が必要：
- [グリッド・ブロック中のスレッド位置の取得方法](https://numba.pydata.org/numba-doc/latest/cuda/kernels.html#absolute-positions)
- [CPU・GPU間のデータ転送方法](https://numba.pydata.org/numba-doc/latest/cuda/memory.html)

In [6]:
from numba import cuda
import numpy as np
import sys
sys.setrecursionlimit(100000)


# カーネル関数
@cuda.jit
def add_kernel(a, b, c):
    i = cuda.grid(1)
    c[i] = a[i] + b[i]

# 起動関数
def add_arrays(a, b, threads_per_block=256):
    # threads_per_block は1ブロックあたりのスレッド数 (128~512)
    # GPU の使用ブロック数を計算
    blocks = (a.size + threads_per_block - 1) // threads_per_block

    # 結果保存用にメモリ確保
    result = cuda.to_device(np.zeros_like(a))
    add_kernel[blocks, threads_per_block](
        cuda.to_device(a), cuda.to_device(b), result)
    return result.copy_to_host()

array_size = 100000000
a = np.ones(array_size, dtype=np.float32)
b = np.ones(array_size, dtype=np.float32)

with timer():
    a + b

add_arrays(a, b)
with timer():
    add_arrays(a, b)

Elapsed: 0.15795063500000595
Elapsed: 0.3968715780000025


## concurrent.futures での高速化

CPU 数を確認

In [2]:
import os

print(os.cpu_count())
print(len(os.sched_getaffinity(0)))

import multiprocessing
multiprocessing.cpu_count()

2
2


2

max_workers の最適値を見つけるサンプル

In [9]:
import numpy as np
import time
import concurrent.futures

def do_something(size):
    return np.dot(np.ones((size, size)), np.ones((size, size)))

worker_values = [1, 2, 4, 8, 16, 32, 64]
tasks = [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]

for max_workers in worker_values:
    start_time = time.time()
    with concurrent.futures.ProcessPoolExecutor(max_workers) as executor:
        results = [executor.submit(do_something, task) for task in tasks]
        for f in concurrent.futures.as_completed(results):
            f.result()
    total_time = time.time() - start_time
    print(f'max_workers={max_workers} finished in {total_time:.2f} seconds.')

max_workers=1 finished in 0.81 seconds.
max_workers=2 finished in 0.93 seconds.
max_workers=4 finished in 1.04 seconds.
max_workers=8 finished in 1.05 seconds.
max_workers=16 finished in 1.10 seconds.
max_workers=32 finished in 1.35 seconds.
max_workers=64 finished in 1.83 seconds.
