# numba 的基本用法

## 使用 jit 加速 Python 低效的 for 语句

In [1]:
import numba as nb
import numpy as np

def add1(x, c):
    rs = [0.] * len(x)
    for i, xx in enumerate(x):
        rs[i] = xx + c
    return rs

def add2(x, c):
    return [xx + c for xx in x]

@nb.jit(nopython=True)
def add_with_jit(x, c):
    rs = [0.] * len(x)
    for i, xx in enumerate(x):
        rs[i] = xx + c
    return rs

@nb.jit(nopython=True)
def wrong_add(x, c):
    rs = [0] * len(x)
    for i, xx in enumerate(x):
        rs[i] = xx + c
    return rs

y = np.random.random(10**5).astype(np.float32)
x = y.tolist()

assert np.allclose(add1(x, 1), add2(x, 1), add_with_jit(x, 1))
%timeit add1(x, 1)
%timeit add2(x, 1)
%timeit add_with_jit(x, 1)
print(np.allclose(wrong_add(x, 1), 1))

10 ms ± 127 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.77 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.66 ms ± 122 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
True


+ 注意：
    + `numba`不支持 list comprehension，详情可参见[这里](https://github.com/numba/numba/issues/504)
    + `jit`会在某种程度上“预编译”你的代码，这意味着它会在某种程度上固定住各个变量的数据类型；所以在`jit`下定义数组时，如果想要使用的是`float`数组的话，就不能像上述`wrong_add`里那样用`[0] * len(x)`定义、而应该在`0`后面加一个小数点：`[0.] * len(x)`
    + `jit`能够加速的不限于`for`，但一般而言加速`for`会比较常见、效果也比较显著。我在我实现的`numpy`版本的卷积神经网络（`CNN`）中用了`jit`后、可以把代码加速 **20** 倍左右（具体代码可以参见[这里](https://github.com/carefree0910/MachineLearning/blob/master/NN/Basic/Layers.py#L9)）

## 使用 vectorize 实现 numpy 的 Ufunc 功能

### vectorize 的基本应用

In [2]:
assert np.allclose(y + 1, add_with_jit(x, 1))
%timeit add_with_jit(x, 1)
%timeit y + 1

3.33 ms ± 233 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
21.6 µs ± 209 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [3]:
@nb.vectorize("float32(float32, int32)", nopython=True)
def add_with_vec(yy, c):
    return yy + c

assert np.allclose(y + 1, add_with_vec(y, 1))
%timeit add_with_vec(y, 1)
%timeit y + 1

76.6 µs ± 410 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
21.3 µs ± 236 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [4]:
@nb.vectorize("float32(float32, float32)", nopython=True)
def add_with_vec(yy, c):
    return yy + c

assert np.allclose(y + 1, add_with_vec(y, 1.))
%timeit add_with_vec(y, 1.)
%timeit y + 1

53.1 µs ± 2.72 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
21.3 µs ± 235 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [5]:
@nb.vectorize([
    "float32(float32, int32)",
    "float32(float32, float32)"
], nopython=True)
def add_with_vec(yy, c):
    return yy + c

assert np.allclose(y + 1, add_with_vec(y, 1), add_with_vec(y, 1.))
%timeit add_with_vec(y, 1)
%timeit add_with_vec(y, 1.)
%timeit y + 1
%timeit y + 1.

76.5 µs ± 339 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
50.8 µs ± 2.57 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
20.8 µs ± 155 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
20.7 µs ± 308 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


### vectorize 的“并行”版本

In [6]:
@nb.vectorize("float32(float32, float32)", target="parallel", nopython=True)
def add_with_vec(y, c):
    return y + c

assert np.allclose(y+1, add_with_vec(y,1.))
%timeit add_with_vec(y, 1.)
%timeit y + 1

72.3 µs ± 2.43 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
20.7 µs ± 214 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


+ 注意：并非所有版本的`numba`、`numpy`都能在上述`parallel`下获得如此明显的性能提升；事实上：
    + 上述测试结果基于 Intel Distribution for Python
    + 在默认的`Python3.6.1`环境下测试时，发现使用`parallel`是会更慢的……

In [7]:
@nb.vectorize("float32(float32, float32, float32)", target="parallel", nopython=True)
def clip_with_parallel(y, a, b):
    if y < a:
        return a
    if y > b:
        return b
    return y

@nb.vectorize("float32(float32, float32, float32)", nopython=True)
def clip(y, a, b):
    if y < a:
        return a
    if y > b:
        return b
    return y

assert np.allclose(np.clip(y, 0.1, 0.9), clip(y, 0.1, 0.9), clip_with_parallel(y, 0.1, 0.9))
%timeit clip_with_parallel(y, 0.1, 0.9)
%timeit clip(y, 0.1, 0.9)
%timeit np.clip(y, 0.1, 0.9)

95.8 µs ± 3.45 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
101 µs ± 433 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
352 µs ± 14.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


+ 注意：这个栗子中的性能提升就是实打实的了

总之，使用`parallel`时不能一概而论，还是要做些实验

### 使用 jit(nogil=True) 实现高效并发（多线程）

In [8]:
import math

def np_func(a, b):
    return 1 / (a + np.exp(-b))

@nb.jit('void(float32[:], float32[:], float32[:])', nopython=True, nogil=False)
def kernel1(result, a, b):
    for i in range(len(result)):
        result[i] = 1 / (a[i] + math.exp(-b[i]))
                
@nb.jit('void(float32[:], float32[:], float32[:])', nopython=True, nogil=True)
def kernel2(result, a, b):
    for i in range(len(result)):
        result[i] = 1 / (a[i] + math.exp(-b[i]))


from concurrent.futures import ThreadPoolExecutor

def make_single_task(kernel):
    def func(length, *args):
        result = np.empty(length, dtype=np.float32)
        kernel(result, *args)
        return result
    return func

def make_multi_task(kernel, n_thread):
    def func(length, *args):
        result = np.empty(length, dtype=np.float32)
        args = (result,) + args
        chunk_size = (length + n_thread - 1) // n_thread
        chunks = [[arg[i*chunk_size:(i+1)*chunk_size] for i in range(n_thread)] for arg in args]
        with ThreadPoolExecutor(max_workers=n_thread) as e:
            for _ in e.map(kernel, *chunks):
                pass
        return result
    return func

length = 10 ** 6
a = np.random.rand(length).astype(np.float32)
b = np.random.rand(length).astype(np.float32)

nb_func1 = make_single_task(kernel1)
nb_func2 = make_multi_task(kernel1, 4)
nb_func3 = make_single_task(kernel2)
nb_func4 = make_multi_task(kernel2, 4)

rs_np = np_func(a, b)
rs_nb1 = nb_func1(length, a, b)
rs_nb2 = nb_func2(length, a, b)
rs_nb3 = nb_func3(length, a, b)
rs_nb4 = nb_func4(length, a, b)
assert np.allclose(rs_np, rs_nb1, rs_nb2, rs_nb3, rs_nb4)
%timeit np_func(a, b)
%timeit nb_func1(length, a, b)
%timeit nb_func2(length, a, b)
%timeit nb_func3(length, a, b)
%timeit nb_func4(length, a, b)

12.3 ms ± 298 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
7.1 ms ± 43.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
9.58 ms ± 57.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
7.29 ms ± 135 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.54 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


+ 注意：一般来说，数据量越大、并发的效果越明显。反之，数据量小的时候，并发很有可能会降低性能