# Cython and Numba

In [None]:
import Cython
%load_ext Cython

# Note: To use Cython on a Windows machine you may need to install Visual Studio to use it 
# to install the Python Extensions Package

In [None]:
from array import array
from random import randint

In [None]:
arr = array('i', (randint(-1000, 1000) for _ in range(10000000)))

In [None]:
def sum_py(arr):
    sum_ = 0
    for elem in arr:
        sum_ += elem
    return sum_

In [None]:
%timeit -r3 -n5 sum(arr)
%timeit -r3 -n5 sum_py(arr)

In [None]:
%%cython --annotate
def sum_cy(arr):
    sum_ = 0
    for elem in arr:
        sum_ += elem
    return sum_

In [None]:
%timeit -r3 -n5 sum(arr)
%timeit -r3 -n5 sum_cy(arr)

### Static Typing

In [None]:
%%cython
def sum_cy_st(arr):
    cdef int sum_ = 0
    for elem in arr:b
        sum_ += elem
    return sum_

In [None]:
%timeit -r3 -n5 sum(arr)
%timeit -r3 -n5 sum_cy_st(arr)

#### Typed Memoryviews

In [None]:
%%cython
def sum_cy_tm_1(int[:] arr):
    cdef int sum_ = 0
    for elem in arr:
        sum_ += elem
    return sum_

In [None]:
%timeit -r3 -n5 sum(arr)
%timeit -r3 -n5 sum_cy_st(arr)
%timeit -r2 -n3 sum_cy_tm_1(arr)

### With C-like looping

In [None]:
%%cython
from cpython cimport array
def sum_cy_tm_2(int[:] arr):
    cdef int i, n = arr.shape[0], sum_ = 0
    for i in range(n):
        sum_ += arr[i]
    return sum_

In [None]:
%timeit -r3 -n5 sum(arr)
%timeit -r3 -n5 sum_cy_st(arr)
%timeit -r3 -n5 sum_cy_tm_2(arr)

### Compile Optimizations

In [None]:
%%cython
cimport cython
from cpython cimport array

@cython.boundscheck(False)  # just read from/write to memory without checking if an index is in a given range
@cython.wraparound(False)  # using negative indices is not possible anymore
# see also: https://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-directives
def sum_cy_co(int[::1] arr):
    cdef int i, n = arr.shape[0], sum_ = 0
    for i in range(n):
        sum_ += arr[i]
    return sum_

In [None]:
%timeit -r3 -n5 sum(arr)
%timeit -r3 -n5 sum_cy_co(arr)

In [None]:
# With Memory view layout --> int[::1] when passing an array as function argument
# see also: https://cython.readthedocs.io/en/latest/src/userguide/memoryviews.html
%timeit -r3 -n5 sum(arr)
%timeit -r3 -n5 sum_cy_co(arr)

### Together with Multithreading

In [None]:
%%cython
cimport cython
from cpython cimport array

@cython.boundscheck(False)
@cython.wraparound(False)
def _sum_gil(int[::1] arr, int[:] out):
    cdef int i, n = arr.shape[0], sum_ = 0
    
    #with nogil:
    for i in range(n):
        sum_ += arr[i]
        
    out[0] = sum_
      
def sum_threads(int[::1] arr):
    cdef int[::1] arr1 = arr[0 : arr.shape[0]//2]
    cdef int[::1] arr2 = arr[arr.shape[0]//2 : arr.shape[0]]
    cdef int[:] out = array.array('i', (0, 0))
    
    from threading import Thread
    t1 = Thread(target=_sum_gil, args=(arr1, out[0:1]))
    t2 = Thread(target=_sum_gil, args=(arr2, out[1:2]))
    
    t1.start(); t2.start()
    t1.join(); t2.join()
    
    return out[0] + out[1]

In [None]:
# with GIL
%timeit -r3 -n5 sum(arr)
%timeit -r3 -n5 sum_threads(arr)

In [None]:
# with GIL released (use 'with nogil' statement)
%timeit -r3 -n5 sum(arr)
%timeit -r3 -n5 sum_threads(arr)

### Numpy (Vecotrization)

In [None]:
import numpy as np

np_arr = np.asarray(arr)

%timeit -r3 -n5 sum(np_arr)
%timeit -r3 -n5 np_arr.sum()
%timeit -r3 -n5 sum_cy_co(np_arr)

### Proof that they are all working

In [None]:
(sum_py(arr), sum_cy(arr), sum(arr), sum_cy_st(arr), sum_cy_tm_1(arr), sum_cy_tm_2(arr), sum_cy_co(arr), sum_threads(arr), np_arr.sum())

## Numba

In [None]:
import os
os.environ['NUMBA_NUM_THREADS'] = '10'
import numba as nb
import numpy as np

In [None]:
arr_nb = np.random.random((4096,4096))

In [None]:
def py_sum2d(arr):
    result = 0.0
    for i in range(arr.shape[0]):
        for j in range(arr.shape[1]):
            result += arr[i, j]
    return result

@nb.jit
def nb_sum2d(arr):
    result = 0.0
    for i in range(arr.shape[0]):
        for j in range(arr.shape[1]):
            result += arr[i, j]
    return result

In [None]:
%timeit -r1 -n1 py_sum2d(arr_nb)
%timeit -r1 -n1 nb_sum2d(arr_nb)  # --> first call is slow

In [None]:
%timeit -r1 -n1 py_sum2d(arr_nb)
%timeit -r1 -n1 nb_sum2d(arr_nb)  # --> first call is slow

In [None]:
@nb.jit(nopython=True)
#@nb.njit
def nb_sum2d_nopython(arr):
    result = 0.0
    for i in range(arr.shape[0]):
        for j in range(arr.shape[1]):
            result += arr[i, j]
    return result

In [None]:
%timeit -r5 -n10 nb_sum2d(arr_nb)
%timeit -r5 -n10 nb_sum2d_nopython(arr_nb)

In [None]:
@nb.jit(nopython=True, parallel=True)
def nb_sum2d_parallel(arr):
    result = 0.0
    for i in nb.prange(arr.shape[0]):
        for j in nb.prange(arr.shape[1]):
            result += arr[i, j]
    return result

In [None]:
%timeit -r5 -n10 nb_sum2d(arr_nb)
%timeit -r5 -n10 nb_sum2d_parallel(arr_nb)  # --> add nb.prange (https://numba.pydata.org/numba-doc/0.11/prange.html)

In [None]:
nb_sum2d_parallel.parallel_diagnostics(level=1)