In [1]:
import numpy as np

from prysm.coordinates import cart_to_polar
from prysm.mathops import (sin, cos)

import dask
from dask.distributed import Client

from numba import vectorize

import ipyparallel as ipp

c = Client()
rc = ipp.Client()
dv = rc[:]  # ipp with all workers

In [4]:
def Z45(rho, phi):
    return (210 * rho**10 - 504 * rho**8 + 420 * rho**6 - 140 * rho**4 + 15 * rho**2) \
        * sin(2 * phi)
    
def Z46(rho, phi):
    return (462 * rho**11 - 1260 * rho**9 + 1260 * rho**7 - 560 * rho**5 + 105 * rho**3 - 6 * rho) \
        * cos(phi)

def Z47(rho, phi):
    return (462 * rho**11 - 1260 * rho**9 + 1260 * rho**7 - 560 * rho**5 + 105 * rho**3 - 6 * rho) \
        * sin(phi)

def Z48(rho, phi):
    return 924 * rho**12 \
        - 2772 * rho**10 \
        + 3150 * rho**8 \
        - 1680 * rho**6 \
        + 420 * rho**4 \
        - 42 * rho**2 \
        + 1


# apply the numba jit to Z45..Z48
v_Z48 = vectorize(Z48)
v_Z47 = vectorize(Z47)
v_Z46 = vectorize(Z46)
v_Z45 = vectorize(Z45)

# apply dask delayed to Z45..Z48
d_Z48 = dask.delayed(Z48)
d_Z47 = dask.delayed(Z47)
d_Z46 = dask.delayed(Z46)
d_Z45 = dask.delayed(Z45)

dv.push(dict(sin=sin, cos=cos, Z45=Z45, Z46=Z46, Z47=Z47, Z48=Z48))
# apply ipparallel to Z45..Z48
@dv.parallel(block=True)
def p_Z45(rho, phi):
    return Z45(rho, phi)

@dv.parallel(block=True)
def p_Z46(rho, phi):
    return Z46(rho, phi)

@dv.parallel(block=True)
def p_Z47(rho, phi):
    return Z47(rho, phi)

@dv.parallel(block=True)
def p_Z48(rho, phi):
    return Z48(rho, phi)


SAMPLES = 128
x, y = np.linspace(-1, 1, SAMPLES), np.linspace(-1, 1, SAMPLES)
rho, phi = cart_to_polar(x, y)

def compute_normal(rho, phi):
    result = Z45(rho, phi)
    result += Z46(rho, phi)
    result += Z47(rho, phi)
    result += Z48(rho, phi)
    return result

def compute_numba(rho, phi):
    result = v_Z45(rho, phi)
    result += v_Z46(rho, phi)
    result += v_Z47(rho, phi)
    result += v_Z48(rho, phi)
    return result

def compute_dask(rho, phi):
    result = d_Z45(rho, phi)
    result += d_Z46(rho, phi)
    result += d_Z47(rho, phi)
    result += d_Z48(rho, phi)
    return result

def compute_ipp(rho, phi):
    result = p_Z45(rho, phi)
    result += p_Z46(rho, phi)
    result += p_Z47(rho, phi)
    result += p_Z48(rho, phi)
    return result

# warm up numba jit
for i in range(1000):
    dat = compute_numba(rho, phi)
    del dat

In [5]:
%%timeit
compute_normal(rho, phi)

497 µs ± 183 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [6]:
%%timeit
compute_numba(rho, phi)

41 µs ± 4.93 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [7]:
%%timeit
r = compute_dask(rho, phi)
r.compute()

46 ms ± 7.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit
compute_ipp(rho, phi)

150 ms ± 6.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Prysm's computations generally fall into a category where the data throughput is very high and the computation time is very low (just a few math kernels).  Numba affords the opportunity to merge these kernels, optimizing performance with @vectorize.  Dask and IPyParallel have to move the data, which incurs overhead larger than the gains of multi-core computing.  Dask appears to be about 3x as efficient at that.  It is possible they would perform better where the result of the computation was e.g. the mean of the array, since the return trip transportation would be almost entirely removed by exchanging an array for a single float.  It may also be possible to only ship the rho and phi arrays once, saving more time.  Still, nubma is about 1000x faster, and it is unlikely sufficient improvement could be made to the transport to overcome this.