## Mini-study for the multi_decoder_gp() optimization

In [23]:
import numpy as np
from numba import jit, float64
from scripts.VCFPooling.poolSNPs import pooler
from scripts.VCFPooling.poolSNPs import pydecoder
import timeit

path_keys = '/home/camille/PoolImpHuman/data/main'
keys, vals = pooler.get_lookup_arrays(path_keys)

dkeys = np.array([[2, 2, 0, 2, 2, 0, 1, 1],
                      [0, 4, 0, 1, 3, 0, 0, 1],
                      [1, 3, 0, 0, 4, 0, 1, 1],
                      [1, 3, 0, 0, 4, 0, 0, 1]])

dkey = dkeys[0]
dummy = pooler.get_dummy_key(dkey)

dummies = np.apply_along_axis(pooler.get_dummy_key, 0, dkeys)
dummies = np.tile(dummies, 100)

def minidecoder_digitize(dkey: np.ndarray, lookkeys: np.ndarray, lookvals: np.ndarray) -> np.ndarray:
    """gets GP from lookup table as arrays"""
    gidx = np.digitize(dkey.dot(lookkeys.T), [np.sum(dkey)])  # np.sum(dkey) = 8
    gp = gidx.dot(lookvals)
    return gp

@jit(nopython=True, forceobj=True, locals={'gidx': float64, 'gp': float64})
def numba_minidecoder_digitize(dkey: np.ndarray, lookkeys: np.ndarray, lookvals: np.ndarray) -> np.ndarray:
    """gets GP from lookup table as arrays"""
    gidx = np.digitize(dkey.dot(lookkeys.T), [np.sum(dkey)])  # np.sum(dkey) = 8
    gp = gidx.dot(lookvals)
    return gp

def minidecoder_regdiv(dkey: np.ndarray, lookkeys: np.ndarray, lookvals: np.ndarray) -> np.ndarray:
    """gets GP from lookup table as arrays"""
    gidx = dkey.dot(lookkeys.T) // np.sum(dkey)  # np.sum(dkey) = 8
    gp = gidx.dot(lookvals)
    return gp

@jit(nopython=True, forceobj=True, locals={'gidx': float64, 'gp': float64})
def numba_minidecoder_regdiv(dkey: np.ndarray, lookkeys: np.ndarray, lookvals: np.ndarray) -> np.ndarray:
    """gets GP from lookup table as arrays"""
    gidx = dkey.dot(lookkeys.T) // np.sum(dkey)  # np.sum(dkey) = 8
    gp = gidx.dot(lookvals)
    return gp

def minidecoder_floordiv(dkey: np.ndarray, lookkeys: np.ndarray, lookvals: np.ndarray) -> np.ndarray:
    """gets GP from lookup table as arrays"""
    gidx = np.floor_divide(dummy.dot(keys.T), np.sum(dkey))  # np.sum(dkey) = 8
    gp = gidx.dot(lookvals)
    return gp

@jit(nopython=True, forceobj=True, locals={'gidx': float64, 'gp': float64})
def numba_minidecoder_floordiv(dkey: np.ndarray, lookkeys: np.ndarray, lookvals: np.ndarray) -> np.ndarray:
    """gets GP from lookup table as arrays"""
    gidx = np.floor_divide(dummy.dot(keys.T), np.sum(dkey))  # np.sum(dkey) = 8
    gp = gidx.dot(lookvals)
    return gp

@jit(nopython=True, forceobj=True, locals={'gidx': float64, 'gp': float64})
def numba_apply_along_axis(*args):
    np.apply_along_axis(*args)


In [24]:
print('np.digitize time --> ')
%timeit 'coord = np.digitize(dummy.dot(keys.T), [len(dkey)])'

np.digitize time --> 
5.5 ns ± 0.42 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [25]:
print('Regular div // time --> ')
%timeit 'dummy.dot(keys.T) // len(dkey)'

Regular div // time --> 
7.71 ns ± 2.74 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [26]:
print('np.floor_divide time --> ')
%timeit 'np.floor_divide(dummy.dot(keys.T), len(dkey))'

np.floor_divide time --> 
6.43 ns ± 1.98 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


### Mini-functions timing

In [27]:
print('minidecoder_digitize() time --> ')
%timeit 'minidecoder_digitize(dummy, keys, vals)'

minidecoder_digitize() time --> 
6.06 ns ± 1.69 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [28]:
print('numba_minidecoder_digitize() time --> ')
%timeit 'numba_minidecoder_digitize(dummy, keys, vals)'

numba_minidecoder_digitize() time --> 
6.41 ns ± 2.23 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [29]:
print('minidecoder_regdiv() time --> ')
%timeit 'minidecoder_regdiv(dummy, keys, vals)'

minidecoder_regdiv() time --> 
6.76 ns ± 1.77 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [30]:
print('numba_minidecoder_regdiv() time --> ')
%timeit 'numba_minidecoder_regdiv(dummy, keys, vals)'

numba_minidecoder_regdiv() time --> 
6.81 ns ± 1.86 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [31]:
print('minidecoder_floordiv() time --> ')
%timeit 'minidecoder_floordiv(dummy, keys, vals)'

minidecoder_floordiv() time --> 
7.67 ns ± 2.27 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [32]:
print('numba_minidecoder_floordiv() time --> ')
%timeit 'numba_minidecoder_floordiv(dummy, keys, vals)'

numba_minidecoder_floordiv() time --> 
6.04 ns ± 0.652 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


Numba does not provide much speed up here?

### `np.apply_along_axis` with mini-functions timing

In [33]:
print('minidecoder_digitize() time --> ')
%timeit 'np.apply_along_axis(minidecoder_digitize, dummies, 0, keys, vals)'

minidecoder_digitize() time --> 
6.28 ns ± 0.898 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [34]:
print('numba_minidecoder_digitize() time --> ')
%timeit 'np.apply_along_axis(numba_minidecoder_digitize, dummies, 0, keys, vals)'

numba_minidecoder_digitize() time --> 
5.85 ns ± 0.231 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [35]:
print('minidecoder_regdiv() time --> ')
%timeit 'np.apply_along_axis(minidecoder_regdiv, dummies, 0, keys, vals)'

minidecoder_regdiv() time --> 
5.58 ns ± 0.236 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [36]:
print('numba_minidecoder_regdiv() time --> ')
%timeit 'np.apply_along_axis(numba_minidecoder_regdiv, dummies, 0, keys, vals)'

numba_minidecoder_regdiv() time --> 
5.45 ns ± 0.235 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [37]:
print('minidecoder_floordiv() time --> ')
%timeit 'np.apply_along_axis(minidecoder_regdiv, dummies, 0, keys, vals)'

minidecoder_floordiv() time --> 
5.62 ns ± 0.169 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [38]:
print('numba_minidecoder_floordiv() time --> ')
%timeit 'np.apply_along_axis(numba_minidecoder_floordiv, dummies, 0, keys, vals)'

numba_minidecoder_floordiv() time --> 
5.57 ns ± 0.218 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


`np.apply_along axis()` is efficient, it does not take longer to perform the computation on 8 or 800 arrays.

Numba does not seem to speed up anything here.

`np.digitize`is an acceptable coding solution, `np.floor_div` might be slightly better?

### Numba on `np.apply_along_axis` with mini-functions timing

In [39]:
print('numba_minidecoder_digitize() time --> ')
%timeit 'numba_apply_along_axis(numba_minidecoder_digitize, dummies, 0, keys, vals)'

numba_minidecoder_digitize() time --> 
5.55 ns ± 0.334 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [40]:
print('numba_minidecoder_regdiv() time --> ')
%timeit 'numba_apply_along_axis(numba_minidecoder_regdiv, dummies, 0, keys, vals)'

numba_minidecoder_regdiv() time --> 
5.95 ns ± 0.495 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [41]:
print('numba_minidecoder_floordiv() time --> ')
%timeit 'numba_apply_along_axis(numba_minidecoder_floordiv, dummies, 0, keys, vals)'

numba_minidecoder_floordiv() time --> 
6.79 ns ± 0.846 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [42]:
print('minidecoder_digitize() time --> ')
%timeit 'numba_apply_along_axis(minidecoder_digitize, dummies, 0, keys, vals)'

minidecoder_digitize() time --> 
6.27 ns ± 1.34 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [43]:
print('minidecoder_regdiv() time --> ')
%timeit 'numba_apply_along_axis(minidecoder_regdiv, dummies, 0, keys, vals)'

minidecoder_regdiv() time --> 
6.85 ns ± 2.41 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [44]:
print('minidecoder_floordiv() time --> ')
%timeit 'numba_apply_along_axis(minidecoder_floordiv, dummies, 0, keys, vals)'

minidecoder_floordiv() time --> 
8.98 ns ± 1.8 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


It looks like `np.apply_along_axis`is not enhanced by Numba.
