In [1]:
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
import os
from numba import jit, njit

In [2]:
os.environ['NUMBA_THREADING_LAYER'] = 'omp' # my TBB version complains
os.environ['NUMBA_NUM_THREADS'] = '3'
os.environ['NUMBA_ENABLE_AVX'] = '1'
os.environ['NUMBA_CPU_NAME'] = 'generic'
os.environ['NUMBA_CPU_FEATURES'] = '+sse,+sse2,+avx,+avx2,+avx512f,+avx512dq'
#os.environ['NUMBA_CPU_FEATURES'] = '+sse,+sse2,+avx,+avx2'

In [3]:
from craft import craco_plan, craco, uvfits
from craco.preprocess import Calibrate, normalise

In [18]:
f = uvfits.open("/data/craco/wan342/tmp/transfer/b04.uvfits")

values = craco_plan.get_parser().parse_args([])
calpath = "/data/craco/gup037/test_runs_of_craco_pipeline/test_mask_filterbank_writer/cal/b04.aver.4pol.smooth.npy"

In [19]:
plan = craco_plan.PipelinePlan(f, values)



In [26]:
blocker = f.fast_time_blocks(nt=256, fetch_uvws=False)

In [27]:
%timeit block0, uvws0 = next(blocker)

198 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
block0 = block0.squeeze()


In [30]:
block0.shape

(253, 120, 256)

In [31]:
%timeit normalise(block0)

675 ms ± 5.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [32]:
%timeit block0.std(axis=-1)

397 ms ± 6.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [42]:
%timeit block0.flatten().std()

423 ms ± 6.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
from numba import njit, jit

In [39]:
@jit
def get_rms_numba(block):
    return block.std(axis=-1)

In [40]:
%timeit get_rms_numba(block0)

Compilation is falling back to object mode WITH looplifting enabled because Function "get_rms_numba" failed type inference due to: - Resolution failure for literal arguments:
AssertionError()
- Resolution failure for non-literal arguments:
AssertionError()

During: resolving callee type: BoundFunction(array.std for array(complex64, 3d, A))
During: typing of call at <ipython-input-39-6140fdd5c362> (3)


File "<ipython-input-39-6140fdd5c362>", line 3:
def get_rms_numba(block):
    return block.std(axis=-1)
    ^

  @jit

File "<ipython-input-39-6140fdd5c362>", line 2:
@jit
def get_rms_numba(block):
^

  state.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "<ipython-input-39-6140fdd5c362>", line 2:
@jit
def get_rms_numba(block):
^


403 ms ± 3.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [67]:
x = np.random.normal(0, 1, block0.size).reshape(block0.shape)

%timeit x.std(axis = -1)

38.3 ms ± 367 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [68]:
%timeit block0.std(axis=-1)

288 ms ± 2.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [69]:
type(block0)

numpy.ma.core.MaskedArray

In [85]:
block0.fill_value = np.nan

In [86]:
tblock0 = block0.filled()

In [78]:
%timeit tblock0.std(axis=-1)

51.2 ms ± 380 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [79]:
tblock0.dtype, x.dtype

(dtype('complex64'), dtype('float64'))

In [80]:
%timeit tblock0.mean(axis=-1)

10.7 ms ± 339 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [145]:
%timeit np.nanstd(tblock0, axis=-1)

  keepdims=keepdims)


110 ms ± 331 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [146]:
%timeit np.nanmean(tblock0, axis=-1)

  """Entry point for launching an IPython kernel.


51.8 ms ± 349 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [228]:
@njit(parallel=True, fastmath = True)
def get_numba_mean_and_rms_masked(block):
    nbl, nf, nt = block.shape
    Ai = np.ascontiguousarray(np.zeros((nbl, nf), dtype='complex64'))
    Qi = np.ascontiguousarray(np.zeros((nbl, nf), dtype='complex64'))
    N = np.ascontiguousarray(np.ones((nbl, nf), dtype='int16'))
    
    
    for ibl in range(nbl):
        for iff in range(nf):
            for it in range(nt):
                #tref = time.time()
                #non_nans = block[..., it] != np.nan
                isamp = block[ibl, iff, it]
                if isamp != np.nan:
                    #t1 = time.time()
                    Qi[ibl, iff] = Qi[ibl, iff] + (N[ibl, iff]-1) / N[ibl, iff] * (isamp - Ai[ibl, iff])**2
                    #t2 = time.time()
                    Ai[ibl, iff] = Ai[ibl, iff] + (isamp - Ai[ibl, iff]) / N[ibl, iff]
                    #t3 = time.time()
                    N[ibl, iff] = N[ibl, iff] + 1
                    #t4 = time.time()
                    #print((t1 - tref)*1e3, (t2-t1)*1e3, (t3-t2)*1e3, (t4-t3)*1e3, (t4 - tref)*1e3)
    
    return Ai, Qi, N
    #mean = Ai
    #rms = np.sqrt(Qi / N)
    #return mean, rms
        
        
    
    

In [231]:
%timeit x = get_numba_mean_and_rms_masked(np.ascontiguousarray(xblock))

238 ms ± 36.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [149]:
np.ascontiguousarray?

In [142]:
tblock0.T.shape

(256, 120, 253)

In [176]:
import multiprocessing as mp

In [179]:
p1 = mp.Process(target=get_numba_mean_and_rms_masked, args=(tblock0,))

In [180]:
p1.start()

In [181]:
p1.join()

In [182]:
p1.pid

2257

In [183]:
import os
os.fork?

In [184]:
from multiprocessing import shared_memory

ImportError: cannot import name 'shared_memory'

In [187]:
type(block0)

numpy.ma.core.MaskedArray

In [188]:
type(tblock0)

numpy.ndarray

## Time taken to do the sky subtraction

In [194]:
%timeit tblock0 - np.ones((tblock0.shape[0], tblock0.shape[1]))[:,:, None]

61.9 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Time taken to do the calubration

In [195]:
%timeit tblock0 * (np.ones((tblock0.shape[0], tblock0.shape[1])) + 1j* np.ones((tblock0.shape[0], tblock0.shape[1])))[:, :, None]

65.6 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [196]:
tblock0.shape

(253, 120, 256)

In [197]:
tblock0.dtype

dtype('complex64')

In [198]:
%timeit tblock0 * tblock0

24.8 ms ± 466 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [199]:
tblock0.size

7772160

In [214]:
xblock = np.zeros((435, 288, 256), dtype='complex64')
xblock_c = xblock.copy()

In [215]:
%timeit xblock * xblock_c

102 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [206]:
%timeit xblock * xcal

254 ms ± 5.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [207]:
xflat = xblock.flatten()

In [213]:
%timeit xflat * xflat

126 ms ± 446 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [210]:
25 / tblock0.size

3.216609025032938e-06

In [211]:
xblock.flatten?

In [217]:
xblock.nbytes / 1e9

0.25657344

In [250]:
cblock = np.zeros((288, 435, 256), dtype='complex64')

In [352]:
import llvmlite.binding as llvm
llvm.set_option('', '--debug-only=loop-vectorize')

In [363]:
@njit(fastmath = True)
def my_abs(arr):
    n1, n2, n3 = arr.shape
    out = np.zeros(arr.shape, dtype='float32')
    for i1 in range(n1):
        for i2 in range(n2):
            for i3 in range(n3):
                isamp = arr[i1, i2, i3]
                out[i1, i2, i3] = isamp.real**2 + isamp.imag**2
                
    return out

@njit
def my_abs2(d,dout):
    for i in range(d.shape[0]):
        x = d[i]
        dout[i] = x.real**2 + x.imag**2
        
    return dout

In [353]:
@njit(fastmath=True)
def get_numba_mean_rms_and_cas(cblock):
    nf, nbl, nt = cblock.shape
    cas = np.zeros((nf, nt), dtype='float32')
    Ai = np.ascontiguousarray(np.zeros((nf, nbl), dtype='complex64'))
    Qi = np.ascontiguousarray(np.zeros((nf, nbl), dtype='complex64'))
    N = np.ascontiguousarray(np.ones((nf, nbl), dtype='int16'))
    cblock_abs = np.zeros(cblock.shape, dtype='float32')
    cblock_abs = np.abs(cblock)
    #cas_Ai = np.zeros(nf, )
    
    for iff in range(nf):
        for it in range(nt):
            for ibl in range(nbl):
                isamp = cblock[iff, ibl, it]
                if isamp != np.nan:
                    Qi[iff, ibl] = Qi[iff, ibl] + (N[iff, ibl]-1) / N[iff, ibl] * (isamp - Ai[iff, ibl])**2
                    Ai[iff, ibl] = Ai[iff, ibl] + (isamp - Ai[iff, ibl]) / N[iff, ibl]
                    N[iff, ibl] = N[iff, ibl] + 1
                    #v = np.abs(isamp)
                    #v = isamp
                    #cas[iff, it] = cas[iff, it] + v
    return Ai, Qi, N, cas
        

In [354]:
_ = get_numba_mean_rms_and_cas(cblock)
%timeit get_numba_mean_rms_and_cas(cblock)

498 ms ± 6.54 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [355]:
xblock.size / tblock0.size

4.126482213438735

In [371]:
cblock_flat = cblock.flatten()

In [375]:
test = np.zeros((435, 288, 256), dtype='complex64')

In [376]:
%timeit cblock_abs = np.abs(test)

59.3 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [364]:
_ = my_abs(cblock)
%timeit mycblock_abs = my_abs(cblock)

110 ms ± 815 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [358]:
nf, nbl, nt = cblock.shape
Ai = np.ascontiguousarray(np.zeros((nf, nbl), dtype='complex64'))
Qi = np.ascontiguousarray(np.zeros((nf, nbl), dtype='complex64'))
N = np.ascontiguousarray(np.ones((nf, nbl), dtype='int16'))

In [359]:
%timeit get_numba_mean_rms_and_cas(cblock, Ai, Qi, N)

TypeError: too many arguments: expected 1, got 4

In [None]:
from iqrm import iqrm_mask

In [341]:
iqrm_mask?

In [345]:
%timeit masks = iqrm_mask(np.random.normal(0, 1, nf), radius = 120)

3.91 ms ± 23.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [344]:
masks

(array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
      

In [348]:
def convert_to_16bit(arr):
    dout_real = arr.real.astype('int16')
    dout_imag =

In [351]:
my_abs.inspect_types(pretty=True)



0
label 0
"arr = arg(0, name=arr) :: array(complex64, 3d, C)"
"$0.2 = getattr(value=arr, attr=shape) :: UniTuple(int64 x 3)"
"$0.6 = exhaust_iter(value=$0.2, count=3) :: UniTuple(int64 x 3)"
del $0.2
"$0.3 = static_getitem(value=$0.6, index=0, index_var=None, fn=<built-in function getitem>) :: int64"
"$0.4 = static_getitem(value=$0.6, index=1, index_var=None, fn=<built-in function getitem>) :: int64"
"$0.5 = static_getitem(value=$0.6, index=2, index_var=None, fn=<built-in function getitem>) :: int64"
del $0.6
n1 = $0.3 :: int64

0
label 0
"arr = arg(0, name=arr) :: array(complex64, 3d, C)"
"$0.2 = getattr(value=arr, attr=shape) :: UniTuple(int64 x 3)"
"$0.6 = exhaust_iter(value=$0.2, count=3) :: UniTuple(int64 x 3)"
del $0.2
"$0.3 = static_getitem(value=$0.6, index=0, index_var=None, fn=<built-in function getitem>) :: int64"
"$0.4 = static_getitem(value=$0.6, index=1, index_var=None, fn=<built-in function getitem>) :: int64"
"$0.5 = static_getitem(value=$0.6, index=2, index_var=None, fn=<built-in function getitem>) :: int64"
del $0.6
n1 = $0.3 :: int64

0
$0.7 = global(np: <module 'numpy' from '/home/gup037/miniconda3/envs/craco/lib/python3.6/site-packages/numpy/__init__.py'>) :: Module(<module 'numpy' from '/home/gup037/miniconda3/envs/craco/lib/python3.6/site-packages/numpy/__init__.py'>)
"$0.8 = getattr(value=$0.7, attr=zeros) :: Function(<built-in function zeros>)"
del $0.7
"$0.10 = getattr(value=arr, attr=shape) :: UniTuple(int64 x 3)"
"$const0.11 = const(str, float32) :: Literal[str](float32)"
"out = call $0.8($0.10, func=$0.8, args=[Var($0.10, <ipython-input-335-3e79df9280e4>:4)], kws=[('dtype', Var($const0.11, <ipython-input-335-3e79df9280e4>:4))], vararg=None) :: (UniTuple(int64 x 3), Literal[str](float32)) -> array(float32, 3d, C)"
del $const0.11
del $0.8
del $0.10

0
jump 30
label 30
jump 32
label 32
$32.1 = global(range: <class 'range'>) :: Function(<class 'range'>)
"$32.3 = call $32.1(n1, func=$32.1, args=[Var(n1, <ipython-input-335-3e79df9280e4>:3)], kws=(), vararg=None) :: (int64,) -> range_state_int64"
del n1
del $32.1
$32.4 = getiter(value=$32.3) :: range_iter_int64
del $32.3

0
jump 44
label 44
jump 46
label 46
$46.1 = global(range: <class 'range'>) :: Function(<class 'range'>)
"$46.3 = call $46.1(n2, func=$46.1, args=[Var(n2, <ipython-input-335-3e79df9280e4>:3)], kws=(), vararg=None) :: (int64,) -> range_state_int64"
del $46.1
$46.4 = getiter(value=$46.3) :: range_iter_int64
del $46.3
$phi54.1 = $46.4 :: range_iter_int64

0
jump 58
label 58
jump 60
label 60
$60.1 = global(range: <class 'range'>) :: Function(<class 'range'>)
"$60.3 = call $60.1(n3, func=$60.1, args=[Var(n3, <ipython-input-335-3e79df9280e4>:3)], kws=(), vararg=None) :: (int64,) -> range_state_int64"
del $60.1
$60.4 = getiter(value=$60.3) :: range_iter_int64
del $60.3
$phi68.1 = $60.4 :: range_iter_int64

0
"$70.6 = build_tuple(items=[Var(i1, <ipython-input-335-3e79df9280e4>:5), Var(i2, <ipython-input-335-3e79df9280e4>:6), Var(i3, <ipython-input-335-3e79df9280e4>:7)]) :: UniTuple(int64 x 3)"
"isamp = getitem(value=arr, index=$70.6, fn=<built-in function getitem>) :: complex64"
del $70.6

0
"$70.9 = getattr(value=isamp, attr=real) :: float32"
"$const70.10 = const(int, 2) :: Literal[int](2)"
$70.11 = $70.9 ** $const70.10 :: float32
del $const70.10
del $70.9
"$70.13 = getattr(value=isamp, attr=imag) :: float32"
del isamp
"$const70.14 = const(int, 2) :: Literal[int](2)"
$70.15 = $70.13 ** $const70.14 :: float32
del $const70.14

0
del n3
del n2
del arr
del $phi42.1
del $phi40.1
del $40.4
jump 128
label 128
"$128.2 = cast(value=out) :: array(float32, 3d, C)"
del out


In [None]:
bbbb

In [365]:
%load_ext Cython

In [369]:
%%cython --annotate

cdef int a = 0
for i in range(10):
    a += i
print(a)

45


In [3]:
from math import sqrt

In [4]:
@njit
def my_abs2(d,dout):
    for i in range(d.shape[0]):
        x = d[i]
        dout[i] = x.real**2 + x.imag**2
        
    return dout

In [5]:
d = np.ones(435*288*256, dtype=np.complex64)

In [6]:
dout = np.empty(d.shape, dtype='float32')
%timeit _ = my_abs2(d, dout)

48.3 ms ± 469 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:


my_abs2.inspect_asm()

{(array(complex64, 1d, C),
  array(float32, 1d, C)): '\t.text\n\t.file\t"<string>"\n\t.globl\t_ZN8__main__11my_abs2$241E5ArrayI9complex64Li1E1C7mutable7alignedE5ArrayIfLi1E1C7mutable7alignedE\n\t.p2align\t4, 0x90\n\t.type\t_ZN8__main__11my_abs2$241E5ArrayI9complex64Li1E1C7mutable7alignedE5ArrayIfLi1E1C7mutable7alignedE,@function\n_ZN8__main__11my_abs2$241E5ArrayI9complex64Li1E1C7mutable7alignedE5ArrayIfLi1E1C7mutable7alignedE:\n\tpushq\t%rbp\n\tpushq\t%r15\n\tpushq\t%r14\n\tpushq\t%r13\n\tpushq\t%r12\n\tpushq\t%rbx\n\tpushq\t%rax\n\tmovq\t%rdi, %r12\n\tmovq\t128(%rsp), %r13\n\tmovq\t120(%rsp), %rbx\n\tmovq\t96(%rsp), %rbp\n\tmovq\t88(%rsp), %r14\n\tmovq\t72(%rsp), %r15\n\tmovabsq\t$NRT_incref, %rax\n\tmovq\t%r14, %rdi\n\tcallq\t*%rax\n\ttestq\t%r15, %r15\n\tjle\t.LBB0_3\n\tmovq\t64(%rsp), %rax\n\tsetg\t%cl\n\tmovzbl\t%cl, %ecx\n\tnegq\t%rcx\n\taddq\t%r15, %rcx\n\tincq\t%rcx\n\txorl\t%edx, %edx\n\t.p2align\t4, 0x90\n.LBB0_2:\n\tmovq\t%rdx, %rsi\n\tsarq\t$63, %rsi\n\tmovq\t%rsi, %rdi\n\t

In [7]:
from libc.stdlib import malloc

ModuleNotFoundError: No module named 'libc'

In [9]:
import numba
numba.__version__


'0.53.1'