In [1]:
import numpy as np
import os
os.environ['NUMBA_THREADING_LAYER'] = 'omp' # my TBB version complains
os.environ['NUMBA_NUM_THREADS'] = '3'
os.environ['NUMBA_ENABLE_AVX'] = '1'
os.environ['NUMBA_CPU_NAME'] = 'generic'
os.environ['NUMBA_CPU_FEATURES'] = '+sse,+sse2,+avx,+avx2,+avx512f,+avx512dq'
#os.environ['NUMBA_CPU_FEATURES'] = '+sse,+sse2,+avx,+avx2'
import sys
sys.path.insert(0,'/Users/ban115/bolton/craco-python/src/')


import numba
from numba import njit, prange
from pylab import *
%matplotlib notebook

%load_ext autoreload
%autoreload 2


In [2]:
nant = 30
nbl = nant*(nant+1)//2
nt = 32
nchan = 24
nbeam = 36

vin = np.arange(nbl*nt*nchan*2, dtype=np.int16).reshape((nchan, nbl, nt, 2))
vavg = np.zeros((nchan, nbl, 2), dtype=np.float32)
ics = np.zeros((nt, nchan), dtype=np.float32)
cas = np.zeros((nt, nchan), dtype=np.float32)

vin.size

714240

In [3]:
# For type hint info see https://numba.pydata.org/numba-doc/0.12.2/tutorial_types.html
@njit(fastmath=True, parallel=True, locals={'vabs':numba.float32})
def make_averages(nant, vin, vavg, ics, cas):
    nchan,nbl, nt, _ = vin.shape
    
    for ic in range(nchan):
        ibl = 0
        for ia1 in range(nant):
            for ia2 in range(ia1, nant):
                #print(ia1, ia2, ibl, nbl)
                #assert ibl < nbl#, f'Invalid ibl {ia1} {ia2} {ibl} {nbl}'
                
                for it in range(nt):
                    v = vin[ic, ibl, it, :]
                    #v = np.array([0,1])
                    vabs = v[0]*v[0] + v[1]*v[1]
                    if ia1 == ia2:
                        ics[it, ic] += vabs
                    else:
                        cas[it, ic] += vabs

                    #print('out', ibl, ic, it)
                    vavg[ic, ibl, 0] += v[0]
                    vavg[ic, ibl, 1] += v[1]
                    
                    #print('done', ibl, ic, it)


                ibl += 1

    ics //= nant
    cas //= nbl
    vavg //= nt
            


In [4]:
for i in range(1):
    make_averages(nant, vin, vavg, ics, cas)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "<ipython-input-3-34d199a97e29>", line 3:
@njit(fastmath=True, parallel=True, locals={'vabs':numba.float32})
def make_averages(nant, vin, vavg, ics, cas):
^

  state.func_ir.loc))


In [5]:
make_averages.parallel_diagnostics(level=4)


 
 Parallel Accelerator Optimizing:  Function make_averages, <ipython-
input-3-34d199a97e29> (2)  
No source available
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
--------------------------------------------------------------------------------
------------------------------ After Optimisation ------------------------------
Parallel structure is already optimal.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 
---------------------------Loop invariant code motion---------------------------
Allocation hoisting:
No allocation hoisting found

Instruction hoisting:
No instruction hoisting found
--------------------------------------------------------------------------------


In [6]:
%timeit -n1 make_averages(nant, vin, vavg, ics, cas)

1.9 ms ± 99.9 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
vin.shape

(24, 465, 32, 2)

In [8]:
make_averages.inspect_types()

make_averages (int64, array(int16, 4d, C), array(float32, 3d, C), array(float32, 2d, C), array(float32, 2d, C))
--------------------------------------------------------------------------------
# File: <ipython-input-3-34d199a97e29>
# --- LINE 2 --- 

@njit(fastmath=True, parallel=True, locals={'vabs':numba.float32})

# --- LINE 3 --- 

def make_averages(nant, vin, vavg, ics, cas):

    # --- LINE 4 --- 
    # label 0
    #   nant = arg(0, name=nant)  :: int64
    #   vin = arg(1, name=vin)  :: array(int16, 4d, C)
    #   vavg = arg(2, name=vavg)  :: array(float32, 3d, C)
    #   ics = arg(3, name=ics)  :: array(float32, 2d, C)
    #   cas = arg(4, name=cas)  :: array(float32, 2d, C)
    #   $0.2 = getattr(value=vin, attr=shape)  :: UniTuple(int64 x 4)
    #   $0.7 = exhaust_iter(value=$0.2, count=4)  :: UniTuple(int64 x 4)
    #   del $0.2
    #   $nchan.28 = static_getitem(value=$0.7, index=0, index_var=None, fn=<built-in function getitem>)  :: int64
    #   $nbl.29 = static_getitem(v

In [35]:
from craco.cardcapfile import  get_single_packet_dtype, NCHAN,get_indexes
%aimport craco.card_averager
from craco.card_averager import *
import craco.card_averager as avg

In [10]:
nfpga = 6
nbeam = 36
nc_per_fpga = 4
nt = 16
npkt = nbeam*nc_per_fpga
pktshape = (npkt, nt)
polsum = True
debughdr = True
dtype = get_single_packet_dtype(nbl, debughdr, polsum)
din_list = [np.zeros(pktshape, dtype=dtype) for i in range(nfpga)]
packets = [(0, pkt) for pkt in din_list]
din = np.array(din_list)
tscrunch = 4
print(din.shape)
print(din['data'].shape)

(6, 144, 16)
(6, 144, 16, 2, 465, 1, 2)


In [11]:

dout = average1(din)
print(dout.shape, dout.dtype)

(144, 465, 1, 2) float32


In [12]:
%timeit average1(din)

12.7 ms ± 995 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
%timeit dout.astype(np.int16)

31.3 µs ± 1.43 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [14]:
%timeit average2(din, tscrunch=4)

30.8 ms ± 362 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
%timeit average3(din, tscrunch=4)

31.6 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%timeit average3(din, tscrunch=2)

31.2 ms ± 609 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
%timeit average4(din, tscrunch=2)

28.2 ms ± 773 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
data = din['data']
(nfpga, npkt, nt1, nt2, nbl, _, _) = data.shape
dshape = (npkt, nt1*nt2 // tscrunch, nbl, 2)
dout = np.zeros(dshape, dtype=np.float32)
%timeit average4(din, tscrunch=tscrunch, dout=dout)

26.5 ms ± 419 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
%timeit average4(din, tscrunch=tscrunch, dout=None)

26.5 ms ± 374 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
%autoreload average5
average5(data, tscrunch=tscrunch, dout=dout)
%timeit average5(data, tscrunch=tscrunch, dout=dout)

74.1 ms ± 1.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
%autoreload average6

average6(din['data'], tscrunch=tscrunch, dout=dout)
%timeit average6(din['data'], tscrunch=tscrunch, dout=dout)

214 ms ± 2.92 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
dout.shape

(144, 8, 465, 2)

In [24]:
average6.inspect_types()

average6 (array(int16, 7d, A), int64, array(float32, 4d, C))
--------------------------------------------------------------------------------
# File: /data/seren-01/fast/ban115/build/craco-python/src/craco/card_averager.py
# --- LINE 300 --- 

@njit(fastmath=True,debug=True,parallel=True)

# --- LINE 301 --- 

def average6(data, tscrunch, dout):

    # --- LINE 302 --- 

    #data = din['data']

    # --- LINE 303 --- 
    # label 0
    #   data = arg(0, name=data)  :: array(int16, 7d, A)
    #   tscrunch = arg(1, name=tscrunch)  :: int64
    #   dout = arg(2, name=dout)  :: array(float32, 4d, C)
    #   $0.2 = getattr(value=data, attr=shape)  :: UniTuple(int64 x 7)
    #   $0.10 = exhaust_iter(value=$0.2, count=7)  :: UniTuple(int64 x 7)
    #   del $0.2
    #   $nfpga.303 = static_getitem(value=$0.10, index=0, index_var=None, fn=<built-in function getitem>)  :: int64
    #   $npkt.304 = static_getitem(value=$0.10, index=1, index_var=None, fn=<built-in function getitem>)  :: int64
   

In [40]:
intermediate_dout = np.zeros(din['data'].shape, dtype=np.float32)
%timeit avg.average9(din, tscrunch=tscrunch, dout=intermediate_dout)

110 ms ± 1.92 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
from craco import card_averager

In [26]:

dummy_packet = packets[0][1].copy()
print(type(dummy_packet))

avger = card_averager.Averager(nbeam,nant,nc=24,nt=32,npol=1,vis_fscrunch=6, vis_tscrunch=4, dummy_packet=dummy_packet, cdtype=np.float32)


<class 'numpy.ndarray'>


  variance = m2 / count #ill produce NAN where count=0
  scale = 1/stdev


In [27]:
# current averager takes 270 milliseconds - which is just a bloody outrage - I mean seriously
%timeit avger.accumulate_packets(packets)

AttributeError: 'tuple' object has no attribute 'shape'

In [None]:
avger = card_averager.Averager(nbeam,nant,24,32,1,6,4, dummy_packet=dummy_packet, cdtype=np.float32)

In [None]:
avger.output.dtype

In [None]:
avger.output.shape

In [None]:
avger.output['vis'].shape

In [None]:
from craco.card_averager import ibc2beamchan


_= average7(din, tscrunch=8, dout=avger.output['vis'])
%timeit average7(din, tscrunch=8, dout=avger.output['vis'])

In [None]:


_= average8(din, tscrunch=8, dout=avger.output['vis'],nant=nant)
%timeit average8(din, tscrunch=8, dout=avger.output['vis'],nant=nant)

In [None]:
avger.output['vis'].shape

In [None]:
_,_,auto_idxs,cross_idxs = get_indexes(nant)

In [None]:
average2(din,tscrunch=8).shape

In [None]:
def average_vis_and_reshape2(din, tscrunch, dout, auto_idxs, cross_idxs):
    '''
    Writes (beam,chan) order and removes autocorrelations
    Fxed fscrunch at 6
    This is horriffically slow in NUMPY - really, really bad
    '''
    data = din['data']
    (nfpga, npkt, nt1, nt2, nblall, _, _) = data.shape
    dout[:] = 0
    # HACK - set nbl to output NBL, which doesn't include autos, for now
    #nbl = dout.shape[1]
    ntout = nt1*nt2//tscrunch
    
    nbl = len(cross_idxs)
    nant = len(auto_idxs)
    dcross = data[:,:,:,:,cross_idxs,:,:]
    d = dcross.reshape(nfpga, npkt, ntout, tscrunch, nbl, 2).mean(axis=(0,3), dtype=np.float32)
    dcopy1 = d[:32*4,...].reshape(32,4,ntout,nbl,2).transpose(0,3,1,2,4)
    dcopy2 = d[32*4:,...].reshape(4,4,ntout,nbl,2).transpose(0,3,1,2,4)


    dout[0:32, :,:,:,:] = dcopy1
    dout[32:, :,:,:,:] = dcopy2

            
    return dout

print(average_vis_and_reshape2(din, tscrunch=4, dout=avger.output['vis'],auto_idxs=auto_idxs,cross_idxs=cross_idxs).shape)
%timeit average_vis_and_reshape2(din, tscrunch=4, dout=avger.output['vis'],auto_idxs=auto_idxs,cross_idxs=cross_idxs)

In [None]:
avger.output['vis'].shape

In [None]:
avger.output['cas'].shape

In [None]:
din['data'].shape

In [None]:
data = din['data'].astype(np.float32)
data.shape

In [None]:
def amp1(data):
    return np.sqrt(data[...,0]**2 + data[...,1]**2)

%timeit amp1(data)

In [None]:
from math import sqrt

@njit
def amp2(data, dout):
    (nfpga, npkt, nt1, nt2, nbl, _, _) = data.shape
    dout[:] = 0
    # HACK - set nbl to output NBL, which doesn't include autos, for now
    nbl = dout.shape[1]
    
    # dout shape
    # avger.output['vis'].shape
    # (36, 435, 4, 8, 2)
    
    for ifpga in range(nfpga):
        for ipkt in range(npkt):
            beam,chan = ibc2beamchan(ipkt)
            #print(beam,chan)
            for t1 in range(nt1):
                for t2 in range(nt2):
                    ttotal = t2 + t1*nt2
                    tout = ttotal // tscrunch
                    for ibl in range(nbl):
                        d0 = data[ifpga,ipkt,t1,t2,ibl,0,0]
                        d1 = data[ifpga,ipkt,t1,t2,ibl,0,1]

                        da = sqrt(d0**2 + d1**2)    
                        dout[ifpga,ipkt,t1,t2,ibl,0] = da
                        
    return dout
                            
    

In [None]:
dout = np.zeros(data.shape[:-1], dtype=np.float32)
amp2(data, dout).shape
%timeit amp2(data, dout)

In [None]:
data.shape

In [None]:
avger.output['ics'].shape

In [None]:
%timeit calc_ics(data, auto_idxs)

In [None]:
dmean = calc_ics(data, auto_idxs)
dmean.shape

In [None]:
icsout = avger.output['ics']

In [None]:
icsout.shape

In [None]:
(nfpga, npkt, nt1, nt2, nbl, npol, _) = data.shape
data.shape

In [None]:
dmean[:,:32*4,:,:].reshape(nfpga,4,32,32).transpose(2,3,1,0).reshape(32,32,24).shape

In [None]:
dmean[:,32*4:,:,:].reshape(nfpga,4,4,32).transpose(2,3,1,0).reshape(4,32,24).shape

In [None]:
valid = np.zeros(6, dtype=bool)
%timeit calc_and_reshape_ics(data, auto_idxs, valid, avger.output['ics'])