In [1]:
import os

# https://numba.readthedocs.io/en/stable/reference/envvars.html
os.environ['NUMBA_NUM_THREADS'] = '8'
os.environ['NUMBA_THREADING_LAYER'] = 'omp' # my TBB version complains
os.environ['NUMBA_ENABLE_AVX'] = '1'
os.environ['NUMBA_CPU_NAME'] = 'generic'
os.environ['NUMBA_CPU_FEATURES'] = '+sse,+sse2,+avx,+avx2,+avx512f,+avx512dq'



import numpy as np
from pylab import *
from craco.cardcap import CardcapFile
from astropy.io import fits
from craco.card_averager import Averager
from craco.cardcapmerger import CcapMerger
import glob
import numba

from numba import jit,njit,prange


%matplotlib notebook
%load_ext autoreload
%autoreload 2

# Introdcution
See this useful numba reference about how to do SIMD with NUMBA
https://tbetcke.github.io/hpc_lecture_notes/simd.html

In [2]:
nt = 64
nbeam = 36
nbl = 465
nant = 30
nfpga = 6
nc = 4*nfpga
npol = 2


In [3]:
def find_instr(func, keyword, sig=0, limit=5):
    count = 0
    for l in func.inspect_asm(func.signatures[sig]).split('\n'):
        if keyword in l:
            count += 1
            print(l)
            if count >= limit:
                break
    if count == 0:
        print('No instructions found')
        
def print_instr(func, sig=0):
    for l in func.inspect_asm(func.signatures[sig]).split('\n'):
        print(l)


In [4]:
d = np.zeros((nc, nbl,nt,nt,2),dtype=np.int16)

%timeit d.mean()

60.8 ms ± 707 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Do this to debug loops:

```
import llvmlite.binding as llvm
llvm.set_option('', '--debug-only=loop-vectorize')
```

In [5]:
@njit(fastmath=True, boundscheck=False)
def mysumi16(d):
    s = np.int16(0)
    for i in range(d.size):
        s += d[i]
    return s


@njit(fastmath=True, boundscheck=False)
def mysumi32(d):
    s = np.int32(0)
    for i in range(d.size):
        s += d[i]
    return s

@njit(fastmath=True, boundscheck=False)
def mysumi64(d):
    s = np.int64(0)
    for i in range(d.size):
        s += d[i]
    return s

@njit(fastmath=True, boundscheck=False)
def mysumf32(d):
    s = np.float32(0)
    for i in range(d.size):
        s += d[i]
    return s

In [6]:
df = d.flatten()
sumfuncs = (mysumi16, mysumi32,mysumi64,  mysumf32)
for sfuncin in sumfuncs:
    print(sfuncin)
    sfuncin(df) # compile
    %timeit sfuncin(df)

CPUDispatcher(<function mysumi16 at 0x7fc617b0fa60>)
21.6 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
CPUDispatcher(<function mysumi32 at 0x7fc617b0fc80>)
21.2 ms ± 395 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
CPUDispatcher(<function mysumi64 at 0x7fc617b36048>)
20.8 ms ± 175 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
CPUDispatcher(<function mysumf32 at 0x7fc617b362f0>)
21.6 ms ± 388 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
# let's pretend what happens if we force it to do 64 bit wide vectors
d4f = np.zeros((nc//4, nbl,nt,nt,2),dtype=np.int64).flatten()
assert d4f.size*d4f.itemsize == df.size*df.itemsize

In [8]:
mysumi64(d4f) # compile
%timeit mysumi64(d4f)

21.5 ms ± 439 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
%load_ext cython

In [10]:
%%cython -a 
import numpy as np
cimport numpy as np
import cython
@cython.boundscheck(False)
@cython.wraparound(False)
def cysum(short[::1] d):
    cdef int s = 0
    cdef long sz = d.size
    cdef long ix
    for i in range(sz):
        s += d[i]
        
    return s


In [11]:
%timeit cysum(df)

45.8 ms ± 818 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
gflops = df.size / 0.02 / 1e9
gflops

4.571136

In [13]:
find_instr(mysumi32,'vpaddq')


	vpaddq	%zmm4, %zmm0, %zmm0
	vpaddq	%zmm4, %zmm1, %zmm1
	vpaddq	%zmm4, %zmm2, %zmm2
	vpaddq	%zmm4, %zmm3, %zmm3
	vpaddq	%zmm0, %zmm1, %zmm0


In [14]:
print_instr(mysumi32)

	.text
	.file	"<string>"
	.globl	_ZN8__main__8mysumi32B2v2B54c8tJTIcFHzwl2ILiXkcBV0KBSsOcbovu9mp1kJR6rSYw_2bIogqjUBE5ArrayIsLi1E1C7mutable7alignedE
	.p2align	4, 0x90
	.type	_ZN8__main__8mysumi32B2v2B54c8tJTIcFHzwl2ILiXkcBV0KBSsOcbovu9mp1kJR6rSYw_2bIogqjUBE5ArrayIsLi1E1C7mutable7alignedE,@function
_ZN8__main__8mysumi32B2v2B54c8tJTIcFHzwl2ILiXkcBV0KBSsOcbovu9mp1kJR6rSYw_2bIogqjUBE5ArrayIsLi1E1C7mutable7alignedE:
	testq	%r8, %r8
	jle	.LBB0_1
	movq	8(%rsp), %rcx
	cmpq	$32, %r8
	jae	.LBB0_4
	xorl	%edx, %edx
	xorl	%eax, %eax
	jmp	.LBB0_7
.LBB0_1:
	xorl	%edx, %edx
	jmp	.LBB0_9
.LBB0_4:
	movq	%r8, %rax
	andq	$-32, %rax
	leaq	48(%rcx), %rdx
	vpxor	%xmm0, %xmm0, %xmm0
	movq	%rax, %rsi
	vpxor	%xmm1, %xmm1, %xmm1
	vpxor	%xmm2, %xmm2, %xmm2
	vpxor	%xmm3, %xmm3, %xmm3
	.p2align	4, 0x90
.LBB0_5:
	vpmovsxwq	-48(%rdx), %zmm4
	vpaddq	%zmm4, %zmm0, %zmm0
	vpmovsxwq	-32(%rdx), %zmm4
	vpaddq	%zmm4, %zmm1, %zmm1
	vpmovsxwq	-16(%rdx), %zmm4
	vpaddq	%zmm4, %zmm2, %zmm2
	vpmovsxwq	(%rdx), %zmm4
	vpaddq	%zmm4, 

In [15]:
mysumi32.inspect_types(pretty=True)



0
label 0
"d = arg(0, name=d) :: array(int16, 1d, C)"
$2load_global.0 = global(np: <module 'numpy' from '/data/seren-01/fast/ban115/build/venv/lib/python3.7/site-packages/numpy/__init__.py'>) :: Module(<module 'numpy' from '/data/seren-01/fast/ban115/build/venv/lib/python3.7/site-packages/numpy/__init__.py'>)
"$4load_method.1 = getattr(value=$2load_global.0, attr=int32) :: class(int32)"
del $2load_global.0
"$const6.2 = const(int, 0) :: Literal[int](0)"
"s = call $4load_method.1($const6.2, func=$4load_method.1, args=[Var($const6.2, 3437041517.py:11)], kws=(), vararg=None, varkwarg=None, target=None) :: (int64,) -> int32"
del $const6.2
del $4load_method.1
s.2 = s :: int64

0
label 0
"d = arg(0, name=d) :: array(int16, 1d, C)"
$2load_global.0 = global(np: <module 'numpy' from '/data/seren-01/fast/ban115/build/venv/lib/python3.7/site-packages/numpy/__init__.py'>) :: Module(<module 'numpy' from '/data/seren-01/fast/ban115/build/venv/lib/python3.7/site-packages/numpy/__init__.py'>)
"$4load_method.1 = getattr(value=$2load_global.0, attr=int32) :: class(int32)"
del $2load_global.0
"$const6.2 = const(int, 0) :: Literal[int](0)"
"s = call $4load_method.1($const6.2, func=$4load_method.1, args=[Var($const6.2, 3437041517.py:11)], kws=(), vararg=None, varkwarg=None, target=None) :: (int64,) -> int32"
del $const6.2
del $4load_method.1
s.2 = s :: int64

0
jump 12
label 12
$14load_global.0 = global(range: <class 'range'>) :: Function(<class 'range'>)
"$18load_attr.2 = getattr(value=d, attr=size) :: int64"
"$20call_function.3 = call $14load_global.0($18load_attr.2, func=$14load_global.0, args=[Var($18load_attr.2, 3437041517.py:12)], kws=(), vararg=None, varkwarg=None, target=None) :: (int64,) -> range_state_int64"
del $18load_attr.2
del $14load_global.0
$22get_iter.4 = getiter(value=$20call_function.3) :: range_iter_int64
del $20call_function.3
$phi24.0 = $22get_iter.4 :: range_iter_int64

0
"$34binary_subscr.5 = getitem(value=d, index=i, fn=<built-in function getitem>) :: int16"
del i
"$36inplace_add.6 = inplace_binop(fn=<built-in function iadd>, immutable_fn=<built-in function add>, lhs=s.2, rhs=$34binary_subscr.5, static_lhs=Undefined, static_rhs=Undefined) :: int64"
del $34binary_subscr.5
s.1 = $36inplace_add.6 :: int64
del $36inplace_add.6
s.2 = s.1 :: int64
del s.1
jump 24
label 42

0
del d
del $phi26.1
del $phi24.0
del $24for_iter.3
jump 44
label 44
$46return_value.1 = cast(value=s.2) :: int64
del s.2
return $46return_value.1


# Conclusions of the sum reduction
- numba is faster than cython by about 2x or more
- Numba runs the problem in 20ms for accumulators of int16, int32 and int64 and about 6x slower or float32, at 120ms (???)
- Numba uses 'vpaddq' with YMM registers, which is equivalent to [_mm256_mask_add_epi64](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=vpaddq&ig_expand=130,128) which has latency of 1 and throughput of 0.33 for lots of architectures.
- _mm256_add_epi64 which doesn't have a mask could have been used, but wasn't. Maybe this is a problem but maybe not. It has the same latency and throughput as the masked version. But maybe having the mask around is a pain?
- It didn't use the 512-bit version _mm512_add_epi64. The CPU has the required flag: AVX512F, according to /proc/cpuinfo
- But, numba feature detection is a bit rubbish. Referencing [this link](https://numba.readthedocs.io/en/stable/reference/envvars.html) and running `llc -march=x86 -mattr=help` on `athena` gave a useful list of features to try.
- `os.environ['NUMBA_CPU_FEATURES'] = '+sse,+sse2,+avx,+avx2,+avx512f,+avx512dq'` improved things. float32 now reduced to the same value = 20ms as int16,int32,int64, which stayed the same. Interesting. Numba using `vpaddq` with `ZMM` registers, which is the correct AVX512 instruction
- The fact that the int16 numbers didn't improve with avx512 means something else is the bottlenexck
- Adding `fastmath=True` made no difference


In [18]:
print_instr(mysumi16, sig=0)

	.text
	.file	"<string>"
	.globl	_ZN8__main__8mysumi16B2v1B54c8tJTIcFHzwl2ILiXkcBV0KBSsOcbovu9mp1kJR6rSYw_2bIogqjUBE5ArrayIsLi1E1C7mutable7alignedE
	.p2align	4, 0x90
	.type	_ZN8__main__8mysumi16B2v1B54c8tJTIcFHzwl2ILiXkcBV0KBSsOcbovu9mp1kJR6rSYw_2bIogqjUBE5ArrayIsLi1E1C7mutable7alignedE,@function
_ZN8__main__8mysumi16B2v1B54c8tJTIcFHzwl2ILiXkcBV0KBSsOcbovu9mp1kJR6rSYw_2bIogqjUBE5ArrayIsLi1E1C7mutable7alignedE:
	testq	%r8, %r8
	jle	.LBB0_1
	movq	8(%rsp), %rcx
	cmpq	$32, %r8
	jae	.LBB0_4
	xorl	%edx, %edx
	xorl	%eax, %eax
	jmp	.LBB0_7
.LBB0_1:
	xorl	%edx, %edx
	jmp	.LBB0_9
.LBB0_4:
	movq	%r8, %rax
	andq	$-32, %rax
	leaq	48(%rcx), %rdx
	vpxor	%xmm0, %xmm0, %xmm0
	movq	%rax, %rsi
	vpxor	%xmm1, %xmm1, %xmm1
	vpxor	%xmm2, %xmm2, %xmm2
	vpxor	%xmm3, %xmm3, %xmm3
	.p2align	4, 0x90
.LBB0_5:
	vpmovsxwq	-48(%rdx), %zmm4
	vpaddq	%zmm4, %zmm0, %zmm0
	vpmovsxwq	-32(%rdx), %zmm4
	vpaddq	%zmm4, %zmm1, %zmm1
	vpmovsxwq	-16(%rdx), %zmm4
	vpaddq	%zmm4, %zmm2, %zmm2
	vpmovsxwq	(%rdx), %zmm4
	vpaddq	%zmm4, 

In [17]:
use_files = True
if use_files:
    cardfiles = glob.glob('/data/craco/ban115/craco-python/notebooks/data/SB43128/run3/1934_b07_c01+f?.fits')
    assert len(cardfiles) == 6

    cfiles = [CardcapFile(f) for f in cardfiles]
    merger = CcapMerger(cardfiles)
    fid, blk = next(merger.block_iter())

    fileblocks = [next(f.packet_iter(nt*4*nbeam)) for f in cfiles]
    fb0 = fileblocks[0]
    fb0_block = fb0[:nt]
else:
    fb0_


AssertionError: 

In [None]:
len(fileblocks[0])

In [None]:
#numba.set_num_threads(3)
#numba.get_num_threads()

In [None]:
avg = Averager(nbeam, nant,nc,nt,npol)


In [None]:
from craco.card_averager import do_accumulate, accumulate_all

for i in range(10):
    do_accumulate(avg.output, avg.rescale_scales, avg.rescale_stats, avg.count, avg.nant, 0,0,fb0_block, 2, 6)
%timeit do_accumulate(avg.output, avg.rescale_scales, avg.rescale_stats, avg.count, avg.nant, 0,0,fb0_block, 2, 6)


In [None]:
gflops = nbl*npol*nt*(11)/300e-6/1e9
gflops

In [None]:
do_accumulate.signatures

In [None]:
find_instr(do_accumulate, 'vadd') # holy crapsky - if there's a vadd instruction, it's SIMD vectorized

In [None]:
fb0.dtype.isalignedstruct

In [None]:
#do_accumulate.parallel_diagnostics(level=4)

In [None]:
do_accumulate.inspect_types()

In [None]:
print(do_accumulate.inspect_asm(do_accumulate.signatures[0]))

In [None]:
numba.set_num_threads(8)
from craco.card_averager import accumulate_all
avg.accumulate_all(fileblocks )

In [None]:
%timeit avg.accumulate_all(fileblocks )

In [None]:
nthreads = [1,2,4,8]
time_ms = np.array([246, 134, 82, 45])
figure()
fig,ax = subplots(1,2)
ax[0].plot(nthreads, 1/time_ms)
ax[0].set_xlabel('nthreads')
ax[0].set_ylabel('Throughput (per ms)')
ax[1].plot(nthreads, time_ms)
ax[1].set_xlabel('nthreads')
ax[1].set_xlabel('Executution time( ms)')



In [None]:
#nt = 64
#nbeam = 36
#nbl = 465
#nant = 30
#nfpga = 6
#nc = 4*nfpga
#npol = 2
# gflops per core
gflops=nbeam*npol*nc*nbl*nt*10/74e-3/float(numba.get_num_threads())/1e9
gflops


In [None]:
n = 1024*32
indtype=np.int16
dtype=np.int32
#dtype=np.int16
a = np.arange(n, dtype=indtype)
b = np.arange(n, dtype=indtype)
c = np.zeros(n, dtype=dtype)


In [None]:
%timeit np.add(a,b,out=c)

In [None]:
gflops = n/4.27e-6/1e9
gflops

In [None]:
@njit(fastmath=True)
def myadd(a,b,c):
    for i in range(len(a)):
        c[i] = a[i]+b[i]
        
myadd(a,b,c)
%timeit myadd(a,b,c)

In [None]:
find_instr(myadd, 'vadd')

1024 elements is about 2x lower gflops athan 1024x8 elements - I guess there's overheads or prefetching or something. int16 is about 3x faster than float32. Float32 usues 'vadd' with YMM register = AVX-256 (not 512??). But it looks like  5 Gflops/core is about the limit for float32, and maybe 10Gflops/core at int16 is about as good as you'll go.


| func | N | intype | outtype | runtime | gflops | vadd? | 
| ------| --| ----| ---------| ------| ---| ---|
| myadd | N=32*1024 | int32 | int32 | 5.16us |  6.35  | No |
| myadd | N=32*1024 | int16  | int16 | 3.09us | 10.6  | No |
| myadd | N = 32*1024 | float32 | float32 |  6.48 | 5.0  | Yes |
| myadd | N = 32*1024 | int16 | float32 |  7.3 | 4.4 | No |
| myadd | N = 32*1024 | int16 | int32 | 4.27 | 7.6  | No |

In [None]:
v = np.arange(10, dtype=np.int16) + np.int16((1<<15) - 5)


In [None]:
v = np.int32(32767)*np.int32(32767)
np.iinfo(np.int32)

In [None]:
def sim_casics(nant, sigamp):
    cas = 0
    ics = 0
    nant = 30
    N = 1024
    noise = np.random.randn(nant) + 1j*np.random.randn(nant)
    sig = np.ones(nant) + 1j*0
    s = sigamp*sig + noise
    x = np.outer(s, s.T)
    x.shape
    for a1 in range(nant):
        for a2 in range(a1, nant):
            a = np.abs(x[a1, a2])
            if a1 == a2:
                ics += a
            else:
                cas += a*2
    
    return cas,ics

In [None]:
siglvls = (0, 0.1, 0.3, 1, 3)
niter = 1024*8
results= np.zeros((len(siglvls), niter,2))
nant = 30
for isig, siglvl in enumerate(siglvls):
    results[isig,...] = np.array([sim_casics(nant, siglvl) for i in range(niter)])
        

In [None]:
cas_rms = results[0,0,:].std()
ics_rms = results[0,1,:].std()
cas_scale = results[:,0,:]/cas_rms
ics_scale = results[:,0,:]/ics_rms
cas_scale.mean(axis=1).shape


In [None]:


figure()
errorbar(siglvls, cas_scale.mean(axis=1), yerr=cas_scale.std(axis=1), label='cas')
errorbar(siglvls, ics_scale.mean(axis=1), yerr=ics_scale.std(axis=1), label='ics')

legend()