In [18]:
import numpy as np
import numba

def sliced_ellpack_spmv_single_thread(N, slice_ptr, colidx, val, x, slice_height):
    """Sliced ELLPACK format based SpMV (y=Ax)"""
    
    y = np.zeros(N, dtype=np.float32)
    slice_count = int(N / slice_height)
    
    for s in range(slice_count):
        row_idx = s * slice_height
        for r in range(row_idx, row_idx + slice_height):
            for i in range(slice_ptr[s] + r - row_idx, slice_ptr[s + 1], slice_height):
                Ax_data = x[colidx[i]] * val[i]
                y[r] += Ax_data
    
    return y

@jit(nopython=True, parallel=True, nogil=True, fastmath=True)
def sliced_ellpack_spmv_multi_thread(y, N, slice_count, slice_ptr, colidx, val, x, slice_height):
    """Sliced ELLPACK format based SpMV (y=Ax)"""
    
    for s in numba.prange(slice_count):
        row_idx = s * slice_height
        loop_y = np.zeros(slice_height, dtype=np.float32)
        for r in range(slice_height):
            row_data = 0.0
            for i in range(slice_ptr[s] + r, slice_ptr[s + 1], slice_height):
                Ax_data = x[colidx[i]] * val[i]
                row_data += Ax_data
            loop_y[r] = row_data
        y[s * slice_height:(s + 1) * slice_height] = loop_y
            
    return y

In [19]:
#[[a, 0, b, 0],
# [0, c, 0, d],
# [0, e, 0, 0],
# [0, 0, 0, f]]

# slice 1: [[a, b], [c, d]]
# slice 2: [[e], [f]]

# a = 1.11, b = 3.33, c = 2.22, d = 4.44, e = 5.55, f = 6.66

N = 4 # number of row
slice_height = 2
colidx = np.array([0, 1, 2, 3, 1, 3]) # column indices
val = np.array([1.11, 2.22, 3.33, 4.44, 5.55, 6.66], dtype=np.float64) # nonzero values and padded zeros
slice_ptr = np.array([0, 4, 6])
x = np.array([2.22, 3.33, 4.44, 5.55], dtype=np.float64)

In [30]:
%time y = sliced_ellpack_spmv_single_thread(N, slice_ptr, colidx, val, x, slice_height)

CPU times: user 43 µs, sys: 1e+03 ns, total: 44 µs
Wall time: 46 µs
