In [3]:
from numba import jit, cuda, float32
import numpy as np

In [10]:
@jit(nopython=True)
def inner_product_for_grad(x, y, b):
    out = 0.
    
    for i in range(x.size):
        out += x[i] * y[i]
    
    out -= b

    return out

BPG = 16
TPB = 24

@cuda.jit
def gradient(A, x, b, out):
    sA = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    
    BPG = cuda.gridDim.x

    if tx < TPB and ty < TPB:
        tmp = 0.
        for j in range(BPG):
            sA[tx,ty] = A.T[tx + bx * TPB,ty + j * TPB]
            sB[tx] = inner_product_for_grad(A[tx + j * TPB,:], x, b[tx + j * TPB])

            cuda.syncthreads()
            
            for k in range(TPB):
                tmp += sA[tx,k] * sB[k]

            cuda.syncthreads()

        out[tx + bx * TPB] = tmp

In [18]:
n = BPG * TPB
A = np.random.randn(n,n)
b = np.random.randn(n)
x = np.random.randn(n)
out = np.zeros((n))

A_ = cuda.to_device(A)
b_ = cuda.to_device(b)
x_ = cuda.to_device(x)
out_ = cuda.to_device(out)

In [19]:
A.nbytes / (1024**2)

1.125

In [20]:
gradient[(BPG,BPG),(TPB,TPB)](A_, x_, b_, out_)

In [21]:
(A.T@(A@x - b))[:10]

array([-228.28756498,  812.84408078,   50.80270226,  889.1304349 ,
         11.4467005 , -729.60431778, -312.57329451, -575.62741395,
       -680.77692563, -586.5735268 ])

In [26]:
out_cpu = out_.copy_to_host()

In [23]:
print(np.linalg.norm(out_cpu - (A.T@(A@x - b))))

0.00036926049609955136


In [24]:
%%time 
for i in range(500):
    grad = A.T @ (A @ x - b)

Wall time: 26 ms


In [25]:
%%time
for i in range(500):
    gradient[(BPG,BPG),(TPB,TPB)](A_,x_,b_,out_)

Wall time: 48 ms
