In [1]:
from numba import jit, cuda, float32
import numpy as np

# 흐음..

In [None]:
@jit(nopython=True)
def inner_product_for_grad(x, y, b):
    out = 0.
    
    for i in range(x.size):
        out += x[i] * y[i]
    
    out -= b

    return out

BPG = 24
TPB = 16

@cuda.jit
def gradient(A, x, b, out):
    sA = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    
    BPG = cuda.gridDim.x

    if tx < TPB and ty < TPB:
        tmp = 0.
        for j in range(BPG):
            sA[tx,ty] = A.T[tx + bx * TPB,ty + j * TPB]
            sB[tx] = inner_product_for_grad(A[tx + j * TPB,:], x, b[tx + j * TPB])

            cuda.syncthreads()
            
            for k in range(TPB):
                tmp += sA[tx,k] * sB[k]

            cuda.syncthreads()

        out[tx + bx * TPB] = tmp

In [None]:
n = BPG * TPB
A = np.random.randn(n,n)
b = np.random.randn(n)
x = np.random.randn(n)
out = np.zeros((n))

A_ = cuda.to_device(A)
b_ = cuda.to_device(b)
x_ = cuda.to_device(x)
out_ = cuda.to_device(out)

print(n)

384


In [None]:
A.nbytes / (1024**2)

1.125

In [None]:
gradient[(BPG,BPG),(TPB,TPB)](A_, x_, b_, out_)

In [None]:
(A.T@(A@x - b))[:10]

array([-533.33870601,  242.89795711,  -88.3093338 ,  712.18969062,
        440.16761357,   56.21602087, -521.05745474, -295.88752422,
       1351.90868418,  114.93493377])

In [None]:
out_cpu = out_.copy_to_host()

In [None]:
print(np.linalg.norm(out_cpu - (A.T@(A@x - b))))

0.00033822569930007113


In [None]:
%%time
for i in range(500):
    gradient[(BPG,BPG),(TPB,TPB)](A_,x_,b_,out_)

CPU times: user 307 ms, sys: 33.7 ms, total: 340 ms
Wall time: 411 ms


# 핫!

In [2]:
np.random.randn(16,16).nbytes / 1024

2.0

In [53]:
@jit(nopython=True)
def inner_product_for_grad(x, y, b):
    out = 0.
    
    for i in range(x.size):
        out += x[i] * y[i]
    
    out -= b

    return out

BPG = 32
TPB = 16

@cuda.jit
def gradient(A, x, b, out, lr, iter):
    sA = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    
    BPG = cuda.gridDim.x

    if tx < TPB and ty < TPB:
        tmp = 0.
        for j in range(BPG):
            sA[tx,ty] = A.T[tx + bx * TPB,ty + j * TPB]
            sB[tx] = inner_product_for_grad(A[tx + j * TPB,:], x[:,iter], b[tx + j * TPB])

            cuda.syncthreads()
            
            for k in range(TPB):
                tmp += sA[tx,k] * sB[k]


            cuda.syncthreads()

        out[tx + bx * TPB,iter] = tmp * 2e-4

## vector subtraction
@cuda.jit
def vector_sub(x, y, out):
    
    tx = cuda.threadIdx.x
    bx = cuda.threadIdx.x

    i = tx + bx * TPB
    
    if i < x.size:
        out[i] = x[i] - y[i]

In [54]:
n = BPG * TPB
A = np.random.randn(n,n)
b = np.random.randn(n)
xp = np.random.randn(n)
x = np.zeros((n,100),dtype=np.float32)
lr = np.ones(1) * 2 * 1e-4

A_ = cuda.to_device(A)
b_ = cuda.to_device(b)
x_ = cuda.to_device(x)
out_ = cuda.device_array((x.shape[0],100), dtype=np.float32)
lr_ = cuda.to_device(lr)

print(n) 

512


In [56]:
cuda.synchronize()
%time gradient[(BPG,BPG),(TPB,TPB)](A_, x_, b_, out_,lr_,1)
cuda.synchronize()
grad = A.T @ (A @ x[:,0] - b) * 2e-4
x[:,0] -= grad

%time x_gpu = x_[:,1].copy_to_host() - out_[:,1].copy_to_host()
print(np.linalg.norm(out_[:,1].copy_to_host() - grad))
print(np.linalg.norm(x_gpu - x[:,1]))

CPU times: user 0 ns, sys: 579 µs, total: 579 µs
Wall time: 587 µs
CPU times: user 7.5 ms, sys: 4.14 ms, total: 11.6 ms
Wall time: 6.16 ms
0.021950241157646346
0.09805607


In [58]:
n = BPG * TPB
A = np.random.randn(n,n)
b = np.random.randn(n)
xp = np.random.randn(n)
x = np.zeros((n,100),dtype=np.float32)
lr = np.ones(1) * 2 * 1e-4

A_ = cuda.to_device(A)
b_ = cuda.to_device(b)
x_ = cuda.to_device(x)
out_ = cuda.device_array((x.shape[0],100), dtype=np.float32)
lr_ = cuda.to_device(lr)

print(n)

512


In [60]:
%%time
for iter in range(100):
    gradient[(BPG,BPG),(TPB,TPB)](A_,x_,b_,out_,lr_,iter)
cuda.synchronize()

KeyboardInterrupt: ignored