In [1]:
from numba import cuda, jit, float32
import numpy as np

In [2]:
cuda.detect()

Found 1 CUDA devices
id 0    b'NVIDIA GeForce GTX 1050'                              [SUPPORTED]
                      Compute Capability: 6.1
                           PCI Device ID: 0
                              PCI Bus ID: 1
                                    UUID: GPU-cba7719a-f6c0-d797-af25-5149df6fbfa6
                                Watchdog: Enabled
                            Compute Mode: WDDM
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported


True

In [22]:
A = np.random.rand(80,80)
x = np.random.rand(80)
b = np.random.rand(80)
out = np.zeros((80))

A_ = cuda.to_device(A)
x_ = cuda.to_device(x)
b_ = cuda.to_device(b)
out_ = cuda.to_device(out)

In [23]:
TPB = 10

@jit(nopython=True)
def inner_product_for_grad(x, y, b):
    out = 0.
    
    for i in range(x.size):
        out += x[i] * y[i]
    
    out -= b

    return out

@cuda.jit
def indexing(A, x, b, out):
    sB = cuda.shared.array(shape=(TPB), dtype=float32)

    tx = cuda.threadIdx.x
    bpg = cuda.gridDim.x

    if tx < 10:
        tmp = 0.
        for j in range(bpg):
            sB[tx] = inner_product_for_grad(A[tx + j * TPB,:], x, b[tx + j * TPB])

            out[tx + j * TPB] = sB[tx]

In [24]:
indexing[(8,8),(TPB,TPB)](A_, x_, b_, out_)
print(out_.copy_to_host())
print()
print((A@x - b))

[24.50805664 20.56057549 21.91553497 22.9709034  24.55823517 19.34800529
 23.01828003 21.51793861 23.28281403 22.65135574 20.53459167 25.74786758
 21.17397499 21.43156242 21.51019859 21.69621468 21.70463753 20.47000694
 22.67274666 22.10284042 24.3195591  20.1792202  21.58756447 24.07234192
 21.56954575 19.90351677 20.04043007 19.09234619 23.84860039 22.37267494
 22.72931099 19.66017151 21.95989418 20.17849541 20.28871536 21.76450539
 20.96751785 23.55218887 17.82385445 20.06032944 21.99184418 21.74591827
 21.96803093 25.05153656 18.7810955  25.54102135 28.0754776  20.14127922
 23.95279503 23.72281265 21.80297089 20.590765   18.64358902 20.70512581
 24.29169464 22.96321106 19.10893822 23.09531784 20.38539886 22.45574951
 23.53698158 23.74315453 21.91783142 21.50679779 23.90076256 24.13824272
 22.87413597 20.28474426 22.96225166 26.11632919 20.25970459 20.0115509
 21.97161484 22.74934006 23.90742683 22.02205849 21.74355507 23.61950874
 22.87112045 24.42792892]

[24.50805734 20.56057573 

In [25]:
BPG = 16
TPB = 16

@cuda.jit
def gradient(A, x, b, out):
    sA = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    
    BPG = cuda.gridDim.x

    if tx < TPB and ty < TPB:
        tmp = 0.
        for j in range(BPG):
            sA[tx,ty] = A.T[tx + bx * TPB,ty + j * TPB]
            sB[tx] = inner_product_for_grad(A[tx + j * TPB,:], x, b[tx + j * TPB])

            cuda.syncthreads()
            
            for k in range(TPB):
                tmp += sA[tx,k] * sB[k]

            cuda.syncthreads()

        out[tx + bx * TPB] = tmp

In [26]:
n = BPG * TPB
A = np.random.rand(n,n)
b = np.random.rand(n)
x = np.random.rand(n)
out = np.zeros((n))

A_ = cuda.to_device(A)
b_ = cuda.to_device(b)
x_ = cuda.to_device(x)
out_ = cuda.to_device(out)

In [27]:
A.nbytes / (1024**2)

0.5

In [28]:
gradient[(BPG,BPG),(TPB,TPB)](A_, x_, b_, out_)

In [30]:
(A.T@(A@x - b))[:10]

array([7451.62881183, 7514.33165698, 7677.58085246, 7762.33650327,
       7590.29875586, 7674.12386132, 7794.11419543, 6828.3730692 ,
       7779.08248531, 7633.12741157])

In [31]:
out_cpu = out_.copy_to_host()

In [32]:
print(np.linalg.norm(out_cpu - (A.T@(A@x - b))))

0.0002786993408952738


In [33]:
@cuda.jit
def optimizer(A, x, b, out):
    sA = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    
    BPG = cuda.gridDim.x

    if tx < TPB and ty < TPB:
        tmp = 0.
        for j in range(BPG):
            sA[tx,ty] = A.T[tx + bx * TPB,ty + j * TPB]
            sB[tx] = inner_product_for_grad(A[tx + j * TPB,:], x, b[tx + j * TPB])

            cuda.syncthreads()
            
            for k in range(TPB):
                tmp += sA[tx,k] * sB[k]

            cuda.syncthreads()

        out[tx + bx * TPB] = tmp

In [34]:
lr = 1e-3 / A.shape[1]

In [35]:
grad = A.T @ (A @ x - b)
optimizer[(BPG,BPG),(TPB,TPB)](A_,x_,b_,out_)

In [36]:
out_cpu = out_.copy_to_host()
np.linalg.norm(grad - out_cpu)

0.0002786993408952738

In [37]:
x -= grad * lr
x_ -= out * lr

In [38]:
print(x[:10])
print()
print(x_[:10])
print(np.linalg.norm(x - x_))

[0.79443    0.37814377 0.64059166 0.51961756 0.67257208 0.23287456
 0.55207069 0.74815953 0.90843225 0.03256296]

[0.82353792 0.40749663 0.67058221 0.54993919 0.70222169 0.26285161
 0.58251645 0.77483286 0.93881929 0.06237987]
0.47490811295025914


In [39]:
%%time 
for i in range(500):
    grad = A.T @ (A @ x - b)

Wall time: 13 ms


In [40]:
%%time
for i in range(500):
    gradient[(BPG,BPG),(TPB,TPB)](A_,x_,b_,out_)



Wall time: 10.4 s


In [None]:
## Using one GPU 
class LeastSquare():
    def __init__(self, A, b, epoches=10, TPB=16):
        self.A = A
        self.b = b
        self.lr = 1e-3/A.shape[1]
        self.epoches = epoches
        self.x = cuda.to_device(np.random.rand(A.shape[1]))
        self.x_hat = cuda.device_array((A.shape[1]))
        self.error_list = []
        self.grad = cuda.device_array((A.shape[1]))

        ## About kernel, Configure the blocks
        self.threadsperblock = (TPB,TPB) 
        blockspergrid_x = int(np.ceil(A.shape[0] / self.threadsperblock[1]))
        blockspergrid_y = int(np.ceil(A.shape[1] / self.threadsperblock[0]))
        self.blockspergrid = (blockspergrid_x, blockspergrid_y)
        
    def run(self):
        for i in range(self.epoches):
            A, b = self.initialize()
            self.optimize(A, b, self.x)

        return self.x_hat

    def initialize(self):
        index = np.random.choice(self.A.shape[0], 1000)
        A = cuda.to_device(self.A[index,:])
        b = cuda.to_device(self.b[index])

        return A, b

    def optimize(self, A, b, x, iters_per_epoch=500):
        
        for i in range(iters_per_epoch):
            optimizer[self.blockspergrid, self.threadsperblock](A, x, b, lr)

    def check(self, x):
        b_hat = self.A @ x
        error = np.linalg.norm(self.b - b_hat)
        self.error_list.append(error)

        return error