In [1]:
from numba import cuda, jit, float32
import numpy as np

In [2]:
@jit(nopython=True)
def inner_product_for_grad(x, y, b):
    out = 0.
    
    for i in range(x.size):
        out += x[i] * y[i]
    
    out -= b

    return out

@cuda.jit
def each_thread(A, x, b, out):
    
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x

    i = tx + ty * 16 + bx * 256
    
    if i < A.shape[0]:
        out[i] += i

In [3]:
A = np.random.randn(100,100)
x = np.random.randn(100)
b = np.random.randn(100)
out = np.zeros((100))

A_ = cuda.to_device(A)
x_ = cuda.to_device(x)
b_ = cuda.to_device(b)
out_ = cuda.to_device(out)

In [4]:
each_thread[4,(16,16)](A_,x_,b_,out_)
result = out_.copy_to_host()
print(result)

[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35.
 36. 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53.
 54. 55. 56. 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67. 68. 69. 70. 71.
 72. 73. 74. 75. 76. 77. 78. 79. 80. 81. 82. 83. 84. 85. 86. 87. 88. 89.
 90. 91. 92. 93. 94. 95. 96. 97. 98. 99.]


In [5]:
@cuda.jit
def each_thread_grad(A, x, b, out):
    
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x

    i = tx + ty * 16 + bx * 256
    
    if i < A.shape[0]:
        out[i] = inner_product_for_grad(A[i,:],x,b[i])

In [6]:
A = np.random.randn(1000,1000)
x = np.random.randn(1000)
b = np.random.randn(1000)
out = np.zeros((1000))

A_ = cuda.to_device(A)
x_ = cuda.to_device(x)
b_ = cuda.to_device(b)
out_ = cuda.to_device(out)

In [7]:
result_cpu = A @ x - b
each_thread_grad[4,(16,16)](A_,x_,b_,out_)
result_gpu = out_.copy_to_host()
np.allclose(result_cpu, result_gpu)

True

In [11]:
result_gpu[:10]

array([ 24.85584052,  -9.40254008,  60.50976213,  65.5607845 ,
       -13.80644096,  48.65847318,  -4.31607246, -15.15128736,
        46.93511241, -20.30497489])

In [12]:
result_cpu[:10]

array([ 24.85584052,  -9.40254008,  60.50976213,  65.5607845 ,
       -13.80644096,  48.65847318,  -4.31607246, -15.15128736,
        46.93511241, -20.30497489])

In [10]:
%timeit A.T @ (A @ x - b)
%timeit each_thread_grad[4,(16,16)](A_,x_,b_,out_)

1000 loops, best of 5: 460 µs per loop
1000 loops, best of 5: 427 µs per loop


In [13]:
BPG = 32
TPB = 32

@cuda.jit
def gradient(A, x, b, out, lr):
    sA = cuda.shared.array(shape=(TPB,TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB), dtype=float32)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y
    
    BPG = cuda.gridDim.x

    if tx < TPB and ty < TPB:
        tmp = 0.
        for j in range(BPG):
            sA[tx,ty] = A.T[tx + bx * TPB,ty + j * TPB]
            sB[tx] = inner_product_for_grad(A[tx + j * TPB,:], x, b[tx + j * TPB])

            cuda.syncthreads()
            
            for k in range(TPB):
                tmp += sA[tx,k] * sB[k]


            cuda.syncthreads()

        out[tx + bx * TPB] = tmp * 2e-4

In [30]:
n = BPG * TPB
A = np.random.randn(n,n)
b = np.random.randn(n)
x = np.random.randn(n)
lr = np.ones(1) * 2 * 1e-4

A_ = cuda.to_device(A)
b_ = cuda.to_device(b)
x_ = cuda.to_device(x)
out_ = cuda.device_array((n))
lr_ = cuda.to_device(lr)

print(n)

1024


In [22]:
A.nbytes / (1024**2)

8.0

In [31]:
%%time
gradient[(BPG,BPG),(TPB,TPB)](A_, x_, b_, out_,lr_)
x_gpu = x_.copy_to_host() - out_.copy_to_host()

CPU times: user 1.4 s, sys: 0 ns, total: 1.4 s
Wall time: 1.4 s


In [32]:
grad = A.T @ (A @ x - b) * 2e-4
x -= grad
print(np.linalg.norm(out_.copy_to_host() - grad))
print(np.linalg.norm(x_gpu - x))

2.817674180644351e-07
2.817674181341837e-07
