In [1]:
import numpy as np
import math
import time
from numba import cuda

In [2]:
@cuda.jit
def matmul_numba(A,B,C):
    i,j = cuda.grid(2)    
    if i < C.shape[0] and j <C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i,k] * B[k,j]
        C[i,j] = tmp

In [7]:
@cuda.jit
def matmul_numba2(A,B,C):
    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    j = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    if i < C.shape[0] and j <C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i,k] * B[k,j]
        C[i,j] = tmp

In [8]:
np.random.seed(2022)
n = 50
A = np.random.randn(n,n).astype(np.float64)
B = np.random.randn(n,n).astype(np.float64)
C = np.zeros_like(A).astype(np.float64)

In [9]:
TPB = (32,32,1)
block = math.ceil(n/32)
BPG = (block, block,1)

In [13]:
matmul_numba2[BPG,TPB](A,B,C)

In [14]:
C.dtype

dtype('float64')

In [15]:
C[0]

array([-4.26516861e+00,  4.47178964e+00,  2.85671449e+00,  1.14224092e+00,
       -6.48794556e+00, -4.84272861e+00, -6.42310726e+00,  4.00891832e+00,
       -4.56629541e+00, -3.96130001e+00,  3.15290990e+00, -2.86421472e+00,
        1.52663492e+01,  8.84744005e+00,  1.38223755e+01, -1.13033539e+01,
        1.79882781e+00,  1.30616217e+00, -9.81496667e-01, -1.20863747e+00,
        1.23914394e+01, -3.92721535e+00, -4.40115427e+00,  5.57401826e+00,
       -4.96962847e+00,  5.68972372e+00, -1.05804794e+01, -1.31119699e+01,
       -2.04478926e+00, -3.00779215e+00, -1.16682453e+00, -6.65407033e+00,
        3.67307120e+00, -5.86803479e+00,  6.08246045e+00,  9.80986417e+00,
       -3.25950414e-01, -4.47629772e+00, -3.13733059e-01, -9.22952897e-03,
        1.82561042e+00, -1.71015296e+00,  4.27159228e+00,  1.12619803e+01,
       -6.88113315e+00, -1.16252493e+00, -7.94300161e+00, -7.85877981e+00,
        4.65615654e-01,  1.01233305e+01])

In [7]:
D = np.dot(A,B)
D[0]

array([-4.26516861e+00,  4.47178964e+00,  2.85671449e+00,  1.14224092e+00,
       -6.48794556e+00, -4.84272861e+00, -6.42310726e+00,  4.00891832e+00,
       -4.56629541e+00, -3.96130001e+00,  3.15290990e+00, -2.86421472e+00,
        1.52663492e+01,  8.84744005e+00,  1.38223755e+01, -1.13033539e+01,
        1.79882781e+00,  1.30616217e+00, -9.81496667e-01, -1.20863747e+00,
        1.23914394e+01, -3.92721535e+00, -4.40115427e+00,  5.57401826e+00,
       -4.96962847e+00,  5.68972372e+00, -1.05804794e+01, -1.31119699e+01,
       -2.04478926e+00, -3.00779215e+00, -1.16682453e+00, -6.65407033e+00,
        3.67307120e+00, -5.86803479e+00,  6.08246045e+00,  9.80986417e+00,
       -3.25950414e-01, -4.47629772e+00, -3.13733059e-01, -9.22952897e-03,
        1.82561042e+00, -1.71015296e+00,  4.27159228e+00,  1.12619803e+01,
       -6.88113315e+00, -1.16252493e+00, -7.94300161e+00, -7.85877981e+00,
        4.65615654e-01,  1.01233305e+01])

In [1]:
import numpy as np
import math
import time
from numba import cuda
from numba import int32, float32, float64

@cuda.jit
def matmul_numba(A,B,C):
    i,j = cuda.grid(2)    
    if i < C.shape[0] and j <C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i,k] * B[k,j]
        C[i,j] = tmp
        
np.random.seed(2022)
n = 2500
A = np.random.randn(n,n).astype(np.float64)
B = np.random.randn(n,n).astype(np.float64)
niter = 11
comp_time_numba = np.zeros(niter)

for i in np.arange(niter):

    C = np.zeros_like(A).astype(np.float64)
    t1 = time.time()
    TPB = (32,32,1)
    block = math.ceil(n/32)
    BPG = (block,block,1)
    matmul_numba[BPG,TPB](A,B,C)
    t2 = time.time()
    comp_time_numba[i] = t2-t1
        
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_numba[i])
    



  1 -th iteration, Collapsed Time:  0.8169958591461182

  2 -th iteration, Collapsed Time:  0.47506141662597656

  3 -th iteration, Collapsed Time:  0.4832158088684082

  4 -th iteration, Collapsed Time:  0.4760856628417969

  5 -th iteration, Collapsed Time:  0.4855058193206787

  6 -th iteration, Collapsed Time:  0.4770340919494629

  7 -th iteration, Collapsed Time:  0.48311758041381836

  8 -th iteration, Collapsed Time:  0.4770841598510742

  9 -th iteration, Collapsed Time:  0.4836595058441162

  10 -th iteration, Collapsed Time:  0.4772765636444092

  11 -th iteration, Collapsed Time:  0.4832596778869629


In [2]:
np.round(np.mean(comp_time_numba[1:]),4)

0.4801

In [3]:
0.4801 - 0.4689

0.011200000000000043

In [4]:
0.1611 - 0.0834

0.07769999999999999