In [1]:
import numpy as np
import math
import time
from numba import cuda

In [2]:
cuda.list_devices()

<numba.cuda.cudadrv.devices._DeviceList at 0x7f949e24b940>

In [3]:
@cuda.jit
def matmul_numba(A,B,C):
    i,j = cuda.grid(2)    
    if i < C.shape[0] and j <C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i,k] * B[k,j]
        C[i,j] = tmp

In [4]:
np.random.seed(2022)
n = 50
A = np.random.randn(n,n).astype(np.float32)
B = np.random.randn(n,n).astype(np.float32)
C = np.zeros_like(A).astype(np.float32)

In [5]:
TPB = (32,32,1)
block = math.ceil(n/32)
BPG = (block, block,1)

In [6]:
matmul_numba[BPG,TPB](A,B,C)

In [7]:
C[0]

array([-4.2651687e+00,  4.4717903e+00,  2.8567140e+00,  1.1422408e+00,
       -6.4879456e+00, -4.8427286e+00, -6.4231071e+00,  4.0089183e+00,
       -4.5662956e+00, -3.9613004e+00,  3.1529095e+00, -2.8642147e+00,
        1.5266349e+01,  8.8474398e+00,  1.3822375e+01, -1.1303353e+01,
        1.7988276e+00,  1.3061627e+00, -9.8149681e-01, -1.2086377e+00,
        1.2391439e+01, -3.9272151e+00, -4.4011540e+00,  5.5740175e+00,
       -4.9696283e+00,  5.6897235e+00, -1.0580480e+01, -1.3111970e+01,
       -2.0447898e+00, -3.0077920e+00, -1.1668246e+00, -6.6540704e+00,
        3.6730716e+00, -5.8680344e+00,  6.0824604e+00,  9.8098640e+00,
       -3.2595071e-01, -4.4762974e+00, -3.1373292e-01, -9.2296228e-03,
        1.8256105e+00, -1.7101529e+00,  4.2715921e+00,  1.1261980e+01,
       -6.8811331e+00, -1.1625252e+00, -7.9430013e+00, -7.8587799e+00,
        4.6561503e-01,  1.0123330e+01], dtype=float32)

In [8]:
C.dtype

dtype('float32')

In [9]:
D = np.dot(A,B)
D[0]

array([-4.2651687e+00,  4.4717889e+00,  2.8567147e+00,  1.1422411e+00,
       -6.4879460e+00, -4.8427286e+00, -6.4231071e+00,  4.0089183e+00,
       -4.5662951e+00, -3.9613001e+00,  3.1529100e+00, -2.8642142e+00,
        1.5266350e+01,  8.8474398e+00,  1.3822375e+01, -1.1303352e+01,
        1.7988281e+00,  1.3061628e+00, -9.8149627e-01, -1.2086369e+00,
        1.2391439e+01, -3.9272153e+00, -4.4011531e+00,  5.5740185e+00,
       -4.9696288e+00,  5.6897259e+00, -1.0580478e+01, -1.3111972e+01,
       -2.0447896e+00, -3.0077918e+00, -1.1668241e+00, -6.6540699e+00,
        3.6730716e+00, -5.8680353e+00,  6.0824609e+00,  9.8098640e+00,
       -3.2595092e-01, -4.4762979e+00, -3.1373355e-01, -9.2294300e-03,
        1.8256099e+00, -1.7101533e+00,  4.2715926e+00,  1.1261982e+01,
       -6.8811331e+00, -1.1625246e+00, -7.9430013e+00, -7.8587794e+00,
        4.6561626e-01,  1.0123331e+01], dtype=float32)

In [10]:
import numpy as np
import math
import time
from numba import cuda

@cuda.jit
def matmul_numba(A,B,C):
    i,j = cuda.grid(2)    
    if i < C.shape[0] and j <C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i,k] * B[k,j]
        C[i,j] = tmp
        
np.random.seed(2022)
n = 2500
A = np.random.randn(n,n).astype(np.float32)
B = np.random.randn(n,n).astype(np.float32)
niter = 11
comp_time_numba = np.zeros(niter)

for i in np.arange(niter):

    C = np.zeros_like(A).astype(np.float32)
    t1 = time.time()
    TPB = (32,32,1)
    block = math.ceil(n/32)
    BPG = (block,block,1)
    matmul_numba[BPG,TPB](A,B,C)
    t2 = time.time()
    comp_time_numba[i] = t2-t1
        
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_numba[i])
    



  1 -th iteration, Collapsed Time:  0.6456441879272461

  2 -th iteration, Collapsed Time:  0.46421241760253906

  3 -th iteration, Collapsed Time:  0.47124338150024414

  4 -th iteration, Collapsed Time:  0.466397762298584

  5 -th iteration, Collapsed Time:  0.46881842613220215

  6 -th iteration, Collapsed Time:  0.46660542488098145

  7 -th iteration, Collapsed Time:  0.47759008407592773

  8 -th iteration, Collapsed Time:  0.4664289951324463

  9 -th iteration, Collapsed Time:  0.4686310291290283

  10 -th iteration, Collapsed Time:  0.47237634658813477

  11 -th iteration, Collapsed Time:  0.4663882255554199


In [12]:
np.round(np.mean(comp_time_numba[1:]),4)

0.4689