In [1]:
from ctypes import *
import numpy as np
import time

libc_matmul32 = CDLL("/home/dyu/Dropbox/tf-notebooks/cuda_comp/mat_mul_example/using_dll/matmul_kernel_f.so")
matmul32 = libc_matmul32.MatMul

libc_matmul_cublas = CDLL("/home/dyu/Dropbox/tf-notebooks/cuda_comp/mat_mul_example/using_dll/matmul_cublas_f.so")
matmul_cublas = libc_matmul_cublas.MatMul_cuBLAS

matmul32.restype = None
matmul32.argtypes = (POINTER(c_float), POINTER(c_float),
                     POINTER(c_float), POINTER(c_int))

matmul_cublas.restype = None
matmul_cublas.argtypes = (POINTER(c_float), POINTER(c_float),
                          POINTER(c_float), POINTER(c_int))

np.random.seed(2022)
n = 3000
A = np.random.randn(n,n).astype(np.float32)
B = np.random.randn(n,n).astype(np.float32)
niter = 11
comp_time_mm = np.zeros((niter,6))

for i in np.arange(niter):
    n_int = c_int(n)
    
    # kernel
    A_ra = A.ctypes.data_as(POINTER(c_float))
    B_ra = B.ctypes.data_as(POINTER(c_float))

    C = np.zeros_like(A)
    C_ra = C.ctypes.data_as(POINTER(c_float))

    t1 = time.time()
    matmul32(A_ra, B_ra, C_ra, n_int)
    t2 = time.time()
    comp_time_mm[i,0] = t2-t1
    
    
    # cublas
    A_c = A.ravel(order='F')
    A_ca = A_c.ctypes.data_as(POINTER(c_float))
    B_c = B.ravel(order='F')
    B_ca = B_c.ctypes.data_as(POINTER(c_float))

    C_c = np.zeros_like(A).ravel(order='F')
    C_ca = C_c.ctypes.data_as(POINTER(c_float))

    t1 = time.time()
    matmul_cublas(A_ca, B_ca, C_ca, n_int)
    t2 = time.time()
    comp_time_mm[i,1] = t2-t1
    

    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_mm[i,0], ',  ', comp_time_mm[i,1],
          ', Total: ',comp_time_mm[i,0]+comp_time_mm[i,1])
    




  1 -th iteration, Collapsed Time:  0.2867465019226074 ,   0.2685883045196533 , Total:  0.5553348064422607

  2 -th iteration, Collapsed Time:  0.18027639389038086 ,   0.027552366256713867 , Total:  0.20782876014709473

  3 -th iteration, Collapsed Time:  0.1811988353729248 ,   0.02614307403564453 , Total:  0.20734190940856934

  4 -th iteration, Collapsed Time:  0.17890310287475586 ,   0.02581000328063965 , Total:  0.2047131061553955

  5 -th iteration, Collapsed Time:  0.17859816551208496 ,   0.027016401290893555 , Total:  0.20561456680297852

  6 -th iteration, Collapsed Time:  0.17906951904296875 ,   0.024827003479003906 , Total:  0.20389652252197266

  7 -th iteration, Collapsed Time:  0.17645525932312012 ,   0.025805950164794922 , Total:  0.20226120948791504

  8 -th iteration, Collapsed Time:  0.17718052864074707 ,   0.024502992630004883 , Total:  0.20168352127075195

  9 -th iteration, Collapsed Time:  0.17716550827026367 ,   0.02548050880432129 , Total:  0.20264601707458496



In [2]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray, tools, cumath
import numpy as np
import time


with open('/home/dyu/Dropbox/tf-notebooks/cuda_comp/mat_mul_example/pycuda/matmul_pycuda_float.cu') as f:
    matmul_kernel = f.read()

mod = SourceModule(matmul_kernel)
matmul_pycuda = mod.get_function("d_MatMul")

np.random.seed(2022)
A = np.random.randn(n,n).astype(np.float32)
B = np.random.randn(n,n).astype(np.float32)
C = np.zeros_like(A).astype(np.float32)

for i in np.arange(niter):

    t1 = time.time()
    dA = gpuarray.to_gpu(A)
    dB = gpuarray.to_gpu(B)
    dC = gpuarray.to_gpu(C)
    TPB = (32, 32, 1)
    block = (np.ceil(n / 32)).astype(np.int32).item()
    BPG = (block, block, 1)
    matmul_pycuda(dA,dB,dC, np.int32(n), block = TPB, grid = BPG)
    #cuda.Context.synchronize()
    t2 = time.time()
    comp_time_mm[i,2] = t2-t1
    
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_mm[i,2])
    



  1 -th iteration, Collapsed Time:  0.023924589157104492

  2 -th iteration, Collapsed Time:  0.1829662322998047

  3 -th iteration, Collapsed Time:  0.18361496925354004

  4 -th iteration, Collapsed Time:  0.1824631690979004

  5 -th iteration, Collapsed Time:  0.18271636962890625

  6 -th iteration, Collapsed Time:  0.18291878700256348

  7 -th iteration, Collapsed Time:  0.18310809135437012

  8 -th iteration, Collapsed Time:  0.18327975273132324

  9 -th iteration, Collapsed Time:  0.18394112586975098

  10 -th iteration, Collapsed Time:  0.1812138557434082

  11 -th iteration, Collapsed Time:  0.18193268775939941


In [3]:
import numpy as np
import math
import time
import numba
from numba import cuda

@cuda.jit
def matmul_numba(A,B,C):
    i,j = cuda.grid(2)    
    if i < C.shape[0] and j <C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i,k] * B[k,j]
        C[i,j] = tmp
        
np.random.seed(2022)
A = np.random.randn(n,n).astype(np.float32)
B = np.random.randn(n,n).astype(np.float32)

numba.cuda.select_device(0)

for i in np.arange(niter):

    C = np.zeros_like(A).astype(np.float32)
    t1 = time.time()
    TPB = (32,32,1)
    block = math.ceil(n/32)
    BPG = (block,block,1)
    matmul_numba[BPG,TPB](A,B,C)
    t2 = time.time()
    comp_time_mm[i,3] = t2-t1
        
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_mm[i,3])
    



  1 -th iteration, Collapsed Time:  1.235156536102295

  2 -th iteration, Collapsed Time:  0.8223178386688232

  3 -th iteration, Collapsed Time:  0.8220539093017578

  4 -th iteration, Collapsed Time:  0.8373508453369141

  5 -th iteration, Collapsed Time:  0.8297779560089111

  6 -th iteration, Collapsed Time:  0.8303267955780029

  7 -th iteration, Collapsed Time:  0.8256044387817383

  8 -th iteration, Collapsed Time:  0.8388140201568604

  9 -th iteration, Collapsed Time:  0.8408968448638916

  10 -th iteration, Collapsed Time:  0.8294031620025635

  11 -th iteration, Collapsed Time:  0.8375644683837891


In [4]:
import numpy as np
import torch
import time
import gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.init()
torch.backends.cuda.matmul.allow_tf32 = False


np.random.seed(2022)
A = np.random.randn(n,n).astype(np.float32)
B = np.random.randn(n,n).astype(np.float32)

for i in np.arange(niter):

    t1 = time.time()
    dA = torch.tensor(A).to('cuda:0')
    dB = torch.tensor(B).to('cuda:0')
    dC = torch.matmul(dA,dB)
    C = dC.to('cpu')
    t2 = time.time()
    comp_time_mm[i,5] = t2-t1
        
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_mm[i,5])
    


  1 -th iteration, Collapsed Time:  2.0762932300567627

  2 -th iteration, Collapsed Time:  0.06759285926818848

  3 -th iteration, Collapsed Time:  0.06951498985290527

  4 -th iteration, Collapsed Time:  0.06462764739990234

  5 -th iteration, Collapsed Time:  0.06738638877868652

  6 -th iteration, Collapsed Time:  0.06284451484680176

  7 -th iteration, Collapsed Time:  0.06397533416748047

  8 -th iteration, Collapsed Time:  0.06407380104064941

  9 -th iteration, Collapsed Time:  0.06342196464538574

  10 -th iteration, Collapsed Time:  0.0617983341217041

  11 -th iteration, Collapsed Time:  0.06333684921264648


In [5]:
import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.init()

In [6]:
import numpy as np
import tensorflow as tf
import time

np.random.seed(2022)
A = np.random.randn(n,n).astype(np.float32)
B = np.random.randn(n,n).astype(np.float32)

for i in np.arange(niter):

    t1 = time.time()
    with tf.device('/GPU:0'):
        dA = tf.constant(A, dtype=tf.float32)
        dB = tf.constant(B, dtype=tf.float32)
        dC = tf.matmul(dA, dB)
    C = dC.numpy()
    t2 = time.time()
    comp_time_mm[i,4] = t2-t1
        
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_mm[i,4])
    



  1 -th iteration, Collapsed Time:  0.6861505508422852

  2 -th iteration, Collapsed Time:  0.12489724159240723

  3 -th iteration, Collapsed Time:  0.10457253456115723

  4 -th iteration, Collapsed Time:  0.10381937026977539

  5 -th iteration, Collapsed Time:  0.1042482852935791

  6 -th iteration, Collapsed Time:  0.10446500778198242

  7 -th iteration, Collapsed Time:  0.1043386459350586

  8 -th iteration, Collapsed Time:  0.10371112823486328

  9 -th iteration, Collapsed Time:  0.10315060615539551

  10 -th iteration, Collapsed Time:  0.10845470428466797

  11 -th iteration, Collapsed Time:  0.10213136672973633


In [7]:
with open('./mm_comp_time_n{}_float.txt'.format(n), 'w') as f:
    print(('%s\t '*6 % ('DLL-K','DLL-cuBLAS','PyCUDA','Numba','TF','PyTorch')), end='\n', file=f)
    for i in range(niter):
        print(('%.4f\t '*6 % tuple(comp_time_mm[i])), end='\n', file=f)
