In [1]:
from ctypes import *
import numpy as np
import time

libc_matmul32 = CDLL("/home/dyu/Dropbox/tf-notebooks/cuda_comp/mat_mul_example/using_dll/matmul_kernel_d.so")
matmul32 = libc_matmul32.MatMul

libc_matmul_cublas = CDLL("/home/dyu/Dropbox/tf-notebooks/cuda_comp/mat_mul_example/using_dll/matmul_cublas_d.so")
matmul_cublas = libc_matmul_cublas.MatMul_cuBLAS

matmul32.restype = None
matmul32.argtypes = (POINTER(c_double), POINTER(c_double),
                     POINTER(c_double), POINTER(c_int))

matmul_cublas.restype = None
matmul_cublas.argtypes = (POINTER(c_double), POINTER(c_double),
                          POINTER(c_double), POINTER(c_int))

np.random.seed(2022)
n = 3000
A = np.random.randn(n,n).astype(np.float64)
B = np.random.randn(n,n).astype(np.float64)
niter = 11
comp_time_mm = np.zeros((niter,6))

for i in np.arange(niter):
    n_int = c_int(n)
    
    # kernel
    A_ra = A.ctypes.data_as(POINTER(c_double))
    B_ra = B.ctypes.data_as(POINTER(c_double))

    C = np.zeros_like(A).astype(np.float64)
    C_ra = C.ctypes.data_as(POINTER(c_double))

    t1 = time.time()
    matmul32(A_ra, B_ra, C_ra, n_int)
    t2 = time.time()
    comp_time_mm[i,0] = t2-t1
    
    
    # cublas
    A_c = A.ravel(order='F')
    A_ca = A_c.ctypes.data_as(POINTER(c_double))
    B_c = B.ravel(order='F')
    B_ca = B_c.ctypes.data_as(POINTER(c_double))

    C_c = np.zeros_like(A).astype(np.float64).ravel(order='F')
    C_ca = C_c.ctypes.data_as(POINTER(c_double))

    t1 = time.time()
    matmul_cublas(A_ca, B_ca, C_ca, n_int)
    t2 = time.time()
    comp_time_mm[i,1] = t2-t1
    

    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_mm[i,0], ',  ', comp_time_mm[i,1],
          ', Total: ',comp_time_mm[i,0]+comp_time_mm[i,1])
    




  1 -th iteration, Collapsed Time:  0.3694939613342285 ,   0.4431276321411133 , Total:  0.8126215934753418

  2 -th iteration, Collapsed Time:  0.2597782611846924 ,   0.21077823638916016 , Total:  0.47055649757385254

  3 -th iteration, Collapsed Time:  0.26433491706848145 ,   0.1961500644683838 , Total:  0.46048498153686523

  4 -th iteration, Collapsed Time:  0.2570345401763916 ,   0.2015841007232666 , Total:  0.4586186408996582

  5 -th iteration, Collapsed Time:  0.25673675537109375 ,   0.20681238174438477 , Total:  0.4635491371154785

  6 -th iteration, Collapsed Time:  0.25884246826171875 ,   0.19358277320861816 , Total:  0.4524252414703369

  7 -th iteration, Collapsed Time:  0.2537858486175537 ,   0.19461369514465332 , Total:  0.44839954376220703

  8 -th iteration, Collapsed Time:  0.2563362121582031 ,   0.20016789436340332 , Total:  0.45650410652160645

  9 -th iteration, Collapsed Time:  0.2576582431793213 ,   0.211700439453125 , Total:  0.4693586826324463

  10 -th iterati

In [2]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray, tools, cumath
import numpy as np
import time


with open('/home/dyu/Dropbox/tf-notebooks/cuda_comp/mat_mul_example/pycuda/matmul_pycuda_float.cu') as f:
    matmul_kernel = f.read()

mod = SourceModule(matmul_kernel)
matmul_pycuda = mod.get_function("d_MatMul")

np.random.seed(2022)
A = np.random.randn(n,n).astype(np.float64)
B = np.random.randn(n,n).astype(np.float64)
C = np.zeros_like(A).astype(np.float64)

for i in np.arange(niter):

    t1 = time.time()
    dA = gpuarray.to_gpu(A)
    dB = gpuarray.to_gpu(B)
    dC = gpuarray.to_gpu(C)
    TPB = (32, 32, 1)
    block = (np.ceil(n / 32)).astype(np.int32).item()
    BPG = (block, block, 1)
    matmul_pycuda(dA,dB,dC, np.int32(n), block = TPB, grid = BPG)
    #cuda.Context.synchronize()
    t2 = time.time()
    comp_time_mm[i,2] = t2-t1
    
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_mm[i,2])
    



  1 -th iteration, Collapsed Time:  0.04091143608093262

  2 -th iteration, Collapsed Time:  0.20671772956848145

  3 -th iteration, Collapsed Time:  0.20404529571533203

  4 -th iteration, Collapsed Time:  0.21441149711608887

  5 -th iteration, Collapsed Time:  0.21289753913879395

  6 -th iteration, Collapsed Time:  0.21155691146850586

  7 -th iteration, Collapsed Time:  0.20369672775268555

  8 -th iteration, Collapsed Time:  0.20773744583129883

  9 -th iteration, Collapsed Time:  0.203352689743042

  10 -th iteration, Collapsed Time:  0.2170557975769043

  11 -th iteration, Collapsed Time:  0.20048856735229492


In [3]:
import numpy as np
import math
import time
import numba
from numba import cuda

@cuda.jit
def matmul_numba(A,B,C):
    i,j = cuda.grid(2)    
    if i < C.shape[0] and j <C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[i,k] * B[k,j]
        C[i,j] = tmp
        
np.random.seed(2022)
A = np.random.randn(n,n).astype(np.float64)
B = np.random.randn(n,n).astype(np.float64)

numba.cuda.select_device(0)

for i in np.arange(niter):

    C = np.zeros_like(A).astype(np.float64)
    t1 = time.time()
    TPB = (32,32,1)
    block = math.ceil(n/32)
    BPG = (block,block,1)
    matmul_numba[BPG,TPB](A,B,C)
    t2 = time.time()
    comp_time_mm[i,3] = t2-t1
        
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_mm[i,3])
    



  1 -th iteration, Collapsed Time:  1.2187793254852295

  2 -th iteration, Collapsed Time:  0.8572866916656494

  3 -th iteration, Collapsed Time:  0.8549926280975342

  4 -th iteration, Collapsed Time:  0.8776733875274658

  5 -th iteration, Collapsed Time:  0.860926628112793

  6 -th iteration, Collapsed Time:  0.8708992004394531

  7 -th iteration, Collapsed Time:  0.8477683067321777

  8 -th iteration, Collapsed Time:  0.8755526542663574

  9 -th iteration, Collapsed Time:  0.86098313331604

  10 -th iteration, Collapsed Time:  0.8609089851379395

  11 -th iteration, Collapsed Time:  0.883453369140625


In [4]:
import numpy as np
import torch
import time

np.random.seed(2022)
A = np.random.randn(n,n).astype(np.float64)
B = np.random.randn(n,n).astype(np.float64)

for i in np.arange(niter):

    t1 = time.time()
    dA = torch.tensor(A).to('cuda:0')
    dB = torch.tensor(B).to('cuda:0')
    dC = torch.matmul(dA,dB)
    C = dC.to('cpu')
    t2 = time.time()
    comp_time_mm[i,5] = t2-t1
        
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_mm[i,5])
    


  1 -th iteration, Collapsed Time:  2.1495089530944824

  2 -th iteration, Collapsed Time:  0.2644205093383789

  3 -th iteration, Collapsed Time:  0.27065610885620117

  4 -th iteration, Collapsed Time:  0.2647826671600342

  5 -th iteration, Collapsed Time:  0.24127769470214844

  6 -th iteration, Collapsed Time:  0.2515718936920166

  7 -th iteration, Collapsed Time:  0.27802276611328125

  8 -th iteration, Collapsed Time:  0.2543461322784424

  9 -th iteration, Collapsed Time:  0.2567276954650879

  10 -th iteration, Collapsed Time:  0.24373078346252441

  11 -th iteration, Collapsed Time:  0.24636602401733398


In [5]:
import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.init()

In [6]:
import numpy as np
import tensorflow as tf
import time

np.random.seed(2022)
A = np.random.randn(n,n).astype(np.float64)
B = np.random.randn(n,n).astype(np.float64)

for i in np.arange(niter):

    t1 = time.time()
    with tf.device('/GPU:0'):
        dA = tf.constant(A, dtype=tf.float64)
        dB = tf.constant(B, dtype=tf.float64)
        dC = tf.matmul(dA, dB)
    C = dC.numpy()
    t2 = time.time()
    comp_time_mm[i,4] = t2-t1
        
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_mm[i,4])
    



  1 -th iteration, Collapsed Time:  0.8665144443511963

  2 -th iteration, Collapsed Time:  0.3374052047729492

  3 -th iteration, Collapsed Time:  0.331636905670166

  4 -th iteration, Collapsed Time:  0.3383510112762451

  5 -th iteration, Collapsed Time:  0.33542871475219727

  6 -th iteration, Collapsed Time:  0.35890746116638184

  7 -th iteration, Collapsed Time:  0.37705016136169434

  8 -th iteration, Collapsed Time:  0.34337377548217773

  9 -th iteration, Collapsed Time:  0.3369724750518799

  10 -th iteration, Collapsed Time:  0.334916353225708

  11 -th iteration, Collapsed Time:  0.3363957405090332


In [7]:
with open('./mm_comp_time_n{}_double.txt'.format(n), 'w') as f:
    print(('%s\t '*6 % ('DLL-K','DLL-cuBLAS','PyCUDA','Numba','TF','PyTorch')), end='\n', file=f)
    for i in range(niter):
        print(('%.4f\t '*6 % tuple(comp_time_mm[i])), end='\n', file=f)
