In [1]:
from ctypes import *
import numpy as np
from tqdm import tqdm
import time

In [2]:
libc_matmul32 = CDLL("/home/dyu/Dropbox/tf-notebooks/cuda_comp/mat_mul_example/using_dll/matmul_kernel_f.so")
matmul32 = libc_matmul32.MatMul

libc_matmul_cublas = CDLL("/home/dyu/Dropbox/tf-notebooks/cuda_comp/mat_mul_example/using_dll/matmul_cublas_f.so")
matmul_cublas = libc_matmul_cublas.MatMul_cuBLAS

In [3]:
matmul32.restype = None
matmul32.argtypes = (POINTER(c_float), POINTER(c_float),
                     POINTER(c_float), POINTER(c_int))

matmul_cublas.restype = None
matmul_cublas.argtypes = (POINTER(c_float), POINTER(c_float),
                          POINTER(c_float), POINTER(c_int))

In [4]:
import numpy.random

In [5]:
np.random.seed(2022)
n = 50
A = np.random.randn(n,n).astype(np.float32)
B = np.random.randn(n,n).astype(np.float32)
C = np.zeros_like(A).astype(np.float32)


In [6]:
A_cr = A.ravel(order='C').ctypes.data_as(POINTER(c_float))

In [7]:
C

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [8]:
N = A.shape[0]
N_c = c_int(N)
c_int(N)
A_cr = A.ravel(order='C').ctypes.data_as(POINTER(c_float))
B_cr = B.ravel(order='C').ctypes.data_as(POINTER(c_float))
C_cr = C.ravel(order='C').ctypes.data_as(POINTER(c_float))

t1 = time.time()
matmul32(A_cr, B_cr, C_cr, N_c)
t2 = time.time()

C
#C = C.reshape((N,N),order='C')
t2-t1


0.1393272876739502

In [9]:
C.shape

(50, 50)

In [10]:
C[0]

array([-4.2651687e+00,  4.4717889e+00,  2.8567147e+00,  1.1422411e+00,
       -6.4879460e+00, -4.8427286e+00, -6.4231071e+00,  4.0089183e+00,
       -4.5662951e+00, -3.9613001e+00,  3.1529100e+00, -2.8642142e+00,
        1.5266350e+01,  8.8474398e+00,  1.3822375e+01, -1.1303352e+01,
        1.7988281e+00,  1.3061628e+00, -9.8149627e-01, -1.2086369e+00,
        1.2391439e+01, -3.9272153e+00, -4.4011531e+00,  5.5740185e+00,
       -4.9696288e+00,  5.6897259e+00, -1.0580478e+01, -1.3111972e+01,
       -2.0447896e+00, -3.0077918e+00, -1.1668241e+00, -6.6540699e+00,
        3.6730716e+00, -5.8680353e+00,  6.0824609e+00,  9.8098640e+00,
       -3.2595092e-01, -4.4762979e+00, -3.1373355e-01, -9.2294300e-03,
        1.8256099e+00, -1.7101533e+00,  4.2715926e+00,  1.1261982e+01,
       -6.8811331e+00, -1.1625246e+00, -7.9430013e+00, -7.8587794e+00,
        4.6561626e-01,  1.0123331e+01], dtype=float32)

In [10]:
N = A.shape[0]
N_c = c_int(N)

A_cr = A.ctypes.data_as(POINTER(c_float))
B_cr = B.ctypes.data_as(POINTER(c_float))

C = np.zeros_like(A)
C_cr = C.ctypes.data_as(POINTER(c_float))

t1 = time.time()
matmul32(A_cr, B_cr, C_cr, N_c)
t2 = time.time()

C = C.reshape(N,N, order = "C")
t2-t1

0.0009253025054931641

In [11]:
C[0]

array([-4.2651687e+00,  4.4717889e+00,  2.8567147e+00,  1.1422411e+00,
       -6.4879460e+00, -4.8427286e+00, -6.4231071e+00,  4.0089183e+00,
       -4.5662951e+00, -3.9613001e+00,  3.1529100e+00, -2.8642142e+00,
        1.5266350e+01,  8.8474398e+00,  1.3822375e+01, -1.1303352e+01,
        1.7988281e+00,  1.3061628e+00, -9.8149627e-01, -1.2086369e+00,
        1.2391439e+01, -3.9272153e+00, -4.4011531e+00,  5.5740185e+00,
       -4.9696288e+00,  5.6897259e+00, -1.0580478e+01, -1.3111972e+01,
       -2.0447896e+00, -3.0077918e+00, -1.1668241e+00, -6.6540699e+00,
        3.6730716e+00, -5.8680353e+00,  6.0824609e+00,  9.8098640e+00,
       -3.2595092e-01, -4.4762979e+00, -3.1373355e-01, -9.2294300e-03,
        1.8256099e+00, -1.7101533e+00,  4.2715926e+00,  1.1261982e+01,
       -6.8811331e+00, -1.1625246e+00, -7.9430013e+00, -7.8587794e+00,
        4.6561626e-01,  1.0123331e+01], dtype=float32)

In [12]:
np.matmul(A,B)

array([[-4.2651687 ,  4.471789  ,  2.8567147 , ..., -7.8587794 ,
         0.46561626, 10.123331  ],
       [ 0.17802599, -7.3373866 , -9.555772  , ..., -2.558515  ,
        -5.6395454 ,  7.8322344 ],
       [-4.883615  ,  6.8135705 , -1.142484  , ...,  1.6628104 ,
         5.6487055 , -4.1760306 ],
       ...,
       [ 3.7956095 ,  2.950998  ,  0.71643484, ..., -8.90769   ,
        -4.7595043 ,  0.31343275],
       [-4.7041545 , -3.3876402 ,  5.749583  , ..., -1.8417509 ,
         3.3775368 , -5.8380384 ],
       [ 0.56850034,  5.466258  ,  0.26090938, ...,  6.9832067 ,
        11.622086  , -2.7714195 ]], dtype=float32)

In [14]:
Ac = A.ravel(order='F')
A_cc = Ac.ctypes.data_as(POINTER(c_float))
Bc = B.ravel(order='F')
B_cc = Bc.ctypes.data_as(POINTER(c_float))

Cc = np.zeros_like(A).ravel(order='F')
C_cc = Cc.ravel(order='F').ctypes.data_as(POINTER(c_float))

t1 = time.time()
matmul_cublas(A_cc, B_cc, C_cc, N_c)
t2 = time.time()

Cc = Cc.reshape(N,N, order = "F")
t2-t1


0.01983046531677246

In [70]:
!echo $LD_LIBRARY_PATH

/usr/local/cuda-11.1/lib64/:/home/dyu/TensorRT-6.0.1.5/lib/


In [71]:
#list(zip(A_c,B_c))

In [15]:
Cc[0]

array([-4.2651682e+00,  4.4717894e+00,  2.8567140e+00,  1.1422405e+00,
       -6.4879456e+00, -4.8427286e+00, -6.4231067e+00,  4.0089178e+00,
       -4.5662951e+00, -3.9613004e+00,  3.1529098e+00, -2.8642151e+00,
        1.5266349e+01,  8.8474407e+00,  1.3822375e+01, -1.1303354e+01,
        1.7988284e+00,  1.3061624e+00, -9.8149610e-01, -1.2086377e+00,
        1.2391439e+01, -3.9272158e+00, -4.4011540e+00,  5.5740180e+00,
       -4.9696283e+00,  5.6897240e+00, -1.0580479e+01, -1.3111970e+01,
       -2.0447900e+00, -3.0077927e+00, -1.1668243e+00, -6.6540704e+00,
        3.6730709e+00, -5.8680348e+00,  6.0824614e+00,  9.8098640e+00,
       -3.2595110e-01, -4.4762983e+00, -3.1373346e-01, -9.2297792e-03,
        1.8256099e+00, -1.7101532e+00,  4.2715921e+00,  1.1261980e+01,
       -6.8811326e+00, -1.1625242e+00, -7.9430017e+00, -7.8587799e+00,
        4.6561575e-01,  1.0123330e+01], dtype=float32)

In [14]:
import os
#os.getcwd()
os.listdir()

['MatMul_Example1_float.ipynb',
 'matmul_kernel_f.so',
 'matmul_cuda_float.cu',
 '.ipynb_checkpoints',
 'MatMul_Example1.ipynb',
 'matmul_kernel.so',
 'matmul_cublas.so',
 'matmul_cuda.cu',
 'matmul_cublas_float.cu',
 'old',
 'matmul_cublas.cu',
 'matmul_cublas_f.so']

In [16]:
import numpy as np

np.random.seed(2022)
A = np.random.randn(5,5)
B = np.random.randn(5,5)

lib_matmul = CDLL("./matmul_kernel.so")
matmul_ker = lib_matmul.MatMul

matmul_ker.restype = None
matmul_ker.argtypes = (POINTER(c_float), POINTER(c_float),
                       POINTER(c_float), POINTER(c_int))

n = A.shape[0]
nsq = n**2
n_c = c_int(n)

A_cr = A.ctypes.data_as(POINTER(c_float))
B_cr = B.ctypes.data_as(POINTER(c_float))

C = np.zeros_like(A)
C_cr = C.ctypes.data_as(POINTER(c_float))

matmul32(A_cr, B_cr, C_cr, n_c)

C = C.reshape(n,n, order = "C")


In [17]:
C

array([[ 1.9589207 , -0.18495393, -1.904801  ,  1.2021145 ,  3.5421216 ],
       [ 1.3897907 ,  0.30167198,  1.1877606 , -0.9365433 ,  0.6155618 ],
       [-3.2913256 ,  0.49356195,  5.7312856 ,  2.959039  , -7.895276  ],
       [ 0.9618915 , -1.7524593 , -1.2447041 ,  0.06135205,  3.6112087 ],
       [-3.1212094 , -1.935519  ,  0.15948515,  0.9054616 ,  1.6629438 ]],
      dtype=float32)

In [18]:
C_cr

<__main__.c_float_Array_25 at 0x7fd956f659d8>

In [19]:
C_cr[0:10]

[1.9589207172393799,
 -0.18495392799377441,
 -1.9048010110855103,
 1.202114462852478,
 3.542121648788452,
 1.389790654182434,
 0.30167198181152344,
 1.187760591506958,
 -0.9365432858467102,
 0.6155617833137512]

## Numerical Comparison

In [43]:
x = np.zeros(10)
x

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [11]:
from ctypes import *
import numpy as np
import time

libc_matmul32 = CDLL("/home/dyu/Dropbox/tf-notebooks/cuda_comp/mat_mul_example/using_dll/matmul_kernel_f.so")
matmul32 = libc_matmul32.MatMul

libc_matmul_cublas = CDLL("/home/dyu/Dropbox/tf-notebooks/cuda_comp/mat_mul_example/using_dll/matmul_cublas_f.so")
matmul_cublas = libc_matmul_cublas.MatMul_cuBLAS

matmul32.restype = None
matmul32.argtypes = (POINTER(c_float), POINTER(c_float),
                     POINTER(c_float), POINTER(c_int))

matmul_cublas.restype = None
matmul_cublas.argtypes = (POINTER(c_float), POINTER(c_float),
                          POINTER(c_float), POINTER(c_int))

np.random.seed(2022)
n = 2500
A = np.random.randn(n,n).astype(np.float32)
B = np.random.randn(n,n).astype(np.float32)
C = np.zeros_like(A).astype(np.float32)
niter = 11
comp_time_ker = np.zeros(niter)
comp_time_blas = np.zeros(niter)

for i in np.arange(niter):
    n_int = c_int(n)
    
    # kernel
    A_ra = A.ctypes.data_as(POINTER(c_float))
    B_ra = B.ctypes.data_as(POINTER(c_float))
    C_ra = C.ctypes.data_as(POINTER(c_float))

    t1 = time.time()
    matmul32(A_ra, B_ra, C_ra, n_int)
    t2 = time.time()
    comp_time_ker[i] = t2-t1
    
    
    # cublas
    A_c = A.ravel(order='F')
    A_ca = A_c.ctypes.data_as(POINTER(c_float))
    B_c = B.ravel(order='F')
    B_ca = B_c.ctypes.data_as(POINTER(c_float))
    C_c = C.ravel(order='F')
    C_ca = C_c.ctypes.data_as(POINTER(c_float))

    t1 = time.time()
    matmul_cublas(A_ca, B_ca, C_ca, n_int)
    t2 = time.time()
    comp_time_blas[i] = t2-t1
    
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_ker[i], ',  ', comp_time_blas[i],
          ', Total: ',comp_time_ker[i]+comp_time_blas[i])
    




  1 -th iteration, Collapsed Time:  0.1247706413269043 ,   0.2007429599761963 , Total:  0.3255136013031006

  2 -th iteration, Collapsed Time:  0.10743093490600586 ,   0.016197681427001953 , Total:  0.12362861633300781

  3 -th iteration, Collapsed Time:  0.08952689170837402 ,   0.0233461856842041 , Total:  0.11287307739257812

  4 -th iteration, Collapsed Time:  0.08809328079223633 ,   0.021433591842651367 , Total:  0.1095268726348877

  5 -th iteration, Collapsed Time:  0.08746576309204102 ,   0.016199111938476562 , Total:  0.10366487503051758

  6 -th iteration, Collapsed Time:  0.08689379692077637 ,   0.01614689826965332 , Total:  0.10304069519042969

  7 -th iteration, Collapsed Time:  0.08868932723999023 ,   0.017761945724487305 , Total:  0.10645127296447754

  8 -th iteration, Collapsed Time:  0.08796572685241699 ,   0.033212900161743164 , Total:  0.12117862701416016

  9 -th iteration, Collapsed Time:  0.08748602867126465 ,   0.023749828338623047 , Total:  0.1112358570098877



In [12]:
C.shape

(2500, 2500)

In [13]:
C[:5,:5]

array([[-12.897821 ,  25.157856 ,   6.22882  , -94.01637  , -42.02696  ],
       [ 38.15462  ,  28.627695 ,  32.507763 ,  20.756927 , -14.107093 ],
       [-76.4854   ,  14.265559 , -15.207016 , -45.09947  , -11.1243725],
       [ -3.3712728,  42.972343 ,  -4.7928786, -49.861485 , -66.20471  ],
       [-24.290062 ,  60.846813 ,  53.658283 ,  75.03848  , -64.98372  ]],
      dtype=float32)

In [3]:
np.round(np.mean(comp_time_ker[1:]),4), np.round(np.mean(comp_time_blas[1:]),4)

(0.0908, 0.0177)

In [5]:
0.0938, 0.0167

0.028599999999999987

In [4]:
0.1214/0.0161

7.540372670807453

In [4]:
C[:5,0]

array([-12.897821 ,  38.15462  , -76.4854   ,  -3.3712728, -24.290062 ],
      dtype=float32)

In [5]:
C_c[:5]

array([-12.897821 ,  38.15462  , -76.4854   ,  -3.3712728, -24.290062 ],
      dtype=float32)