In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray, tools, cumath
import numpy as np
import time
from tqdm import tqdm


In [2]:
import os
os.environ["PATH"]

'/home/dyu/anaconda3/bin:/usr/texbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/usr/local/cuda-11.1/bin'

In [3]:
matmul_kernel_code = """
__global__ void d_MatMul(double *A, double *B, double *C, int n)
{
	
	int i = blockIdx.y * blockDim.y + threadIdx.y;
	int j = blockIdx.x * blockDim.x + threadIdx.x;

	if ((i<n) && (j<n)) {
        double Cvalue = 0.0;
		for (int k = 0; k < n; ++k) {
			Cvalue += A[i*n+k]*B[k*n+j];
		}
		C[i*n+j]=Cvalue;
	}
}
"""

In [4]:
with open('./matmul_pycuda_float.cu') as f:
    matmul_kernel = f.read()
    #print(matmul_kernel)

In [5]:
mod = SourceModule(matmul_kernel)
matmul_pycuda = mod.get_function("d_MatMul")

In [6]:
np.random.seed(2022)
n = 50
A = np.random.randn(n,n)
B = np.random.randn(n,n)
C = np.zeros_like(A)

In [7]:
d_A = gpuarray.to_gpu(A.astype(np.float32))
d_B = gpuarray.to_gpu(B.astype(np.float32))
d_C = gpuarray.to_gpu(C.astype(np.float32))

In [14]:
TPB = (32, 32, 1)
block = (np.ceil(n / 32)).astype(np.int32).item()
BPG = (block, block, 1)

In [15]:
matmul_pycuda(d_A,d_B,d_C, np.int32(n), block = TPB, grid = BPG)
#cuda.Context.synchronize()

In [16]:
d_C[1:5,1]

array([-7.3373866,  6.8135705,  1.0331956,  1.6117499], dtype=float32)

In [17]:
del d_A, d_B, d_C

In [18]:
np.dot(A,B)[1:5,1]

array([-7.33738934,  6.81357067,  1.03319585,  1.61174989])

In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray, tools, cumath
import numpy as np
import time


with open('./matmul_pycuda_float.cu') as f:
    matmul_kernel = f.read()

mod = SourceModule(matmul_kernel)
matmul_pycuda = mod.get_function("d_MatMul")

np.random.seed(2022)
n = 2500
A = np.random.randn(n,n).astype(np.float32)
B = np.random.randn(n,n).astype(np.float32)
C = np.zeros_like(A).astype(np.float32)
niter = 11
comp_time_pycuda = np.zeros(niter)

for i in np.arange(niter):

    t1 = time.time()
    dA = gpuarray.to_gpu(A)
    dB = gpuarray.to_gpu(B)
    dC = gpuarray.to_gpu(C)
    TPB = (32, 32, 1)
    block = (np.ceil(n / 32)).astype(np.int32).item()
    BPG = (block, block, 1)
    matmul_pycuda(dA,dB,dC, np.int32(n), block = TPB, grid = BPG)
    cuda.Context.synchronize()
    t2 = time.time()
    comp_time_pycuda[i] = t2-t1
    
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_pycuda[i])
    



  1 -th iteration, Collapsed Time:  0.0871882438659668

  2 -th iteration, Collapsed Time:  0.0982816219329834

  3 -th iteration, Collapsed Time:  0.0891573429107666

  4 -th iteration, Collapsed Time:  0.09219098091125488

  5 -th iteration, Collapsed Time:  0.08936619758605957

  6 -th iteration, Collapsed Time:  0.09166669845581055

  7 -th iteration, Collapsed Time:  0.08961367607116699

  8 -th iteration, Collapsed Time:  0.09017777442932129

  9 -th iteration, Collapsed Time:  0.11772894859313965

  10 -th iteration, Collapsed Time:  0.09645867347717285

  11 -th iteration, Collapsed Time:  0.09285187721252441


In [2]:
np.round(np.mean(comp_time_pycuda),4)

0.0834