In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray, tools, cumath
import math
import numpy as np
import time
from tqdm import tqdm

In [2]:
import os
os.environ["PATH"]


'/home/dyu/anaconda3/bin:/usr/texbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/usr/local/cuda-11.1/bin'

In [3]:
matmul_kernel_code = """
__global__ void d_MatMul(double *A, double *B, double *C, int n)
{
	
	int i = blockIdx.y * blockDim.y + threadIdx.y;
	int j = blockIdx.x * blockDim.x + threadIdx.x;

	if ((i<n) && (j<n)) {
        double Cvalue = 0.0;
		for (int k = 0; k < n; ++k) {
			Cvalue += A[i*n+k]*B[k*n+j];
		}
		C[i*n+j]=Cvalue;
	}
}
"""

In [4]:
with open('./matmul_pycuda.cu') as f:
    matmul_kernel = f.read()
    #print(matmul_kernel)

In [5]:
mod = SourceModule(matmul_kernel)
matmul_pycuda = mod.get_function("d_MatMul")

In [6]:
np.random.seed(2022)
n = 50
A = np.random.randn(n,n)
B = np.random.randn(n,n)
C = np.zeros_like(A)

In [7]:
d_A = gpuarray.to_gpu(A.astype(np.float64))
d_B = gpuarray.to_gpu(B.astype(np.float64))
d_C = gpuarray.to_gpu(C.astype(np.float64))

In [8]:
TPB = (32, 32, 1)
block = math.ceil(n//32)#(np.ceil(n / 32)).astype(np.int32).item()
BPG = (block, block, 1)

In [9]:
matmul_pycuda(d_A,d_B,d_C, np.int32(n), block = TPB, grid = BPG)

In [16]:
d_C[1:5,1]

array([-7.33738934,  6.81357067,  1.03319585,  1.61174989])

In [17]:
np.dot(A,B)[1:5,1]

array([-7.33738934,  6.81357067,  1.03319585,  1.61174989])

In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import time
import numpy as np

with open('./matmul_pycuda.cu') as f:
    matmul_kernel = f.read()
    
mod = SourceModule(matmul_kernel)
matmul_pycuda = mod.get_function("d_MatMul")

np.random.seed(2022)
n = 2500
A = np.random.randn(n,n)
B = np.random.randn(n,n)
C = np.zeros_like(A)
niter = 10
comp_time_pycuda = np.zeros(niter)

for i in np.arange(niter):

    t1 = time.time()
    dA = gpuarray.to_gpu(A.astype(np.float64))
    dB = gpuarray.to_gpu(B.astype(np.float64))
    dC = gpuarray.to_gpu(C.astype(np.float64))
    TPB = (32, 32, 1)
    block = (np.ceil(n / 32)).astype(np.int32).item()
    BPG = (block, block, 1)
    matmul_pycuda(dA,dB,dC, np.int32(n), block = TPB, grid = BPG)
    t2 = time.time()
    comp_time_pycuda[i] = t2-t1
        
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_pycuda[i])
    



  1 -th iteration, Collapsed Time:  0.05661153793334961

  2 -th iteration, Collapsed Time:  0.19960665702819824

  3 -th iteration, Collapsed Time:  0.183197021484375

  4 -th iteration, Collapsed Time:  0.16542315483093262

  5 -th iteration, Collapsed Time:  0.1697826385498047

  6 -th iteration, Collapsed Time:  0.1664140224456787

  7 -th iteration, Collapsed Time:  0.16842961311340332

  8 -th iteration, Collapsed Time:  0.16636300086975098

  9 -th iteration, Collapsed Time:  0.16644835472106934

  10 -th iteration, Collapsed Time:  0.16836333274841309


In [2]:
np.round(np.mean(comp_time_pycuda),4)

0.1611

In [19]:
t1 = time.time()
D = A.dot(B)
t2 = time.time()
t2-t1

0.06563472747802734

In [22]:
A.shape

(2500, 2500)