In [1]:
!pip install pycuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycuda
  Downloading pycuda-2022.2.2.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting appdirs>=1.4.0
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting pytools>=2011.2
  Downloading pytools-2022.1.14.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 KB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Building wheels fo

In [1]:
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np



In [2]:

ker = SourceModule('''
// row-column dot-product for matrix multiplication
__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
{
	float val = 0;
	
	for (int k=0; k < N; k++)
	{
        val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
	}
	
	return(val);

}

// matrix multiplication kernel that is parallelized over row/column tuples.
__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
{

    // for each (row,col) in the output_matrix, 
    // we calculate the entries of output_matrix parallel:
    int row = blockIdx.x*blockDim.x + threadIdx.x;
    int col = blockIdx.y*blockDim.y + threadIdx.y;

	output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);

}
''')



In [3]:
matrix_ker = ker.get_function('matrix_mult_ker')


In [4]:
test_a = np.float32([np.arange(1,5)] * 4)
test_b = np.float32([np.arange(14,10, -1)]*4 )


In [5]:
print(test_a)
print(test_b)

[[1. 2. 3. 4.]
 [1. 2. 3. 4.]
 [1. 2. 3. 4.]
 [1. 2. 3. 4.]]
[[14. 13. 12. 11.]
 [14. 13. 12. 11.]
 [14. 13. 12. 11.]
 [14. 13. 12. 11.]]


In [6]:
output_mat = np.matmul(test_a, test_b)
print(output_mat)

[[140. 130. 120. 110.]
 [140. 130. 120. 110.]
 [140. 130. 120. 110.]
 [140. 130. 120. 110.]]


In [7]:
test_a_gpu = gpuarray.to_gpu(test_a)
test_b_gpu = gpuarray.to_gpu(test_b)
output_mat_gpu = gpuarray.empty_like(test_a_gpu)


In [8]:
matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1))
assert(np.allclose(output_mat_gpu.get(), output_mat) )



In [9]:
print(output_mat_gpu.get())

[[140. 130. 120. 110.]
 [140. 130. 120. 110.]
 [140. 130. 120. 110.]
 [140. 130. 120. 110.]]
