# Notebook for Parallel Matrix Multiplication Experimentation

In [6]:
import numpy as np
import pycuda.gpuarray as gpuarray

from utils.context import Context


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
BLOCK_SIZE = 32
matrix_multiplication_kernel_path = './kernels/MatrixMultiplication.cu'

context = Context(BLOCK_SIZE)

matrix_multiplication = context.getSourceModule(matrix_multiplication_kernel_path).get_function('MatMul')

### Test Cases

Sequential

In [9]:
a = np.random.randint(5, size=(2, 4))
b = np.random.randint(5, size=(4, 2))
c_python = np.matmul(a, b)

print(a)
print(b)
print(c_python)

[[2 3 3 3]
 [1 4 1 0]]
[[0 3]
 [2 1]
 [2 3]
 [0 4]]
[[12 30]
 [10 10]]


Parallel

In [13]:
a = a.astype(np.float32)
b = b.astype(np.float32)
c = np.zeros((a.shape[0], b.shape[1])).astype(np.float32)

a_d = gpuarray.to_gpu(a)
b_d = gpuarray.to_gpu(b)
c_d = gpuarray.to_gpu(c)

block = context.block_dims
grid = context.grid_dims(max([a.shape[0], b.shape[1]]))

matrix_multiplication(a_d, b_d, c_d, np.int32(a.shape[0]), 
                      np.int32(a.shape[1]), np.int32(b.shape[0]), 
                      np.int32(b.shape[1]), np.int32(c.shape[0]), np.int32(c.shape[1]),
                      block=block, grid=grid)

c = c_d.get()

print(c)


[[12. 30.]
 [10. 10.]]
