# Notebook for Elementary Operation Experimentation

In [1]:
import numpy as np
import pycuda.gpuarray as gpuarray

from utils.context import Context


%load_ext autoreload
%autoreload 2

## Matrix Multiplication

In [2]:
BLOCK_SIZE = 32
context = Context(BLOCK_SIZE)

matrix_multiplication_kernel_path = './kernels/MatrixMultiplication.cu'
matrix_multiplication = context.getSourceModule(matrix_multiplication_kernel_path).get_function('MatMul')


### Test Cases

Sequential

In [None]:
a = np.random.randint(5, size=(28, 49))
b = np.random.randint(5, size=(49, 29))
c_python = np.matmul(a, b)

print(a)
print(b)
print(c_python)

Parallel

In [None]:
a = a.astype(np.float32)
b = b.astype(np.float32)
c = np.zeros((a.shape[0], b.shape[1])).astype(np.float32)

a_d = gpuarray.to_gpu(a)
b_d = gpuarray.to_gpu(b)
c_d = gpuarray.to_gpu(c)

block = context.block_dims
grid = context.grid_dims(max([a.shape[0], b.shape[1]]))

matrix_multiplication(a_d, b_d, c_d, np.int32(a.shape[0]), 
                      np.int32(a.shape[1]), np.int32(b.shape[0]), 
                      np.int32(b.shape[1]), np.int32(c.shape[0]), np.int32(c.shape[1]),
                      block=block, grid=grid)

c = c_d.get()

print(c)
print("Correct: ", np.allclose(c_python, c))

## Matrix Transpose

Now working.

In [None]:
matrix_transpose_kernel_path = './kernels/MatrixTranspose.cu'
matrix_transpose = context.getSourceModule(matrix_transpose_kernel_path).get_function('MatTran')


### Test Cases

In [None]:
a = np.random.randint(5, size=(40, 50))
a_t = a.T
print(a)
print()
print(a_t)

In [None]:
a = a.astype(np.float32)
b = np.zeros((a.shape[1], a.shape[0])).astype(np.float32)

a_d = gpuarray.to_gpu(a)
b_d = gpuarray.to_gpu(b)

block = context.block_dims
grid = context.grid_dims(max([a.shape[0], b.shape[1]]))

# print(block)
# print(grid)

matrix_transpose(a_d, b_d, np.int32(a.shape[1]), np.int32(a.shape[0]), block=block, grid=grid)

b = b_d.get()

print(a.shape)
print(b.shape)

print(b)
print(a_t)
print("Correct: ", np.allclose(a.T, b))

## Element Wise Multiplication

In [2]:
BLOCK_SIZE = 32
context = Context(BLOCK_SIZE)

element_multiplication_kernel_path = './kernels/ElementWise.cu'
element_multiplication = context.getSourceModule(element_multiplication_kernel_path).get_function('MatEleMul')


In [3]:
a = np.random.randint(5, size=(490000, 49))
b = np.random.randint(5, size=(490000, 49))
c_python = a * b

c = np.zeros((a.shape[0], b.shape[1])).astype(np.float32)


In [4]:
a = a.astype(np.float32)
b = b.astype(np.float32)

a_d = gpuarray.to_gpu(a)
b_d = gpuarray.to_gpu(b)
c_d = gpuarray.to_gpu(c)


block = context.block_dims
grid = context.grid_dims(max([a.shape[0], b.shape[1]]))

element_multiplication(a_d, b_d, c_d, np.int32(a.shape[0]), np.int32(a.shape[1]), block=block, grid=grid)

c = c_d.get()

print("Correct: ", np.allclose(c, c_python))

Correct:  True


## Element Wise Add

In [5]:
element_add = context.getSourceModule(element_multiplication_kernel_path).get_function('MatEleAdd')


In [6]:
a = np.random.randint(5, size=(490000, 49))
b = np.random.randint(5, size=(490000, 49))
c_python = a + b

c = np.zeros((a.shape[0], b.shape[1])).astype(np.float32)


In [7]:
a = a.astype(np.float32)
b = b.astype(np.float32)

a_d = gpuarray.to_gpu(a)
b_d = gpuarray.to_gpu(b)
c_d = gpuarray.to_gpu(c)


block = context.block_dims
grid = context.grid_dims(max([a.shape[0], b.shape[1]]))

element_add(a_d, b_d, c_d, np.int32(a.shape[0]), np.int32(a.shape[1]), block=block, grid=grid)

c = c_d.get()

print("Correct: ", np.allclose(c, c_python))

Correct:  True
