In [155]:
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

from cusprec.constants import KERNEL_PATH, BLOCK_SIZE

In [164]:
with open(KERNEL_PATH / "algorithms" / "ops.cu") as kernel_file:
    kernel = kernel_file.read()

N = 256

mod = SourceModule(kernel)
dot_product = mod.get_function("dot")

a = np.random.randint(0, 1000, N).astype(np.int32)
b = np.random.randint(0, 1000, N).astype(np.int32)
c = np.empty(1, dtype=np.int32)

dev_a = cuda.mem_alloc(a.nbytes)
dev_b = cuda.mem_alloc(b.nbytes)
dev_c = cuda.mem_alloc(c.nbytes)

cuda.memcpy_htod(dev_a, a)
cuda.memcpy_htod(dev_b, b)

grid = (N + BLOCK_SIZE[0] - 1) // BLOCK_SIZE[0]
dot_product(dev_a, dev_b, dev_c, block=BLOCK_SIZE, grid=(grid, 1))
cuda.memcpy_dtoh(c, dev_c)
(c == np.dot(a, b))[0]

True

In [165]:
add = mod.get_function("add")

a = np.random.randint(0, 1000, N).astype(np.int32)
b = np.random.randint(0, 1000, N).astype(np.int32)
c = np.empty(N, dtype=np.int32)

dev_a = cuda.mem_alloc(a.nbytes)
dev_b = cuda.mem_alloc(b.nbytes)
dev_c = cuda.mem_alloc(c.nbytes)

cuda.memcpy_htod(dev_a, a)
cuda.memcpy_htod(dev_b, b)

grid = (N + BLOCK_SIZE[0] - 1) // BLOCK_SIZE[0]
add(dev_a, dev_b, dev_c, block=BLOCK_SIZE, grid=(grid, 1))
cuda.memcpy_dtoh(c, dev_c)
(c == np.add(a, b)).all()

True

In [166]:
sub = mod.get_function("sub")

a = np.random.randint(0, 1000, N).astype(np.int32)
b = np.random.randint(0, 1000, N).astype(np.int32)

dev_a = cuda.mem_alloc(a.nbytes)
dev_b = cuda.mem_alloc(b.nbytes)
dev_c = cuda.mem_alloc(a.nbytes)

cuda.memcpy_htod(dev_a, a)
cuda.memcpy_htod(dev_b, b)

sub(dev_a, dev_b, dev_c, block=BLOCK_SIZE, grid=(grid, 1))
c = np.empty(N, dtype=np.int32)
cuda.memcpy_dtoh(c, dev_c)
(c == np.subtract(a, b)).all()

True

In [167]:
scalar_multiply = mod.get_function("scalarMultiply")

scalar = 2.5
a = np.random.randint(0, 1000, N).astype(np.float32)

dev_a = cuda.mem_alloc(a.nbytes)
dev_c = cuda.mem_alloc(a.nbytes)

cuda.memcpy_htod(dev_a, a)

scalar_multiply(dev_a, np.float32(scalar), dev_c, block=BLOCK_SIZE, grid=(grid, 1))
c = np.empty(N, dtype=np.float32)
cuda.memcpy_dtoh(c, dev_c)
(c == a * scalar).all()

True