In [2]:
# function for checking accuracy
def check_accuracy(arr_f, arr_s):
    deltas = []
    if np.array_equal(arr_f, arr_s):
        return [1.0, 0.0]
    for i in range(len(arr_f)):
        if arr_f[i] != arr_s[i]:
            deltas.append(max(arr_f[i], arr_s[i]) - min(arr_f[i], arr_s[i]))
    return [(len(arr_f) - len(deltas))/len(arr_f), np.sum(deltas)/len(deltas)]

In [48]:
# Main framework code
import pyopencl as cl
from time import time # Import time tools


class Framework:
    def __init__(self):
        self.f_ctx = cl.create_some_context()
        self.f_queue = cl.CommandQueue(self.f_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
        self.sumOp = ArraySum(self.f_ctx)
        self.ReLU = ReLU(self.f_ctx)
        
    def test(self, inp1, inp2, expected, operation):    
        mf = cl.mem_flags
        f_a_g = cl.Buffer(self.f_ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=inp1)
        f_b_g = cl.Buffer(self.f_ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=inp2)
        f_res_g = cl.Buffer(self.f_ctx, mf.WRITE_ONLY, inp1.nbytes)
        
        if (operation == "sum"):
            f_event = self.sumOp.__call__(self.f_queue, inp1, f_a_g, f_b_g, f_res_g)
            cpu_stats = self.sumOp.sum_on_cpu(inp1, inp2) # CPU
        elif (operation == "relu"):
            f_event = self.ReLU.__call__(self.f_queue, inp1, f_a_g, f_res_g)
            cpu_stats = self.ReLU.relu_on_cpu(inp1) # CPU
        
        f_event.wait()
        elapsed = 1e-9*(f_event.profile.end - f_event.profile.start) # Calculate the time it took to execute the kernel
        mem_bw = (inp1.nbytes + inp2.nbytes)/(elapsed*1024*1024*1024)
        print("GPU Kernel Time: {0}s".format(elapsed) + ", " + str(mem_bw) + " Gb/s") # Print the time it took to execute the kernel
        f_res_np = np.zeros_like(inp1) # TODO: why 'inp1' and not 'expected'? Program crash with 'expected'
        cl.enqueue_copy(self.f_queue, f_res_np, f_res_g)
        accur = check_accuracy(expected, f_res_np)
        print("Testing results:\nResult is %g percent accurate, delta = %f" % (accur[0]*100, accur[1]))
        
        # CPU
        cpu_time = cpu_stats[2] - cpu_stats[1]
        mem_cpu = (inp1.nbytes + inp2.nbytes)/(cpu_time*1024*1024*1024)
        print("CPU Time: {0}s".format(cpu_time) + ", " + str(mem_cpu) + " Gb/s")
        cpu_accur = check_accuracy(expected, cpu_stats[0])
        print("Testing results:\nResult is %g percent accurate, delta = %f" % (cpu_accur[0]*100, cpu_accur[1]))

In [53]:
# Testing of framework
import numpy as np
import pyopencl as cl

framework = Framework()
a_np = np.random.rand(5000000).astype(np.float32)
b_np = np.random.rand(5000000).astype(np.float32)
perfect_res = []
for i in range(len(a_np)):
    perfect_res.append(a_np[i] + b_np[i])
    
relu_np = np.random.rand(15).astype(np.float32) * 5 - 3
perfect_res_relu = ReLU(cl.create_some_context()).relu_on_cpu(relu_np)[0]

print("ArrayAdd:")
framework.test(a_np, b_np, perfect_res, "sum")
print("\nReLU:")
framework.test(relu_np, np.array([0]), perfect_res_relu, "relu") # TODO: np.array([0]) - is stupid, need to be fixed


ArrayAdd:
GPU Kernel Time: 0.032026s, 1.1632081116786093 Gb/s
Testing results:
Result is 100 percent accurate, delta = 0.000000
CPU Time: 1.653979778289795s, 0.022523191319266563 Gb/s
Testing results:
Result is 100 percent accurate, delta = 0.000000

ReLU:
GPU Kernel Time: 1.5e-05s, 0.004221995671590169 Gb/s
Testing results:
Result is 100 percent accurate, delta = 0.000000
CPU Time: 9.34600830078125e-05s, 0.0006776147959183673 Gb/s
Testing results:
Result is 100 percent accurate, delta = 0.000000


In [37]:
# possible operations representation way
class ArraySum:
    def __init__(self, sum_ctx):
        self.sum_prg = cl.Program(sum_ctx, """
        __kernel void sum(
            __global const float *a_g, __global const float *b_g, __global float *res_g)
        {
          int gid = get_global_id(0);
          res_g[gid] = a_g[gid] + b_g[gid];
        }
        """).build()
        
    def __call__(self, sum_queue, sum_a_np, sum_a_g, sum_b_g, sum_res_g):
        return self.sum_prg.sum(sum_queue, sum_a_np.shape, None, sum_a_g, sum_b_g, sum_res_g)
    # CPU
    def sum_on_cpu(self, sum_a_g, sum_b_g):
        res_cpu = np.empty_like(sum_a_g)
        cpu_start_time = time() # Get the CPU start time
        for i in range(len(sum_a_g)):
            res_cpu[i] = sum_a_g[i] + sum_b_g[i]
        cpu_end_time = time()
        return [res_cpu, cpu_start_time, cpu_end_time]
    
class ReLU:
    def __init__(self, relu_ctx):
        self.relu_prg = cl.Program(relu_ctx, """
        __kernel void relu(
            __global const float *a_g, __global float *res_g)
        {
          int gid = get_global_id(0);
          if (a_g[gid] < 0)
              res_g[gid] = 0;
          else res_g[gid] = a_g[gid];
        }
        """).build()
        
    def __call__(self, relu_queue, relu_a_np, relu_a_g, relu_res_g):
        return self.relu_prg.relu(relu_queue, relu_a_np.shape, None, relu_a_g, relu_res_g)
    
    def relu_on_cpu(self, relu_a_g):
        res_cpu = np.empty_like(relu_a_g)
        cpu_start_time = time() # Get the CPU start time
        for i in range(len(relu_a_g)):
            res_cpu[i] = relu_a_g[i] if (relu_a_g[i] > 0) else 0
        cpu_end_time = time()
        return [res_cpu, cpu_start_time, cpu_end_time]