# Let's Start

In [3]:
import pyopencl as cl
import numpy as np 

In [4]:
opencl_platforms = [p.name for p in cl.get_platforms()]
print(opencl_platforms)

['NVIDIA CUDA', 'AMD Accelerated Parallel Processing', 'OpenCLOn12']


In [7]:
nvidia_platform = [p for p in cl.get_platforms() if p.name == "NVIDIA CUDA"][0]
nvidia_devices = nvidia_platform.get_devices()
nvidia_devices

[<pyopencl.Device 'NVIDIA GeForce GTX 1650' on 'NVIDIA CUDA' at 0x2878c1421f0>]

### Building Programs 

Using the PyOpenCL to create a OpenCL context, then declaring OPENCL kernel code and compiling it 

#### The code is for a simple vector sum c = a + b

In [8]:
nvidia_context = cl.Context(devices = nvidia_devices)

program_source = """
    kernel void sum(global float *a, global float *b, global float *c){
        int gid = get_global_id(0);
        c[gid] = a[gid] + b[gid];
    }
"""

nvidia_program_source = cl.Program(nvidia_context, program_source)
nvidia_program = nvidia_program_source.build()

In [11]:
program_kernel_names = nvidia_program.get_info(cl.program_info.KERNEL_NAMES)
print("Kernel Names:", program_kernel_names)

Kernel Names: sum


In [12]:
def run_ocl_kernel(queue, kernel, global_size, input_tuples, output_tuples, local_size = (32,)):
    
    # Copying data onto the device 
    for (array, buffer) in input_tuples:
        cl.enqueue_copy(queue, src = array, dest = buffer)
    
    # Running program on the device 
    kernel_arguments = [buffer for (_, buffer) in input_tuples]
    kernel_arguments += [buffer for (_, buffer) in output_tuples]
    
    kernel(queue, global_size, local_size, *kernel_arguments)
    
    # Copying data off the device
    for (arr, buffer) in output_tuples:
        cl.enqueue_copy(queue, src = buffer, dest = arr)
        
    # waiting for everything to finish
    queue.finish()

In [13]:
def check_sum_results(a,b,c):
    c_ref = a + b
    err = np.abs(c - c_ref)
    if (err.sum() > 0.0).any() :
        print("Result does not match")
    else:
        print("Result Matches")

### Preparing Synthetic Data

In [14]:
N = int(2**20)
a = np.random.rand(N).astype(np.float32)
b = np.random.rand(N).astype(np.float32)
c = np.empty_like(a)

### Device Memory Setup

In [15]:
a_nvidia_buffer = cl.Buffer(nvidia_context, flags = cl.mem_flags.READ_ONLY, size = a.nbytes)
b_nvidia_buffer = cl.Buffer(nvidia_context, flags = cl.mem_flags.READ_ONLY, size = b.nbytes)
c_nvidia_buffer = cl.Buffer(nvidia_context, flags = cl.mem_flags.WRITE_ONLY, size = c.nbytes)

In [16]:
nvidia_queue = cl.CommandQueue(nvidia_context)

In [18]:
input_tuples = ((a, a_nvidia_buffer), (b, b_nvidia_buffer))
output_tuples = ((c, c_nvidia_buffer),)
run_ocl_kernel(nvidia_queue, nvidia_program.sum, (N,), input_tuples, output_tuples)

In [19]:
check_sum_results(a, b ,c)

Result Matches


In [23]:
%timeit run_ocl_kernel(nvidia_queue, nvidia_program.sum, (N,), input_tuples, output_tuples)

4.66 ms ± 26.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [25]:
# LAPTOP CHARGE ON XD

%timeit run_ocl_kernel(nvidia_queue, nvidia_program.sum, (N,), input_tuples, output_tuples)

4.48 ms ± 209 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Using GLOBAL MEMORY

In [35]:
def create_input_memory(context, input_array):
    return [(array, cl.Buffer(context, flags = cl.mem_flags.READ_ONLY, size = array.nbytes)) for array in input_array]

In [36]:
def create_output_memory(context, output_array):
    return [(array, cl.Buffer(context, flags = cl.mem_flags.WRITE_ONLY, size = array.nbytes)) for array in output_array]

In [37]:
N = int(2**20)
a = np.random.rand(N).astype(np.float32)
b = np.random.rand(N).astype(np.float32)
c = np.empty_like(a)

In [38]:
input_tuples = create_input_memory(nvidia_context, (a, b, ))
output_tuples = create_output_memory(nvidia_context, (c,))
run_ocl_kernel(nvidia_queue, nvidia_program.sum, (N,), input_tuples, output_tuples)
check_sum_results(a,b,c)

Result Matches
