# PyOpenCL

In [1]:
import pyopencl as cl
import pyopencl.array
import numpy as np

In [2]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

## Elementwise Kernel

A generalized kernel generator for elementwise operations.  
No need to worry about work group sizes.  
Similar to C. Minimal OpenCL stuff!

### The square problem

In [3]:
from pyopencl.elementwise import ElementwiseKernel

In [4]:
a_g = cl.array.arange(queue, 0, 10, 1, dtype=np.float32)

In [5]:
src = """
    float a_i = a[i];
    a[i] = a_i * a_i;
"""

square = ElementwiseKernel(ctx, "float* a", src)

In [6]:
square(a_g)

<pyopencl.cffi_cl.Event at 0x1152945d0>

In [7]:
print(a_g)

[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]


Each operation on an array has a kernel call.  
For example,

In [32]:
a_g = 1 + cl.array.zeros(queue, 10**7, dtype=np.float32)
b_g = 1 + cl.array.zeros(queue, 10**7, dtype=np.float32)

In [33]:
import time

start = time.time()

for i in range(10):
    c_g = 2 * a_g + 3 * b_g

queue.finish()

end = time.time()

print(c_g)
print("Time taken: %f" % (end - start))

[ 5.  5.  5. ...,  5.  5.  5.]
Time taken: 2.237150


In [35]:
lincomb = ElementwiseKernel(ctx, "float a, float* x, float b, float* y, float* res",
                            "res[i] = a * x[i] + b * y[i];")

In [36]:
start = time.time()

c_g = cl.array.empty(queue, 10**7, dtype=np.float32)
for i in range(10):
    lincomb(2, a_g, 3, b_g, c_g)

queue.finish()

end = time.time()

print(c_g)
print("Time taken: %f" % (end - start))

[ 5.  5.  5. ...,  5.  5.  5.]
Time taken: 0.950778
