Writing a naive OpenCL kernel for matmul and comparing that kernel with numpy's CPU implementation.

In [1]:
import pyopencl as cl 
import numpy as np
import time
import timeit

In [2]:
ctx = cl.create_some_context()
# to time more precisely/fairly, enabling profile.
queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) 

In [3]:
krnl = """
__kernel void mat_mul_multi_core(__global float *res, __global float *first, __global float *second){
    int idx_row = get_global_id(0);
    int idx_col = get_global_id(1);
    int size = get_global_size(0);
    // using memory closest to the core is probably better than using global memory. have to think 
    // about this more though. 
    float val = 0; 
    for(int i=0; i<size; ++i){
       //btw, memory is contiguously allocated, so indexing like this. Stride = (size, 1).
        val += first[size*idx_row+i] * second[size*i+idx_col];
    }
    res[size*idx_row+idx_col] = val;
}
"""
krnl_prog = cl.Program(ctx, krnl).build()

In [4]:
mf = cl.mem_flags

sz = 4096 # the size of the matrix is sz x sz. you can play with this. 

first_host = np.random.uniform(0, 1, size = (sz, sz)).astype(np.float32)
first_device = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=first_host)

second_host = np.random.uniform(0, 1, size = (sz, sz)).astype(np.float32)
second_device = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=second_host)

result = cl.Buffer(ctx, mf.WRITE_ONLY, first_host.nbytes)

https://man.opencl.org/clGetEventProfilingInfo.html : OpenCL profiling options.

In [5]:
krnl_second_obj = krnl_prog.mat_mul_multi_core

start_time = time.perf_counter()

event = krnl_second_obj(queue, (sz, sz), (16, 16), result, first_device, second_device)
event.wait()

end_time = time.perf_counter()

"""
# if you want to see the result of the computation.
res_np = np.empty_like(first_host)
cl.enqueue_copy(queue, res_np, result)
print(res_np)
"""

# in ths measurement, there is probably extra overhead because of pyopencl. i mean it is a wrapper on 
# top of the actual OpenCL API written in C right? but even with this, gpu_speed and gpu_speed_3  
# tend to be pretty close. in actual practice, you'll probably write purley in C, or using a low level 
# library like tinygrad's gpuctypes for instance. so, i think using gpu_speed_3 is going
# to be more accurate. gpu_speed probably also measured the time taken to enqueue the command, 
# but that time is probably insignificant anyways. 
gpu_speed = (
    end_time - start_time
) 

# difference between start of execution of kernels and end of execution of kernels.
# the time is in nanosecond, thus the * 1e-9. 
gpu_speed_2 = (
    event.profile.end - event.profile.start
) * 1e-9 

# difference between the time when the command is enqueued till the command
# is over (including all the child commands corresponding to the given command.)
gpu_speed_3 = (
    event.profile.complete - event.profile.queued
) * 1e-9 
 
# for larger matrices, all of these are more or less the same; the difference isn't significant. 
f"GPU speeds measured in different ways = {gpu_speed, gpu_speed_2, gpu_speed_3}"

'GPU speeds measured in different ways = (5.60436337502324, 5.56757392, 5.604012)'

In [6]:
start_time = time.perf_counter()

res = first_host @ second_host # numpy's matmul. This occurs on the CPU. 

end_time = time.perf_counter()
    
CPU_speed =  end_time - start_time

# np.show_config()

f"CPU speed = {CPU_speed}"

'CPU speed = 1.0374637419881765'

In [7]:
if CPU_speed < gpu_speed_3:
    speedup = round(gpu_speed_3 / CPU_speed)
    if speedup == 1:
        print(f"They are roughly the same speed. CPU is a bit faster.")
    else:
        print(f"CPU is roughly {speedup}x as fast as the GPU.")
else:
    speedup = round(CPU_speed / gpu_speed_3)
    if speedup == 1:
        print(f"They are roughly the same speed. GPU is a bit faster.")
    else:
        print(f"GPU is roughly {speedup}x fast as the CPU.")

CPU is roughly 5x as fast as the GPU.


You can play with different sizes of the matrices, the range of numbers in the matrices, workgroup sizes, etc. In general, what you find is that numpy's CPU implementation outperforms the GPU irrespective of what you do, for this kernel at least. Sometimes the CPU can be inconsistent, probably because there is a bunch of stuff running on the CPU while the GPU is mostly idle, but nothing significant. In some configurations, their speeds are roughly the same or GPU wins by a very insignificant margin, while in most configurations (and practical scenarios), the CPU is significantly better. Numpy's is just better. Maybe a different kernel would do better. Will try. 