# Debugging OpenCL applications

In [None]:
import numpy as np

from matplotlib import pyplot as plt

%matplotlib widget

# A is of size (NROWS_C, NCOLS_A)
# B is of size (NCOLS_A, NCOLS_C)    
# C is of size (NROWS_C, NCOLS_C)

NCOLS_A = 256
NROWS_C = 520
NCOLS_C = 1032

# Data type
dtype = np.float32

# Make up the arrays A, B, and C
A = np.random.random(size = (NROWS_C, NCOLS_A)).astype(dtype)
B = np.random.random(size = (NCOLS_A, NCOLS_C)).astype(dtype)

# Make up the answer
C = np.matmul(A, B, dtype = dtype)

# Write out the arrays as binary files
A.tofile("array_A.dat")
B.tofile("array_B.dat")

In [None]:
!make clean; make; ./mat_mult_buggy.exe

In [5]:
# Import axes machinery
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Read in the output from OpenCL
C_ocl = np.fromfile("array_C.dat", dtype=dtype).reshape((NROWS_C, NCOLS_C))

# Make plots
fig, axes = plt.subplots(3, 1, figsize=(6,8), sharex=True, sharey=True)

# Data to plot
data = [C, C_ocl, np.abs(C-C_ocl)]

# Labels to plot
labels = ["Numpy", "OpenCL", "Absolute residual"]

for n, value in enumerate(data):
    # Plot the graph
    ax = axes[n]
    im = ax.imshow(value)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.1)

    # Set labels on things
    ax.set_xlabel("Dimension 1 (columns)")
    ax.set_ylabel("Dimension 0 (rows)")
    ax.set_title(labels[n])

    # Put a color bar on the plot
    plt.colorbar(mappable=im, cax=cax)

fig.tight_layout()
plt.show()

## Debugging with printf

## Debugging with GDB

## Debugging and validation with OCLGrind

In [None]:
!oclgrind ./mat_mult_buggy.exe

	               name: Oclgrind Simulator 
	 global memory size: 134 MB
	    max buffer size: 134 MB
	     max local size: (1024,1024,1024)
	     max work-items: 1024

Invalid read of size 4 at global memory address 0x1000000082000
	Kernel: mat_mult
	Entity: Global(520,0,0) Local(8,0,0) Group(32,0,0)
	  %0 = load float, float addrspace(1)* %arrayidx, align 4, !dbg !54, !tbaa !47
	At line 30 (column 19) of input.cl:
	  temp+=A[i0*N1_A+n]*B[n*N1_C+i1]; 
	

Invalid read of size 4 at global memory address 0x1000000082004
	Kernel: mat_mult
	Entity: Global(520,0,0) Local(8,0,0) Group(32,0,0)
	  %0 = load float, float addrspace(1)* %arrayidx, align 4, !dbg !54, !tbaa !47
	At line 30 (column 19) of input.cl:
	  temp+=A[i0*N1_A+n]*B[n*N1_C+i1]; 
	

Invalid read of size 4 at global memory address 0x1000000082008
	Kernel: mat_mult
	Entity: Global(520,0,0) Local(8,0,0) Group(32,0,0)
	  %0 = load float, float addrspace(1)* %arrayidx, align 4, !dbg !54, !tbaa !47
	At line 30 (column 19) of input.cl:


## Exercise