# Introduction to GPU Programming with Python
## Solutions to notebook 4

### Main example: Matrix multiplication using Numba CUDA (in global memory only)

In [None]:
import numpy as np
from numba import cuda

In [None]:
#Part 1: Create matrices A,B,C as numpy arrays. Fill A and B with random numbers.
A=np.random.rand(512,512).astype(np.float32)
B=np.random.rand(512,512).astype(np.float32)
C=np.zeros(shape=(512,512)).astype(np.float32)

In [None]:
#Part 2: Calculate number of blocks and threads
NumThreads=32
NumBlocks = (C.shape[0]+(NumThreads-1))//NumThreads
blockdim = (NumThreads,NumThreads)
griddim = (NumBlocks,NumBlocks)
print(griddim,blockdim)

In [None]:
#Part 3: Create a CUDA kernel with @cuda.jit decorator
@cuda.jit
def matmul(A,B,C):
    i,j=cuda.grid(2)
    if i<C.shape[0] and j<C.shape[1]:
        tmp=0.0
        for k in range(A.shape[1]):
            tmp+=A[i,k]*B[k,j]
        C[i,j]=tmp

In [None]:
#Part 4: Call the kernel function and time it to get the execution time
%timeit matmul[griddim,blockdim](A,B,C)
print(C.dtype)

In [None]:
#Part 5: Create A,B,C manually on the GPU and copy data to the GPU arrays
d_A=cuda.to_device(A)
d_B=cuda.to_device(B)
d_C=cuda.to_device(C)

In [None]:
#Part 6: Call the kernel function and time it to get the execution time. Compare the execution times.
%timeit matmul[griddim,blockdim](d_A,d_B,d_C)

### Exercise: Incrementation of array elements
In the following exercise each element of an array is incremented : array[i] = array[i] + 1

In [None]:
import numpy as np
from numba import cuda

In [None]:
@cuda.jit
def increment(array):
    pos = cuda.grid(1)
    if pos<array.size:
        array[pos] += 1

In [None]:
data=np.ones(12800,dtype=np.int32)
NumThreads=16
NumBlocks = (data.size + (NumThreads - 1)) // NumThreads
print(NumBlocks,NumThreads)

In [None]:
# Run the kernel and measure execution time:
%timeit increment[NumBlocks,NumThreads](data)

In [None]:
# Take advatage of excplicit data management and copy an array to GPU before kernel execution. 
# Then measure the execution time again
d_data = cuda.to_device(data)
%timeit increment[NumBlocks,NumThreads](d_data)
print(data)

### Exercise: Reversal of array elements
Here an integer array is sent to GPU where its indices are reversed, i.e. array[0]=array[N-1], array[1]=array[N-2], etc.

In [None]:
# Import required libs
import numpy as np
from numba import cuda, float32

In [None]:
#Part 3: Create a CUDA kernel with @cuda.jit decorator
# Kernel: reverse the array content using appropriate indices. 
# To do so you may need input and output indices. Implement kernel with possibility of multiple thread blocks.
@cuda.jit
def reverseArray(d_out,d_in):
    ind_in = cuda.blockDim.x*cuda.blockIdx.x + cuda.threadIdx.x; ## Index of the current thread
    ind_out = cuda.gridsize(1)-ind_in-1 ## Total number of threads - in -1
    if ind_in<d_in.size:
        d_out[ind_out] = d_in[ind_in]

In [None]:
# Define CUDA grid
dim=256*1000
NumThreads=128
NumBlocks = (dim + (NumThreads - 1)) // NumThreads
print(NumBlocks,NumThreads)

In [None]:
#Part 1: Create arrays on CPU and GPU (if you want to)
a = np.arange(0,dim,dtype=np.int32)
b = np.zeros(dim,dtype=np.int32)

In [None]:
#Part 2: Initialize host array
# Already initialized

In [None]:
#Part 4: Call the kernel function
%timeit reverseArray[NumBlocks,NumThreads](b,a)

In [None]:
#Part 5: Verify the result
print(b)

In [None]:
#Part 5: Take advantage of explicit data management
d_a = cuda.to_device(a)
d_b = cuda.device_array_like(b)
%timeit reverseArray[NumBlocks,NumThreads](d_b,d_a)
b = d_b.copy_to_host()
print(b)