# Introduction to GPU Programming with Python
## Solutions to notebook 4

### Exercise 1
Lets do the following exercise where each element of an array is incremented : array[i] = array[i] + 1

In [None]:
import numpy as np
from numba import cuda

In [None]:
@cuda.jit
def kernel1(array):
    pos = cuda.grid(1)
    if pos<array.size:
        array[pos] += 1

In [None]:
data=numpy.ones(12800,dtype=np.int32)
threads=32
blocks = (data.size + (threads - 1)) // threads
print(blocks)

In [None]:
# Run the kernel and measure execution time:
%timeit kernel1[blocks,threads](data)
print(data)

In [None]:
# Take advatage of excplicit data management and copy an array to GPU before kernel execution. 
# Then measure the execution time again
d_data = cuda.to_device(data)
%timeit kernel1[blocks,threads](d_data)

### Exercise 2
Here an integer array is sent to GPU where its indices are reversed, i.e. array[0]=array[N-1], array[1]=array[N-2], etc.

In [None]:
# Import required libs
import numpy as np
from numba import cuda, float32

In [None]:
#Part 3: Create a CUDA kernel with @cuda.jit decorator
# Kernel: reverse the array content using appropriate indices. 
# To do so you may need input and output indices. Implement kernel with possibility of multiple thread blocks.
@cuda.jit
def reverseArrayBlock(d_out,d_in):
    ind_in = cuda.blockDim.x*cuda.blockIdx.x + cuda.threadIdx.x; ## Index of the current thread
    ind_out = cuda.gridsize(1)-ind_in-1 ## Total number of threads - in -1
    if ind_in<d_in.size:
        d_out[ind_out] = d_in[ind_in]

In [None]:
# Define CUDA grid
dim=256*1000
NumThreadsPerBlock=128
NumBlocks = (dim + (NumThreadsPerBlock - 1)) // NumThreadsPerBlock
print(NumBlocks)

In [None]:
#Part 1: Create arrays on CPU and GPU (if you want to)
a = np.arange(0,dim,dtype=np.int32)
b = np.zeros(dim,dtype=np.int32)

In [None]:
#Part 2: Initialize host array
# Already initialized

In [None]:
#Part 4: Call the kernel function
%timeit reverseArrayBlock[NumBlocks,NumThreadsPerBlock](b,a)

In [None]:
#Part 5: Verify the result
print(b)

In [None]:
#Part 5: Take advantage of explicit data management
d_a = cuda.to_device(a)
d_b = cuda.device_array_like(b)
%timeit reverseArrayBlock[NumBlocks,NumThreadsPerBlock](d_b,d_a)
b = d_b.copy_to_host()
print(b)

### Hands-on: Matrix multiplication on GPU (with global memory) 

In [None]:
def matmul(A,B,C):
    # iterating by row of A
    for i in range(len(A)):
  
        # iterating by coloum by B 
        for j in range(len(B[0])):
  
            # iterating by rows of B
            for k in range(len(B)):
                C[i][j] += A[i][k] * B[k][j]
  

In [None]:
#Part 1: Create matrices A,B,C as numpy arrays. Fill A and B with random numbers.
A=np.random.rand(128,128)
B=np.random.rand(128,128)
C=np.zeros(shape=(128,128))

In [None]:
#Part 2: Calculate number of blocks and threads
threads=32
blocks = (C.shape[0]+(C.shape[0]-1))//threads
blockdim = (threads,threads)
griddim = (blocks,blocks)
print(blocks,threads)

In [None]:
#Part 3: Create a CUDA kernel with @cuda.jit decorator
@cuda.jit
def matmul(A,B,C):
    i,j=cuda.grid(2)
    if i<C.shape[0] and j<C.shape[1]:
        tmp=0.0
        for k in range(A.shape[1]):
            tmp+=A[i,k]*B[k,j]
        C[i,j]=tmp

In [None]:
#Part 4: Call the kernel function and time it to get the execution time
%timeit matmul[griddim,blockdim](A,B,C)

In [None]:
#Part 5: Create A,B,C manually on the GPU and copy data to the GPU arrays
d_A=cuda.to_device(A)
d_B=cuda.to_device(B)
d_C=cuda.to_device(C)

In [None]:
#Part 6: Call the kernel function and time it to get the execution time. Compare the execution times.
%timeit matmul[blocks,threads](d_A,d_B,d_C)

### Exercise 3

Here we re-use the code from Ex.2 and add shared memory into play

In [None]:
# Take this code and re-write it in the next cell by using a shared memory 
@cuda.jit
def reverseArrayBlock(d_out,d_in):
    ind_in = cuda.blockDim.x*cuda.blockIdx.x + cuda.threadIdx.x; ## Index of the current thread
    ind_out = cuda.gridsize(1)-ind_in-1 ## Total number of threads - in -1
    if ind_in<d_in.size:
        d_out[ind_out] = d_in[ind_in]

In [None]:
# Part 2: Here is the code with shared memory
@cuda.jit
def reverseArrayBlock_shared(d_out,d_in):
    # Declare/allocate array s in shared memory
    # Static shared memory declaration
    # s = cuda.shared.array(2000, dtype=int32) 
    # Dynamic shared memory declaration
    s = cuda.shared.array(0, dtype=int32)
    # Create input index
    ind_in = cuda.blockDim.x*cuda.blockIdx.x + cuda.threadIdx.x; ## Index of the current thread
    # Populate array s from arrat d_in
    s[cuda.blockDim.x - cuda.threadIdx.x - 1] = d_in[ind_in]
    # Synchronize threads in each block
    cuda.syncthreads()
    # Create output index
    ind_out = cuda.blockDim.x*(cuda.gridDim.x - 1 - cuda.blockIdx.x) + cuda.threadIdx.x
    if ind_in<d_in.size:
        # Populate output array d_out from shared array s
        d_out[ind_out] = s[cuda.threadIdx.x]

In [None]:
dim=256*1000
NumThreadsPerBlock=128
NumBlocks = (dim + (NumThreadsPerBlock - 1)) // NumThreadsPerBlock

In [None]:
#Part 1: Create arrays on CPU and GPU (if you want to)
a = np.arange(0,dim,dtype=np.int32)
b = np.zeros(dim,dtype=np.int32)
memSize = NumThreadsPerBlock * a.dtype.itemsize
print(memSize)

In [None]:
#Part 3: Call the kernel
# Static shared memory call
reverseArrayBlock_shared[NumBlocks,NumThreadsPerBlock](b,a)

In [None]:
#Part 4: Modify the kernel as well as the call from the host by changing static shared memory declaration to dynamic
reverseArrayBlock_shared[NumBlocks,NumThreadsPerBlock,0,memSize](b,a)

### Hands-on: Matrix multiplication with shared memory

![](../images/05-matmulshared.png)

In [None]:
import numpy as np
from numba import cuda, float32

In [None]:
#Part 3: Create a CUDA kernel with @cuda.jit decorator

# Controls threads per block and shared memory usage.
# The computation will be done on blocks of TPBxTPB elements.
TPB = 16
@cuda.jit
def fast_matmul(A, B, C):
    # Define an array in the shared memory
    # The size and type of the arrays must be known at compile time
    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    
    # Define global and thread indices
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    
    # Define number of blocks per grid
    bpg = cuda.gridDim.x    # blocks per grid
    
    if x >= C.shape[0] and y >= C.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return
    
    tmp = 0.
    for i in range(bpg):
        # Preload data into shared memory
        sA[tx, ty] = A[x, ty + i * TPB]
        sB[tx, ty] = B[tx + i * TPB, y]
        
        # Wait until all threads finish preloading
        cuda.syncthreads()

        # Computes partial product on the shared memory
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]

        # Wait until all threads finish computing
        cuda.syncthreads()

    # Put tmp into C matrix
    C[x, y] = tmp

In [None]:
#Part 1: Create matrices A,B,C as numpy arrays (size 128x128,float32). Fill A and B with random numbers.
N=128
A=np.random.rand(N,N).astype(np.float32)
B=np.random.rand(N,N).astype(np.float32)
C=np.zeros(shape=(N,N)).astype(np.float32)

In [None]:
#Part 2: Calculate number of blocks and threads
griddim = (N//TPB,N//TPB)
blockdim = (TPB,TPB)

In [None]:
#Part 4: Call the kernel function and time it to get the execution time
%timeit fast_matmul[griddim,blockdim](A, B, C)