# Introduction to GPU Programming with Python
## Solutions to notebook 4

### Exercise 1
Lets do the following exercise where each element of an array is incremented : array[i] = array[i] + 1

In [None]:
import numpy
from numba import cuda

In [None]:
@cuda.jit
def kernel1(array):
    pos = cuda.grid(1)
    if pos<array.size:
        array[pos] += 1

In [None]:
data=numpy.ones(12800,dtype=np.int32)
threads=32
blocks = (data.size + (threads - 1)) // threads
print(blocks)

In [None]:
# Run the kernel and measure execution time:
%timeit kernel1[blocks,threads](data)
print(data)

In [None]:
# Take advatage of excplicit data management and copy an array to GPU before kernel execution. 
# Then measure the execution time again
d_data = cuda.to_device(data)
%timeit kernel1[blocks,threads](d_data)

### Exercise 2
Here an integer array is sent to GPU where its indices are reversed, i.e. array[0]=array[N-1], array[1]=array[N-2], etc.

In [None]:
# Import required libs
import numpy as np
from numba import cuda, float32

Here we re-use the code from Ex.2 and add shared memory into play

In [None]:
# Import required libs
import numpy as np
from numba import cuda, float32

In [None]:
# Take this code and re-write it in the next cell by using a shared memory 
@cuda.jit
def reverseArrayBlock(d_out,d_in):
    ind_in = cuda.blockDim.x*cuda.blockIdx.x + cuda.threadIdx.x; ## Index of the current thread
    ind_out = cuda.gridsize(1)-ind_in-1 ## Total number of threads - in -1
    if ind_in<d_in.size:
        d_out[ind_out] = d_in[ind_in]

In [None]:
# Here is the code with shared memory
@cuda.jit
def reverseArrayBlock_shared(d_out,d_in):
# Static shared memory declaration
#    s = cuda.shared.array(2000, dtype=int32) 
    # Below is dynamic shared memory declaration
    s = cuda.shared.array(0, dtype=int32) 
    ind_in = cuda.blockDim.x*cuda.blockIdx.x + cuda.threadIdx.x; ## Index of the current thread
    ind_out = cuda.gridsize(1)-ind_in-1 ## Total number of threads - in -1
    s[cuda.blockDim.x - cuda.threadIdx.x - 1] = d_in[ind_in]
    cuda.syncthreads()
    ind_out = cuda.blockDim.x*(cuda.gridDim.x - 1 - cuda.blockIdx.x) + cuda.threadIdx.x
    if ind_in<d_in.size:
        d_out[ind_out] = s[cuda.threadIdx.x]

In [None]:
dim=256*1000
NumThreadsPerBlock=128
NumBlocks = (dim + (NumThreadsPerBlock - 1)) // NumThreadsPerBlock

In [None]:
a = np.arange(0,dim,dtype=np.int32)
b = np.zeros(dim,dtype=np.int32)
memSize = NumThreadsPerBlock * a.dtype.itemsize
print(memSize)

In [None]:
# Static shared memory call
# reverseArrayBlock_shared[NumBlocks,NumThreadsPerBlock](b,a)
#Dynamic shared memory call 
reverseArrayBlock_shared[NumBlocks,NumThreadsPerBlock,0,memSize](b,a)

### Hands-on: Matrix multiplication with shared memory

![](../images/05-matmulshared.png)

In [None]:
import numpy as np
from numba import cuda, float32

In [None]:
#Part 3: Create a CUDA kernel with @cuda.jit decorator

# Controls threads per block and shared memory usage.
# The computation will be done on blocks of TPBxTPB elements.
TPB = 16

def fast_matmul(A, B, C):
    # Define an array in the shared memory
    # The size and type of the arrays must be known at compile time
    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    
    # Define global and thread indices
    
    # Define number of blocks per grid
    
    tmp = 0.
    for i in range(bpg):
        # Preload data into shared memory
        #####
        
        # Wait until all threads finish preloading
        
        # Computes partial product on the shared memory
        for j in range(TPB):
            #####
            
        # Wait until all threads finish computing
        
    # Put tmp into C matrix

In [None]:
#Part 1: Create matrices A,B,C as numpy arrays (size 128x128,float32). Fill A and B with random numbers.

In [None]:
#Part 2: Calculate number of blocks and threads

In [None]:
#Part 4: Call the kernel function and time it to get the execution time