# Introduction to GPU Programming with Python
## Solutions to notebook 5

### Main example: Matrix multiplication with shared memory

![](../images/05-matmulshared.png)

In [None]:
import numpy as np
from numba import cuda, float32

In [None]:
#Part 3: Create a CUDA kernel with @cuda.jit decorator

# Controls threads per block and shared memory usage.
# The computation will be done on blocks of TPBxTPB elements.
TPB = 32
@cuda.jit
def fast_matmul(A, B, C):
    # Define an array in the shared memory
    # The size and type of the arrays must be known at compile time
    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
    
    # Define global and thread indices
    x, y = cuda.grid(2)
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    
    # Define number of blocks per grid
    bpg = cuda.gridDim.x    # blocks per grid
    
    if x >= C.shape[0] and y >= C.shape[1]:
        # Quit if (x, y) is outside of valid C boundary
        return
    
    tmp = 0.
    for i in range(bpg):
        # Preload data into shared memory
        sA[tx, ty] = A[x, ty + i * TPB]
        sB[tx, ty] = B[tx + i * TPB, y]
        
        # Wait until all threads finish preloading
        cuda.syncthreads()

        # Computes partial product on the shared memory
        for j in range(TPB):
            tmp += sA[tx, j] * sB[j, ty]

        # Wait until all threads finish computing
        cuda.syncthreads()

    # Put tmp into C matrix
    C[x, y] = tmp

In [None]:
#Part 1: Create matrices A,B,C as numpy arrays (size 128x128,float32). Fill A and B with random numbers.
N=128
A=np.random.rand(N,N).astype(np.float32)
B=np.random.rand(N,N).astype(np.float32)
C=np.zeros(shape=(N,N)).astype(np.float32)

In [None]:
#Part 2: Calculate number of blocks and threads
NumThreads = TPB
NumBlocks = N//NumThreads
griddim = (NumBlocks,NumBlocks)
blockdim = (NumThreads,NumThreads)
print(blockdim)
print(griddim)

In [None]:
#Part 4: Call the kernel function and time it to get the execution time
%timeit fast_matmul[griddim,blockdim](A, B, C)

### Exercise: Array reversal with shared memory

Here we re-use the code from [previous notebook](../04-numba-cuda.ipynb) and add shared memory into play

In [None]:
import numpy as np
from numba import cuda,int32

In [None]:
# Part 2: Here is the code with shared memory
@cuda.jit
def reverseArrayBlock_shared(d_out,d_in):
    # Declare/allocate array s in shared memory
    # Static shared memory declaration
    s = cuda.shared.array(2000, dtype=int32) 
    # Dynamic shared memory declaration
    #s = cuda.shared.array(0, dtype=int32)
    # Create input index
    ind_in = cuda.blockDim.x*cuda.blockIdx.x + cuda.threadIdx.x; ## Index of the current thread
    # Populate array s from arrat d_in
    s[cuda.blockDim.x - cuda.threadIdx.x - 1] = d_in[ind_in]
    # Synchronize threads in each block
    cuda.syncthreads()
    # Create output index
    ind_out = cuda.blockDim.x*(cuda.gridDim.x - 1 - cuda.blockIdx.x) + cuda.threadIdx.x
    if ind_in<d_in.size:
        # Populate output array d_out from shared array s
        d_out[ind_out] = s[cuda.threadIdx.x]

In [None]:
dim=256*1000
NumThreads=128
NumBlocks = (dim + (NumThreads - 1)) // NumThreads
print(NumBlocks,NumThreads)

In [None]:
#Part 1: Create arrays on CPU and GPU (if you want to)
a = np.arange(0,dim,dtype=np.int32)
b = np.zeros(dim,dtype=np.int32)
memSize = NumThreads * a.dtype.itemsize
print(memSize)

In [None]:
#Part 3: Call the kernel
# Static shared memory declaration
reverseArrayBlock_shared[NumBlocks,NumThreads](b,a)
print(b)