
**Authors of the homework:**
+ Mireia Izquierdo
+ Aleksandra Jamróz

In [None]:
#Uncomment next line if you are using Google Colaboratory
!pip install pycuda

In [2]:
import  numpy  as  np
import  pycuda.autoinit
from    pycuda.compiler import SourceModule
import  pycuda.driver as  drv
import  pycuda.gpuarray as  gpuarray
import  pycuda.tools as gtools
from numpy import linalg as la
from IPython import display
import time

# Guide to using tiled algorithms
When we want to use tiled memory based algorithms, we need to analize the following steps:

1) Tiled Memory Size: What information we will share across all the execution threads in a execution block

2) Assign the memory position to each execution thread with memory coalesence

3) Fill the Tiled Memory in parallel

4) Assign to some threads the extra data needed for the algorithms

5) Synchronize the filling memory execution

6) Each thread execute his individual task

7) Synchronize the execution task

8) End the execution

# TILED REDUCTION ARRAY
The algorithm of reduction (which calculates the sum of all elements in an array), works as follows:

![image.png](attachment:d6e8dd16-0624-4d98-af7c-5a8f36bd28c5.png)

* Tiled memory size: The tile will contain twice the times of the number of threads assigned
* Each thread in the execution block copies the data from global memory to the assigned memory place
* No extra data need (for this task)
* In each iteration
    * Add the possition assigned and the next available data (indexed by the stride)
    * This will work until the stride exceeds the block size

In [3]:
tiled_reduction_src = """
__global__ void tiled_reduction( float *v, float *c, int N){

  const int BLOCK_SIZE = 1024;

  __shared__ float partialSum[ 2 * BLOCK_SIZE ]; //The array dimensions MUST be constants

  unsigned int t = threadIdx.x;
  unsigned int start = 2 * blockIdx.x * blockDim.x;

  
  //fill the tile memory
  //each thread will fill the memory position start +t and start+blockDim.x+t
  //each consecutive execution thread (threadIdx.x) will access to coalesced memory in both steps
  
  if ( (start+t) < N) 
    partialSum[t] = v[start+t];
  else 
    partialSum[t] = 0;

  if ((start+blockDim.x+t) < N)
    partialSum[blockDim.x+t] = v[start+blockDim.x+t];
  else
    partialSum[blockDim.x+t] = 0;

  //Here we will wait until all execution threads fill te memory
  __syncthreads();

  for ( unsigned int stride = 1; stride <=blockDim.x; stride*=2 ) {
    __syncthreads();
    if ( t % stride == 0)
      partialSum[2*t]+=partialSum[2*t+stride];
  }
  c[blockIdx.x] = partialSum[0];
}
"""

Here we cannot automatically edit the source template to use the string % function to replace character chains in the string variable by other values. 

The problem when using modern strings formating (format method or f-strings) is the collision of the use of {} symbols, and in the previos source code, the presence of modulus operator (%) collides with the string subtitution.

In further codes we will use it to be able to substitute constants values for external variables.

In [4]:
#Set block size to 1024, the maximum
BLOCK_SIZE = 1024

In [5]:
#prints the code from above 
print(tiled_reduction_src)


__global__ void tiled_reduction( float *v, float *c, int N){

  const int BLOCK_SIZE = 1024;

  __shared__ float partialSum[ 2 * BLOCK_SIZE ]; //The array dimensions MUST be constants

  unsigned int t = threadIdx.x;
  unsigned int start = 2 * blockIdx.x * blockDim.x;

  
  //fill the tile memory
  //each thread will fill the memory position start +t and start+blockDim.x+t
  //each consecutive execution thread (threadIdx.x) will access to coalesced memory in both steps
  
  if ( (start+t) < N) 
    partialSum[t] = v[start+t];
  else 
    partialSum[t] = 0;

  if ((start+blockDim.x+t) < N)
    partialSum[blockDim.x+t] = v[start+blockDim.x+t];
  else
    partialSum[blockDim.x+t] = 0;

  //Here we will wait until all execution threads fill te memory
  __syncthreads();

  for ( unsigned int stride = 1; stride <=blockDim.x; stride*=2 ) {
    __syncthreads();
    if ( t % stride == 0)
      partialSum[2*t]+=partialSum[2*t+stride];
  }
  c[blockIdx.x] = partialSum[0];
}



In [6]:
#use SourceModule function from PyCUDA to compile raw inline CUDA C code into usable kernels that we can launch from Python
mod = SourceModule(tiled_reduction_src) 

In [7]:
#create array, datatype int32
datasize = np.int32(1000000)
#output of datasize = 1000000  

In [8]:
tiled_reduction = mod.get_function("tiled_reduction")

In [9]:
#create array filled with random floats sampled from a univariate standard normal distribution
data = np.random.randn(datasize).astype(np.float32)

#return a GPUArray that is an exact copy of the numpy array data, i.e. transfer the array to the GPU
data_gpu = gpuarray.to_gpu(data)

In [10]:
#define the size for the blocks
block_size = (int(BLOCK_SIZE),1,1)

In [11]:
#define the number of blocks by dividing the size of data by the previously defined blocksize
numblocks = int(np.ceil(datasize/BLOCK_SIZE))

In [12]:
#allocate an empty array of float32 values on the GPU
c_gpu = gpuarray.empty((numblocks,1),np.float32)

In [13]:
#define the grid size. Here: grid size of n*1
grid_size = (numblocks,1)

In [14]:
#data_gpu: GPU array to store the input image
#c_gpu: empty CPU array
#datasize: specifies how many pixels are in each tile of the grid
#grid/block: specify how large each tile/block is 
start_t = time.time()
tiled_reduction(data_gpu,
                c_gpu,
                datasize,
                grid = grid_size,
                block = block_size)
end_t = time.time()

In [15]:
c = c_gpu.get()

In [16]:
sum(data)
#output is different each time, for instance:
#1380.650449003166 ; then 2325.0763068947194

1882.9878161986908

In [17]:
np.sum(c)
#output is different each time, for instance:
#1380.6504 ; then 2325.0764

1882.9878

In [18]:
#compute & print the time difference between start & end time, i.e. time taken to process
time_diff = end_t - start_t
print(time_diff)
#output: approx 0.000210 ; meaning it takes very little time. It is efficient

0.0026144981384277344


# 1D Convolution

The algorithm to implement will calculate the convolution between 2 arrays.

The shortest array, called system mask, system response, represents the exit of a system to a special signal called Dirac's delta (signal of infinite height, but limited area under the curve).

The second array (the longest one) is the signal to be shaped by our system.

Based on this mathematical operation the filter works.

The Image Filter algorithms are 2 dimensional convolutions.<br>

![image.png](attachment:17f3b373-5760-4edb-b6b3-a233aef56fe3.png)

The problem with the tile algorithms is that we need extra data to calculate the correct convolution (halos)<br>

![image.png](attachment:e3bc106b-14eb-4935-8de3-549c7a225578.png)

The steps to implement the algorithm are:

* Tiled memory size: The tile will contains not only the block size elements, but also the system mask length - 1, to store the halos. Also, we need to store in memory the shared memory the system mask. 
* Each thread in the execution block copy the data from global memory to the assigned memory place, and few of them will fill the halos.<br>

![image.png](attachment:3a76556c-c652-49c4-821a-c5be890b13aa.png)

<br/>

![image.png](attachment:83d342d6-57dc-439a-8dc7-190a5dc8a10b.png)
<br/>

![image.png](attachment:5d4f3aa2-d27b-4639-94fc-e9b3ae706820.png)

<br/>
* Once the assigned memory positions are filled, we have to wait for the other tasks (\_\_syncthreads())
* Now, we have to calculate the convolution between the system mask and the assigned memory position in the signal vector.

You have to implement the algorithms in the following cell.



In [73]:
#example of a convolution operation of a 3x3 matrix with a 1x1 matrix
convolution_src = """

//declare variables needed for convolution operation
__global__ void convolution( float *v, 
                             float *c, 
                             float *conv,
                             int N,
                             int c_size){
                           
//declare a tile and mask memory to be shared across all threads
  __shared__ float tile[ HERE SHOULD INDICATES THE STRIDE LENGTH ];
  __shared__ float mask[ HERE SHOULD INDICATES THE MASK LENGTH ]

  unsigned int t = threadIdx.x; //assign the threadIdx.x value to t, then increment it by 1

  //fill the tile memory with tiles 
  __syncthreads();

  float accu = 0;

  __syncthreads();
  c[blockIdx.x] = accu; //store the value of blockIdx.x in the variable accu 
}
"""

In [74]:

convolution_tmp = """
__global__ void convolution( float *v, 
                             float *c, 
                             float *conv,
                             int N,
                             int c_size){

//declare variables title size and mask size 
//tile: array of float values to store the output of each thread in a block
//mask: array of float values to calculate which pixels should have their value set to 0
  __shared__ float tile[ %(TILE_SIZE)s ];
  __shared__ float mask[ %(MASK_SIZE)s ];

//declare variables t, start, offset
//t: integer representing the thread index
//start: integer representing the block index
//offset: specifies where each tile should be stored, based on its position relative to other tiles
  unsigned int t = threadIdx.x; // store the value of threadIdx.x in t (index of the current thread)
  unsigned int start = blockIdx.x * blockDim.x;
  unsigned int offset = %(MASK_SIZE)s / 2;

//check if there is enough space left on either side of t to calculate tiles for both blocks at the same time
//iterate through all tiles in the array and set each tile's value to either 0.0 or v[start + t]
  if (start + t < N) {
    tile[t + offset] = v[start + t];
    //if there isn't, then only one block can be calculated at a time 
  } else { 
    tile[t + offset] = 0.0;
  }


  // analyzing beginning of the vector
  if (t < offset)
    if (blockIdx.x > 0)
      tile[t] = v[start+t-2]; // if t==0, we will fetch memory position start-2, if t==1, we will fetch position start-1
    else
      tile[t] = 0.0;




  // analyzing the end of the vector
  if (t > (blockDim.x - offset - 1)){
    if ((t + start + offset) < N)
      tile[t + offset + offset] = v[start + t + offset];
    else
      tile[t + offset + offset] = 0.0; // will point to element in next block
  }

// filling the mask
  if ( t < %(MASK_SIZE)s)
    mask[t] = conv[t];



  //fill the tile memory
  //IN THIS SECTION SHOULD FILL THE TILE MEMORY AND MAYBE THE MASK
  // all variables in chip memory

  __syncthreads();
  // calculate the convolution
  if ((start + t) < N) {
    float accu = 0;
    c[start+t] = accu;
  }
  
  __syncthreads();
 // c[blockIdx.x] = accu;
}
"""

In [75]:
#create a filter mask on the CPU and then compiles it to a GPU-friendly data type
filtermask = np.array([1,1,3,1,1],dtype = np.float32)
#prints: [1. 1. 3. 1. 1.]
filtermask_gpu = gpuarray.to_gpu(filtermask) #converts filtermask to a GPU array object
#prints: [1. 1. 3. 1. 1.]
filtermask_size = np.int32(5)
#prints: 5

In [76]:
convolved_gpu = gpuarray.empty((datasize,1),np.float32) #create a new GPU array

In [77]:
#create a 256x256 tile map with 5 tiles per block
BLOCKSIZE = 256
MASK_SIZE = 5
TILE_SIZE = BLOCK_SIZE + MASK_SIZE - 1

In [78]:
#create a convolution kernel that is the same size as the input image
convolution_src = convolution_tmp%{
'TILE_SIZE': BLOCK_SIZE,
'MASK_SIZE': MASK_SIZE,
}

In [79]:
#create a convolution_src module, i.e. the first step in creating an image
mod2 = SourceModule(convolution_src)

#this line kept failing bc it stated that "identifier accu is undefined, altough it is defined in the previous code snippet"
#issue solved by commenting out  // c[blockIdx.x] = accu; at the bottom of convolution_tmp

In [80]:
convolution = mod2.get_function("convolution") #important to remember that it is mod2 now

In [81]:
n = np.int32(1000000)

#define block_size (this is NOT a definition for grid_size!!)
block_size = (BLOCK_SIZE,1,1)

In [66]:
#check the data type if necessary 
#type(BLOCK_SIZE)

int

In [82]:
#calculate the number of blocks & the gridsize
numblocks = np.int(np.ceil(n/BLOCK_SIZE))
grid_size = (numblocks,1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [68]:
#check the data type if necessary 
#type(numblocks)

int

In [83]:
#compute the convolution operation on a GPU by computing the convolved_gpu matrix with data_gpu & then applying a filter to it (filtermask_gpu)
start_t = time.time()
convolution(data_gpu,
            convolved_gpu,
            filtermask_gpu,
            n,
            filtermask_size,
            grid=grid_size,
            block=block_size)
end_t = time.time

In [84]:
#take the data and filter it with the filter mask
local_convolved = np.convolve(data,filtermask,mode='full')

In [85]:
convolved = convolved_gpu.get()
print(convolved)

[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


# **Conclusion**
Tiling is helpful to partition the data into subsets (tiles) so that each tile can fit into the shared memory. We do this because when using CUDA device memories there is a tradeoff: large but slow global memory, and small but fast shared memory. We solve this by partitioning the data to reduce memory traffic. 

It is crucial to be aware of the data sizes, block sizes, grid sizes, and the element types in each of these arrays. If they are not correctly set, it will not compile correctly. 