<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring_2024/blob/main/Week11/Conv1D_Tiled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-qlw1cedb
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-qlw1cedb
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=7599776fbb7d0c2e49b139df96ae3b4523eb280f3202cb2916501f636fcefcb5
  Stored in directory: /tmp/pip-ephem-wheel-cache-uthtw17u/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [None]:
%%cuda
#include <stdio.h>


inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
  }
  return err;
}

#define MASK_WIDTH 5
#define TILE_SIZE MASK_WIDTH/2

__constant__ float M[MASK_WIDTH];

__global__ void convolution_1D_basic_kernel(float* N, float* P,  int Width)
{
  int i = blockIdx.x*blockDim.x+threadIdx.x;
  float Pvalue=0;
  int N_start_point = i-(MASK_WIDTH/2);

  for (int j=0; j<MASK_WIDTH; j++)
  {
      if(N_start_point+j>=0 && N_start_point+j< Width)
      {
        Pvalue += N[N_start_point+j]*M[j];
      }
  }
  P[i]=Pvalue;
}

__global__ void convolution_1D_basic_kernel_tiled(float* N, float* P, int Width)
{
  int i = blockIdx.x*blockDim.x+threadIdx.x;
 __shared__ float N_ds[TILE_SIZE + MASK_WIDTH - 1];

  int n = TILE_SIZE;

  int halo_index_left = (blockIdx.x - 1) * blockDim.x + threadIdx.x;
  if(threadIdx.x >= (blockDim.x - n))
  {
      N_ds[threadIdx.x - (blockDim.x - n)] = (halo_index_left < 0)? 0: N[halo_index_left];
  }

  N_ds[n + threadIdx.x] = N[i];

  int halo_index_right = (blockIdx.x + 1) * blockDim.x + threadIdx.x;

  if(threadIdx.x < n)
  {
      N_ds[n+ blockDim.x + threadIdx.x] = (halo_index_right >= Width)? 0: N[halo_index_right];
  }

  __syncthreads();

  float Pvalue = 0;
  for (int j=0; j<MASK_WIDTH; j++)
  {
     Pvalue += N_ds[threadIdx.x + j]*M[j];
  }
  P[i]=Pvalue;
}

int main() {
		float   *a,  *c=0, *tiled_c=0;
    const int N = 16;
    const int threadsPerBlock = 4;

    float h_M[MASK_WIDTH]={3,4,5,4,3};

    // Allocate Unified Memory -- accessible from CPU or GPU
    checkCudaErr(cudaMallocManaged(&a, N*sizeof(float)), "cudaMallocManaged a");
    checkCudaErr(cudaMallocManaged(&c, N*sizeof(float)), "cudaMallocManaged c");
    checkCudaErr(cudaMallocManaged(&tiled_c, N*sizeof(float)), "cudaMallocManaged tiled_c");

    // fill in the memory with data
    for (int i=0; i<N; i++) {
        a[i] = i+1;
        c[i] = 0;
        tiled_c[i] = 0;
    }

    cudaMemcpyToSymbol(M,h_M,MASK_WIDTH*sizeof(float));
    const int blocksPerGrid =  (N / threadsPerBlock);

    // Prefetch the data to the GPU
    int device = -1;
    cudaGetDevice(&device);
    cudaMemPrefetchAsync(a, N*sizeof(float), device, NULL);
    cudaMemPrefetchAsync(M, MASK_WIDTH*sizeof(float), device, NULL);

    //lets time the conv1D kernel
    cudaEvent_t start, stop;
    float gpu_elapsed_time_ms=0;
    float gpu_elapsed_time_tiled_ms=0;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // start to count execution time
    cudaEventRecord(start, 0);
    convolution_1D_basic_kernel<<<blocksPerGrid,threadsPerBlock>>>(a, c, N);

    cudaDeviceSynchronize();

    // time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    // compute time elapse on GPU computing
    cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);

    //call the tiled Conv1D function
    // start to count execution time
    cudaEventRecord(start, 0);
    convolution_1D_basic_kernel_tiled<<<blocksPerGrid,threadsPerBlock>>>(a, tiled_c, N);

    cudaDeviceSynchronize();

    // time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    // compute time elapse on GPU computing
    cudaEventElapsedTime(&gpu_elapsed_time_tiled_ms, start, stop);

    //output the result
    printf("Results:\nConv1D: %3.3f msecs: \nConv1D Tiled: %3.3f msecs\n", gpu_elapsed_time_ms, gpu_elapsed_time_tiled_ms);
    printf(" p[i]  | conv[i]| conv_tiled[i]\n");
    printf("-------+--------+--------------\n");
    for(int i=0; i<N; ++i)
      printf(" p[%2d] | %6.2f | %6.2f\n", i, c[i], tiled_c[i]);

    printf("-------+--------+--------------\n");

    // free memory on the gpu side
    checkCudaErr( cudaFree( a ) , "cudaFree a");
    checkCudaErr( cudaFree( c ) , "cudaFree c");
    checkCudaErr( cudaFree( tiled_c ) , "cudaFree tiled_c");
		checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

		return 0;
}

Results:
Conv1D: 0.030 msecs: 
Conv1D Tiled: 0.012 msecs
 p[i]  | conv[i]| conv_tiled[i]
-------+--------+--------------
 p[ 0] |  22.00 |  22.00
 p[ 1] |  38.00 |  38.00
 p[ 2] |  57.00 |  57.00
 p[ 3] |  76.00 |  76.00
 p[ 4] |  95.00 |  95.00
 p[ 5] | 114.00 | 114.00
 p[ 6] | 133.00 | 133.00
 p[ 7] | 152.00 | 152.00
 p[ 8] | 171.00 | 171.00
 p[ 9] | 190.00 | 190.00
 p[10] | 209.00 | 209.00
 p[11] | 228.00 | 228.00
 p[12] | 247.00 | 247.00
 p[13] | 266.00 | 266.00
 p[14] | 234.00 | 234.00
 p[15] | 182.00 | 182.00
-------+--------+--------------

