<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring_2024/blob/main/Week11/Conv1D_Modified.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-y_7eregx
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-y_7eregx
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [None]:
%%cuda
#include <stdio.h>


inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
  }
  return err;
}

#define MASK_WIDTH 5
__constant__ float M[MASK_WIDTH];

__global__ void convolution_1D_basic_kernel(float* N, float* P, int Mask_Width, int Width)
{
  int i = blockIdx.x*blockDim.x+threadIdx.x;
  float Pvalue=0;
  int N_start_point = i-(Mask_Width/2);

  for (int j=0; j<Mask_Width; j++)
  {
      if(N_start_point+j>=0 && N_start_point+j< Width)
      {
        Pvalue += N[N_start_point+j]*M[j];
      }
  }
  P[i]=Pvalue;
}


int main() {
		float   *a,  *c=0;
    const int N = 16;
    const int threadsPerBlock = 4;

    float h_M[MASK_WIDTH]={3,4,5,4,3};

    // Allocate Unified Memory -- accessible from CPU or GPU
    checkCudaErr(cudaMallocManaged(&a, N*sizeof(float)), "cudaMallocManaged1");
    checkCudaErr(cudaMallocManaged(&c, N*sizeof(float)), "cudaMallocManaged1");

    // fill in the memory with data
    for (int i=0; i<N; i++) {
        a[i] = i+1;
        c[i] = 0;
    }

    cudaMemcpyToSymbol(M,h_M,MASK_WIDTH*sizeof(float));
    const int blocksPerGrid =  (N / threadsPerBlock);

    // Prefetch the data to the GPU
    int device = -1;
    cudaGetDevice(&device);
    cudaMemPrefetchAsync(a, N*sizeof(float), device, NULL);
    cudaMemPrefetchAsync(M, MASK_WIDTH*sizeof(float), device, NULL);

    //lets time the conv1D kernel
    cudaEvent_t start, stop;
    float gpu_elapsed_time_ms=0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

     // start to count execution time
    cudaEventRecord(start, 0);
    convolution_1D_basic_kernel<<<blocksPerGrid,threadsPerBlock>>>(a, c, MASK_WIDTH, N);

    cudaDeviceSynchronize();

    // time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    // compute time elapse on GPU computing
    cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);

    //output the result
    printf("Conv1D Result calculated in %3.3f msecs: \n", gpu_elapsed_time_ms);
    for(int i=0; i<N; ++i)
      printf(" p[%d] = %3.3f\n", i, c[i]);

    // free memory on the gpu side
    checkCudaErr( cudaFree( a ) , "cudaFree1");
    checkCudaErr( cudaFree( c ) , "cudaFree3");
		checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

		return 0;
}

Conv1D Result calculated in 0.033 msecs: 
 p[0] = 22.000
 p[1] = 38.000
 p[2] = 57.000
 p[3] = 76.000
 p[4] = 95.000
 p[5] = 114.000
 p[6] = 133.000
 p[7] = 152.000
 p[8] = 171.000
 p[9] = 190.000
 p[10] = 209.000
 p[11] = 228.000
 p[12] = 247.000
 p[13] = 266.000
 p[14] = 234.000
 p[15] = 182.000

