<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring_2024/blob/main/Week12/PrefixSum_Correct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-s0qj0lci
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-s0qj0lci
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=3ea7c77559fe49479e98ea4a701933d8dfd4f3770b1f9b544cf34923acf1ef88
  Stored in directory: /tmp/pip-ephem-wheel-cache-lrdvn9ks/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [None]:
%%cuda
#include <stdio.h>

#define SECTION_SIZE 4

inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
  }
  return err;
}

//CPU version
void sequential_scan(int* x, int* y, int N)
{
  y[0]=x[0];
  for (int i=1; i < N; i++)
  {
    y[i]= y [i-1] + x[i];
  }
}

__global__ void work_inefficient_inc_scan_kernel(int *X, int *Y, int N) {
  __shared__ int XY[SECTION_SIZE];
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < N) {
    XY[threadIdx.x] = X[i];
  }

  // the code below performs iterative scan on XY
  for (unsigned int stride=1; stride<=threadIdx.x; stride*= 2)
  {
    __syncthreads();
    XY[threadIdx.x] += XY[threadIdx.x-stride];
  }

  Y[i] = XY[threadIdx.x];
}

int main()
{
		int   *X, *Y;
    const int N = 16;
    const int threadsPerBlock = SECTION_SIZE;
    const int blocksPerGrid =  (N / threadsPerBlock);

    // Allocate Unified Memory -- accessible from CPU or GPU
    checkCudaErr(cudaMallocManaged(&X, N*sizeof(int)), "cudaMallocManaged1");
    checkCudaErr(cudaMallocManaged(&Y, N*sizeof(int)), "cudaMallocManaged2");

    // fill in the memory with data
    for (int i=0; i<N; i++)
    {
        X[i] = i+1;
        Y[i] = 0;
    }

    // Prefetch the data to the GPU
    int device = -1;
    cudaGetDevice(&device);
    cudaMemPrefetchAsync(X, N*sizeof(int), device, NULL);
    cudaMemPrefetchAsync(Y, N*sizeof(int), device, NULL);

    cudaEvent_t start, stop;
    float gpu_elapsed_time_ms=0, cpu_elapsed_time_ms=0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    ////lets time the CPU code
    cudaEventRecord(start, 0);
    sequential_scan(X, Y, N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);

    //reset Y for GPU
    for (int i=0; i<N; i++)
    {
        Y[i] = 0;
    }

    //lets time the GPU code
    cudaEventRecord(start, 0);
    work_inefficient_inc_scan_kernel<<<1,N>>>(X, Y, N);

    cudaDeviceSynchronize();

    // time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    // compute time elapse on GPU computing
    cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);

    //output the result
    puts("Prefix Scan Results:");
    printf("CPU Time: %3.3f msecs, GPU Time: %3.3f\n",cpu_elapsed_time_ms, gpu_elapsed_time_ms);
    puts("X[i]\t| Y[i]");
    puts("--------+-------");

    for(int i=0; i<N; ++i)
      printf("%3d\t| %3d\n", X[i], Y[i]);

    // free memory on the gpu side
    checkCudaErr( cudaFree( X ) , "cudaFree1");
    checkCudaErr( cudaFree( Y ) , "cudaFree2");
		checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

		return 0;
}

Prefix Scan Results:
CPU Time: 0.019 msecs, GPU Time: 0.270
X[i]	| Y[i]
--------+-------
  1	|   1
  2	|   3
  3	|   6
  4	|  10
  5	|  15
  6	|  21
  7	|  28
  8	|  36
  9	|  45
 10	|  55
 11	|  66
 12	|  78
 13	|  91
 14	| 105
 15	| 120
 16	| 136

