<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring_2024/blob/main/Week13/Histogram_Strategy_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git --quiet
%load_ext nvcc4jupyter

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
Source files will be saved in "/tmp/tmpezia7p3t".


In [2]:
%%cuda
#include <stdio.h>
#include <cstdlib> //rand() function


inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
		fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
	}
	return err;
}

void histogram_CPU(int *data, const int N, int* histogram, const int Nbins)
{
	for(int i=0; i<N; ++i)
	{
		histogram[data[i]]++;
	}
}


__global__ void histogram_GPU_4( int *data, const int N, int *histo, const int num_bins)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;

	//Privatized bins
	extern __shared__ int histo_s[];

	for(int binIdx = threadIdx.x ; binIdx < num_bins ; binIdx += blockDim.x )
     histo_s[ binIdx ] = 0;

	__syncthreads();

  int prev_index = -1;
	int accumulator = 0;

	//Histogram (updated)
	for (int i = tid; i < N; i += blockDim.x*gridDim.x) {
			int curr_index = data[i];
			if (curr_index != prev_index) {
					if (prev_index != -1 && accumulator > 0)
							atomicAdd(&histo_s[prev_index], accumulator);
					accumulator = 1;
					prev_index = curr_index;
			}
			else {
					accumulator++;
			}
    }
    if (accumulator > 0)
        atomicAdd(&histo_s[prev_index], accumulator);
    __syncthreads();

	//Commit to global memory (merge step)
  for(int binIdx = threadIdx.x ; binIdx < num_bins ; binIdx += blockDim.x  )
		atomicAdd(&(histo[binIdx]), histo_s[binIdx]);
}

int main()
{
	int *data;
	int *histogram;
	int *histogramGPU;
	const int Nbins = 256;
	const int N = 1024 * 1024;
	const int threadsPerBlock = 256;

	#define imin(a,b) (a<b?a:b)

	const int blocksPerGrid =  imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );

	// Allocate Unified Memory -- accessible from CPU or GPU
	checkCudaErr(cudaMallocManaged(&data, N*sizeof(int)), "cudaMallocManaged1 data");
	checkCudaErr(cudaMallocManaged(&histogram, Nbins*sizeof(int)), "cudaMallocManaged2 histogram");
	checkCudaErr(cudaMallocManaged(&histogramGPU, Nbins*sizeof(int)), "cudaMallocManaged2 histogramGPU");

	// fill in the data with random values between 0-255
	for (int i=0; i<N; i++) {
		data[i] = (rand() % Nbins);
	}

	// some events to count the execution time
  cudaEvent_t start, stop;
	float cpu_elapsed_time_ms, gpu_1_elapsed_time_ms, gpu_2_elapsed_time_ms,
				gpu_3_elapsed_time_ms, gpu_4_elapsed_time_ms;

  cudaEventCreate(&start);
  cudaEventCreate(&stop);

	//start to count execution time of CPU version
	cudaEventRecord(start, 0);

  //calculate histogram on the CPU
	histogram_CPU(data, N, histogram, Nbins);

  // time counting terminate
  cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

  //compute time elapsed on CPU
  cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
  printf("Histogram(CPU) - Time: %f ms.\n", cpu_elapsed_time_ms);


  //clear old gpu histogram for histogram calc uisng strategy 4
	for (int i=0; i<Nbins; i++) {
		histogramGPU[i] =0;
	}

	//start to count execution time of GPU version
	cudaEventRecord(start, 0);

	//launch kernel
	histogram_GPU_4<<<blocksPerGrid,threadsPerBlock, Nbins*sizeof(int)>>>( data, N, histogramGPU, Nbins );

	// time counting terminate
  cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

  // compute time elapse on GPU computing
  cudaEventElapsedTime(&gpu_4_elapsed_time_ms, start, stop);
  printf("Histogram_4(GPU) - Time: %f ms.\n", gpu_4_elapsed_time_ms);

	int allOK = 1;

  printf("+------------------+-------------------+\n");
  printf("| CPU Histogram[i] |  GPU Histogram[i] |\n");
	printf("+------------------+-------------------+\n");

  //compare results on CPU and GPU
	for(int i=0 ; i<Nbins; ++i)
 	{
		 printf("|      %5d       |      %5d        |\n", histogram[i], histogramGPU[i]);

		 if(histogram[i]!=histogramGPU[i])
		 {
		    printf("|\t Histogram mismatch!!!         |\n");
				allOK=0;
				break;
		 }
	}
  printf("+------------------+-------------------+\n");

	if(allOK==1)
  	printf("Results correct on both CPU and GPU");

	// free memory on the gpu side
	checkCudaErr( cudaFree( data ) , "cudaFree1");
	checkCudaErr( cudaFree( histogram ) , "cudaFree2");
	checkCudaErr( cudaFree( histogramGPU ) , "cudaFree3");
	checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

	return 0;
}

Histogram(CPU) - Time: 3.727008 ms.
Histogram_4(GPU) - Time: 185.039780 ms.
+------------------+-------------------+
| CPU Histogram[i] |  GPU Histogram[i] |
+------------------+-------------------+
|       4083       |       4083        |
|       4074       |       4074        |
|       4073       |       4073        |
|       4107       |       4107        |
|       3910       |       3910        |
|       4082       |       4082        |
|       4173       |       4173        |
|       4074       |       4074        |
|       3957       |       3957        |
|       4073       |       4073        |
|       4072       |       4072        |
|       4077       |       4077        |
|       4098       |       4098        |
|       4105       |       4105        |
|       4112       |       4112        |
|       4070       |       4070        |
|       4152       |       4152        |
|       4061       |       4061        |
|       4101       |       4101        |
|       4105       |  