<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring_2024/blob/main/Week13/Histogram_Strategy_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-2f5mdzm2
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-2f5mdzm2
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=ed638ce0faeb72722372865737cdb53ec3899cdb9987ea9e2ebd0f5e261a8908
  Stored in directory: /tmp/pip-ephem-wheel-cache-_741cujj/wheels/db/c1/1f/a2bb07bbb4a1ce3c43921252aeafaa6205f08637e292496f04
Successfully built NVCCPlugin
Installing collecte

In [None]:
%%cuda
#include <stdio.h>
#include <cstdlib> //rand() function


inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
		fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
	}
	return err;
}

void histogram_CPU(int *data, const int N, int* histogram, const int Nbins)
{
	for(int i=0; i<N; ++i)
	{
		histogram[data[i]]++;
	}
}

__global__ void histogram_GPU_1( int *data, const int N, int *histo)
{
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	int section_size = (N-1)/(blockDim.x *gridDim.x)+1;
	int start = i*section_size; //All threads handle blockDim.x * gridDim.x consecutive elements
	for (int k = 0; k < section_size; k++)
	{
		if (start+k < N)
		{
			atomicAdd(&(histo[data[start+k]]), 1);
		}
	}
}

__global__ void histogram_GPU_2( int *data, const int N, int *histo)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;

	//All threads handle blockDim.x * gridDim.x consecutive elements in each
	//iteration

	for(int i=tid ; i < N ; i += blockDim.x*gridDim.x )
  {
		 atomicAdd(&(histo[data[i]]), 1);
	}
}

__global__ void histogram_GPU_3( int *data, const int N, int *histo, const int num_bins)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;

	//Privatized bins
	extern __shared__ unsigned int histo_s[];

	for(int binIdx = threadIdx.x ; binIdx < num_bins ; binIdx += blockDim.x )
     histo_s[ binIdx ] = 0u;

	__syncthreads();

	//Histogram
	for(unsigned int i = tid ; i < N ; i += blockDim.x*gridDim.x )
		atomicAdd(&(histo_s[data[i]]), 1);

  __syncthreads();

	//Commit to global memory (merge step)
  for(unsigned int binIdx = threadIdx.x ; binIdx < num_bins ; binIdx += blockDim.x )
		atomicAdd(&(histo[binIdx]), histo_s[binIdx]);
}

int main()
{
	int *data;
	int *histogram;
	int *histogramGPU;
	const int Nbins = 256;
	const int N = 1024 * 1024;
	const int threadsPerBlock = 256;

	#define imin(a,b) (a<b?a:b)

	const int blocksPerGrid =  imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );

	// Allocate Unified Memory -- accessible from CPU or GPU
	checkCudaErr(cudaMallocManaged(&data, N*sizeof(int)), "cudaMallocManaged1 data");
	checkCudaErr(cudaMallocManaged(&histogram, Nbins*sizeof(int)), "cudaMallocManaged2 histogram");
	checkCudaErr(cudaMallocManaged(&histogramGPU, Nbins*sizeof(int)), "cudaMallocManaged2 histogramGPU");

	// fill in the data with random values between 0-255
	for (int i=0; i<N; i++) {
		data[i] = (rand() % Nbins);
	}

	// some events to count the execution time
  cudaEvent_t start, stop;
	float cpu_elapsed_time_ms, gpu_1_elapsed_time_ms, gpu_2_elapsed_time_ms,
				gpu_3_elapsed_time_ms;

  cudaEventCreate(&start);
  cudaEventCreate(&stop);

	//start to count execution time of CPU version
	cudaEventRecord(start, 0);

  //calculate histogram on the CPU
	histogram_CPU(data, N, histogram, Nbins);

  // time counting terminate
  cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

  //compute time elapsed on CPU
  cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
  printf("Histogram(CPU) - Time: %f ms.\n", cpu_elapsed_time_ms);

 	//start to count execution time of GPU version
	cudaEventRecord(start, 0);

	//launch kernel
	histogram_GPU_1<<<blocksPerGrid,threadsPerBlock>>>( data, N, histogramGPU );

	// time counting terminate
  cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

  // compute time elapse on GPU computing
  cudaEventElapsedTime(&gpu_1_elapsed_time_ms, start, stop);
  printf("Histogram_1(GPU) - Time: %f ms.\n", gpu_1_elapsed_time_ms);

	//clear old gpu histogram for histogram calc uisng strategy 2
	for (int i=0; i<Nbins; i++) {
		histogramGPU[i] =0;
	}

	//start to count execution time of GPU version
	cudaEventRecord(start, 0);

	//launch kernel
	histogram_GPU_2<<<blocksPerGrid,threadsPerBlock>>>( data, N, histogramGPU );

	// time counting terminate
  cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

  // compute time elapse on GPU computing
  cudaEventElapsedTime(&gpu_2_elapsed_time_ms, start, stop);
  printf("Histogram_2(GPU) - Time: %f ms.\n", gpu_2_elapsed_time_ms);

//clear old gpu histogram for histogram calc uisng strategy 2
	for (int i=0; i<Nbins; i++) {
		histogramGPU[i] =0;
	}

	//start to count execution time of GPU version
	cudaEventRecord(start, 0);

	//launch kernel
	histogram_GPU_3<<<blocksPerGrid,threadsPerBlock, Nbins*sizeof(unsigned int)>>>( data, N, histogramGPU, Nbins );

	// time counting terminate
  cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

  // compute time elapse on GPU computing
  cudaEventElapsedTime(&gpu_3_elapsed_time_ms, start, stop);
  printf("Histogram_3(GPU) - Time: %f ms.\n", gpu_3_elapsed_time_ms);


	int allOK = 1;

 /*
  printf("+------------------+-------------------+\n");
  printf("| CPU Histogram[i] |  GPU Histogram[i] |\n");
	printf("+------------------+-------------------+\n");
	*/
  //compare results on CPU and GPU
	for(int i=0 ; i<Nbins; ++i)
 	{
		 //printf("|      %5d       |      %5d        |\n", histogram[i], histogramGPU[i]);

		 if(histogram[i]!=histogramGPU[i])
		 {
		    printf("Histogram mismatch!!!");
				allOK=0;
				break;
		 }
	}
  // printf("+------------------+-------------------+\n");

	if(allOK==1)
  	printf("Results correct on both CPU and GPU");

	// free memory on the gpu side
	checkCudaErr( cudaFree( data ) , "cudaFree1");
	checkCudaErr( cudaFree( histogram ) , "cudaFree2");
	checkCudaErr( cudaFree( histogramGPU ) , "cudaFree3");
	checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

	return 0;
}

Histogram(CPU) - Time: 2.592768 ms.
Histogram_1(GPU) - Time: 2.088224 ms.
Histogram_2(GPU) - Time: 0.735232 ms.
Histogram_3(GPU) - Time: 0.147936 ms.
Results correct on both CPU and GPU
