<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring_2024/blob/main/Week7/CUDA_Streams_Breadth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-3xy1f95u
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-3xy1f95u
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 781ff5b76ba6c4c2d80dcbbec9983e147613cc71
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Source files will be saved in "/tmp/tmph7gencha".


In [3]:
%%cuda

#include <stdio.h>

#define N 1000000000
#define NSTREAM 4

inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
		fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
	}
	return err;
}

__global__ void kernel_1()
{
	double sum = 0.0;
	for(int i = 0; i < N; i++)
  {
			sum += sum + tan(0.1) * tan(0.1);
	}
}

__global__ void kernel_2()
{
	double sum = 0.0;
	for(int i = 0; i < N; i++)
  {
			sum += sum + tan(0.1) * tan(0.1);
	}
}

__global__ void kernel_3()
{
	double sum = 0.0;
	for(int i = 0; i < N; i++)
  {
			sum += sum + tan(0.1) * tan(0.1);
	}
}

__global__ void kernel_4()
{
	double sum = 0.0;
	for(int i = 0; i < N; i++)
  {
			sum += sum + tan(0.1) * tan(0.1);
	}
}

int main()
{
	float elapsed_time = 0;
	int bigcase = 1;
	int isize = 1;
  int iblock = 1;

	const int n_streams = NSTREAM;
	cudaStream_t* streams = (cudaStream_t*) malloc(n_streams * sizeof(cudaStream_t));

	for (int i = 0 ; i < n_streams; i++)
  {
     checkCudaErr(cudaStreamCreate(&streams[i]), "stream creation");
	}


    // run kernel with more threads
    if (bigcase == 1)
    {
        iblock = 512;
        isize = 1 << 12;
    }

    // set up execution configuration
    dim3 block (iblock);
    dim3 grid  (isize / iblock);

	// creat events
  cudaEvent_t start, stop;
  checkCudaErr(cudaEventCreate(&start), "event create (start)");
  checkCudaErr(cudaEventCreate(&stop), "event create (stop)");

  // record start event
  checkCudaErr(cudaEventRecord(start, 0), "cudaEventRecord(start)");

	for(int i=0; i<n_streams; ++i)
			kernel_1 <<< grid, block, 0, streams[i] >>> ();
	for(int i=0; i<n_streams; ++i)
			kernel_2 <<< grid, block, 0, streams[i] >>> ();
	for(int i=0; i<n_streams; ++i)
			kernel_3 <<< grid, block, 0, streams[i] >>> ();
	for(int i=0; i<n_streams; ++i)
			kernel_4 <<< grid, block, 0, streams[i] >>> ();

  // record stop event
  checkCudaErr(cudaEventRecord(stop, 0), "cudaEventRecord");
  checkCudaErr(cudaEventSynchronize(stop), "cudaEventSynchronize");

	// calculate elapsed time
  checkCudaErr(cudaEventElapsedTime(&elapsed_time, start, stop), "cudaEventElapsedTime");
	printf("Measured time for parallel execution = %.3fs\n", elapsed_time / 1000.0f);

	// release all stream
	for (int i = 0 ; i < n_streams ; i++)
  {
     checkCudaErr(cudaStreamDestroy(streams[i]), "cudaStreamDestroy");
  }

  free(streams);

	// free memory on the gpu side
  // destroy events
  checkCudaErr(cudaEventDestroy(start), "cudaEventDestroy(start)");
  checkCudaErr(cudaEventDestroy(stop), "cudaEventDestroy(stop)");
	checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

	return 0;
}

Measured time for parallel execution = 0.104s

