<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring_2024/blob/main/Week6/CUDA_DynamicParallelism.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-vhbtxkxv
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-vhbtxkxv
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 781ff5b76ba6c4c2d80dcbbec9983e147613cc71
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.1.0-py3-none-any.whl size=8011 sha256=e2b2acbb79d726227ace2ffdeb18b901e4debb9cf9419b13a35a389a655ee4ac
  Stored in directory: /tmp/pip-ephem-wheel-cache-6a7q6x9f/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bui

In [4]:
%%writefile dp.cu
//example given here: https://stackoverflow.com/questions/64516177/call-kernel-inside-cuda-kernel


#include <stdio.h>
const int N = 100;//33 * 1024;
const int threadsPerBlock = 32;// 256;

#define imin(a,b) (a<b?a:b)

const int blocksPerGrid =  imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );

inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
		fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
	}
	return err;
}

__global__ void kernel( float *a, float *b, int N )
{
	int i = threadIdx.x + blockDim.x*blockIdx.x;
	if(i < N)
	{
			for(int j=i; j< min(i+5, N); j++)
			{
					b[i] += a[j];
			}
	}
}

__global__ void kernel_child(int start, int end, float *a, float *b)
{
	int j = start + threadIdx.x + blockDim.x*blockIdx.x;
	if(j < end)
	{
			b[j] += a[j];
	}
}

__global__ void kernel_parent( float *a, float *b, int start, int end )
{
	int i = threadIdx.x + blockDim.x*blockIdx.x;

	b[i] = a[i];
	kernel_child<<< (end-start)/32, 32>>>(start, end, a, b);
}



int main()
{
	float   *a = 0;
  float   *b = 0;
	float   *c = 0;

	// Allocate Unified Memory -- accessible from CPU or GPU
	checkCudaErr(cudaMallocManaged(&a, N*sizeof(float)), "cudaMallocManaged1");
	checkCudaErr(cudaMallocManaged(&b, N*sizeof(float)), "cudaMallocManaged2");
	checkCudaErr(cudaMallocManaged(&c, N*sizeof(float)), "cudaMallocManaged3");

	// fill in the memory with data
	for (int i=0; i<N; i++) {
		a[i] = i+1;
		b[i] = 0;
		c[i] = 0;
	}

	kernel<<<blocksPerGrid,threadsPerBlock>>>( a, b, N );

	cudaDeviceSynchronize();

	kernel_parent<<<blocksPerGrid,threadsPerBlock>>>( a, b, 0, N-1 );

	for(int i=0; i<N;++i)
		printf("%f\n", b[i]);

	// free memory on the gpu side
	checkCudaErr( cudaFree( a ) , "cudaFree1");
	checkCudaErr( cudaFree( b ) , "cudaFree2");
	checkCudaErr( cudaFree( c ) , "cudaFree3");
	checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

	return 0;
}

Overwriting dp.cu


In [5]:
!nvcc -rdc=true dp.cu -o dp
!./dp

1.000000
2.000000
3.000000
4.000000
5.000000
6.000000
7.000000
8.000000
9.000000
10.000000
11.000000
12.000000
13.000000
14.000000
15.000000
16.000000
17.000000
18.000000
19.000000
20.000000
21.000000
22.000000
23.000000
24.000000
25.000000
26.000000
27.000000
28.000000
29.000000
30.000000
31.000000
32.000000
33.000000
34.000000
35.000000
36.000000
37.000000
38.000000
39.000000
40.000000
41.000000
42.000000
43.000000
44.000000
45.000000
46.000000
47.000000
48.000000
49.000000
50.000000
51.000000
52.000000
53.000000
54.000000
55.000000
56.000000
57.000000
58.000000
59.000000
60.000000
61.000000
62.000000
126.000000
128.000000
130.000000
132.000000
134.000000
136.000000
138.000000
140.000000
142.000000
144.000000
146.000000
148.000000
150.000000
152.000000
154.000000
156.000000
158.000000
240.000000
243.000000
246.000000
249.000000
252.000000
255.000000
258.000000
261.000000
264.000000
267.000000
270.000000
273.000000
276.000000
279.000000
282.000000
285.000000
288.000000
97.000000
98.00