In [1]:
%%writefile divergence_test.cu

#include <cuda_runtime.h>
#include <stdio.h>

__global__ void divergence_test_ker()
{
	if( threadIdx.x % 2 == 0)
		printf("threadIdx.x %d : This is an even thread.\n", threadIdx.x);
	else
		printf("threadIdx.x %d : This is an odd thread.\n", threadIdx.x);
}

__host__ int main()
{
	cudaSetDevice(0);
	divergence_test_ker<<<1, 32>>>();
	cudaDeviceSynchronize();
	cudaDeviceReset();
}

Writing divergence_test.cu


Let's compile the program. We need to call nvcc with a shell command. In Jupyter, shell commands start with !. We need to include the directories for the Cuda include files (-I) and Cuda libraries (-L). Let's link also the cuBlas and cuSolver libraries, since you might need them at some point. We don't specify the name of the executable, so it will be the default a.out. If everything goes well, executing the cell does not give any output. If the compiler has problems, you will see the error messages:

In [2]:
!nvcc -I /usr/local/cuda/samples/common/inc/ -L/usr/local/cuda/include -lcublas -lcusolver -arch=sm_35 -Wno-deprecated-gpu-targets divergence_test.cu

In [3]:
!./a.out

threadIdx.x 1 : This is an odd thread.
threadIdx.x 3 : This is an odd thread.
threadIdx.x 5 : This is an odd thread.
threadIdx.x 7 : This is an odd thread.
threadIdx.x 9 : This is an odd thread.
threadIdx.x 11 : This is an odd thread.
threadIdx.x 13 : This is an odd thread.
threadIdx.x 15 : This is an odd thread.
threadIdx.x 17 : This is an odd thread.
threadIdx.x 19 : This is an odd thread.
threadIdx.x 21 : This is an odd thread.
threadIdx.x 23 : This is an odd thread.
threadIdx.x 25 : This is an odd thread.
threadIdx.x 27 : This is an odd thread.
threadIdx.x 29 : This is an odd thread.
threadIdx.x 31 : This is an odd thread.
threadIdx.x 0 : This is an even thread.
threadIdx.x 2 : This is an even thread.
threadIdx.x 4 : This is an even thread.
threadIdx.x 6 : This is an even thread.
threadIdx.x 8 : This is an even thread.
threadIdx.x 10 : This is an even thread.
threadIdx.x 12 : This is an even thread.
threadIdx.x 14 : This is an even thread.
threadIdx.x 16 : This is an even thread.
t

**The odd and even threads run in different orders!** <br> This happens because of the the warps' lockstep property. All the 32 threads in the warp must have run the same code.

In [21]:
 %%writefile divergence_test2.cu

#include <cuda_runtime.h>
#include <stdio.h>

__global__ void divergence_test_ker()
{
	if( threadIdx.x % 3 == 0)
		printf("threadIdx.x %d : mod 0. \n", threadIdx.x);
	else if ( threadIdx.x % 3 == 1)
		printf("threadIdx.x %d : mod 1.\n", threadIdx.x);
  else
    printf("threadIdx.x %d : mod 2.\n", threadIdx.x);
}

__host__ int main()
{
	cudaSetDevice(0);
	divergence_test_ker<<<1, 300>>>();
	cudaDeviceSynchronize();
	cudaDeviceReset();
}

Overwriting divergence_test2.cu


In [22]:
!nvcc -I /usr/local/cuda/samples/common/inc/ -L/usr/local/cuda/include -lcublas -lcusolver -arch=sm_35 -Wno-deprecated-gpu-targets divergence_test2.cu

In [23]:
!./a.out

threadIdx.x 288 : mod 0. 
threadIdx.x 291 : mod 0. 
threadIdx.x 294 : mod 0. 
threadIdx.x 297 : mod 0. 
threadIdx.x 96 : mod 0. 
threadIdx.x 99 : mod 0. 
threadIdx.x 102 : mod 0. 
threadIdx.x 105 : mod 0. 
threadIdx.x 108 : mod 0. 
threadIdx.x 111 : mod 0. 
threadIdx.x 114 : mod 0. 
threadIdx.x 117 : mod 0. 
threadIdx.x 120 : mod 0. 
threadIdx.x 123 : mod 0. 
threadIdx.x 126 : mod 0. 
threadIdx.x 258 : mod 0. 
threadIdx.x 261 : mod 0. 
threadIdx.x 264 : mod 0. 
threadIdx.x 267 : mod 0. 
threadIdx.x 270 : mod 0. 
threadIdx.x 273 : mod 0. 
threadIdx.x 276 : mod 0. 
threadIdx.x 279 : mod 0. 
threadIdx.x 282 : mod 0. 
threadIdx.x 285 : mod 0. 
threadIdx.x 192 : mod 0. 
threadIdx.x 195 : mod 0. 
threadIdx.x 198 : mod 0. 
threadIdx.x 201 : mod 0. 
threadIdx.x 204 : mod 0. 
threadIdx.x 207 : mod 0. 
threadIdx.x 210 : mod 0. 
threadIdx.x 213 : mod 0. 
threadIdx.x 216 : mod 0. 
threadIdx.x 219 : mod 0. 
threadIdx.x 222 : mod 0. 
threadIdx.x 162 : mod 0. 
threadIdx.x 165 : mod 0. 
threadIdx.x 16