In [13]:
! ls /usr/local/

bin    cuda	cuda-11.8  games	       include	lib64	   man	 share
colab  cuda-11	etc	   _gcs_config_ops.so  lib	licensing  sbin  src


In [14]:
! nvcc --version # nvcc compiler version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [15]:
# Understanding Thread Indexing in CUDA
%%writefile thread_indexing.cu
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>

// Kernel Function
__global__ void idThreads_kernel(int *block_dev, int *threadLocal_dev, int *warp_dev, int *threadGlobal_dev){
  int threadGlobal_idx = (blockIdx.x*blockDim.x) + threadIdx.x;
  block_dev[threadGlobal_idx] = blockIdx.x;
  threadLocal_dev[threadGlobal_idx] = threadGlobal_idx / warpSize;
  threadGlobal_dev[threadGlobal_idx] = threadGlobal_idx;

  printf("Global Thread %d -- Block %d -- Warp %d -- Local Thread %d\n", threadGlobal_dev[threadGlobal_idx], block_dev[threadGlobal_idx], warp_dev[threadGlobal_idx], threadLocal_dev[threadGlobal_idx]);
}

#define TAM 128

// Main code executed by the host
int main(void){
  int num_blocks = 4; // 4 blocks each one with 32 threads
  int num_threads = 32;

  int block_host[TAM];
  int threadLocal_host[TAM];
  int warp_host[TAM];
  int threadGlobal_host[TAM];

  int *block_dev;
  int *threadLocal_dev;
  int *warp_dev;
  int *threadGlobal_dev;

  size_t TAM_bytes_int = TAM*sizeof(int);
  cudaMalloc((void**)&block_dev, TAM_bytes_int);
  cudaMalloc((void**)&threadLocal_dev, TAM_bytes_int);
  cudaMalloc((void**)&warp_dev, TAM_bytes_int);
  cudaMalloc((void**)&threadGlobal_dev, TAM_bytes_int);

  cudaMemset(threadGlobal_dev, -1, TAM_bytes_int);
  cudaMemset(threadLocal_dev, -1, TAM_bytes_int);
  cudaMemset(warp_dev, -1, TAM_bytes_int);
  cudaMemset(block_dev, -1, TAM_bytes_int);

  idThreads_kernel<<<num_blocks, num_threads>>>(block_dev, threadLocal_dev, warp_dev, threadGlobal_dev);

  cudaMemcpy(block_host, block_dev, TAM_bytes_int, cudaMemcpyDeviceToHost); // copy data from device back to host
  cudaMemcpy(threadLocal_host, threadLocal_dev, TAM_bytes_int, cudaMemcpyDeviceToHost);
  cudaMemcpy(warp_host, warp_dev, TAM_bytes_int, cudaMemcpyDeviceToHost);
  cudaMemcpy(threadGlobal_host, threadGlobal_dev, TAM_bytes_int, cudaMemcpyDeviceToHost);

  cudaFree(block_dev); // free memory device
  cudaFree(threadLocal_dev);
  cudaFree(warp_dev);
  cudaFree(threadGlobal_dev);

}

Overwriting thread_indexing.cu


In [16]:
! nvcc thread_indexing.cu -o test

In [17]:
! ./test

Global Thread 64 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 65 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 66 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 67 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 68 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 69 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 70 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 71 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 72 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 73 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 74 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 75 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 76 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 77 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 78 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 79 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 80 -- Block 2 -- Warp -1 -- Local Thread 2
Global Thread 81 -- Block 2 -- 