**Thread Organization in CUDA**

In [None]:
!which nvcc
!/usr/local/cuda/bin/nvcc --version
!nvidia-smi

/usr/local/cuda/bin/nvcc
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Wed May 28 07:14:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |

**Kernel Function (launch parameters) & Multiple Dimensions using dim3**

In [None]:
%%writefile two.cu

#include "/usr/local/cuda/include/cuda_runtime.h"
#include "/usr/local/cuda/include/device_launch_parameters.h"

#include <stdio.h>

__global__ void hello_cuda(){
  printf("Aniruddha Shete\n");
}

int main(){
  dim3 block(4);
  dim3 grid(8);

  hello_cuda<<<grid,block>>>(); //kernel launch parameters

  //checking whether the kernel is being launched
    cudaError_t cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "hello_cuda launch failed: %s\n", cudaGetErrorString(cudaStatus));
        return 1; // Indicate an error
    }
  cudaDeviceSynchronize();

  cudaDeviceReset();
  return 0;
}

Writing two.cu


In [None]:
# !nvcc two.cu -o two // version issue - launch failed: the provided PTX was compiled with an unsupported toolchain.
!nvcc -arch=sm_75 two.cu -o two

In [None]:
!./two

Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete
Aniruddha Shete


**threadIdx**

In [None]:
%%writefile three.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void print_threadIds(){
  printf("threadIdx.x : %d, threadIdx.y : %d, threadIdx.z : %d \n", threadIdx.x, threadIdx.y, threadIdx.z);
}

int main(){
    int nx, ny;
    ny = 16;
    nx = 16;

    dim3 block(8,8);
    dim3 grid(nx/block.x, ny/block.y);

    print_threadIds<<<grid,block>>>();
    cudaDeviceSynchronize();

    cudaDeviceReset();
    return 0;
}

Overwriting three.cu


In [None]:
!nvcc -arch=sm_75 three.cu -o three

In [None]:
!./three

block.x : 8, block.y : 8 
threadIdx.x : 0, threadIdx.y : 0, threadIdx.z : 0 
threadIdx.x : 1, threadIdx.y : 0, threadIdx.z : 0 
threadIdx.x : 2, threadIdx.y : 0, threadIdx.z : 0 
threadIdx.x : 3, threadIdx.y : 0, threadIdx.z : 0 
threadIdx.x : 4, threadIdx.y : 0, threadIdx.z : 0 
threadIdx.x : 5, threadIdx.y : 0, threadIdx.z : 0 
threadIdx.x : 6, threadIdx.y : 0, threadIdx.z : 0 
threadIdx.x : 7, threadIdx.y : 0, threadIdx.z : 0 
threadIdx.x : 0, threadIdx.y : 1, threadIdx.z : 0 
threadIdx.x : 1, threadIdx.y : 1, threadIdx.z : 0 
threadIdx.x : 2, threadIdx.y : 1, threadIdx.z : 0 
threadIdx.x : 3, threadIdx.y : 1, threadIdx.z : 0 
threadIdx.x : 4, threadIdx.y : 1, threadIdx.z : 0 
threadIdx.x : 5, threadIdx.y : 1, threadIdx.z : 0 
threadIdx.x : 6, threadIdx.y : 1, threadIdx.z : 0 
threadIdx.x : 7, threadIdx.y : 1, threadIdx.z : 0 
threadIdx.x : 0, threadIdx.y : 2, threadIdx.z : 0 
threadIdx.x : 1, threadIdx.y : 2, threadIdx.z : 0 
threadIdx.x : 2, threadIdx.y : 2, threadIdx.z : 0 
threa

**blockIdx, blockDim, gridDim**

In [None]:
%%writefile four.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void print_details(){
  printf("blockIdx.x : %d, blockIdx.y : %d, blockIdx.z : %d \n"
         "blockDim.x : %d, blockDim.y : %d, blockDim.z : %d \n"
         "gridDim.x: %d, gridDim.y : %d, gridDim.z : %d \n",
          blockIdx.x, blockIdx.y, blockIdx.z,
          blockDim.x, blockDim.y, blockDim.z,
          gridDim.x, gridDim.y, gridDim.z);
}

int main(){
    int nx, ny;
    ny = 16;
    nx = 16;

    dim3 block(8,8);
    dim3 grid(nx/block.x, ny/block.y);

    print_details<<<grid,block>>>();
    cudaDeviceSynchronize();

    cudaDeviceReset();
    return 0;
}

Overwriting four.cu


In [None]:
!nvcc -arch=sm_75 four.cu -o four

In [None]:
!./four

blockIdx.x : 1, blockIdx.y : 0, blockIdx.z : 0 
blockDim.x : 8, blockDim.y : 8, blockDim.z : 1 
gridDim.x: 2, gridDim.y : 2, gridDim.z : 1 
blockIdx.x : 1, blockIdx.y : 0, blockIdx.z : 0 
blockDim.x : 8, blockDim.y : 8, blockDim.z : 1 
gridDim.x: 2, gridDim.y : 2, gridDim.z : 1 
blockIdx.x : 1, blockIdx.y : 0, blockIdx.z : 0 
blockDim.x : 8, blockDim.y : 8, blockDim.z : 1 
gridDim.x: 2, gridDim.y : 2, gridDim.z : 1 
blockIdx.x : 1, blockIdx.y : 0, blockIdx.z : 0 
blockDim.x : 8, blockDim.y : 8, blockDim.z : 1 
gridDim.x: 2, gridDim.y : 2, gridDim.z : 1 
blockIdx.x : 1, blockIdx.y : 0, blockIdx.z : 0 
blockDim.x : 8, blockDim.y : 8, blockDim.z : 1 
gridDim.x: 2, gridDim.y : 2, gridDim.z : 1 
blockIdx.x : 1, blockIdx.y : 0, blockIdx.z : 0 
blockDim.x : 8, blockDim.y : 8, blockDim.z : 1 
gridDim.x: 2, gridDim.y : 2, gridDim.z : 1 
blockIdx.x : 1, blockIdx.y : 0, blockIdx.z : 0 
blockDim.x : 8, blockDim.y : 8, blockDim.z : 1 
gridDim.x: 2, gridDim.y : 2, gridDim.z : 1 
blockIdx.x : 1, bloc

In [None]:
%%writefile five.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>

__global__ void unique_idx_calc_threadIdx(int * input){
  int tid = threadIdx.x;
  printf("threadIdx : %d, value : %d \n", tid, input[tid]);
}

__global__ void unique_gid_calculation(int * input){
  int tid = threadIdx.x;
  int offset = blockIdx.x * blockDim.x;
  int gid = offset + tid;
  printf("blockIdx.x : %d, threadIdx.x : %d, gid : %d, value : %d \n",
        blockIdx.x, tid, gid, input[gid]);
}

int main(){
  int array_size = 16;
  int array_byte_size = sizeof(int) * array_size;
  int data[] = {13,21,34,51,23,31,19,12,32,91,43,44,54,45,15,29};

  for(int i=0;  i<array_size; i++){
    printf("%d ", data[i]);
  }
  printf("\n \n");

  int * d_data;
  cudaMalloc((void**)&d_data, array_byte_size);
  cudaMemcpy(d_data, data, array_byte_size, cudaMemcpyHostToDevice);

  dim3 block(4);
  dim3 grid(4);

  unique_gid_calculation<<<grid, block>>>(d_data);
  cudaDeviceSynchronize();

  cudaDeviceReset();
  return 0;

}


Overwriting five.cu


In [None]:
!nvcc -arch=sm_75 five.cu -o five

In [None]:
!./five

13 21 34 51 23 31 19 12 32 91 43 44 54 45 15 29 
 
blockIdx.x : 1, threadIdx.x : 0, gid : 4, value : 23 
blockIdx.x : 1, threadIdx.x : 1, gid : 5, value : 31 
blockIdx.x : 1, threadIdx.x : 2, gid : 6, value : 19 
blockIdx.x : 1, threadIdx.x : 3, gid : 7, value : 12 
blockIdx.x : 0, threadIdx.x : 0, gid : 0, value : 13 
blockIdx.x : 0, threadIdx.x : 1, gid : 1, value : 21 
blockIdx.x : 0, threadIdx.x : 2, gid : 2, value : 34 
blockIdx.x : 0, threadIdx.x : 3, gid : 3, value : 51 
blockIdx.x : 2, threadIdx.x : 0, gid : 8, value : 32 
blockIdx.x : 2, threadIdx.x : 1, gid : 9, value : 91 
blockIdx.x : 2, threadIdx.x : 2, gid : 10, value : 43 
blockIdx.x : 2, threadIdx.x : 3, gid : 11, value : 44 
blockIdx.x : 3, threadIdx.x : 0, gid : 12, value : 54 
blockIdx.x : 3, threadIdx.x : 1, gid : 13, value : 45 
blockIdx.x : 3, threadIdx.x : 2, gid : 14, value : 15 
blockIdx.x : 3, threadIdx.x : 3, gid : 15, value : 29 
