In [None]:
!nvidia-smi

Fri May 30 06:59:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   66C    P8             13W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

**CUDA Device Properties**

In [None]:
%%writefile two.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cstring>

void query_device(){
  int deviceCount = 0;
  cudaError_t error = cudaGetDeviceCount(&deviceCount);

  if (error != cudaSuccess) {
    printf("CUDA Error: %s\n", cudaGetErrorString(error));
    return;
  }

  printf("Number of CUDA devices: %d\n", deviceCount);

  if (deviceCount == 0){
    printf("There are no available CUDA devices.\n");
    printf("Make sure you have:\n");
    printf("1. Enabled GPU runtime in Colab (Runtime -> Change runtime type -> GPU)\n");
    printf("2. Restarted the runtime after enabling GPU\n");
    return;
  }

  for (int devNo = 0; devNo < deviceCount; devNo++) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, devNo);

    printf("\nDevice %d : %s \n", devNo, deviceProp.name);
    printf("Number of Multiprocessors : %d\n", deviceProp.multiProcessorCount);
    printf("Compute Capability : %d.%d\n", deviceProp.major, deviceProp.minor);
    printf("Total Global Memory : %.2f GB\n", deviceProp.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
    printf("Max Threads per Block : %d\n", deviceProp.maxThreadsPerBlock);
    printf("Warp Size : %d\n", deviceProp.warpSize);
  }
}

int main(){
  query_device();
  return 0;
}

Writing two.cu


In [None]:
!nvcc -arch=sm_75 two.cu -o two

In [None]:
!./two

Number of CUDA devices: 1

Device 0 : Tesla T4 
Number of Multiprocessors : 40
Compute Capability : 7.5
Total Global Memory : 14.74 GB
Max Threads per Block : 1024
Warp Size : 32


**Sum Array Example**

In [None]:
%%writefile common.h
//array comparison
void compare_arrays(int *a, int *b, int size){
  for(int i=0 ; i<size ; i++){
    if(a[i] != b[i]){
      printf("Arrays are different \n");
      return;
    }
  }
  printf("Arrays are same");
}

Writing common.h


In [None]:
%%writefile one.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <cstring>
#include <time.h>
#include "common.h"

__global__ void sum_array(int *a, int *b, int *c, int size){
  int gid = blockIdx.x * blockDim.x + threadIdx.x;
  if(gid < size){
    c[gid] = a[gid] + b[gid];
  }
}

void sum_array_cpu(int *a, int *b, int *c, int size){
  for(int i=0 ; i<size ; i++){
    c[i] = a[i] + b[i];
  }
}

int main(){
  int size = 10000;
  int block_size = 128;

  int NO_BYTES = size * sizeof(int);

  //host pointers
  int *h_a, *h_b, *gpu_results, *h_c;

  // allocate memory for host pointers
  h_a = (int*)malloc(NO_BYTES);
  h_b = (int*)malloc(NO_BYTES);
  gpu_results = (int*)malloc(NO_BYTES);
  h_c = (int*)malloc(NO_BYTES);

  // initialize host pointers
  time_t t;
  srand((unsigned) time(&t));
  for(int i=0 ; i<size ; i++){
    h_a[i] = rand();
    h_b[i] = rand();
  }

  memset(gpu_results, 0, NO_BYTES);
  memset(h_c, 0, NO_BYTES);

  sum_array_cpu(h_a, h_b, h_c, size);

  //device pointers
  int *d_a, *d_b, *d_results;
  cudaMalloc((int **)&d_b, NO_BYTES);
  cudaMalloc((int **)&d_a, NO_BYTES);
  cudaMalloc((int **)&d_results, NO_BYTES);

  //memory transfer from host to device
  cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice);

  //launching the grid
  dim3 block(block_size);
  dim3 grid((size + block.x - 1) / block.x);

  //calling the kernel
  sum_array<<<grid, block>>>(d_a, d_b, d_results, size);
  cudaDeviceSynchronize();

  //memory transfer from device to host
  cudaMemcpy(gpu_results, d_results, NO_BYTES, cudaMemcpyDeviceToHost);

  //array comparison
  compare_arrays(gpu_results, h_c, size);

  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_results);

  free(gpu_results);
  free(h_a);
  free(h_b);
  free(h_c);

  return 0;
}

Overwriting one.cu


In [None]:
!nvcc -arch=sm_75 one.cu -o one

In [None]:
!./one

Arrays are same