In [1]:
%%writefile vecadd.cu

//Vector Addition on GPU using CUDA

#include <cstdlib>
#include <cassert>
#include <iostream>

using namespace std;

__global__
void vecAdd(int *a, int *b, int *c, int N){
  int thread = blockIdx.x*blockDim.x + threadIdx.x;

  if(thread < N){
    c[thread] = a[thread] + b[thread];
  }
}

void verifyAdd(int *a, int *b, int *c, int N){
  for(int i = 0; i < N; i++){
    assert(c[i] == a[i] + b[i]);
  }
}

void initMatrix(int* a, int N){
  for(int i = 0; i < N; i++){
    a[i] = rand() % 100; // number from 0-99
  }
}


int main(){
  int N = 1 << 10;
  size_t sz = N*sizeof(int);

  //allocating and initializing vectors
  int *h_a, *h_b, *h_c;
  h_a = (int*)malloc(sz);
  h_b = (int*)malloc(sz);
  h_c = (int*)malloc(sz);

  int *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, sz);
  cudaMalloc(&d_b, sz);
  cudaMalloc(&d_c, sz);


  initMatrix(h_a, N);
  initMatrix(h_b, N);
  
  cudaMemcpy(d_a, h_a, sz, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, sz, cudaMemcpyHostToDevice);

  int threads = 256;
  int blocks = (N + threads - 1) / threads;
  dim3 grid_size(blocks, 1, 1);
  dim3 block_size(threads, 1, 1);

  //Launching Kernel
  vecAdd<<<grid_size, block_size>>>(d_a, d_b, d_c, N);
  cudaDeviceSynchronize();
  cudaMemcpy(h_c, d_c, sz, cudaMemcpyDeviceToHost);

  //Verifying the result
  verifyAdd(h_a, h_b, h_c, N);

  return 0;

}

Writing vecadd.cu


In [2]:
!nvcc -o vecadd vecadd.cu

In [3]:
!./vecadd

In [4]:
!nvprof --print-gpu-trace ./vecadd

==389== NVPROF is profiling process 389, command: ./vecadd
==389== Profiling application: ./vecadd
==389== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
408.25ms  2.1120us                    -               -         -         -         -  4.0000KB  1.8062GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
408.27ms  1.6960us                    -               -         -         -         -  4.0000KB  2.2492GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
408.30ms  4.9600us              (4 1 1)       (256 1 1)        16        0B        0B         -           -           -           -     Tesla T4 (0)         1         7  vecAdd(int*, int*, int*, int) [117]
408.33ms  2.2710us                    -               -         -         -         -  4.0000KB  1.6797GB/s      Dev