<a href="https://colab.research.google.com/github/awanm2/osprey/blob/main/gpu/jupyter/vector_add_basic_001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is example from Chapter 2 of Programming Massively Parallel Processors:3rd edition.
Compile the code and run it

In [1]:

%%writefile prog_vector_add.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
// Kernel
__global__ void vecAddKernel(float* A, float* B, float* C, int n)
{

  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < n ){
    C[i] = A[i] + B[i];
  }
}

// Warapper around the kernel
void vecAdd(float* A, float* B, float* C, int n)
{
  int size = n * sizeof(float);
  float *d_A;
  float *d_B;
  float *d_C;

  cudaMalloc((void**) &d_A, size);
  cudaMemcpy( d_A, A, size, cudaMemcpyHostToDevice);


  cudaMalloc((void**) &d_B, size);
  cudaMemcpy( d_B, B, size, cudaMemcpyHostToDevice);


  cudaMalloc((void**) &d_C, size);

  vecAddKernel<<< ceil(n/256.0), 256>>> (d_A, d_B, d_C, n);

  cudaMemcpy( C, d_C, size, cudaMemcpyDeviceToHost);

  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

}

void fill_random_floats(float* Arr, int n){
  float a_range = 1000.0;

  for(int i = 0;i<n;i++){
    Arr[i] = (float)rand()/(float)(RAND_MAX/a_range);
     }
}


int main()
{
     time_t t;

     srand((unsigned) time(&t));
     printf("%d\n", rand() % 50);

     int n = 6000;

     float* A = (float *)malloc(sizeof(float)* n);



     float* B = (float *)malloc(sizeof(float)* n);
     float* C = (float*) malloc(sizeof(float)* n);




     memset(A,0,sizeof(float)*n);
     memset(B,0,sizeof(float)*n);
     memset(C,0,sizeof(float)*n);

     fill_random_floats(A, n);
     fill_random_floats(B, n);

     vecAdd(A, B, C,  n);

     // check --not the best floating point compare

     for(int i = 0;i<n;i++)
     {
      if ((A[i] + B[i]) != C[i]){

        printf ("A:%f, B:%f C:%f", A[i], B[i], C[i]);
        break;


      }
     }



     free(A);
     free(B);
     free(C);

  return EXIT_SUCCESS;
}




Writing prog_vector_add.cu


In [2]:
!nvcc -I /usr/local/cuda/samples/common/inc/ -L/usr/local/cuda/include -lcublas -lcusolver -Wno-deprecated-gpu-targets prog_vector_add.cu

Run it


In [4]:
!./a.out

46
