In [49]:
%%file sumArraysOnGPU.cu

#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <stdio.h>
#include <cuda_runtime.h>


#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(1);                                                               \
    }                                                                          \
}


__global__ void sumArraysOnDevice(float *A, float *B, float *C){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    C[idx] = A[idx] + B[idx];

}


void initialData(float *ip, int size){
    // generate different seed for random number 
    time_t t;
    srand((unsigned int) time (&t));
    
    for (int i=0; i<size; i++){
        ip[i] = (float)(rand() & 0xFF) / 10.0f;
    }
}


void sumArraysOnHost(float *A, float *B, float *C, const int N){
    for (int idx=0; idx<N; idx++){
        C[idx] = A[idx] + B[idx];
    }
}



void checkResult(float *hostRef, float *gpuRef, const int N){
    double epsilon = 1.0E-8;
    int match = 1;
    for (int i = 0; i < N; i++){
        if (abs(hostRef[i] - gpuRef[i]) > epsilon){
            match = 0;
            printf("Arrays do not match!\n");
            printf("host %5.2f gpu %5.2f at current %d\n",
                   hostRef[i], gpuRef[i], i);
            break;
        }
    }
    if (match) printf("Arrays match. \n\n");
}


int main(int argc, char **argv){
    
    printf("%s Starting...\n", argv[0]);
    
    // malloc host memory
    int nElem = 10000;
    size_t nBytes = nElem * sizeof(float);
    
    
    // initialize data at host side
    float *h_A, *h_B, *hostRef, *gpuRef;
    h_A = (float *)malloc(nBytes);
    h_B = (float *)malloc(nBytes);
    hostRef = (float *)malloc(nBytes);
    gpuRef = (float *)malloc(nBytes);
    
    // initialize data at host side
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    
    memset(hostRef, 0, nBytes);
    memset(gpuRef, 0, nBytes);
    
    // malloc device global memory 
    float *d_A, *d_B, *d_C;
    cudaMalloc((float**)&d_A, nBytes);
    cudaMalloc((float**)&d_B, nBytes);
    cudaMalloc((float**)&d_C, nBytes);
    
    // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the
    // parameter cudaMemcpyHostToDevice specifying the transfer direction.
    
    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
    
    // invoke kernel at host side
    dim3 block(100);
    dim3 grid(nElem / block.x);
    
    sumArraysOnDevice<<<grid, block>>>(d_A, d_B, d_C);
    printf("Execution configuration <<<%d, %d>>>\n", grid.x, block.x);
    
    // copy kernel result back to host side 
    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
    
    // add vector at host side for result checks
    sumArraysOnHost(h_A, h_B, hostRef, nElem);
    
    for (int i=0; i<10; i++){
         printf("%f + %f = %f \n", h_A[i], h_B[i], hostRef[i]);

    }
    
    // check device results
    checkResult(hostRef, gpuRef, nElem);
    
    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);
    
    // use cudaFree to release the memory used on the GPU
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cudaDeviceReset();
    
    return (0);
}


Overwriting sumArraysOnGPU.cu


In [50]:
%%bash
nvcc sumArraysOnGPU.cu -o addvector
nvprof --unified-memory-profiling off ./addvector


./addvector Starting...
Execution configuration <<<100, 100>>>
17.600000 + 17.600000 = 35.200001 
16.299999 + 16.299999 = 32.599998 
0.600000 + 0.600000 = 1.200000 
23.200001 + 23.200001 = 46.400002 
16.799999 + 16.799999 = 33.599998 
15.600000 + 15.600000 = 31.200001 
2.200000 + 2.200000 = 4.400000 
19.700001 + 19.700001 = 39.400002 
4.300000 + 4.300000 = 8.600000 
3.200000 + 3.200000 = 6.400000 
Arrays match. 



==26284== NVPROF is profiling process 26284, command: ./addvector
==26284== Profiling application: ./addvector
==26284== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   61.39%  41.056us         2  20.528us  19.840us  21.216us  [CUDA memcpy HtoD]
                   32.87%  21.984us         1  21.984us  21.984us  21.984us  [CUDA memcpy DtoH]
                    5.74%  3.8400us         1  3.8400us  3.8400us  3.8400us  sumArraysOnDevice(float*, float*, float*)
      API calls:   67.64%  108.27ms         3  36.090ms  6.2490us  108.25ms  cudaMalloc
                   31.70%  50.742ms         1  50.742ms  50.742ms  50.742ms  cudaDeviceReset
                    0.37%  586.92us        94  6.2430us     177ns  259.83us  cuDeviceGetAttribute
                    0.10%  166.18us         3  55.392us  6.7450us  147.89us  cudaFree
                    0.07%  117.21us         3  39.069us  22.571us  54.160us  cudaMemcpy
               