In [1]:
%%file sumArraysOnHost.c

#include <stdlib.h>
#include <string.h>
#include <time.h>

void sumArraysOnHost(float *A, float *B, float *C, const int N){
    for (int idx=0; idx<N; idx++){
        C[idx] = A[idx] + B[idx];
    }
}


void initialData(float *ip, int size){
    // generate different seed for random number 
    time_t t;
    srand((unsigned int) time (&t));
    
    for (int i=0; i<size; i++){
        ip[i] = (float)(rand() & 0xFF) / 10.0f;
    }
}


int main(int argc, char **argv){
    int nElem = 1024;
    size_t nBytes = nElem * sizeof(float);
    
    float *h_A, *h_B, *h_C;
    h_A = (float *)malloc(nBytes);
    h_B = (float *)malloc(nBytes);
    h_C = (float *)malloc(nBytes);
    
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    
    sumArraysOnHost(h_A, h_B, h_C, nElem);
    
    free(h_A);
    free(h_B);
    free(h_C);
    
    return (0);
}

Overwriting sumArraysOnHost.c


In [2]:
!nvcc -Xcompiler -std=c99 sumArraysOnHost.c -o sum



In [3]:
!./sum

In [4]:
!nvprof ./sum

In [20]:
%%file sumArraysOnDevice.cu

#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <stdio.h>

__global__ void sumArraysOnDevice(float *A, float *B, float *C){
    int idx = threadIdx.x;
    C[idx] = A[idx] + B[idx];

}


void initialData(float *ip, int size){
    // generate different seed for random number 
    time_t t;
    srand((unsigned int) time (&t));
    
    for (int i=0; i<size; i++){
        ip[i] = (float)(rand() & 0xFF) / 10.0f;
    }
}


void sumArraysOnHost(float *A, float *B, float *C, const int N){
    for (int idx=0; idx<N; idx++){
        C[idx] = A[idx] + B[idx];
    }
}



void checkResult(float *h_C, float *result, const int N){
    double epsilon = 1.0E-8;
    int match = 1;
    for (int i = 0; i < N; i++){
        if (abs(h_C[i] - result[i]) > epsilon){
            match = 0;
            printf("Arrays do not match!\n");
            printf("host %5.2f gpu %5.2f at current %d\n",
                   h_C[i], result[i], i);
            break;
        }
    }
    if (match) printf("Arrays match. \n\n");
}


int main(int argc, char **argv){
    int nElem = 1024;
    size_t nBytes = nElem * sizeof(float);
    
    float *h_A, *h_B, *h_C, *result;
    h_A = (float *)malloc(nBytes);
    h_B = (float *)malloc(nBytes);
    h_C = (float *)malloc(nBytes);
    result = (float *)malloc(nBytes);
    
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    
    float *d_A, *d_B, *d_C;
    cudaMalloc((float**)&d_A, nBytes);
    cudaMalloc((float**)&d_B, nBytes);
    cudaMalloc((float**)&d_C, nBytes);
    
    // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the
    // parameter cudaMemcpyHostToDevice specifying the transfer direction.
    
    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
    
    
    
    sumArraysOnDevice<<<1, nElem>>>(d_A, d_B, d_C);
    sumArraysOnHost(h_A, h_B, result, nElem);
    
    cudaMemcpy(h_C, d_C, nBytes, cudaMemcpyDeviceToHost);
    
    for (int i=0; i<10; i++){
         printf("%f + %f = %f \n", h_A[i], h_B[i], h_C[i]);

    }
    
    checkResult(h_C, result, nElem);
    
    free(h_A);
    free(h_B);
    free(h_C);
    free(result);
    
    // use cudaFree to release the memory used on the GPU
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cudaDeviceReset();
    
    return (0);
}

Overwriting sumArraysOnDevice.cu


In [21]:
%%bash
nvcc sumArraysOnDevice.cu -o sumgpu
./sumgpu

21.600000 + 21.600000 = 43.200001 
12.200000 + 12.200000 = 24.400000 
3.300000 + 3.300000 = 6.600000 
6.400000 + 6.400000 = 12.800000 
8.600000 + 8.600000 = 17.200001 
11.400000 + 11.400000 = 22.799999 
23.299999 + 23.299999 = 46.599998 
2.700000 + 2.700000 = 5.400000 
2.600000 + 2.600000 = 5.200000 
24.100000 + 24.100000 = 48.200001 
Arrays match. 



In [11]:
!nvprof --unified-memory-profiling off ./sumgpu

==12294== NVPROF is profiling process 12294, command: ./sumgpu
11.300000 + 11.300000 = 22.600000 
23.200001 + 23.200001 = 0.000000 
23.500000 + 23.500000 = 0.000000 
21.500000 + 21.500000 = 0.000000 
16.700001 + 16.700001 = 0.000000 
23.000000 + 23.000000 = 0.000000 
5.900000 + 5.900000 = 0.000000 
3.200000 + 3.200000 = 0.000000 
13.900000 + 13.900000 = 0.000000 
8.200000 + 8.200000 = 0.000000 
Arrays do not match!
host  0.00 gpu 46.40 at current 1
==12294== Profiling application: ./sumgpu
==12294== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   38.31%  3.7760us         1  3.7760us  3.7760us  3.7760us  sumArraysOnDevice(float*, float*, float*)
                   37.34%  3.6800us         2  1.8400us  1.8240us  1.8560us  [CUDA memcpy HtoD]
                   24.35%  2.4000us         1  2.4000us  2.4000us  2.4000us  [CUDA memcpy DtoH]
      API calls:   70.29%  116.93ms         3  38.978ms  4.1810us  116.92ms  cudaMal