<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring_2024/blob/main/Week8/SumArrayZeroCopy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-lk8apgq1
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-lk8apgq1
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=8aadf444aa20ca2ab1b958ad5bbf89b193bce828f4b7b04ed39354ef887e155e
  Stored in directory: /tmp/pip-ephem-wheel-cache-w5urbnkv/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [None]:
%%cuda
#include <stdio.h>

inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
  }
  return err;
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            printf("Arrays do not match!\n");
            printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
                    gpuRef[i], i);
            return;
        }
    }

    printf("Results match !!!\n");
    return;
}

void initialData(float *ip, int size)
{
    int i;

    for (i = 0; i < size; i++)
    {
        ip[i] = (float)( rand() & 0xFF ) / 10.0f;
    }

    return;
}

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx < N; idx++)
    {
        C[idx] = A[idx] + B[idx];
    }
}

__global__ void sumArrays(float *A, float *B, float *C, const int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N) C[i] = A[i] + B[i];
}

__global__ void sumArraysZeroCopy(float *A, float *B, float *C, const int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N) C[i] = A[i] + B[i];
}

int main(int argc, char **argv)
{
    // set up device
    int dev = 0;
    checkCudaErr(cudaSetDevice(dev), "cudeSetDevice");

    // get device properties
    cudaDeviceProp deviceProp;
    checkCudaErr(cudaGetDeviceProperties(&deviceProp, dev), "cudaGetDeviceProperties");

    // check if support mapped memory
    if (!deviceProp.canMapHostMemory)
    {
        printf("Device %d does not support mapping CPU host memory!\n", dev);
        checkCudaErr(cudaDeviceReset(), "cudaDeviceReset");
        exit(EXIT_SUCCESS);
    }

    printf("Using Device %d: %s ", dev, deviceProp.name);

    // set up data size of vectors
    int ipower = 10;

    if (argc > 1) ipower = atoi(argv[1]);

    int nElem = 1 << ipower;
    size_t nBytes = nElem * sizeof(float);

    if (ipower < 18)
    {
        printf("Vector size %d power %d  nbytes  %3.0f KB\n", nElem, ipower,
               (float)nBytes / (1024.0f));
    }
    else
    {
        printf("Vector size %d power %d  nbytes  %3.0f MB\n", nElem, ipower,
               (float)nBytes / (1024.0f * 1024.0f));
    }

    // part 1: using device memory
    // malloc host memory
    float *h_A, *h_B, *hostRef, *gpuRef;
    h_A     = (float *)malloc(nBytes);
    h_B     = (float *)malloc(nBytes);
    hostRef = (float *)malloc(nBytes);
    gpuRef  = (float *)malloc(nBytes);

    // initialize data at host side
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    memset(hostRef, 0, nBytes);
    memset(gpuRef,  0, nBytes);

    // add vector at host side for result checks
    sumArraysOnHost(h_A, h_B, hostRef, nElem);

    // malloc device global memory
    float *d_A, *d_B, *d_C;
    checkCudaErr(cudaMalloc((float**)&d_A, nBytes), "cudaMalloc1");
    checkCudaErr(cudaMalloc((float**)&d_B, nBytes), "cudaMalloc2");
    checkCudaErr(cudaMalloc((float**)&d_C, nBytes), "cudaMalloc3");

    // transfer data from host to device
    checkCudaErr(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice), "cudaMemcpy");
    checkCudaErr(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice), "cudaMemcpy");

    // set up execution configuration
    int iLen = 512;
    dim3 block (iLen);
    dim3 grid  ((nElem + block.x - 1) / block.x);

    sumArrays<<<grid, block>>>(d_A, d_B, d_C, nElem);

    // copy kernel result back to host side
    checkCudaErr(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost), "cudaMemcpy");

    // check device results
    checkResult(hostRef, gpuRef, nElem);

    // free device global memory
    checkCudaErr(cudaFree(d_A),"cudaFree");
    checkCudaErr(cudaFree(d_B),"cudaFree");

    // free host memory
    free(h_A);
    free(h_B);

    // part 2: using zerocopy memory for array A and B
    // allocate zerocpy memory
    checkCudaErr(cudaHostAlloc((void **)&h_A, nBytes, cudaHostAllocMapped),"cudaHostAlloc1");
    checkCudaErr(cudaHostAlloc((void **)&h_B, nBytes, cudaHostAllocMapped),"cudaHostAlloc2");

    // initialize data at host side
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    memset(hostRef, 0, nBytes);
    memset(gpuRef,  0, nBytes);

    // pass the pointer to device
    checkCudaErr(cudaHostGetDevicePointer((void **)&d_A, (void *)h_A, 0), "cudaHostGetDevicePointer1");
    checkCudaErr(cudaHostGetDevicePointer((void **)&d_B, (void *)h_B, 0), "cudaHostGetDevicePointer2");

    // add at host side for result checks
    sumArraysOnHost(h_A, h_B, hostRef, nElem);

    // execute kernel with zero copy memory
    sumArraysZeroCopy<<<grid, block>>>(d_A, d_B, d_C, nElem);

    // copy kernel result back to host side
    checkCudaErr(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost), "cudaMemcpy");

    // check device results
    checkResult(hostRef, gpuRef, nElem);

    // free  memory
    checkCudaErr(cudaFree(d_C), "cudaFree1");
    checkCudaErr(cudaFreeHost(h_A), "cudaFree2");
    checkCudaErr(cudaFreeHost(h_B), "cudaFree3");

    free(hostRef);
    free(gpuRef);

    // reset device
    checkCudaErr(cudaDeviceReset(), "cudaDeviceReset");
    return EXIT_SUCCESS;
}

Using Device 0: Tesla T4 Vector size 1024 power 10  nbytes    4 KB
Results match !!!
Results match !!!

