<a href="https://colab.research.google.com/github/YKochura/ac-kpi/blob/main/tutor/lec2/vectAdd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip list | grep Keras


Keras-Preprocessing           1.1.2


In [None]:
!pip list | grep tensorflow

tensorflow                    2.7.0
tensorflow-datasets           4.0.1
tensorflow-estimator          2.7.0
tensorflow-gcs-config         2.7.0
tensorflow-hub                0.12.0
tensorflow-io-gcs-filesystem  0.24.0
tensorflow-metadata           1.6.0
tensorflow-probability        0.15.0


In [None]:
!cat /usr/include/cudnn.h | grep CUDNN_MAJOR -A 2

#define CUDNN_MAJOR 7
#define CUDNN_MINOR 6
#define CUDNN_PATCHLEVEL 5
--
#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)

#include "driver_types.h"


In [None]:
import tensorflow as tf
tf.test.is_built_with_cuda()

True

In [None]:
%%writefile vecAdd.c

#include <stdio.h>
#include <stdlib.h>

void initWith(float num, float *a, int n)
{
  for(int i = 0; i < n; ++i)
  {
    a[i] = num;
  }
}

void addVectorsInto(float *result, float *a, float *b, int n)
{
  for(int i = 0; i < n; ++i)
  {
    result[i] = a[i] + b[i];
  }
}

void checkElementsAre(float target, float *array, int n)
{
  for(int i = 0; i < n; i++)
  {
    if(array[i] != target)
    {
      printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
      exit(1);
    }
  }
  printf("SUCCESS! All values added correctly.\n");
}

int main()
{
  // https://www.geeksforgeeks.org/left-shift-right-shift-operators-c-cpp/
  const int N = 2<<20;
  size_t size = N * sizeof(float);

  float *a;
  float *b;
  float *c;

  a = (float *)malloc(size);
  b = (float *)malloc(size);
  c = (float *)malloc(size);

  initWith(3, a, N);
  initWith(4, b, N);
  initWith(0, c, N);

  addVectorsInto(c, a, b, N);

  checkElementsAre(7, c, N);

  free(a);
  free(b);
  free(c);
}

Writing vecAdd.c


In [None]:
%%shell

gcc vecAdd.c -o vecAdd
./vecAdd

SUCCESS! All values added correctly.




## GPU

In [None]:
%%writefile vecAdd.cu

#include <stdio.h>
#include <assert.h>

inline cudaError_t checkCuda(cudaError_t result)
{
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
  return result;
}

void initWith(float num, float *a, int n)
{
  for(int i = 0; i < n; ++i)
  {
    a[i] = num;
  }
}

__global__
void addVectorsInto(float *result, float *a, float *b, int n)
{
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < n; i += stride)
  {
    result[i] = a[i] + b[i];
  }
}

void checkElementsAre(float target, float *array, int n)
{
  for(int i = 0; i < n; i++)
  {
    if(array[i] != target)
    {
      printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
      exit(1);
    }
  }
  printf("SUCCESS! All values added correctly.\n");
}

int main()
{
  const int N = 2<<20;
  size_t size = N * sizeof(float);

  float *a;
  float *b;
  float *c;

  checkCuda( cudaMallocManaged(&a, size) );
  checkCuda( cudaMallocManaged(&b, size) );
  checkCuda( cudaMallocManaged(&c, size) );

  initWith(3, a, N);
  initWith(4, b, N);
  initWith(0, c, N);

  size_t threadsPerBlock;
  size_t numberOfBlocks;

  threadsPerBlock = 256;
  numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;

  addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);

  checkCuda( cudaGetLastError() );
  checkCuda( cudaDeviceSynchronize() );

  checkElementsAre(7, c, N);

  checkCuda( cudaFree(a) );
  checkCuda( cudaFree(b) );
  checkCuda( cudaFree(c) );
}

Writing vecAdd.cu


In [None]:
%%shell

nvcc vecAdd.cu -o vecAddKernel
nvprof ./vecAddKernel

==181== NVPROF is profiling process 181, command: ./vecAddKernel
SUCCESS! All values added correctly.
==181== Profiling application: ./vecAddKernel
==181== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  9.2349ms         1  9.2349ms  9.2349ms  9.2349ms  addVectorsInto(float*, float*, float*, int)
      API calls:   97.06%  421.00ms         3  140.33ms  9.8920us  420.95ms  cudaMallocManaged
                    2.13%  9.2446ms         1  9.2446ms  9.2446ms  9.2446ms  cudaDeviceSynchronize
                    0.51%  2.1943ms         3  731.43us  474.00us  1.1654ms  cudaFree
                    0.15%  666.49us         1  666.49us  666.49us  666.49us  cuDeviceGetPCIBusId
                    0.08%  357.71us         1  357.71us  357.71us  357.71us  cuDeviceTotalMem
                    0.04%  187.23us       101  1.8530us     138ns  75.473us  cuDeviceGetAttribute
                    0.01%  53.108us         1  53.108u

