<a href="https://colab.research.google.com/github/aditya-malte/Simple-Cuda-Colab-Demo/blob/master/CUDA_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Select Runtime as GPU

In [0]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130


In [0]:
code = """
#include <stdio.h>
#include <stdlib.h>
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int a, b, c;
// host copies of variables a, b & c
int *d_a, *d_b, *d_c;
// device copies of variables a, b & c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values  
c = 0;
a = 3;
b = 3;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
      printf("CUDA error copying to Host: %s", cudaGetErrorString(err));
  }
printf("result is %d",c);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
"""


In [0]:
text_file = open("code.cu", "w")
text_file.write(code)
text_file.close()

In [0]:
!nvcc code.cu

In [0]:
!./a.out

result is 6

In [0]:
!nvprof ./a.out

==192== NVPROF is profiling process 192, command: ./a.out
==192== Profiling application: ./a.out
result is 6==192== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   38.91%  3.8720us         2  1.9360us  1.5680us  2.3040us  [CUDA memcpy HtoD]
                   34.73%  3.4560us         1  3.4560us  3.4560us  3.4560us  add(int*, int*, int*)
                   26.37%  2.6240us         1  2.6240us  2.6240us  2.6240us  [CUDA memcpy DtoH]
      API calls:   99.23%  151.50ms         3  50.499ms  6.1530us  151.48ms  cudaMalloc
                    0.33%  496.81us         1  496.81us  496.81us  496.81us  cuDeviceTotalMem
                    0.22%  342.72us        96  3.5700us     149ns  186.62us  cuDeviceGetAttribute
                    0.08%  129.72us         3  43.241us  5.5010us  111.76us  cudaFree
                    0.07%  110.76us         1  110.76us  110.76us  110.76us  cudaLaunchKernel
                    0.04%  64.318