<a href="https://colab.research.google.com/github/aditya-malte/Simple-Cuda-Colab-Demo/blob/master/CUDA_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Select Runtime as GPU

In [7]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130


In [0]:
code = """
#include <stdio.h>
#include <stdlib.h>
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int a, b, c;
// host copies of variables a, b & c
int *d_a, *d_b, *d_c;
// device copies of variables a, b & c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values  
c = 0;
a = 3;
b = 3;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
      printf("CUDA error copying to Host: %s", cudaGetErrorString(err));
  }
printf("result is %d",c);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
"""


In [0]:
text_file = open("code.cu", "w")
text_file.write(code)
text_file.close()

In [0]:
!nvcc code.cu

In [11]:
!./a.out

result is 6

In [12]:
!nvprof ./a.out

==265== NVPROF is profiling process 265, command: ./a.out
==265== Profiling application: ./a.out
result is 6==265== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   38.96%  3.8400us         2  1.9200us  1.5680us  2.2720us  [CUDA memcpy HtoD]
                   35.06%  3.4560us         1  3.4560us  3.4560us  3.4560us  add(int*, int*, int*)
                   25.97%  2.5600us         1  2.5600us  2.5600us  2.5600us  [CUDA memcpy DtoH]
      API calls:   99.06%  134.38ms         3  44.792ms  8.0550us  134.34ms  cudaMalloc
                    0.42%  566.34us         1  566.34us  566.34us  566.34us  cuDeviceTotalMem
                    0.25%  337.35us        96  3.5140us     150ns  158.52us  cuDeviceGetAttribute
                    0.10%  138.37us         3  46.122us  4.8830us  116.30us  cudaFree
                    0.10%  132.80us         1  132.80us  132.80us  132.80us  cudaLaunchKernel
                    0.04%  59.923