<a href="https://colab.research.google.com/github/Yugesh299/Getting-started-with-CUDA-on-Google-Colab/blob/master/CUDA_By_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CUDA Installation Commands**

In [1]:
!apt-get update;
!wget https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda-repo-ubuntu1604-10-0-local-10.0.130-410.48_1.0-1_amd64 -O cuda-repo-ubuntu1604-10-0-local-10.0.130-410.48_1.0-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-10-0-local-10.0.130-410.48_1.0-1_amd64.deb
!apt-key add /var/cuda-repo-10-0-local/7fa2af80.pub
!apt-get update
!apt-get -y install gcc-7 g++-7
!apt-get -y install cuda

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]
0% [Waiting for headers] [Waiting for headers] [2 InRelease 3,626 B/3,626 B 1000% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f0% [2 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
                                                                               Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [2 InRelease gpgv 3,626 B] [Waiting for headers] [4 InRelease 14.2 kB/88.7 k                   

In [2]:
!export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
!export LD_LIBRARY_PATH=/usr/local/cuda/lib64\${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}

Check if CUDA is installed sucessfully

In [3]:
!/usr/local/cuda/bin/nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_May__6_19:09:25_PDT_2020
Cuda compilation tools, release 11.0, V11.0.167
Build cuda_11.0_bu.TC445_37.28358933_0


***Install python-nvcc plugin (Run this commmand if you get an error "Cell magic `%%cu` not found")***

In [7]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-81afjkr6
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-81afjkr6
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=b454b36fa2a25ae35f79b8d5eb33380d932439f492087ac6e4d423539481dcb7
  Stored in directory: /tmp/pip-ephem-wheel-cache-xjea5g1m/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


In [4]:
!nvidia-smi

Mon Jun 22 06:24:32 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

Sample Program

In [9]:
%%cu
#include <iostream>
#include <math.h>
#include <stdlib.h>
int main(){ 
    std::cout << "Hello there CUDA, looking good on Google colab" << std::endl;
    return 0;
}

Hello there CUDA, looking good on Google colab



# CUDA By Example

Chapter 3

In [18]:
%%cu
#include <iostream>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
__global__ void kernel (void) {
    
    printf("Hello, Kernel!\n");
    
}

int main (void){
    kernel<<<1,1>>>();
    printf("Hello, World!\n");
    return 0;
}

Hello, World!



In [4]:
%%cu
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}


/*CODE*/

//Addition
__global__ void add(int a, int b, int *c) {
    *c = a + b;
}

int main(void)
{
    int c;
    int *dev_c;
    HANDLE_ERROR(cudaMalloc( (void**)&dev_c, sizeof(int) ));
    add<<<1,1>>>(2, 7, dev_c);
    HANDLE_ERROR(cudaMemcpy( &c, dev_c, sizeof(int),cudaMemcpyDeviceToHost));
    printf("2 + 7 = %d\n", c);
    cudaFree(dev_c);


//GPU specifications
 

cudaDeviceProp prop;
int count;
HANDLE_ERROR( cudaGetDeviceCount( &count ) );
for (int i=0; i< count; i++) {
HANDLE_ERROR( cudaGetDeviceProperties( &prop, i ) );
printf( " --- General Information for device %d ---\n", i );
printf( "Name: %s\n", prop.name );
printf( "Compute capability: %d.%d\n", prop.major, prop.minor );
printf( "Clock rate: %d\n", prop.clockRate );
printf( "Device copy overlap: " );
if (prop.deviceOverlap)
printf( "Enabled\n" );
else
printf( "Disabled\n" );
printf( "Kernel execition timeout : " );
if (prop.kernelExecTimeoutEnabled)
printf( "Enabled\n" );
else
printf( "Disabled\n" );
printf( " --- Memory Information for device %d ---\n", i );
printf( "Total global mem: %ld\n", prop.totalGlobalMem );
printf( "Total constant Mem: %ld\n", prop.totalConstMem );
printf( "Max mem pitch: %ld\n", prop.memPitch );
printf( "Texture Alignment: %ld\n", prop.textureAlignment );
printf( " --- MP Information for device %d ---\n", i );
printf( "Multiprocessor count: %d\n",
prop.multiProcessorCount );
printf( "Shared mem per mp: %ld\n", prop.sharedMemPerBlock );
printf( "Registers per mp: %d\n", prop.regsPerBlock );
printf( "Threads in warp: %d\n", prop.warpSize );
printf( "Max threads per block: %d\n",
prop.maxThreadsPerBlock );
printf( "Max thread dimensions: (%d, %d, %d)\n",
prop.maxThreadsDim[0], prop.maxThreadsDim[1],
prop.maxThreadsDim[2] );
printf( "Max grid dimensions: (%d, %d, %d)\n",
prop.maxGridSize[0], prop.maxGridSize[1],
prop.maxGridSize[2] );
printf( "\n" );
}
 
    return 0;
}

2 + 7 = 9
 --- General Information for device 0 ---
Name: Tesla K80
Compute capability: 3.7
Clock rate: 823500
Device copy overlap: Enabled
Kernel execition timeout : Disabled
 --- Memory Information for device 0 ---
Total global mem: 11996954624
Total constant Mem: 65536
Max mem pitch: 2147483647
Texture Alignment: 512
 --- MP Information for device 0 ---
Multiprocessor count: 13
Shared mem per mp: 49152
Registers per mp: 65536
Threads in warp: 32
Max threads per block: 1024
Max thread dimensions: (1024, 1024, 64)
Max grid dimensions: (2147483647, 65535, 65535)




In [19]:
%%cuda --name my_curand.cu 
/*
 * This program uses the host CURAND API to generate 100 
 * pseudorandom floats.
 */
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>

#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    return EXIT_FAILURE;}} while(0)
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    return EXIT_FAILURE;}} while(0)

int main(int argc, char *argv[])
{
    size_t n = 100;
    size_t i;
    curandGenerator_t gen;
    float *devData, *hostData;

    /* Allocate n floats on host */
    hostData = (float *)calloc(n, sizeof(float));

    /* Allocate n floats on device */
    CUDA_CALL(cudaMalloc((void **)&devData, n*sizeof(float)));

    /* Create pseudo-random number generator */
    CURAND_CALL(curandCreateGenerator(&gen, 
                CURAND_RNG_PSEUDO_DEFAULT));

    /* Set seed */
    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 
                1234ULL));

    /* Generate n floats on device */
    CURAND_CALL(curandGenerateUniform(gen, devData, n));

    /* Copy device memory to host */
    CUDA_CALL(cudaMemcpy(hostData, devData, n * sizeof(float),
        cudaMemcpyDeviceToHost));

    /* Show result */
    for(i = 0; i < n; i++) {
        printf("%1.4f ", hostData[i]);
    }
    printf("\n");

    /* Cleanup */
    CURAND_CALL(curandDestroyGenerator(gen));
    CUDA_CALL(cudaFree(devData));
    free(hostData);    
    return EXIT_SUCCESS;
}

'File written in /content/src/my_curand.cu'

In [20]:
!nvcc -o /content/src/my_curand /content/src/my_curand.cu -lcurand

In [21]:
!/content/src/my_curand

0.1455 0.8202 0.5504 0.2948 0.9147 0.8690 0.3219 0.7829 0.0113 0.2855 0.7816 0.2338 0.6791 0.2824 0.6299 0.1212 0.4333 0.3831 0.5136 0.2987 0.4166 0.0345 0.0494 0.0467 0.6166 0.6480 0.8685 0.4012 0.0631 0.4972 0.6809 0.9350 0.0704 0.0458 0.1324 0.3785 0.6457 0.9930 0.9952 0.7677 0.3217 0.8210 0.2765 0.2691 0.4579 0.1969 0.9555 0.8739 0.7996 0.3810 0.6662 0.3153 0.9428 0.5006 0.3369 0.1490 0.8637 0.6191 0.6820 0.4573 0.9261 0.5650 0.7117 0.8252 0.8755 0.2216 0.2958 0.4046 0.3896 0.7335 0.7301 0.8154 0.0913 0.0866 0.6974 0.1811 0.5834 0.9255 0.9029 0.0413 0.9522 0.5507 0.7237 0.3976 0.7519 0.4398 0.4638 0.6094 0.7358 0.3272 0.6961 0.4893 0.9698 0.0456 0.2025 0.9491 0.1516 0.0424 0.6149 0.5638 


Chapter 4

In [60]:
%%cu

//Summing Vectors In C language using CPU
#include <iostream>
#include <math.h>

#define N 1<<20
void add( int *a, int *b, int *c ) 
{
int tid = 0; // this is CPU zero, so we start at zero

while (tid < N) 
  {
    c[tid] = a[tid] + b[tid];
    tid += 1; // we have one CPU, so we increment by one
  }
}


int main( void ) 
{
  int a[N], b[N], c[N];
  // fill the arrays 'a' and 'b' on the CPU
  for (int i=0; i<N; i++)   
  {
    a[i] = -i;
    b[i] = i * i; 
  }
 
   add( a, b, c );  //launch the CPU function

// display the results
for (int i=0; i<N; i++) 
{
  printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
 
return 0;
}




In [61]:
%%cuda --name my_vector.cu 

//Summing Vectors In CUDA language using GPU
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda.h>
//#include "book.h"
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}

#define N 1<<20

__global__ void add( int *a, int *b, int *c ) 
{
int tid = blockIdx.x; //handle the data at this index
if (tid < N) 
  {
    c[tid] = a[tid] + b[tid];
  }
}


int main( void ) 
{
  int a[N], b[N], c[N];
  int *dev_a, *dev_b, *dev_c;
 
/*allocate the memory on the GPU*/
HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );

// fill the arrays 'a' and 'b' on the CPU
  for (int i=0; i<N; i++)   
  {
    a[i] = -i;
    b[i] = i * i; 
  }

//copy the content of arrays 'a' and 'b' to the GPU 
HANDLE_ERROR(cudaMemcpy(dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice));


//launch the GPU Kernel
add<<<N,1>>>( dev_a, dev_b, dev_c );  
 
 //copy the array 'c' back from the GPU to the CPU
 HANDLE_ERROR(cudaMemcpy(c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost));

/*display the results*/
for (int i=0; i<N; i++) 
{
  printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}


//Free the memory allocated to the GPU
 cudaFree(dev_a);
 cudaFree(dev_b);
 cudaFree(dev_c);

return 0;
}


'File written in /content/src/my_vector.cu'

In [63]:
!nvprof ./2



In [64]:
!ls


my_vector.cu  sample_data  src
