In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
!nvidia-smi

Thu May 23 07:11:36 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpdnlyl_1x".


In [4]:
%%cuda
#include <cstdio>
#include <cuda_runtime.h>
//This fctn will run on the device
__global__ void helloWorldKernel(){
    printf("Hello World!\n");
}
int main(){
    //here we launch the kernel
    helloWorldKernel<<<1,1>>>(); //DimGrid is 1 and DimBlock is 1
    //wait for the kernel to finish
    cudaDeviceSynchronize();
    return 0;
}

Hello World!



In [12]:
%%cuda
#include <iostream>
#include <cuda_runtime.h>
__global__ void add(int *d_a, int *d_b, int *d_c, int n){
    int idx = threadIdx.x;
    /*if(i < n){
        d_c[idx] = d_a[idx] + d_c[idx];
    }*/
    //Now this is not good if we do not have enough threads
    //we will change it to a for loop

    for(int i = idx; i < n; i += blockDim.x){
        d_c[i] = d_a[i] + d_b[i];
    }
}

    //Now we are safe if we launch the kernel with n-1 or less threads
__host__
int main(){
        //we will set the constant and the host memory
        const int n = 12;
        int h_a[n], h_b[n], h_c[n]; // these are the host array variables
        //Initialize the arrays
        for(int i =1; i <=n; i++){
            h_a[i] = i;
            h_b[i] = i * 2;
        }

        //Now we will work on the device memory
        int *d_a, *d_b, *d_c;
        cudaMalloc(&d_a, n * sizeof(int));
        cudaMalloc(&d_b, n*sizeof(int));
        cudaMalloc(&d_c, n * sizeof(int));

        //copy from host to device
        cudaMemcpy(d_a, h_a, n * sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, h_b, n*sizeof(int), cudaMemcpyHostToDevice);
        cudaDeviceSynchronize();
        //Launch the kernel
        add<<<1,n-1>>>(d_a, d_b, d_c, n);
        cudaDeviceSynchronize();
        //Now after the execution of the kernel, we have to copy memory back to the device
        cudaMemcpy(h_c, d_c, n*sizeof(int), cudaMemcpyDeviceToHost);
        cudaDeviceSynchronize();
        //Print the results
        for(int i =0; i < n; i++){
            std::cout << h_c[i] << " ";
        }
        //now we have to free the allocated memeory
        cudaFree(d_a);
        cudaFree(d_b);
        cudaFree(d_c);
        return 0;
        }

12 3 6 9 12 15 18 21 24 27 30 33 
