<a href="https://colab.research.google.com/github/ayanmitra2021/CUDA_Practice/blob/master/Cuda_practice_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [1]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpni3nrae3".


In [11]:
%%writefile test.cu

#include <iostream>
#include <cuda_runtime.h>

// A helper function to check for CUDA errors
void checkCudaError(cudaError_t err, const char* message) {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA Error: %s - %s\n", message, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

__global__ void helloFromGPU() {
    printf("Hello World from GPU thread %d!\n", threadIdx.x);
}

int main() {
    std::cout << "Hello World from CPU!" << std::endl;
    helloFromGPU<<<1, 5>>>();
    checkCudaError(cudaGetLastError(), "Kernel Launch Failed");
    checkCudaError(cudaDeviceSynchronize(), "cudaDeviceSynchronize Failed");
    std::cout << "\nSuccessfully synchronized with GPU." << std::endl;
    return 0;
}

Overwriting test.cu


In [12]:
!nvcc test.cu -o test_executable -arch=sm_75

In [13]:
!./test_executable

Hello World from CPU!
Hello World from GPU thread 0!
Hello World from GPU thread 1!
Hello World from GPU thread 2!
Hello World from GPU thread 3!
Hello World from GPU thread 4!

Successfully synchronized with GPU.


In [7]:
%%cuda
#include <iostream>

// This kernel will add a value to each element of an array on the GPU
__global__ void add(int *a, int value) {
    a[threadIdx.x] += value;
}

int main() {
    const int N = 5;
    int host_a[N] = {10, 20, 30, 40, 50}; // Data on the CPU
    int *device_a; // Pointer for data on the GPU

    // 1. Allocate memory on the GPU
    cudaMalloc(&device_a, N * sizeof(int));

    // 2. Copy data from CPU to GPU
    cudaMemcpy(device_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice);

    std::cout << "Data on CPU before kernel launch:" << std::endl;
    for (int i = 0; i < N; i++) {
        std::cout << host_a[i] << " ";
    }
    std::cout << std::endl;

    // 3. Launch the kernel on the GPU to add 100 to each element
    add<<<1, N>>>(device_a, 100);
    cudaDeviceSynchronize();

    // 4. Copy the modified data back from GPU to CPU
    cudaMemcpy(host_a, device_a, N * sizeof(int), cudaMemcpyDeviceToHost);

    // 5. Free memory on the GPU
    cudaFree(device_a);

    // 6. Print the result from the CPU
    std::cout << "\nData on CPU after kernel launch:" << std::endl;
    for (int i = 0; i < N; i++) {
        std::cout << host_a[i] << " ";
    }
    std::cout << std::endl;

    return 0;
}

Data on CPU before kernel launch:
10 20 30 40 50 

Data on CPU after kernel launch:
10 20 30 40 50 

