# **CUDA Setup**

In [3]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Python 3.10.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpwz3mz6ex".


# **Running a Hello World CUDA code on C**

In [4]:
%%cuda
#include<stdio.h>
__global__ void hello(){
    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main(){
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
}

Hello from block: 1, thread: 0
Hello from block: 1, thread: 1
Hello from block: 0, thread: 0
Hello from block: 0, thread: 1



# **Running CUDA on C++**

In [5]:
#This CUDA program is a basic extension of the Hello World Program

%%cuda
#include<iostream>
#include<cuda_runtime.h>
#include<stdio.h>

__global__ void hellokernel(){
    printf("Hello from block:%u, and threads:%u\n", blockIdx.x, threadIdx.x);
}

int main(){
    hellokernel<<<4,4>>>();
    cudaDeviceSynchronize();
}

Hello from block:1, and threads:0
Hello from block:1, and threads:1
Hello from block:1, and threads:2
Hello from block:1, and threads:3
Hello from block:0, and threads:0
Hello from block:0, and threads:1
Hello from block:0, and threads:2
Hello from block:0, and threads:3
Hello from block:3, and threads:0
Hello from block:3, and threads:1
Hello from block:3, and threads:2
Hello from block:3, and threads:3
Hello from block:2, and threads:0
Hello from block:2, and threads:1
Hello from block:2, and threads:2
Hello from block:2, and threads:3



In [6]:
#This CUDA program is based on the 3D diffusion of heat model

%%cuda
#include <iostream>
#include <vector>
#include <cuda_runtime.h>

__global__ void heatDiffusionKernel(float *grid, float *newGrid, int N) {
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int z = threadIdx.z + blockIdx.z * blockDim.z;

    if (x > 0 && x < N - 1 && y > 0 && y < N - 1 && z > 0 && z < N - 1) {
        int idx = x + y * N + z * N * N;
        newGrid[idx] = 0.2 * (grid[idx] + grid[idx - 1] + grid[idx + 1] +
                               grid[idx - N] + grid[idx + N] +
                               grid[idx - N * N] + grid[idx + N * N]);
    }
}

class HeatDiffusion {
public:
    HeatDiffusion(int gridSize) : N(gridSize), bytes(N * N * N * sizeof(float)) {
        h_grid.resize(N * N * N, 0.0);
        h_newGrid.resize(N * N * N, 0.0);

        for (int i = 0; i < N * N * N; i++) {
            h_grid[i] = (i % 100 == 0) ? 100.0 : 0.0; // Heat source
        }

        cudaMalloc(&d_grid, bytes);
        cudaMalloc(&d_newGrid, bytes);
        cudaMemcpy(d_grid, h_grid.data(), bytes, cudaMemcpyHostToDevice);
    }

    ~HeatDiffusion() {
        cudaFree(d_grid);
        cudaFree(d_newGrid);
    }

    void simulate(int timeSteps) {
        dim3 threads(8, 8, 8);
        dim3 blocks((N + 7) / 8, (N + 7) / 8, (N + 7) / 8);

        for (int t = 0; t < timeSteps; t++) {
            heatDiffusionKernel<<<blocks, threads>>>(d_grid, d_newGrid, N);
            cudaMemcpy(d_grid, d_newGrid, bytes, cudaMemcpyDeviceToDevice);
        }

        cudaMemcpy(h_newGrid.data(), d_newGrid, bytes, cudaMemcpyDeviceToHost);
    }

    float getCenterHeat() const {
        return h_newGrid[N / 2 * N * N + N / 2 * N + N / 2];
    }

private:
    int N;
    size_t bytes;
    std::vector<float> h_grid, h_newGrid;
    float *d_grid, *d_newGrid;
};

int main() {
    int gridSize = 32;
    int timeSteps = 100;

    HeatDiffusion simulation(gridSize);

    simulation.simulate(timeSteps);

    std::cout << "Final heat at center: " << simulation.getCenterHeat() << std::endl;

    return 0;
}


Final heat at center: 4.003e+14



In [18]:
#This CUDA program enables to add two arrays in a parallel computing process

%%cuda
#include <iostream>
#include <cuda_runtime.h>

#define N (1 << 20) // 1 million elements
#define THREADS_PER_BLOCK 256

// CUDA kernel to perform vector addition
__global__ void vectorAdd(const float* A, const float* B, float* C, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        C[idx] = A[idx] + B[idx];
    }
}

int main() {
    // Host vectors
    float *h_A, *h_B, *h_C;

    // Allocate memory on the host
    size_t bytes = N * sizeof(float);
    h_A = (float*)malloc(bytes);
    h_B = (float*)malloc(bytes);
    h_C = (float*)malloc(bytes);

    // Initialize input vectors with large values
    for (int i = 0; i < N; i++) {
        h_A[i] = static_cast<float>(i);
        h_B[i] = static_cast<float>(i * 2);
    }

    // Device vectors
    float *d_A, *d_B, *d_C;

    // Allocate memory on the device
    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice);

    // Launch the kernel
    int blocks = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    vectorAdd<<<blocks, THREADS_PER_BLOCK>>>(d_A, d_B, d_C, N);

    // Wait for GPU to finish
    cudaDeviceSynchronize();

    // Copy the result back to the host
    cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost);

    // Verify the result
    bool success = true;
    for (int i = 0; i < N; i++) {
        if (h_C[i] != h_A[i] + h_B[i]) {
            std::cerr << "Error at index " << i << ": " << h_C[i] << " != " << h_A[i] + h_B[i] << std::endl;
            success = false;
            break;
        }
    }

    if (success) {
        std::cout << "All results are correct!" << std::endl;
    }

    // Print the elements of the result vector
    for (int i = 0; i < 10000; i++) { // Print first 10 elements for brevity
        std::cout << "C[" << i << "] = " << h_C[i] << std::endl;
    }

    // Free memory
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
C[5001] = 15003
C[5002] = 15006
C[5003] = 15009
C[5004] = 15012
C[5005] = 15015
C[5006] = 15018
C[5007] = 15021
C[5008] = 15024
C[5009] = 15027
C[5010] = 15030
C[5011] = 15033
C[5012] = 15036
C[5013] = 15039
C[5014] = 15042
C[5015] = 15045
C[5016] = 15048
C[5017] = 15051
C[5018] = 15054
C[5019] = 15057
C[5020] = 15060
C[5021] = 15063
C[5022] = 15066
C[5023] = 15069
C[5024] = 15072
C[5025] = 15075
C[5026] = 15078
C[5027] = 15081
C[5028] = 15084
C[5029] = 15087
C[5030] = 15090
C[5031] = 15093
C[5032] = 15096
C[5033] = 15099
C[5034] = 15102
C[5035] = 15105
C[5036] = 15108
C[5037] = 15111
C[5038] = 15114
C[5039] = 15117
C[5040] = 15120
C[5041] = 15123
C[5042] = 15126
C[5043] = 15129
C[5044] = 15132
C[5045] = 15135
C[5046] = 15138
C[5047] = 15141
C[5048] = 15144
C[5049] = 15147
C[5050] = 15150
C[5051] = 15153
C[5052] = 15156
C[5053] = 15159
C[5054] = 15162
C[5055] = 15165
C[5056] = 15168
C[5057] = 15171
C[5058] = 15174
C[5059]