In [None]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

In [None]:
%%cuda
#include <iostream>
#include <cuda_runtime.h>

__global__ void addVectors(int* A, int* B, int* C, int n)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
    {
        C[i] = A[i] + B[i];
    }
}

int main()
{
    std::cout<<"This is Atharva Pingale's code";
    std::cout<<"\nPractical 4 : Vector Addition\n";
    int n = 1000000;
    int* A, * B, * C;
    int size = n * sizeof(int);

    // Allocate memory on the host
    cudaMallocHost(&A, size);
    cudaMallocHost(&B, size);
    cudaMallocHost(&C, size);

    // Initialize the vectors
    for (int i = 0; i < n; i++)
    {
        A[i] = i * 2 * 10;
        B[i] = i * 3 * 12;
    }


    // Printing the vectors
    std::cout<<"\nPrinting Vector A ( first 20 elements ): ";
    std::cout<<"\n";
    for(int i=0;i<20;i++){
        std::cout<<A[i]<<" ";
    }
    std::cout<<"\n";

    std::cout<<"\nPrinting Vector B ( first 20 elements ) : ";
    std::cout<<"\n";
    for(int i=0;i<20;i++){
        std::cout<<B[i]<<" ";
    }
    std::cout<<"\n";


    // Allocate memory on the device
    int* dev_A, * dev_B, * dev_C;
    cudaMalloc(&dev_A, size);
    cudaMalloc(&dev_B, size);
    cudaMalloc(&dev_C, size);

    // Copy data from host to device
    cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

    // Launch the kernel
    int blockSize = 256;
    int numBlocks = (n + blockSize - 1) / blockSize;
    addVectors<<<numBlocks, blockSize>>>(dev_A, dev_B, dev_C, n);

    // Copy data from device to host
    cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);

    // Print the results
    std::cout<<"\nPrinting Result Vector C ( first 20 elements ): ";
    std::cout<<"\n";
    for (int i = 0; i < 20; i++)
    {
        std::cout << C[i] << " ";
    }
    std::cout<<"\n";

    // Free memory
    cudaFree(dev_A);
    cudaFree(dev_B);
    cudaFree(dev_C);
    cudaFreeHost(A);
    cudaFreeHost(B);
    cudaFreeHost(C);

    return 0;
}