In [3]:
%%writefile vector_add.cu
#include <stdio.h>
#include <cuda.h>

__global__ void vectorAdd(float *A, float *B, float *C, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) C[i] = A[i] + B[i];
}

int main() {
    int n = 10;
    int size = n * sizeof(float);

    float h_A[10], h_B[10], h_C[10];

    // Initialize arrays
    for (int i = 0; i < n; i++) {
        h_A[i] = (float)i;
        h_B[i] = (float)(i * 2);
    }

    float *d_A, *d_B, *d_C;

    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    vectorAdd<<<blocks, threads>>>(d_A, d_B, d_C, n);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    printf("Vector Addition Results:\n");
    for (int i = 0; i < n; i++)
        printf("%f + %f = %f\n", h_A[i], h_B[i], h_C[i]);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Overwriting vector_add.cu


In [4]:
!nvcc vector_add.cu -o vector_add

In [5]:
!./vector_add

Vector Addition Results:
0.000000 + 0.000000 = 0.000000
1.000000 + 2.000000 = 0.000000
2.000000 + 4.000000 = 0.000000
3.000000 + 6.000000 = 0.000000
4.000000 + 8.000000 = 0.000000
5.000000 + 10.000000 = 0.000000
6.000000 + 12.000000 = 0.000000
7.000000 + 14.000000 = 0.000000
8.000000 + 16.000000 = 0.000000
9.000000 + 18.000000 = 0.000000
