In [26]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter



In [25]:
%%cuda

#include <iostream>
#include <numeric>

__global__ void vecadd(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

void deviceAlloc(void **addr, size_t size) {
    auto err = cudaMalloc(addr, size);
    if (err != cudaSuccess) {
        std::cerr << "cudaMalloc failed" << err << std::endl;
        exit(-1);
    }
}

int main() {
    constexpr int kN = 100;
    float *h_a = new float[kN];
    float *h_b = new float[kN];
    float *h_c = new float[kN];
    std::iota(h_a, h_a + kN, 0);
    std::iota(h_b, h_b + kN, 0);

    float *d_a, *d_b, *d_c;

    deviceAlloc((void**)&d_a, sizeof(float) * kN);
    deviceAlloc((void**)&d_b, sizeof(float) * kN);
    deviceAlloc((void**)&d_c, sizeof(float) * kN);


    cudaMemcpy(d_a, h_a, sizeof(float) * kN, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(float) * kN, cudaMemcpyHostToDevice);

    dim3 threads(kN);
    dim3 blocks(1);

    vecadd<<<blocks, threads>>>(d_a, d_b, d_c, kN);

    cudaMemcpy(h_c, d_c, sizeof(float) * kN, cudaMemcpyDeviceToHost);

    for (int i = 0; i < kN; i++) {
        std::cout << h_c[i] << " ";
    }
}

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 
