In [1]:
%%writefile vector.cu
 #include <iostream>
 using namespace std;

 __global__ void add(int* A, int* B, int* C, int size) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;

     if (tid < size) {
         C[tid] = A[tid] + B[tid];
     }
 }

 void initialize(int* vector, int size) {
     for (int i = 0; i < size; i++) {
        vector[i] = i+1;
     }
 }

 void print(int* vector, int size) {
     for (int i = 0; i < size; i++) {
         cout << vector[i] << " ";
     }
     cout << endl;
 }

 int main() {
     int N = 4;
     int* A, * B, * C;

     int vectorSize = N;
     size_t vectorBytes = vectorSize * sizeof(int);

     A = new int[vectorSize];
     B = new int[vectorSize];
     C = new int[vectorSize];

     initialize(A, vectorSize);
     initialize(B, vectorSize);

     cout << "Vector A: ";
     print(A, N);
     cout << "Vector B: ";
     print(B, N);

     int* X, * Y, * Z;
     cudaMalloc(&X, vectorBytes);
     cudaMalloc(&Y, vectorBytes);
     cudaMalloc(&Z, vectorBytes);

     cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice);

     int threadsPerBlock = 256;
     int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

     add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N);

     cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);
     cout << "Addition: ";
    print(C, N);

    delete[] A;
    delete[] B;
    delete[] C;

     cudaFree(X);
     cudaFree(Y);
     cudaFree(Z);

     return 0;
 }

Writing vector.cu


In [2]:
!nvcc vector.cu -o vector

In [3]:
!./vector


Vector A: 1 2 3 4 
Vector B: 1 2 3 4 
Addition: 2 4 6 8 
