<a href="https://colab.research.google.com/github/benamiller/cnn.c/blob/main/cnn_c.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!mkdir src
!mkdir layers
!mkdir utils

In [None]:
%%writefile vector_add.cu
#include <stdio.h>

__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_a = (float *)malloc(size);
    float *h_b = (float *)malloc(size);
    float *h_c = (float *)malloc(size);

    for (int i = 0; i < n; i++) {
        h_a[i] = rand() / (float)RAND_MAX;
        h_b[i] = rand() / (float)RAND_MAX;
    }

    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int numBlocks = (n + blockSize - 1) / blockSize;
    vectorAdd<<<numBlocks, blockSize>>>(d_a, d_b, d_c, n);

    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    for (int i = 0; i < 10; i++) {
        printf("%f + %f = %f\n", h_a[i], h_b[i], h_c[i]);
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}

Writing vector_add.cu


In [None]:
!nvcc vector_add.cu -o vector_add
!./vector_add

0.840188 + 0.394383 = 1.234571
0.783099 + 0.798440 = 1.581539
0.911647 + 0.197551 = 1.109199
0.335223 + 0.768230 = 1.103452
0.277775 + 0.553970 = 0.831745
0.477397 + 0.628871 = 1.106268
0.364784 + 0.513401 = 0.878185
0.952230 + 0.916195 = 1.868425
0.635712 + 0.717297 = 1.353009
0.141603 + 0.606969 = 0.748571
