In [14]:
! ls /usr/local/

bin    cuda	cuda-11.8  games	       include	lib64	   man	 share
colab  cuda-11	etc	   _gcs_config_ops.so  lib	licensing  sbin  src


In [15]:
! nvcc --version # nvcc compiler version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [21]:
%%writefile vector_sum.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

// Kernel Function - GPU function that is meant to be called from CPU code
__global__ void Vector_sum(float *c_d,float *a_d,float *b_d, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx<N){
	  c_d[idx] = a_d[idx] + b_d[idx];
  }
}

// Main code executed by the host
int main(void){
	float *a_h,*b_h,*c_h; // pointers host
	float *a_d,*b_d,*c_d; // pointers device
	const int N = 24;

	size_t size=N * sizeof(float);

	a_h = (float *)malloc(size); // allocating memory host; alternative function cudaMallocHost
	b_h = (float *)malloc(size);
	c_h = (float *)malloc(size);

	srand(time(NULL));
	for (int i=0; i<N; i++){
		//a_h[i] = (float)i;b_h[i] = (float)(i+1);
		a_h[i] = rand() % 100 + 1.0;
		b_h[i] = rand() % 100 + 1.0;
	}

	printf("\nArray a:\n");
	for (int i=0; i<N; i++) printf("%f ", a_h[i]);
	printf("\n\nArray b:\n");
	for (int i=0; i<N; i++) printf("%f ", b_h[i]);

	cudaMalloc((void **) &a_d,size); // allocating memory device
	cudaMalloc((void **) &b_d,size);
	cudaMalloc((void **) &c_d,size);

	cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice); // copy data from host to device
	cudaMemcpy(b_d, b_h, size, cudaMemcpyHostToDevice);

	int block_size =8;
	int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);

	Vector_sum<<<n_blocks, block_size>>>(c_d, a_d , b_d, N);

	cudaMemcpy(c_h, c_d, size, cudaMemcpyDeviceToHost); // copy data from device back to host

	printf("\n\nArray c:\n");
	for (int i=0; i<N; i++) printf("%f ", c_h[i]);

	printf("\n\nEnd.\n");
	//system("pause");

	free(a_h); // free memory host
	free(b_h);
	free(c_h);

	cudaFree(a_d); // free memory device
	cudaFree(b_d);
	cudaFree(c_d);

	return(0);
}

Overwriting vector_sum.cu


In [22]:
! nvcc vector_sum.cu -o test

In [23]:
! ./test


Array a:
36.000000 36.000000 7.000000 91.000000 35.000000 75.000000 67.000000 53.000000 38.000000 40.000000 34.000000 35.000000 45.000000 14.000000 97.000000 99.000000 31.000000 25.000000 40.000000 47.000000 22.000000 14.000000 91.000000 45.000000 

Array b:
55.000000 42.000000 55.000000 59.000000 55.000000 36.000000 18.000000 6.000000 66.000000 12.000000 19.000000 51.000000 41.000000 41.000000 77.000000 32.000000 86.000000 37.000000 68.000000 26.000000 22.000000 40.000000 18.000000 80.000000 

Array c:
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 

End.
