In [None]:
!nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Thu Jun 19 09:27:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   61C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                       

In [None]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpx0r4lb_5".


In [None]:
%%cuda
#include <iostream>
    int
    main()
{
    std::cout << "Testing Host Code..\n";
    return 0;
}

Testing Host Code..



In [None]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

void VecInit(float* A_h, float* B_h, int n) {
    for (int i=0; i < n; ++i) {
      A_h[i] = i;
      B_h[i] = n - 1 - i;
    }
}

void VecAdd(float* A_h, float* B_h, float* C_h, int n){
    for (int i = 0; i < n; ++i) {
        C_h[i] = A_h[i] + B_h[i];
    }
}

void VecValidate(float* A_h, float* B_h, float* C_h, int n) {
    int error_counter = 0;
    float epsilon = 1e-6;
    for (int i = 0; i < n; ++i) {
        float temp = A_h[i] + B_h[i];
        if (fabs(temp - C_h[i]) > epsilon) {
            ++error_counter;
        }
    }
    printf("Error Counter is %d \n", error_counter);
}

int main() {
    int n;
    float *A_h;
    float *B_h;
    float *C_h;
    n = 10000;
    A_h = (float*)malloc(n*sizeof(float));
    B_h = (float*)malloc(n*sizeof(float));
    C_h = (float*)malloc(n*sizeof(float));

    if (A_h == NULL || B_h == NULL || C_h == NULL) {
      printf("Memory allocation failed\n");
      return 1;
    }

    VecInit(A_h, B_h, n);

    clock_t begin = clock();

    VecAdd(A_h, B_h, C_h, n);

    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("Time consumed doing sequential VecAdd is %f seconds\n", time_spent);
    VecValidate(A_h, B_h, C_h, n);

    free(A_h);
    free(B_h);
    free(C_h);

    return 0;
}

Time consumed doing sequential VecAdd is 0.000046 seconds
Error Counter is 0 



In [None]:
%%writefile vector_add.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

__global__
void vecAddKernel(float* A_d, float* B_d, float* C_d, int n) {
    //printf("Kernel launched to GPU...\n");
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        C_d[i] = A_d[i] + B_d[i];
        //printf("%f, %f, %f\n", A_d[i], B_d[i], C_d[i]);
    }
}

void VecInit(float* A_h, float* B_h, int n) {
    for (int i=0; i < n; ++i) {
      A_h[i] = i;
      B_h[i] = n - 1 - i;
    }
}

void VecAdd(float* A_h, float* B_h, float* C_h, int n){
    float* A_d;
    float* B_d;
    float* C_d;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);


    int size = n * sizeof(float);


    cudaMalloc((void **)&A_d, size);
    cudaMalloc((void **)&B_d, size);
    cudaMalloc((void **)&C_d, size);

    clock_t begin = clock();

    cudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
    cudaMemcpy(B_d, B_h, size, cudaMemcpyHostToDevice);

    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("Time consumed doing device input mem copy is %f seconds\n", time_spent);

    cudaEventRecord(start);
    //number of blocks in grid is ceil(n/256) and number of threads per block is 256.
    vecAddKernel<<<ceil(n/256.0), 256>>>(A_d, B_d, C_d, n);
    cudaEventRecord(stop);

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
      printf("Kernel launch failed: %s\n", cudaGetErrorString(err));
    }

    begin = clock();
    cudaMemcpy(C_h, C_d, size, cudaMemcpyDeviceToHost);
    end = clock();
    time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

    printf ("Time consumed doing device otuput memeory copy is %f seconds\n", time_spent);


    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Elapsed time for the vecAddKernel is %f seconds\n", milliseconds/1000);
    cudaFree(A_d);
    cudaFree(B_d);
    cudaFree(C_d);

}

void VecValidate(float* A_h, float* B_h, float* C_h, int n) {
    int error_counter = 0;
    float epsilon = 1e-6;
    for (int i = 0; i < n; ++i) {
        float temp = A_h[i] + B_h[i];
        //printf("%f, %f, %f, %f\n", A_h[i], B_h[i], temp, C_h[i]);
        if (fabs(temp - C_h[i]) > epsilon) {
            ++error_counter;
        }
    }
    printf("Error Counter is %d \n", error_counter);
}

int main() {
    int n;
    float *A_h;
    float *B_h;
    float *C_h;
    n = 10000;
    A_h = (float*)malloc(n*sizeof(float));
    B_h = (float*)malloc(n*sizeof(float));
    C_h = (float*)malloc(n*sizeof(float));

    if (A_h == NULL || B_h == NULL || C_h == NULL) {
      printf("Memory allocation failed\n");
      return 1;
    }

    VecInit(A_h, B_h, n);

    clock_t begin = clock();

    VecAdd(A_h, B_h, C_h, n);

    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("Time consumed doing parallel VecAdd is %f seconds\n", time_spent);

    VecValidate(A_h, B_h, C_h, n);

    free(A_h);
    free(B_h);
    free(C_h);

    return 0;
}

Writing vector_add.cu


In [None]:
!nvcc -arch=sm_75 vector_add.cu -o vector_add

In [None]:
!nvprof ./vector_add

==2817== NVPROF is profiling process 2817, command: ./vector_add
Time consumed doing device input mem copy is 0.000068 seconds
Time consumed doing device otuput memeory copy is 0.000054 seconds
Elapsed time for the vecAddKernel is 0.000154 seconds
Time consumed doing parallel VecAdd is 0.214536 seconds
Error Counter is 0 
==2817== Profiling application: ./vector_add
==2817== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   59.51%  12.416us         2  6.2080us  6.0160us  6.4000us  [CUDA memcpy HtoD]
                   23.93%  4.9920us         1  4.9920us  4.9920us  4.9920us  [CUDA memcpy DtoH]
                   16.56%  3.4560us         1  3.4560us  3.4560us  3.4560us  vecAddKernel(float*, float*, float*, int)
      API calls:   99.62%  186.61ms         2  93.306ms     775ns  186.61ms  cudaEventCreate
                    0.08%  151.28us       114  1.3270us     107ns  53.947us  cuDeviceGetAttribute
                    