1)Write a CUDA program to perform the vector addition and calculate its execution time using CUDA events. Use dynamic memory allocation for the arrays considered.

In [5]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [1]:
%%writefile vector_add1.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

// CUDA kernel for vector addition
__global__ void vectorAdd(const float *A, const float *B, float *C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}

// Macro for checking CUDA errors
#define CHECK_CUDA_ERROR(call)                               \
    do {                                                     \
        cudaError_t error = call;                            \
        if (error != cudaSuccess) {                          \
            fprintf(stderr, "CUDA Error: %s (code %d), %s\n",\
                    cudaGetErrorString(error), error, #call);\
            exit(1);                                         \
        }                                                    \
    } while (0)

int main() {
    int N = 10 * 1024 * 1024;  // Vector size: 10 million elements
    size_t size = N * sizeof(float);

    printf("Vector size: %d elements (%zu MB)\n", N, size / (1024 * 1024));

    // Allocate host memory dynamically
    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C = (float *)malloc(size);

    if (h_A == NULL || h_B == NULL || h_C == NULL) {
        fprintf(stderr, "Host memory allocation failed\n");
        exit(1);
    }

    // Seed random number generator and fill host arrays with random values
    srand(time(NULL));
    for (int i = 0; i < N; i++) {
        h_A[i] = (float)rand() / RAND_MAX;
        h_B[i] = (float)rand() / RAND_MAX;
    }

    // Allocate device memory with error checking
    float *d_A, *d_B, *d_C;
    CHECK_CUDA_ERROR(cudaMalloc((void **)&d_A, size));
    CHECK_CUDA_ERROR(cudaMalloc((void **)&d_B, size));
    CHECK_CUDA_ERROR(cudaMalloc((void **)&d_C, size));

    // Copy data from host to device with error checking
    CHECK_CUDA_ERROR(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
    CHECK_CUDA_ERROR(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));

    // Define block and grid sizes
    int blockSize = 256;
    int gridSize = (N + blockSize - 1) / blockSize;

    // Create CUDA events for timing
    cudaEvent_t start, stop;
    CHECK_CUDA_ERROR(cudaEventCreate(&start));
    CHECK_CUDA_ERROR(cudaEventCreate(&stop));

    // Record the start event
    CHECK_CUDA_ERROR(cudaEventRecord(start));

    // Launch the kernel with error checking
    vectorAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
    CHECK_CUDA_ERROR(cudaGetLastError());  // Check for kernel launch errors

    // Record the stop event and synchronize
    CHECK_CUDA_ERROR(cudaEventRecord(stop));
    CHECK_CUDA_ERROR(cudaEventSynchronize(stop));

    // Calculate and print elapsed time
    float milliseconds = 0;
    CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start, stop));
    printf("Execution time: %f ms\n", milliseconds);

    // Copy result from device to host with error checking
    CHECK_CUDA_ERROR(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));

    // Verify the result (optional for large sizes)
    bool error_found = false;
    for (int i = 0; i < N; i++) {
        float expected = h_A[i] + h_B[i];
        if (fabs(h_C[i] - expected) > 1e-5) {
            printf("Error at index %d: %f != %f\n", i, h_C[i], expected);
            error_found = true;
            break;
        }
    }
    if (!error_found) {
        printf("Result verification passed!\n");
    }

    // Free device and host memory
    CHECK_CUDA_ERROR(cudaFree(d_A));
    CHECK_CUDA_ERROR(cudaFree(d_B));
    CHECK_CUDA_ERROR(cudaFree(d_C));
    free(h_A);
    free(h_B);
    free(h_C);

    // Destroy CUDA events
    CHECK_CUDA_ERROR(cudaEventDestroy(start));
    CHECK_CUDA_ERROR(cudaEventDestroy(stop));

    return 0;
}


Writing vector_add1.cu


In [2]:
!nvcc vector_add1.cu -o vector_add1

In [4]:
!./vector_add1



Vector size: 10485760 elements (40 MB)
Execution time: 138.623108 ms
Result verification passed!


2)Write a CUDA program to perform the addition of two matrices and compare parallel and serial time. Use dynamic memory allocation for the arrays considered and CUDA events for the execution time.

In [34]:
%%writefile matrix_add2.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

// CUDA kernel for matrix addition
__global__ void matrixAdd(const float *A, const float *B, float *C, int width, int height) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < height && col < width) {
        int idx = row * width + col;
        C[idx] = A[idx] + B[idx];
    }
}

// Function to perform matrix addition serially on the CPU
void matrixAddSerial(const float *A, const float *B, float *C, int width, int height) {
    for (int row = 0; row < height; row++) {
        for (int col = 0; col < width; col++) {
            int idx = row * width + col;
            C[idx] = A[idx] + B[idx];
        }
    }
}

int main() {
    int width = 1024;   // Matrix width
    int height = 1024;  // Matrix height
    int N = width * height;  // Total number of elements
    size_t size = N * sizeof(float);

    printf("Matrix size: %d x %d (%d elements)\n", width, height, N);

    // Allocate host memory dynamically
    float *h_A = (float *)malloc(size);
    float *h_B = (float *)malloc(size);
    float *h_C_parallel = (float *)malloc(size);
    float *h_C_serial = (float *)malloc(size);

    // Check for successful host memory allocation
    if (h_A == NULL || h_B == NULL || h_C_parallel == NULL || h_C_serial == NULL) {
        fprintf(stderr, "Host memory allocation failed\n");
        return 1;
    }

    // Seed the random number generator and initialize matrices with random values
    srand(time(NULL));
    for (int i = 0; i < N; i++) {
        h_A[i] = (float)rand() / RAND_MAX;
        h_B[i] = (float)rand() / RAND_MAX;
    }

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define block and grid sizes
    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);

    // Create CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Record the start event for parallel execution
    cudaEventRecord(start);

    // Launch the kernel
    matrixAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, width, height);

    // Record the stop event for parallel execution
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    // Calculate and print elapsed time for parallel execution
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Parallel execution time (CUDA): %f ms\n", milliseconds);

    // Copy the result from device to host
    cudaMemcpy(h_C_parallel, d_C, size, cudaMemcpyDeviceToHost);

    // Measure serial execution time using clock()
    clock_t serial_start = clock();
    matrixAddSerial(h_A, h_B, h_C_serial, width, height);
    clock_t serial_end = clock();
    double serial_time = 1000.0 * (serial_end - serial_start) / CLOCKS_PER_SEC;
    printf("Serial execution time (CPU): %f ms\n", serial_time);

    // Verify the result by comparing parallel and serial results
    bool error_found = false;
    for (int i = 0; i < N; i++) {
        if (fabs(h_C_parallel[i] - h_C_serial[i]) > 1e-5) {
            printf("Error at index %d: %f (parallel) != %f (serial)\n", i, h_C_parallel[i], h_C_serial[i]);
            error_found = true;
            break;
        }
    }
    if (!error_found) {
        printf("Result verification passed!\n");
    }

    // Free device and host memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C_parallel);
    free(h_C_serial);

    // Destroy CUDA events
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Overwriting matrix_add2.cu


In [35]:
!nvcc matrix_add2.cu -o matrix_add2

In [36]:
!./matrix_add2

Matrix size: 1024 x 1024 (1048576 elements)
Parallel execution time (CUDA): 0.189184 ms
Serial execution time (CPU): 5.639000 ms
Result verification passed!


In [29]:
!./matrix_add2

Matrix size: 2048 x 2048 (4194304 elements)
Parallel execution time (CUDA): 0.345152 ms
Serial execution time (CPU): 25.398000 ms
Result verification passed!


In [22]:
!./matrix_add2

Matrix size: 4096 x 4096 (16777216 elements)
Parallel execution time (CUDA): 1.018976 ms
Serial execution time (CPU): 91.000000 ms
Result verification passed!


In [33]:
!./matrix_add2

Matrix size: 8192 x 8192 (67108864 elements)
Parallel execution time (CUDA): 3.504512 ms
Serial execution time (CPU): 555.287000 ms
Result verification passed!
