<a href="https://colab.research.google.com/github/akash-verma22/MPI_Assignments/blob/main/Assignment5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Question 1.1


In [1]:
%%writefile 1.1.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 1048576
__device__ float d_A[N];
__device__ float d_B[N];
__device__ float d_C[N];

__global__ void vectorAdd(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_C[i] = d_A[i] + d_B[i];
    }
}

__global__ void initializeArrays(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_A[i] = i;
        d_B[i] = 2 * i;
    }
}

int main(void)
{
    cudaError_t err = cudaSuccess;

    printf("Vector addition of %d elements\n", N);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    initializeArrays<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch initialize kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    float h_C[10]; // Just verify first 10 elements to avoid memory issues
    err = cudaMemcpyFromSymbol(h_C, d_C, 10 * sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy result array from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < 10; i++)
    {
        float expected = i + 2*i;
        if (fabs(h_C[i] - expected) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");

    err = cudaDeviceReset();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}


Writing 1.1.cu


In [2]:
!nvcc 1.1.cu -o 1.1 -arch=sm_75
!./1.1

Vector addition of 1048576 elements
CUDA kernel launch with 4096 blocks of 256 threads
Test PASSED
Done


In [3]:
!nvprof ./1.1

Vector addition of 1048576 elements
CUDA kernel launch with 4096 blocks of 256 threads
==919== NVPROF is profiling process 919, command: ./1.1
Test PASSED
Done
==919== Profiling application: ./1.1
==919== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   57.59%  54.367us         1  54.367us  54.367us  54.367us  vectorAdd(int)
                   39.93%  37.696us         1  37.696us  37.696us  37.696us  initializeArrays(int)
                    2.47%  2.3360us         1  2.3360us  2.3360us  2.3360us  [CUDA memcpy DtoH]
      API calls:   75.15%  85.973ms         2  42.987ms  6.7560us  85.967ms  cudaLaunchKernel
                   23.80%  27.230ms         1  27.230ms  27.230ms  27.230ms  cudaDeviceReset
                    0.84%  957.30us         1  957.30us  957.30us  957.30us  cudaMemcpyFromSymbol
                    0.12%  135.12us       114  1.1850us     102ns  54.764us  cuDeviceGetAttribute
                    0.08%

# Question 1.2


In [4]:
%%writefile 1.2.cu
#include <stdio.h>
#include <cuda_runtime.h>
#define N 1048576
__device__ float d_A[N];
__device__ float d_B[N];
__device__ float d_C[N];

__global__ void vectorAdd(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_C[i] = d_A[i] + d_B[i];
    }
}

__global__ void initializeArrays(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_A[i] = i;
        d_B[i] = 2 * i;
    }
}

int main(void)
{
    cudaError_t err = cudaSuccess;

    printf("Vector addition of %d elements\n", N);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    initializeArrays<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch initialize kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start, NULL);

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaEventRecord(stop, NULL);

    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Kernel execution time: %f ms\n", milliseconds);

    float h_C[10]; // Just verify first 10 elements
    err = cudaMemcpyFromSymbol(h_C, d_C, 10 * sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy result array from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < 10; i++)
    {
        float expected = i + 2*i;
        if (fabs(h_C[i] - expected) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    err = cudaDeviceReset();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}

Writing 1.2.cu


In [5]:
!nvcc 1.2.cu -o 1.2 -arch=sm_75
!./1.2

Vector addition of 1048576 elements
CUDA kernel launch with 4096 blocks of 256 threads
Kernel execution time: 0.077824 ms
Test PASSED
Done


In [6]:
!nvprof ./1.2

Vector addition of 1048576 elements
CUDA kernel launch with 4096 blocks of 256 threads
==1231== NVPROF is profiling process 1231, command: ./1.2
Kernel execution time: 0.064480 ms
Test PASSED
Done
==1231== Profiling application: ./1.2
==1231== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   57.85%  54.367us         1  54.367us  54.367us  54.367us  vectorAdd(int)
                   39.60%  37.215us         1  37.215us  37.215us  37.215us  initializeArrays(int)
                    2.55%  2.4000us         1  2.4000us  2.4000us  2.4000us  [CUDA memcpy DtoH]
      API calls:   77.93%  101.23ms         2  50.617ms  7.5140us  101.23ms  cudaLaunchKernel
                   21.83%  28.352ms         1  28.352ms  28.352ms  28.352ms  cudaDeviceReset
                    0.10%  130.02us       114  1.1400us     101ns  52.978us  cuDeviceGetAttribute
                    0.05%  70.918us         1  70.918us  70.918us  70.918us  cudaEve

# Question 1.3

In [7]:
%%writefile 1.3.cu
#include <stdio.h>
#include <cuda_runtime.h>
#define N 1048576
__device__ float d_A[N];
__device__ float d_B[N];
__device__ float d_C[N];

__global__ void vectorAdd(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_C[i] = d_A[i] + d_B[i];
    }
}

__global__ void initializeArrays(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_A[i] = i;
        d_B[i] = 2 * i;
    }
}

int main(void)
{
    cudaError_t err = cudaSuccess;

    printf("Vector addition of %d elements\n", N);

    cudaDeviceProp deviceProp;
    err = cudaGetDeviceProperties(&deviceProp, 0);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to get device properties (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    float memClockRate = deviceProp.memoryClockRate * 1000;
    float memBusWidth = deviceProp.memoryBusWidth;
    float theoreticalBW = (memClockRate * memBusWidth * 2) / 8.0f;
    theoreticalBW = theoreticalBW / 1.0e9f; // Convert to GB/s

    printf("Device: %s\n", deviceProp.name);
    printf("Memory Clock Rate (KHz): %d\n", deviceProp.memoryClockRate);
    printf("Memory Bus Width (bits): %d\n", deviceProp.memoryBusWidth);
    printf("Theoretical Bandwidth (GB/s): %f\n", theoreticalBW);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    initializeArrays<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch initialize kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    float h_C[10]; // Just verify first 10 elements
    err = cudaMemcpyFromSymbol(h_C, d_C, 10 * sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy result array from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < 10; i++)
    {
        float expected = i + 2*i;
        if (fabs(h_C[i] - expected) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");

    err = cudaDeviceReset();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}

Writing 1.3.cu


In [8]:
!nvcc 1.3.cu -o 1.3 -arch=sm_75
!./1.3

Vector addition of 1048576 elements
Device: Tesla T4
Memory Clock Rate (KHz): 5001000
Memory Bus Width (bits): 256
Theoretical Bandwidth (GB/s): 45.186092
CUDA kernel launch with 4096 blocks of 256 threads
Test PASSED
Done


In [9]:
!nvprof ./1.3

Vector addition of 1048576 elements
==1509== NVPROF is profiling process 1509, command: ./1.3
Device: Tesla T4
Memory Clock Rate (KHz): 5001000
Memory Bus Width (bits): 256
Theoretical Bandwidth (GB/s): 45.186092
CUDA kernel launch with 4096 blocks of 256 threads
Test PASSED
Done
==1509== Profiling application: ./1.3
==1509== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   57.73%  54.271us         1  54.271us  54.271us  54.271us  vectorAdd(int)
                   39.79%  37.407us         1  37.407us  37.407us  37.407us  initializeArrays(int)
                    2.48%  2.3360us         1  2.3360us  2.3360us  2.3360us  [CUDA memcpy DtoH]
      API calls:   78.64%  104.56ms         2  52.281ms  6.9880us  104.56ms  cudaLaunchKernel
                   21.08%  28.034ms         1  28.034ms  28.034ms  28.034ms  cudaDeviceReset
                    0.10%  130.51us       114  1.1440us     108ns  53.814us  cuDeviceGetAttribute


# Question 1.4


In [10]:
%%writefile 1.4.cu
#include <stdio.h>
#include <cuda_runtime.h>
#define N 1048576
__device__ float d_A[N];
__device__ float d_B[N];
__device__ float d_C[N];

__global__ void vectorAdd(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_C[i] = d_A[i] + d_B[i];
    }
}

__global__ void initializeArrays(int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n)
    {
        d_A[i] = i;
        d_B[i] = 2 * i;
    }
}

int main(void)
{
    cudaError_t err = cudaSuccess;

    printf("Vector addition of %d elements\n", N);

    cudaDeviceProp deviceProp;
    err = cudaGetDeviceProperties(&deviceProp, 0);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to get device properties (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    float memClockRate = deviceProp.memoryClockRate * 1000;
    float memBusWidth = deviceProp.memoryBusWidth;
    float theoreticalBW = (memClockRate * memBusWidth * 2) / 8.0f;
    theoreticalBW = theoreticalBW / 1.0e9f; // Convert to GB/s

    printf("Device: %s\n", deviceProp.name);
    printf("Memory Clock Rate (KHz): %d\n", deviceProp.memoryClockRate);
    printf("Memory Bus Width (bits): %d\n", deviceProp.memoryBusWidth);
    printf("Theoretical Bandwidth (GB/s): %f\n", theoreticalBW);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    initializeArrays<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch initialize kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start, NULL);

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(N);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    cudaEventRecord(stop, NULL);

    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Kernel execution time: %f ms\n", milliseconds);

    size_t bytesRead = 2 * sizeof(float) * N;
    size_t bytesWritten = sizeof(float) * N;
    float totalBytes = (float)(bytesRead + bytesWritten);
    float seconds = milliseconds / 1000.0f;
    float measuredBW = totalBytes / seconds / 1.0e9f;

    printf("Data read: %.2f MB\n", bytesRead / (1024.0f * 1024.0f));
    printf("Data written: %.2f MB\n", bytesWritten / (1024.0f * 1024.0f));
    printf("Measured Bandwidth (GB/s): %.2f\n", measuredBW);
    printf("Bandwidth utilization: %.2f%%\n", (measuredBW / theoreticalBW) * 100.0f);

    float h_C[10]; // Just verify first 10 elements
    err = cudaMemcpyFromSymbol(h_C, d_C, 10 * sizeof(float));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy result array from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    for (int i = 0; i < 10; i++)
    {
        float expected = i + 2*i;
        if (fabs(h_C[i] - expected) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    err = cudaDeviceReset();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}

Writing 1.4.cu


In [11]:
!nvcc 1.4.cu -o 1.4 -arch=sm_75
!./1.4

Vector addition of 1048576 elements
Device: Tesla T4
Memory Clock Rate (KHz): 5001000
Memory Bus Width (bits): 256
Theoretical Bandwidth (GB/s): 45.186092
CUDA kernel launch with 4096 blocks of 256 threads
Kernel execution time: 0.074080 ms
Data read: 8.00 MB
Data written: 4.00 MB
Measured Bandwidth (GB/s): 169.86
Bandwidth utilization: 375.90%
Test PASSED
Done


In [12]:
!nvprof ./1.4

Vector addition of 1048576 elements
==1748== NVPROF is profiling process 1748, command: ./1.4
Device: Tesla T4
Memory Clock Rate (KHz): 5001000
Memory Bus Width (bits): 256
Theoretical Bandwidth (GB/s): 45.186092
CUDA kernel launch with 4096 blocks of 256 threads
Kernel execution time: 0.065888 ms
Data read: 8.00 MB
Data written: 4.00 MB
Measured Bandwidth (GB/s): 190.97
Bandwidth utilization: 422.64%
Test PASSED
Done
==1748== Profiling application: ./1.4
==1748== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   57.14%  54.143us         1  54.143us  54.143us  54.143us  vectorAdd(int)
                   40.32%  38.207us         1  38.207us  38.207us  38.207us  initializeArrays(int)
                    2.53%  2.4000us         1  2.4000us  2.4000us  2.4000us  [CUDA memcpy DtoH]
      API calls:   69.93%  91.187ms         2  45.594ms  8.4240us  91.179ms  cudaLaunchKernel
                   29.75%  38.791ms         1  38.