In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
!nvidia-smi

Thu Nov 16 23:53:09 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
%%writefile lab2_ex1.cu
#include <stdio.h>
#include <sys/time.h>

#define TPB 128
#define DataType double
#define DOUBLE_MIN -5
#define DOUBLE_MAX 5

__global__ void vecAdd(DataType *in1, DataType *in2, DataType *out, int len) {
  //@@ Insert code to implement vector addition here
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  out[i] = in1[i] + in2[i];
}

//@@ Insert code to implement timer start
void timerStart(struct timeval *start) {
  gettimeofday(start, NULL);
}

//@@ Insert code to implement timer stop
double timerStop(struct timeval *start) {
  struct timeval end;
  gettimeofday(&end, NULL);
  double time = (end.tv_sec - start->tv_sec) * 1000.0;
  time += (end.tv_usec - start->tv_usec) / 1000.0;
  return time;
}

double randDouble(double min, double max) {
  double scale = rand() / (double)RAND_MAX;
  return min + scale * (max-min);
}


int main(int argc, char **argv) {

  int inputLength;
  DataType *hostInput1;
  DataType *hostInput2;
  DataType *hostOutput;
  DataType *resultRef;
  DataType *deviceInput1;
  DataType *deviceInput2;
  DataType *deviceOutput;

  struct timeval copyToDevice, copyFromDevice, kernelExecution;
  double copyToDeviceTime, copyFromDeviceTime, kernelExecutionTime;

  //@@ Insert code below to read in inputLength from args
  if(argc > 1){
    inputLength = atoi(argv[1]);
  }


  printf("The input length is %d\n", inputLength);

  //@@ Insert code below to allocate Host memory for input and output
  hostInput1 = (DataType *)malloc(inputLength * sizeof(DataType));
  hostInput2 = (DataType *)malloc(inputLength * sizeof(DataType));
  hostOutput = (DataType *)malloc(inputLength * sizeof(DataType));
  resultRef = (DataType *)malloc(inputLength * sizeof(DataType));


  //@@ Insert code below to initialize hostInput1 and hostInput2 to random numbers, and create reference result in CPU
  for(int i = 0; i < inputLength; ++i) {
    hostInput1[i] = randDouble(DOUBLE_MIN, DOUBLE_MAX);
    hostInput2[i] = randDouble(DOUBLE_MIN, DOUBLE_MAX);
    //printf("host1: %d", hostInput1[i]);
    resultRef[i] = hostInput1[i] + hostInput2[i];
  }


  //@@ Insert code below to allocate GPU memory here
  cudaMalloc((void **)&deviceInput1, inputLength * sizeof(DataType));
  cudaMalloc((void **)&deviceInput2, inputLength * sizeof(DataType));
  cudaMalloc((void **)&deviceOutput, inputLength * sizeof(DataType));


  //@@ Insert code to below to Copy memory to the GPU here
  timerStart(&copyToDevice);
  cudaMemcpy(deviceInput1, hostInput1, inputLength * sizeof(DataType), cudaMemcpyHostToDevice);
  cudaMemcpy(deviceInput2, hostInput2, inputLength * sizeof(DataType), cudaMemcpyHostToDevice);
  copyToDeviceTime = timerStop(&copyToDevice);


  //@@ Initialize the 1D grid and block dimensions here
  dim3 DimGrid((inputLength+TPB-1)/TPB, 1, 1);
  dim3 DimBlock(TPB, 1, 1);


  //@@ Launch the GPU Kernel here
  timerStart(&kernelExecution);
  vecAdd<<<DimGrid, DimBlock>>>(deviceInput1, deviceInput2, deviceOutput, inputLength);
  cudaDeviceSynchronize();
  kernelExecutionTime = timerStop(&kernelExecution);


  //@@ Copy the GPU memory back to the CPU here
  timerStart(&copyFromDevice);
  cudaMemcpy(hostOutput, deviceOutput, inputLength * sizeof(DataType), cudaMemcpyDeviceToHost);
  copyFromDeviceTime = timerStop(&copyFromDevice);


  //@@ Insert code below to compare the output with the reference
  double diff = 0.0;
  for(int i = 0; i < inputLength; ++i) {
    diff += abs(hostOutput[i] - resultRef[i]);
  }
  printf("Average difference: %f\n\n", diff/(double)inputLength);

  printf("Copy to Device Time: %f ms\n", copyToDeviceTime);
  printf("Kernel Execution Time: %f ms\n", kernelExecutionTime);
  printf("Copy from Device Time: %f ms\n", copyFromDeviceTime);


  //@@ Free the GPU memory here
  cudaFree(deviceInput1);
  cudaFree(deviceInput2);
  cudaFree(deviceOutput);

  //@@ Free the CPU memory here
  free(hostInput1);
  free(hostInput2);
  free(hostOutput);

  return 0;
}

Writing lab2_ex1.cu


In [4]:
!nvcc lab2_ex1.cu
!ls
!./a.out 1024

a.out  lab2_ex1.cu  sample_data
The input length is 1024
Average difference: 0.000000

Copy to Device Time: 0.954000 ms
Kernel Execution Time: 0.074000 ms
Copy from Device Time: 0.030000 ms


In [5]:
!nvprof ./a.out 1024

The input length is 1024
==707== NVPROF is profiling process 707, command: ./a.out 1024
Average difference: 0.000000

Copy to Device Time: 0.033000 ms
Kernel Execution Time: 0.033000 ms
Copy from Device Time: 0.025000 ms
==707== Profiling application: ./a.out 1024
==707== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   41.14%  4.8310us         1  4.8310us  4.8310us  4.8310us  vecAdd(double*, double*, double*, int)
                   37.88%  4.4480us         2  2.2240us  2.0480us  2.4000us  [CUDA memcpy HtoD]
                   20.98%  2.4630us         1  2.4630us  2.4630us  2.4630us  [CUDA memcpy DtoH]
      API calls:   99.37%  215.08ms         3  71.695ms  2.3820us  215.08ms  cudaMalloc
                    0.46%  1.0061ms         1  1.0061ms  1.0061ms  1.0061ms  cuDeviceGetPCIBusId
                    0.06%  120.50us       101  1.1930us     140ns  56.891us  cuDeviceGetAttribute
                    0.05%  115.65us 

In [6]:
!ncu --set default --metrics sm__warps_active.avg.pct_of_peak_sustained_active  ./a.out 1024

The input length is 1024
==PROF== Connected to process 731 (/content/a.out)
==PROF== Profiling "vecAdd" - 0: 0%....50%....100% - 8 passes
Average difference: 0.000000

Copy to Device Time: 0.069000 ms
Kernel Execution Time: 371.724000 ms
Copy from Device Time: 0.055000 ms
==PROF== Disconnected from process 731
[731] a.out@127.0.0.1
  vecAdd(double *, double *, double *, int), 2023-Nov-16 23:53:13, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    sm__warps_active.avg.pct_of_peak_sustained_active                                    %                          12.12
    ---------------------------------------------------------------------- --------------- ------------------------------

    Section: GPU Speed Of Light Throughput
    ---------------------------------------------------------------------- --------------- ------------------------------
    

In [7]:
!ncu --set default --metrics sm__warps_active.avg.pct_of_peak_sustained_active  ./a.out 131070

The input length is 131070
==PROF== Connected to process 754 (/content/a.out)
==PROF== Profiling "vecAdd" - 0: 0%....50%....100% - 8 passes
Average difference: 0.000000

Copy to Device Time: 0.662000 ms
Kernel Execution Time: 517.237000 ms
Copy from Device Time: 0.985000 ms
==PROF== Disconnected from process 754
[754] a.out@127.0.0.1
  vecAdd(double *, double *, double *, int), 2023-Nov-16 23:53:15, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    sm__warps_active.avg.pct_of_peak_sustained_active                                    %                          77.43
    ---------------------------------------------------------------------- --------------- ------------------------------

    Section: GPU Speed Of Light Throughput
    ---------------------------------------------------------------------- --------------- ------------------------------
  

In [8]:
!./a.out 1024

The input length is 1024
Average difference: 0.000000

Copy to Device Time: 0.042000 ms
Kernel Execution Time: 0.028000 ms
Copy from Device Time: 0.023000 ms


In [9]:
!./a.out 8092

The input length is 8092
Average difference: 0.000000

Copy to Device Time: 0.088000 ms
Kernel Execution Time: 0.029000 ms
Copy from Device Time: 0.070000 ms


In [10]:
!./a.out 65000

The input length is 65000
Average difference: 0.000000

Copy to Device Time: 0.306000 ms
Kernel Execution Time: 0.061000 ms
Copy from Device Time: 0.421000 ms


In [11]:
!./a.out 250000

The input length is 250000
Average difference: 0.000000

Copy to Device Time: 1.047000 ms
Kernel Execution Time: 0.109000 ms
Copy from Device Time: 1.484000 ms


In [12]:
!./a.out 2000000

The input length is 2000000
Average difference: 0.000000

Copy to Device Time: 6.998000 ms
Kernel Execution Time: 0.230000 ms
Copy from Device Time: 12.088000 ms


In [13]:
!./a.out 120000

The input length is 120000
Average difference: 0.000000

Copy to Device Time: 0.570000 ms
Kernel Execution Time: 0.099000 ms
Copy from Device Time: 0.762000 ms


In [14]:
!./a.out 12000

The input length is 12000
Average difference: 0.000000

Copy to Device Time: 0.089000 ms
Kernel Execution Time: 0.027000 ms
Copy from Device Time: 0.082000 ms


In [15]:
!./a.out 4000

The input length is 4000
Average difference: 0.000000

Copy to Device Time: 0.051000 ms
Kernel Execution Time: 0.025000 ms
Copy from Device Time: 0.047000 ms


In [16]:
!./a.out 1

The input length is 1
Average difference: 0.000000

Copy to Device Time: 0.032000 ms
Kernel Execution Time: 0.025000 ms
Copy from Device Time: 0.016000 ms


In [17]:
!./a.out 128

The input length is 128
Average difference: 0.000000

Copy to Device Time: 0.034000 ms
Kernel Execution Time: 0.025000 ms
Copy from Device Time: 0.018000 ms


In [18]:
!./a.out 131070

The input length is 131070
Average difference: 0.000000

Copy to Device Time: 0.728000 ms
Kernel Execution Time: 0.105000 ms
Copy from Device Time: 0.864000 ms
