In [53]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [54]:
!nvidia-smi

Sat Jan  6 08:06:40 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [55]:
%%writefile lab4_ex3.cu
#include <cuda_runtime_api.h>
#include <math.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cusparse_v2.h>
#include <cublas_v2.h>
#include <thrust/device_ptr.h>
#include <thrust/sequence.h>

#define gpuCheck(stmt)                                               \
  do {                                                               \
      cudaError_t err = stmt;                                        \
      if (err != cudaSuccess) {                                      \
          printf("ERROR. Failed to run stmt %s\n", #stmt);           \
          break;                                                     \
      }                                                              \
  } while (0)

// Macro to check the cuBLAS status
#define cublasCheck(stmt)                                            \
  do {                                                               \
      cublasStatus_t err = stmt;                                     \
      if (err != CUBLAS_STATUS_SUCCESS) {                            \
          printf("ERROR. Failed to run cuBLAS stmt %s\n", #stmt);    \
          break;                                                     \
      }                                                              \
  } while (0)

// Macro to check the cuSPARSE status
#define cusparseCheck(stmt)                                          \
  do {                                                               \
      cusparseStatus_t err = stmt;                                   \
      if (err != CUSPARSE_STATUS_SUCCESS) {                          \
          printf("ERROR. Failed to run cuSPARSE stmt %s\n", #stmt);  \
          break;                                                     \
      }                                                              \
  } while (0)


struct timeval t_start, t_end;
void cputimer_start(){
  gettimeofday(&t_start, 0);
}
void cputimer_stop(const char* info){
  gettimeofday(&t_end, 0);
  double time = (1000000.0*(t_end.tv_sec-t_start.tv_sec) + t_end.tv_usec-t_start.tv_usec);
  printf("Timing - %s. \t\tElasped %.0f microseconds \n", info, time);
}

// Initialize the sparse matrix needed for the heat time step
void matrixInit(double* A, int* ArowPtr, int* AcolIndx, int dimX,
    double alpha) {
  // Stencil from the finete difference discretization of the equation
  double stencil[] = { 1, -2, 1 };
  // Variable holding the position to insert a new element
  size_t ptr = 0;
  // Insert a row of zeros at the beginning of the matrix
  ArowPtr[1] = ptr;
  // Fill the non zero entries of the matrix
  for (int i = 1; i < (dimX - 1); ++i) {
    // Insert the elements: A[i][i-1], A[i][i], A[i][i+1]
    for (int k = 0; k < 3; ++k) {
      // Set the value for A[i][i+k-1]
      A[ptr] = stencil[k];
      // Set the column index for A[i][i+k-1]
      AcolIndx[ptr++] = i + k - 1;
    }
    // Set the number of newly added elements
    ArowPtr[i + 1] = ptr;
  }
  // Insert a row of zeros at the end of the matrix
  ArowPtr[dimX] = ptr;
}

int main(int argc, char **argv) {
  int device = 0;            // Device to be used
  int dimX;                  // Dimension of the metal rod
  int nsteps;                // Number of time steps to perform
  double alpha = 0.4;        // Diffusion coefficient
  double* temp;              // Array to store the final time step
  double* A;                 // Sparse matrix A values in the CSR format
  int* ARowPtr;              // Sparse matrix A row pointers in the CSR format
  int* AColIndx;             // Sparse matrix A col values in the CSR format
  int nzv;                   // Number of non zero values in the sparse matrix
  double* tmp;               // Temporal array of dimX for computations
  size_t bufferSize = 0;     // Buffer size needed by some routines
  void* buffer = nullptr;    // Buffer used by some routines in the libraries
  int concurrentAccessQ;     // Check if concurrent access flag is set
  double zero = 0;           // Zero constant
  double one = 1;            // One constant
  double norm;               // Variable for norm values
  double error;              // Variable for storing the relative error
  double tempLeft = 200.;    // Left heat source applied to the rod
  double tempRight = 300.;   // Right heat source applied to the rod
  cublasHandle_t cublasHandle;      // cuBLAS handle
  cusparseHandle_t cusparseHandle;  // cuSPARSE handle
  cusparseSpMatDescr_t Adescriptor;   // Mat descriptor needed by cuSPARSE
  int prefetch;
  struct timeval total_time_start, total_time_end;
  gettimeofday(&total_time_start, 0);

  // Read the arguments from the command line
  dimX = atoi(argv[1]);
  nsteps = atoi(argv[2]);
  prefetch = atoi(argv[3]);

  // Print input arguments
  printf("The X dimension of the grid is %d \n", dimX);
  printf("The number of time steps to perform is %d \n", nsteps);

  // Get if the cudaDevAttrConcurrentManagedAccess flag is set
  gpuCheck(cudaDeviceGetAttribute(&concurrentAccessQ, cudaDevAttrConcurrentManagedAccess, device));

  // Calculate the number of non zero values in the sparse matrix. This number
  // is known from the structure of the sparse matrix
  nzv = 3 * dimX - 6;

  //@@ Insert the code to allocate the temp, tmp and the sparse matrix
  //@@ arrays using Unified Memory
  cputimer_start();
  gpuCheck(cudaMallocManaged(&temp, dimX * sizeof(double)));
  gpuCheck(cudaMallocManaged(&tmp, dimX * sizeof(double)));
  gpuCheck(cudaMallocManaged(&A, nzv * sizeof(double)));
  gpuCheck(cudaMallocManaged(&ARowPtr, (dimX + 1) * sizeof(int)));
  gpuCheck(cudaMallocManaged(&AColIndx, nzv * sizeof(int)));

  cputimer_stop("Allocating device memory");

  // Check if concurrentAccessQ is non zero in order to prefetch memory
  if (concurrentAccessQ && prefetch) {
    cputimer_start();
    //@@ Insert code to prefetch in Unified Memory asynchronously to CPU
    gpuCheck(cudaMemPrefetchAsync(A, nzv * sizeof(double), cudaCpuDeviceId));
    gpuCheck(cudaMemPrefetchAsync(ARowPtr, (dimX + 1) * sizeof(int), cudaCpuDeviceId));
    gpuCheck(cudaMemPrefetchAsync(AColIndx, nzv * sizeof(int), cudaCpuDeviceId));

    cputimer_stop("Prefetching GPU memory to the host");
  }

  // Initialize the sparse matrix
  cputimer_start();
  matrixInit(A, ARowPtr, AColIndx, dimX, alpha);
  cputimer_stop("Initializing the sparse matrix on the host");

  //Initiliaze the boundary conditions for the heat equation
  cputimer_start();
  memset(temp, 0, sizeof(double) * dimX);
  temp[0] = tempLeft;
  temp[dimX - 1] = tempRight;
  cputimer_stop("Initializing memory on the host");

  if (concurrentAccessQ && prefetch) {
    cputimer_start();
    //@@ Insert code to prefetch in Unified Memory asynchronously to the GPU
    gpuCheck(cudaMemPrefetchAsync(A, nzv * sizeof(double), device));
    gpuCheck(cudaMemPrefetchAsync(ARowPtr, (dimX + 1) * sizeof(int), device));
    gpuCheck(cudaMemPrefetchAsync(AColIndx, nzv * sizeof(int), device));

    cputimer_stop("Prefetching GPU memory to the device");
  }

  //@@ Insert code to create the cuBLAS handle
  cublasCheck(cublasCreate(&cublasHandle));

  //@@ Insert code to create the cuSPARSE handle
  cusparseCheck(cusparseCreate(&cusparseHandle));

  //@@ Insert code to set the cuBLAS pointer mode to CUSPARSE_POINTER_MODE_HOST
  cublasCheck(cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST));
  cusparseCheck(cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST));

  //@@ Insert code to call cusparse api to create the mat descriptor used by cuSPARSE
  cusparseCheck(cusparseCreateCsr(&Adescriptor, dimX, dimX, nzv, ARowPtr, AColIndx, A, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
  cusparseDnVecDescr_t tempDescriptor;
  cusparseDnVecDescr_t tmpDescriptor;
  cusparseCheck(cusparseCreateDnVec(&tempDescriptor, dimX, temp, CUDA_R_64F));
  cusparseCheck(cusparseCreateDnVec(&tmpDescriptor, dimX, tmp, CUDA_R_64F));

  //@@ Insert code to call cusparse api to get the buffer size needed by the sparse matrix per
  //@@ vector (SMPV) CSR routine of cuSPARSE
  cusparseCheck(cusparseCreateDnVec(&tempDescriptor, dimX, temp, CUDA_R_64F));
  cusparseCheck(cusparseCreateDnVec(&tmpDescriptor, dimX, tmp, CUDA_R_64F));
  cusparseCheck(cusparseSpMV_bufferSize(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, Adescriptor, tempDescriptor, &zero, tmpDescriptor, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));


  //@@ Insert code to allocate the buffer needed by cuSPARSE
  gpuCheck(cudaMalloc(&buffer, bufferSize));

  cputimer_start();
  // Perform the time step iterations
  for (int it = 0; it < nsteps; ++it) {
    //@@ Insert code to call cusparse api to compute the SMPV (sparse matrix multiplication) for
    //@@ the CSR matrix using cuSPARSE. This calculation corresponds to:
    //@@ tmp = 1 * A * temp + 0 * tmp
    cusparseCheck(cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, Adescriptor, tempDescriptor, &zero, tmpDescriptor, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, buffer));

    //@@ Insert code to call cublas api to compute the axpy routine using cuBLAS.
    //@@ This calculation corresponds to: temp = alpha * tmp + temp
    cublasCheck(cublasDaxpy_v2(cublasHandle, dimX, &alpha, tmp, 1, temp, 1));

    //@@ Insert code to call cublas api to compute the norm of the vector using cuBLAS
    //@@ This calculation corresponds to: ||temp||
    cublasCheck(cublasDnrm2_v2(cublasHandle, dimX, temp, 1, &norm));

    // If the norm of A*temp is smaller than 10^-4 exit the loop
    if (norm < 1e-4)
      break;
  }
  cputimer_stop("Performing the time step iterations");

  // Calculate the exact solution using thrust
  thrust::device_ptr<double> thrustPtr(tmp);
  thrust::sequence(thrustPtr, thrustPtr + dimX, tempLeft,
      (tempRight - tempLeft) / (dimX - 1));

  // Calculate the relative approximation error:
  one = -1;
  //@@ Insert the code to call cublas api to compute the difference between the exact solution
  //@@ and the approximation
  //@@ This calculation corresponds to: tmp = -temp + tmp
  cublasCheck(cublasDaxpy_v2(cublasHandle, dimX, &one, temp, 1, tmp, 1));

  //@@ Insert the code to call cublas api to compute the norm of the absolute error
  //@@ This calculation corresponds to: || tmp ||
  cublasCheck(cublasDnrm2_v2(cublasHandle, dimX, tmp, 1, &norm));

  error = norm;
  //@@ Insert the code to call cublas api to compute the norm of temp
  //@@ This calculation corresponds to: || temp ||
  cublasCheck(cublasDnrm2_v2(cublasHandle, dimX, temp, 1, &norm));

  // Calculate the relative error
  error = error / norm;
  printf("The relative error of the approximation is %f\n", error);

  //@@ Insert the code to destroy the mat descriptor
  cusparseCheck(cusparseDestroySpMat(Adescriptor));
  cusparseCheck(cusparseDestroyDnVec(tempDescriptor));
  cusparseCheck(cusparseDestroyDnVec(tmpDescriptor));

  //@@ Insert the code to destroy the cuSPARSE handle
  cusparseCheck(cusparseDestroy(cusparseHandle));

  //@@ Insert the code to destroy the cuBLAS handle
  cublasCheck(cublasDestroy(cublasHandle));

  //@@ Insert the code for deallocating memory
  gpuCheck(cudaFree(temp));
  gpuCheck(cudaFree(tmp));
  gpuCheck(cudaFree(A));
  gpuCheck(cudaFree(buffer));
  gpuCheck(cudaFree(ARowPtr));
  gpuCheck(cudaFree(AColIndx));

  gettimeofday(&total_time_end, 0);
  double total_time = (1000000.0*(total_time_end.tv_sec-total_time_start.tv_sec) + total_time_end.tv_usec-total_time_start.tv_usec);
  printf("Total time: %.0f microseconds \n", total_time);

  return 0;
}

Overwriting lab4_ex3.cu


In [56]:
!nvcc lab4_ex3.cu -lcublas -lcusparse

In [57]:
!./a.out 128 100 1

The X dimension of the grid is 128 
The number of time steps to perform is 100 
Timing - Allocating device memory. 		Elasped 83024 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 461 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 4 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 358 microseconds 
Timing - Performing the time step iterations. 		Elasped 21894 microseconds 
The relative error of the approximation is 3.318021
Total time: 144660 microseconds 


In [58]:
!./a.out 256 100 1

The X dimension of the grid is 256 
The number of time steps to perform is 100 
Timing - Allocating device memory. 		Elasped 73137 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 448 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 3 microseconds 
Timing - Initializing memory on the host. 		Elasped 12 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 364 microseconds 
Timing - Performing the time step iterations. 		Elasped 25646 microseconds 
The relative error of the approximation is 4.924835
Total time: 138362 microseconds 


In [59]:
!./a.out 512 100 1

The X dimension of the grid is 512 
The number of time steps to perform is 100 
Timing - Allocating device memory. 		Elasped 65147 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 453 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 12 microseconds 
Timing - Initializing memory on the host. 		Elasped 16 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 431 microseconds 
Timing - Performing the time step iterations. 		Elasped 25984 microseconds 
The relative error of the approximation is 7.123992
Total time: 130214 microseconds 


In [60]:
!./a.out 64 100 1

The X dimension of the grid is 64 
The number of time steps to perform is 100 
Timing - Allocating device memory. 		Elasped 61675 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 426 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 1 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 355 microseconds 
Timing - Performing the time step iterations. 		Elasped 21301 microseconds 
The relative error of the approximation is 2.096374
Total time: 122359 microseconds 


In [61]:
!./a.out 32 100 1

The X dimension of the grid is 32 
The number of time steps to perform is 100 
Timing - Allocating device memory. 		Elasped 63661 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 428 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 1 microseconds 
Timing - Initializing memory on the host. 		Elasped 1 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 355 microseconds 
Timing - Performing the time step iterations. 		Elasped 22127 microseconds 
The relative error of the approximation is 1.049938
Total time: 126065 microseconds 


In [62]:
!./a.out 16 100 1

The X dimension of the grid is 16 
The number of time steps to perform is 100 
Timing - Allocating device memory. 		Elasped 62930 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 440 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 1 microseconds 
Timing - Initializing memory on the host. 		Elasped 1 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 384 microseconds 
Timing - Performing the time step iterations. 		Elasped 20536 microseconds 
The relative error of the approximation is 0.168723
Total time: 127398 microseconds 


In [63]:
!./a.out 128 100 1

The X dimension of the grid is 128 
The number of time steps to perform is 100 
Timing - Allocating device memory. 		Elasped 62005 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 449 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 4 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 409 microseconds 
Timing - Performing the time step iterations. 		Elasped 21522 microseconds 
The relative error of the approximation is 3.318021
Total time: 122665 microseconds 


In [64]:
!./a.out 128 200 1

The X dimension of the grid is 128 
The number of time steps to perform is 200 
Timing - Allocating device memory. 		Elasped 64872 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 435 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 2 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 391 microseconds 
Timing - Performing the time step iterations. 		Elasped 36213 microseconds 
The relative error of the approximation is 2.723174
Total time: 140230 microseconds 


In [65]:
!./a.out 128 500 1

The X dimension of the grid is 128 
The number of time steps to perform is 500 
Timing - Allocating device memory. 		Elasped 68528 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 445 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 4 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 374 microseconds 
Timing - Performing the time step iterations. 		Elasped 77897 microseconds 
The relative error of the approximation is 2.002595
Total time: 184664 microseconds 


In [66]:
!./a.out 128 1000 1

The X dimension of the grid is 128 
The number of time steps to perform is 1000 
Timing - Allocating device memory. 		Elasped 69895 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 459 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 3 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 379 microseconds 
Timing - Performing the time step iterations. 		Elasped 151056 microseconds 
The relative error of the approximation is 1.488118
Total time: 261012 microseconds 


In [67]:
!./a.out 128 2000 1

The X dimension of the grid is 128 
The number of time steps to perform is 2000 
Timing - Allocating device memory. 		Elasped 66405 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 434 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 2 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 414 microseconds 
Timing - Performing the time step iterations. 		Elasped 212464 microseconds 
The relative error of the approximation is 0.963428
Total time: 318226 microseconds 


In [68]:
!./a.out 128 5000 1

The X dimension of the grid is 128 
The number of time steps to perform is 5000 
Timing - Allocating device memory. 		Elasped 67547 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 434 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 4 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 233 microseconds 
Timing - Performing the time step iterations. 		Elasped 306223 microseconds 
The relative error of the approximation is 0.337648
Total time: 413433 microseconds 


In [69]:
!./a.out 128 10000 1

The X dimension of the grid is 128 
The number of time steps to perform is 10000 
Timing - Allocating device memory. 		Elasped 70342 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 442 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 4 microseconds 
Timing - Initializing memory on the host. 		Elasped 1 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 345 microseconds 
Timing - Performing the time step iterations. 		Elasped 607705 microseconds 
The relative error of the approximation is 0.082647
Total time: 716549 microseconds 


In [70]:
!./a.out 128 10000 0

The X dimension of the grid is 128 
The number of time steps to perform is 10000 
Timing - Allocating device memory. 		Elasped 62035 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 410 microseconds 
Timing - Initializing memory on the host. 		Elasped 1 microseconds 
Timing - Performing the time step iterations. 		Elasped 639149 microseconds 
The relative error of the approximation is 0.082647
Total time: 739856 microseconds 


In [71]:
!./a.out 128 1000 0

The X dimension of the grid is 128 
The number of time steps to perform is 1000 
Timing - Allocating device memory. 		Elasped 81778 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 463 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Performing the time step iterations. 		Elasped 81245 microseconds 
The relative error of the approximation is 1.488118
Total time: 214911 microseconds 


In [72]:
!./a.out 128 100 0

The X dimension of the grid is 128 
The number of time steps to perform is 100 
Timing - Allocating device memory. 		Elasped 86276 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 472 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Performing the time step iterations. 		Elasped 18652 microseconds 
The relative error of the approximation is 3.318021
Total time: 157242 microseconds 


In [73]:
!./a.out 512 100 0

The X dimension of the grid is 512 
The number of time steps to perform is 100 
Timing - Allocating device memory. 		Elasped 86664 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 453 microseconds 
Timing - Initializing memory on the host. 		Elasped 0 microseconds 
Timing - Performing the time step iterations. 		Elasped 21398 microseconds 
The relative error of the approximation is 7.123992
Total time: 160417 microseconds 


In [74]:
!./a.out 32 100 0

The X dimension of the grid is 32 
The number of time steps to perform is 100 
Timing - Allocating device memory. 		Elasped 93601 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 434 microseconds 
Timing - Initializing memory on the host. 		Elasped 1 microseconds 
Timing - Performing the time step iterations. 		Elasped 19215 microseconds 
The relative error of the approximation is 1.049938
Total time: 166002 microseconds 


In [75]:
!./a.out 512 10000 0

The X dimension of the grid is 512 
The number of time steps to perform is 10000 
Timing - Allocating device memory. 		Elasped 81550 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 452 microseconds 
Timing - Initializing memory on the host. 		Elasped 1 microseconds 
Timing - Performing the time step iterations. 		Elasped 993338 microseconds 
The relative error of the approximation is 1.878041
Total time: 1128752 microseconds 


In [76]:
!./a.out 512 10000 1

The X dimension of the grid is 512 
The number of time steps to perform is 10000 
Timing - Allocating device memory. 		Elasped 67536 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 437 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 12 microseconds 
Timing - Initializing memory on the host. 		Elasped 16 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 280 microseconds 
Timing - Performing the time step iterations. 		Elasped 818097 microseconds 
The relative error of the approximation is 1.878041
Total time: 924366 microseconds 


In [77]:
!./a.out 512 50000 0

The X dimension of the grid is 512 
The number of time steps to perform is 50000 
Timing - Allocating device memory. 		Elasped 68558 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 421 microseconds 
Timing - Initializing memory on the host. 		Elasped 1 microseconds 
Timing - Performing the time step iterations. 		Elasped 3779774 microseconds 
The relative error of the approximation is 0.643318
Total time: 3886221 microseconds 


In [78]:
!./a.out 512 50000 1

The X dimension of the grid is 512 
The number of time steps to perform is 50000 
Timing - Allocating device memory. 		Elasped 63749 microseconds 
Timing - Prefetching GPU memory to the host. 		Elasped 461 microseconds 
Timing - Initializing the sparse matrix on the host. 		Elasped 11 microseconds 
Timing - Initializing memory on the host. 		Elasped 17 microseconds 
Timing - Prefetching GPU memory to the device. 		Elasped 224 microseconds 
Timing - Performing the time step iterations. 		Elasped 3791572 microseconds 
The relative error of the approximation is 0.643318
Total time: 3892902 microseconds 
