<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring_2024/blob/main/Week9/TestProfilers_nvprof_ncu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This Colab books tells you how to profile your CUDA kernel code to see whats going on in it. Refer here for more details: https://docs.nvidia.com/cuda/profiler-users-guide/index.html#gpu-trace-and-api-trace-modes

https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html

https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html#nvprof-guide

In [None]:
%%writefile /content/test.cu
#include <stdio.h>
#include <cuda.h>

__global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) {
	// Calculate the row index of the d_Pelement and d_M
	int Row = blockIdx.y*blockDim.y+threadIdx.y;
	// Calculate the column index of d_P and d_N
	int Col = blockIdx.x*blockDim.x+threadIdx.x;
	if ((Row < Width) && (Col < Width)) {
		float Pvalue = 0;
		// each thread computes one element of the block sub-matrix
		for (int k = 0; k < Width; ++k) {
			Pvalue += d_M[Row*Width+k]*d_N[k*Width+Col];
		}
		d_P[Row*Width+Col] = Pvalue;
	}
}

__global__ void MatrixMulKernelTiled(	float* d_M, float* d_N,
																			float* d_P, int Width) {
	const int TILE_WIDTH = 16;
	__shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
	__shared__ float Nds[TILE_WIDTH][TILE_WIDTH];
	int bx = blockIdx.x;
	int by = blockIdx.y;
	int tx = threadIdx.x;
	int ty = threadIdx.y;

	// Identify the row and column of the d_P element to work on
	int Row = by * TILE_WIDTH + ty;
	int Col = bx * TILE_WIDTH + tx;
	float Pvalue = 0;

	// Loop over the d_M and d_N tiles required to compute d_P element
	for (int m = 0; m < (TILE_WIDTH+Width-1)/TILE_WIDTH; ++m) {
			if(m*TILE_WIDTH + tx < Width && Row < Width)
				Mds[ty][tx] = d_M[Row*Width + m*TILE_WIDTH + tx];
			else
			  Mds[ty][tx] = 0.0;

			if(m*TILE_WIDTH + ty < Width && Col < Width)
				Nds[ty][tx] = d_N[(m*TILE_WIDTH + ty)*Width + Col];
			else
				Nds[ty][tx] = 0.0;

		__syncthreads();

		for (int k = 0; k < TILE_WIDTH; ++k) {
			Pvalue += Mds[ty][k] * Nds[k][tx];
		}
		__syncthreads();
	}
	if (Row < Width && Col < Width)
		d_P[Row*Width + Col] = Pvalue;
}

void MatrixMultHost(float* A, float* B, float* C, int N)
{
	for (int i = 0; i < N; ++i)
	{
		for (int j = 0; j < N; ++j)
		{
			float Pvalue = 0;
			for (int k = 0; k < N; ++k)
			{
				Pvalue += A[i*N + k] * B[k*N + j];
			}
			C[j + i*N] = Pvalue;
		}
	}
}

int main(int argc, char** argv)
{
	const int N = 1000;
	const int SIZE = N*N;
	const int SIZE_IN_BYTES = SIZE * sizeof(float);

	float* h_A = (float*)malloc(SIZE_IN_BYTES);
	float* h_B = (float*)malloc(SIZE_IN_BYTES);
	float* h_C = (float*)malloc(SIZE_IN_BYTES);
	float* h_CD = (float*)malloc(SIZE_IN_BYTES); //device calc res
	float* h_CDT = (float*)malloc(SIZE_IN_BYTES); //device calc res

	// Initialize matrices on the host
	for (int i = 0; i < N; i++) {
		 for (int j = 0; j < N; j++) {
			 h_A[i*N + j] = (float) (rand() % 1024);
			 h_B[i*N + j] = (float) (rand() % 1024);
		 }
	}

	float* d_A;
	float* d_B;
	float* d_C;

	cudaMalloc(&d_A, SIZE_IN_BYTES);
	cudaMalloc(&d_B, SIZE_IN_BYTES);
	cudaMalloc(&d_C, SIZE_IN_BYTES);

	cudaMemcpy(d_A, h_A, SIZE_IN_BYTES, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, SIZE_IN_BYTES, cudaMemcpyHostToDevice);

	dim3    blocksGrid;
	dim3    threadsBlock(16, 16, 1);

	blocksGrid.x = (N + threadsBlock.x - 1) / threadsBlock.x;
	blocksGrid.y = (N + threadsBlock.y - 1) / threadsBlock.y;

	float gpu_elapsed_time_ms, cpu_elapsed_time_ms, gpu_elapsed_time_tiled_ms;

	// some events to count the execution time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// start to count execution time of GPU version
	cudaEventRecord(start, 0);

	MatrixMulKernel << <blocksGrid, threadsBlock >> > (d_A, d_B, d_C, N);

	cudaMemcpy(h_CD, d_C, SIZE_IN_BYTES, cudaMemcpyDeviceToHost);

	// time counting terminate
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	// compute time elapse on GPU computing
	cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);
	printf("Time elapsed (GPU): %f ms.\n", gpu_elapsed_time_ms);

	//now try the tiled matrix mult kernel
	// start to count execution time of GPU version
	cudaEventRecord(start, 0);

	MatrixMulKernelTiled << <blocksGrid, threadsBlock >> > (d_A, d_B, d_C, N);

	cudaMemcpy(h_CDT, d_C, SIZE_IN_BYTES, cudaMemcpyDeviceToHost);

	// time counting terminate
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	// compute time elapse on GPU computing
	cudaEventElapsedTime(&gpu_elapsed_time_tiled_ms, start, stop);
	printf("Time elapsed (GPU Tiled): %f ms.\n", gpu_elapsed_time_tiled_ms);

	// start the CPU version
	cudaEventRecord(start, 0);
	MatrixMultHost(h_A, h_B, h_C, N);

	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
	printf("Time elapsed (CPU): %f ms.\n", cpu_elapsed_time_ms);

	//validate results
	// validate results computed by GPU
	int all_ok = 1;
	for (int i = 0; i < N; ++i)
	{
		for (int j = 0; j < N; ++j)
		{
			if (h_C[j*N + i] != h_CD[j*N + i])
			{
				all_ok = 0;
			}
		}
	}

	// roughly compute speedup
	if (all_ok)
	{
		printf("All results are correct!!! (CPU vs GPU)\n");
	}
	else
	{
		printf("incorrect results\n");
	}

	//validate results
	// validate results computed by GPU Tiled
	all_ok = 1;
	for (int i = 0; i < N; ++i)
	{
		for (int j = 0; j < N; ++j)
		{
			if (h_C[j*N + i] != h_CDT[j*N + i])
			{
				all_ok = 0;
			}
		}
	}

	// roughly compute speedup
	if (all_ok)
	{
		printf("All results are correct!!! (CPU vs GPU Tiled)\n");
	}
	else
	{
		printf("incorrect results (CPU vs GPU Tiled)\n");
	}

  printf("Speedup: GPU (Tiled) vs GPU (Untiled): %3.3f\n", gpu_elapsed_time_ms/gpu_elapsed_time_tiled_ms);

	free(h_A);
	free(h_B);
	free(h_C);
	free(h_CD);

	cudaFree(&d_A);
	cudaFree(&d_B);
	cudaFree(&d_C);

	cudaDeviceReset();
	return 0;
}

Writing /content/test.cu


In [None]:
!nvcc /content/test.cu -o /content/test

In [None]:
!nvprof --help

Usage: nvprof [options] [application] [application-arguments]
Options:
       --aggregate-mode <on|off>
                        Turn on/off aggregate mode for events and metrics specified
                        by subsequent "--events" and "--metrics" options. Those
                        event/metric values will be collected for each domain instance,
                        instead of the whole device. Allowed values:
                        	on - turn on aggregate mode (default)
                        	off - turn off aggregate mode

       --analysis-metrics
                        Collect profiling data that can be imported to Visual Profiler's
                        "analysis" mode. Note: Use "--export-profile" to specify
                        an export file.

       --annotate-mpi <off|openmpi|mpich>
                        Automatically annotate MPI calls with NVTX markers. Specify
                        the MPI implementation installed on your machine. Currently,
        

In [None]:
!nvprof --query-metrics

                  Use NVIDIA Nsight Compute for GPU profiling and NVIDIA Nsight Systems for GPU tracing and CPU sampling.
                  Refer https://developer.nvidia.com/tools-overview for more details.

Available Metrics:
                            Name   Description


In [None]:
!nvprof ./test

==318== NVPROF is profiling process 318, command: ./test
Time elapsed (GPU): 9.886432 ms.
Time elapsed (GPU Tiled): 8.612448 ms.
Time elapsed (CPU): 4117.901855 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 1.148
==318== Profiling application: ./test
==318== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   39.99%  6.9790ms         1  6.9790ms  6.9790ms  6.9790ms  MatrixMulKernel(float*, float*, float*, int)
                   33.55%  5.8545ms         1  5.8545ms  5.8545ms  5.8545ms  MatrixMulKernelTiled(float*, float*, float*, int)
                   18.24%  3.1824ms         2  1.5912ms  1.5040ms  1.6785ms  [CUDA memcpy DtoH]
                    8.23%  1.4363ms         2  718.13us  696.59us  739.66us  [CUDA memcpy HtoD]
      API calls:   86.79%  559.27ms         3  186.42ms  80.504us  559.10ms  cudaMalloc
                    9.84%  63

In [None]:
!nvprof -o results.nvprof --analysis-metrics ./test

                  Use NVIDIA Nsight Compute for GPU profiling and NVIDIA Nsight Systems for GPU tracing and CPU sampling.
                  Refer https://developer.nvidia.com/tools-overview for more details.

==353== NVPROF is profiling process 353, command: ./test
Time elapsed (GPU): 9.747904 ms.
Time elapsed (GPU Tiled): 8.604448 ms.
Time elapsed (CPU): 4041.955078 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 1.133
==353== Generated result file: /content/results.nvprof


In [None]:
!nvprof --print-gpu-trace ./test

==385== NVPROF is profiling process 385, command: ./test
Time elapsed (GPU): 9.735904 ms.
Time elapsed (GPU Tiled): 8.575360 ms.
Time elapsed (CPU): 4961.250000 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 1.135
==385== Profiling application: ./test
==385== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
413.46ms  722.93us                    -               -         -         -         -  3.8147MB  5.1531GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
414.41ms  726.00us                    -               -         -         -         -  3.8147MB  5.1313GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
415.14ms  6.9992ms            (63 63 1)       (16 16 1)        49        0B        0B  

In [None]:
!nvprof --metrics gld_throughput ./test

                  Use NVIDIA Nsight Compute for GPU profiling and NVIDIA Nsight Systems for GPU tracing and CPU sampling.
                  Refer https://developer.nvidia.com/tools-overview for more details.

==466== NVPROF is profiling process 466, command: ./test
Time elapsed (GPU): 9.723744 ms.
Time elapsed (GPU Tiled): 8.677184 ms.
Time elapsed (CPU): 4070.635254 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 1.121
==466== Profiling application: ./test
==466== Profiling result:
No events/metrics were profiled.


In [None]:
!ncu -o profile ./test

==PROF== Connected to process 1427 (/content/test)
==PROF== Profiling "MatrixMulKernel" - 1: 0%....50%....100% - 8 passes
Time elapsed (GPU): 726.418762 ms.
==PROF== Profiling "MatrixMulKernelTiled" - 2: 0%....50%....100% - 8 passes
Time elapsed (GPU Tiled): 373.246368 ms.
Time elapsed (CPU): 4975.893066 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 1.946
==PROF== Disconnected from process 1427
==PROF== Report: /content/profile.ncu-rep


In [None]:
!ncu --query-metrics

Device TU104
--------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------
Metric Name                                                                 Metric Description                                                                                  
--------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------
dram__bytes                                                                 # of bytes accessed in DRAM                                                                         
dram__bytes_read                                                            # of bytes read from DRAM                                                                           
dram__bytes_write                                                           # of bytes written to DRAM

In [None]:
!ncu --list-chips

ga100, ga102, ga104, ga106, ga107, gv100, gv11b, tu102, tu104, tu106, tu116, tu117


In [None]:
!ncu --list-sets

---------- --------------------------------------------------------------------------- ------- -----------------
Identifier Sections                                                                    Enabled Estimated Metrics
---------- --------------------------------------------------------------------------- ------- -----------------
default    LaunchStats, Occupancy, SpeedOfLight                                        yes     36               
detailed   ComputeWorkloadAnalysis, InstructionStats, LaunchStats, MemoryWorkloadAnaly no      173              
           sis, Nvlink, Occupancy, SchedulerStats, SourceCounters, SpeedOfLight, Speed                          
           OfLight_RooflineChart, WarpStateStats                                                                
full       ComputeWorkloadAnalysis, InstructionStats, LaunchStats, MemoryWorkloadAnaly no      178              
           sis, MemoryWorkloadAnalysis_Chart, MemoryWorkloadAnalysis_Tables, Nvlink, O          

In [None]:
!ncu --list-sections

--------------------------------- ------------------------------------- ------- --------------------------------------------------
Identifier                        Display Name                          Enabled Filename                                          
--------------------------------- ------------------------------------- ------- --------------------------------------------------
ComputeWorkloadAnalysis           Compute Workload Analysis             no      ...20.3.1/Sections/ComputeWorkloadAnalysis.section
InstructionStats                  Instruction Statistics                no      ...2020.3.1/Sections/InstructionStatistics.section
LaunchStats                       Launch Statistics                     yes     ...pute/2020.3.1/Sections/LaunchStatistics.section
MemoryWorkloadAnalysis            Memory Workload Analysis              no      ...020.3.1/Sections/MemoryWorkloadAnalysis.section
MemoryWorkloadAnalysis_Chart      Memory Workload Analysis Chart        no      ...

In [None]:
!ncu --section=ComputeWorkloadAnalysis ./test

==PROF== Connected to process 5454 (/content/test)
==PROF== Profiling "MatrixMulKernel" - 1: 0%....50%....100% - 6 passes
Time elapsed (GPU): 408.344818 ms.
==PROF== Profiling "MatrixMulKernelTiled" - 2: 0%....50%....100% - 6 passes
Time elapsed (GPU Tiled): 220.979065 ms.
Time elapsed (CPU): 4123.012207 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 1.848
==PROF== Disconnected from process 5454
[5454] test@127.0.0.1
  MatrixMulKernel(float*, float*, float*, int), 2023-Feb-08 16:11:36, Context 1, Stream 7
    Section: Compute Workload Analysis
    ---------------------------------------------------------------------- --------------- ------------------------------
    Executed Ipc Active                                                         inst/cycle                           0.86
    Executed Ipc Elapsed                                                        inst/cycle                           0.85
  

In [None]:
!ncu --section=Occupancy ./test

==PROF== Connected to process 5699 (/content/test)
==PROF== Profiling "MatrixMulKernel" - 1: 0%....50%....100% - 1 pass
Time elapsed (GPU): 234.239227 ms.
==PROF== Profiling "MatrixMulKernelTiled" - 2: 0%....50%....100% - 1 pass
Time elapsed (GPU Tiled): 29.864288 ms.
Time elapsed (CPU): 5084.123047 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 7.843
==PROF== Disconnected from process 5699
[5699] test@127.0.0.1
  MatrixMulKernel(float*, float*, float*, int), 2023-Feb-08 16:12:30, Context 1, Stream 7
    Section: Occupancy
    ---------------------------------------------------------------------- --------------- ------------------------------
    Block Limit SM                                                                   block                             16
    Block Limit Registers                                                            block                              4
    Block Limit Shared 

In [None]:
!ncu-ui

qt.qpa.xcb: could not connect to display 
qt.qpa.plugin: Could not load the Qt platform plugin "xcb" in "" even though it was found.
This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem.

Available platform plugins are: xcb.

/opt/nvidia/nsight-compute/2020.3.1/host/linux-desktop-glibc_2_11_3-x64/ncu-ui: line 16:  6950 Aborted                 (core dumped) "$NV_AGORA_PATH/CrashReporter" "NVIDIA Nsight Compute" "NVIDIA Nsight Compute" "2020.3.1.0 (build 29567428) (public-release)" "$NV_AGORA_PATH/ncu-ui.bin" "$@"


In [None]:
!ncu --metrics smsp__inst_executed.sum,inst_executed -s 1 -c 1 ./test

==PROF== Connected to process 7713 (/content/test)
Time elapsed (GPU): 9.928192 ms.
==PROF== Profiling "MatrixMulKernelTiled" - 1 of 1: 0%....50%....100% - 3 passes
Time elapsed (GPU Tiled): 611.888611 ms.
Time elapsed (CPU): 7350.420410 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 0.016
==PROF== Disconnected from process 7713
[7713] test@127.0.0.1
  MatrixMulKernelTiled(float*, float*, float*, int), 2023-Feb-08 16:20:43, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    inst_executed                                                                     inst                    131,134,500
    smsp__inst_executed.sum                                                           inst                    131,134,500
    ---------------------------------------------------------

For the following, I used this youtube video as reference. It basically shows you how you can save the profiler output to a file and then launch NVIDIA NSight to view the detailed report locally.

In [None]:
!nvcc /content/test.cu -o /content/test --generate-line-info

In [None]:
!compute-sanitizer ./test

Time elapsed (GPU): 2754.913086 ms.
Time elapsed (GPU Tiled): 495.155243 ms.
Time elapsed (CPU): 4032.016602 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 5.564


In [None]:
!nvprof --print-gpu-trace ./test

==8946== NVPROF is profiling process 8946, command: ./test
Time elapsed (GPU): 9.707616 ms.
Time elapsed (GPU Tiled): 8.554048 ms.
Time elapsed (CPU): 4123.003906 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 1.135
==8946== Profiling application: ./test
==8946== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
411.74ms  734.22us                    -               -         -         -         -  3.8147MB  5.0738GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
412.72ms  699.95us                    -               -         -         -         -  3.8147MB  5.3222GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
413.43ms  6.9975ms            (63 63 1)       (16 16 1)        49        0B        

In [None]:
!nsys profile ./test

Collecting data...
Time elapsed (GPU): 9.813440 ms.
Time elapsed (GPU Tiled): 8.823392 ms.
Time elapsed (CPU): 6216.187988 ms.
All results are correct!!! (CPU vs GPU)
All results are correct!!! (CPU vs GPU Tiled)
Speedup: GPU (Tiled) vs GPU (Untiled): 1.112
Processing events...
Capturing symbol files...
Saving temporary "/tmp/nsys-report-db40-b0bc-88e3-1dc9.qdstrm" file to disk...
Creating final output files...

Saved report file to "/tmp/nsys-report-db40-b0bc-88e3-1dc9.qdrep"
Report file moved to "/content/report1.qdrep"
