In [None]:
%%writefile one.cu

#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include <numeric>

// Reduction 0 - Interleaved Addressing with branch divergence
__global__ void reduce0(int *g_in_data, int *g_out_data){
    extern __shared__ int sdata[];  // stored in the shared memory

    // Each thread loading one element from global onto shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    sdata[tid] = g_in_data[i];
    __syncthreads();

    // Reduction method -- occurs in shared memory
    for(unsigned int s = 1; s < blockDim.x; s  *= 2){
        if (tid % (2 * s) == 0) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    if (tid == 0){
        g_out_data[blockIdx.x] = sdata[0];
    }
}

// I hope to use this main file for all of the reduction files
int main(){
    int n = 1 << 22; // Increase to about 4M elements
    size_t bytes = n * sizeof(int);

    // Host/CPU arrays
    int *host_input_data = new int[n];
    int *host_output_data = new int[(n + 255) / 256]; // to have sufficient size for output array

    // Device/GPU arrays
    int *dev_input_data, *dev_output_data;

    // Init data
    srand(42); // Fixed seed
    for (int i = 0; i < n; i++){
        host_input_data[i] = rand() % 100;
    }

    // Allocating memory on GPU for device arrays
    cudaMalloc(&dev_input_data, bytes);
    cudaMalloc(&dev_output_data, (n + 255) / 256 * sizeof(int));

    // Copying our data onto the device (GPU)
    cudaMemcpy(dev_input_data, host_input_data, bytes, cudaMemcpyHostToDevice);

    int blockSize = 256; // number of threads per block

    auto start = std::chrono::high_resolution_clock::now(); // start timer

    // Launch Kernel and Synchronize threads
    int num_blocks = (n + blockSize - 1) / blockSize;
    cudaError_t err;
    reduce0<<<num_blocks, blockSize, blockSize * sizeof(int)>>>(dev_input_data, dev_output_data);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
    }
    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count() / 1000.0; // duration in milliseconds with three decimal points

    // Copying data back to the host (CPU)
    cudaMemcpy(host_output_data, dev_output_data, (n + 255) / 256 * sizeof(int), cudaMemcpyDeviceToHost);

    // Final reduction on the host
    int finalResult = host_output_data[0];
    for (int i = 1; i < (n + 255) / 256; ++i) {
        finalResult += host_output_data[i];
    }

    // CPU Summation for verification
    int cpuResult = std::accumulate(host_input_data, host_input_data + n, 0);
    if (cpuResult == finalResult) {
        std::cout << "\033[32m"; // Set text color to green
        std::cout << "Verification successful: GPU result matches CPU result.\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    } else {
        std::cout << "\033[31m"; // Set text color to red
        std::cout << "Verification failed: GPU result (" << finalResult << ") does not match CPU result (" << cpuResult << ").\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    }
    std::cout << "\033[0m"; // Reset text color to default

    double bandwidth = (duration > 0) ? (bytes / duration / 1e6) : 0; // computed in GB/s, handling zero duration
    std::cout << "Reduced result: " << finalResult << std::endl;
    std::cout << "Time elapsed: " << duration << " ms" << std::endl;
    std::cout << "Effective bandwidth: " << bandwidth << " GB/s" << std::endl;

    // Freeing memory
    cudaFree(dev_input_data);
    cudaFree(dev_output_data);
    delete[] host_input_data;
    delete[] host_output_data;
}

Writing one.cu


In [None]:
!nvcc -arch=sm_75 one.cu -o one

In [None]:
!./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.794 ms
Effective bandwidth: 21.13 GB/s


In [None]:
!nvcc -arch=sm_75 two.cu -o two -O1 -lineinfo

In [None]:
!./two

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.741 ms
Effective bandwidth: 22.6413 GB/s


In [None]:
!nvcc -arch=sm_75 three.cu -o three -O2 -lineinfo

In [None]:
!./three

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.739 ms
Effective bandwidth: 22.7026 GB/s


In [None]:
!nvcc -arch=sm_75 four.cu -o four -O3 -lineinfo

In [None]:
!./four

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.735 ms
Effective bandwidth: 22.8261 GB/s


In [51]:
! set -x \
&& cd $(mktemp -d) \
&& wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run \
&& sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit \
&& rm cuda_12.1.0_530.30.02_linux.run

++ mktemp -d
+ cd /tmp/tmp.TqsbEfQVTp
+ wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
--2025-06-06 09:22:41--  https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.59.88.195, 23.59.88.207
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.59.88.195|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4245586997 (4.0G) [application/octet-stream]
Saving to: ‘cuda_12.1.0_530.30.02_linux.run’


2025-06-06 09:23:00 (217 MB/s) - ‘cuda_12.1.0_530.30.02_linux.run’ saved [4245586997/4245586997]

+ sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit
+ rm cuda_12.1.0_530.30.02_linux.run


In [52]:
import os
os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/cuda/bin/'

In [None]:
!ncu --version

NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2024 NVIDIA Corporation
Version 2024.2.1.0 (build 34372528) (public-release)


In [None]:
!ncu ./one -o reudction_report_no

==PROF== Connected to process 2762 (/content/one)
==PROF== Profiling "reduce0(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 484.236 ms
Effective bandwidth: 0.0346468 GB/s
==PROF== Disconnected from process 2762
[2762] one@127.0.0.1
  reduce0(int *, int *) (16384, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         4.99
    SM Frequency                    Mhz       584.99
    Elapsed Cycles                cycle      357,385
    Memory Throughput                 %        50.06
    DRAM Throughput                   %        11.97
    Duration                         us       610.91
    L1

In [None]:
!ncu ./two -o reudction_report_o1

==PROF== Connected to process 2865 (/content/two)
==PROF== Profiling "reduce0(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 293.884 ms
Effective bandwidth: 0.0570879 GB/s
==PROF== Disconnected from process 2865
[2865] two@127.0.0.1
  reduce0(int *, int *) (16384, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         5.00
    SM Frequency                    Mhz       585.02
    Elapsed Cycles                cycle      356,707
    Memory Throughput                 %        50.07
    DRAM Throughput                   %        11.98
    Duration                         us       609.73
    L1

In [None]:
!ncu ./three -o reduction_report_o2

==PROF== Connected to process 3080 (/content/three)
==PROF== Profiling "reduce0(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 294.26 ms
Effective bandwidth: 0.0570149 GB/s
==PROF== Disconnected from process 3080
[3080] three@127.0.0.1
  reduce0(int *, int *) (16384, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         4.99
    SM Frequency                    Mhz       584.96
    Elapsed Cycles                cycle      357,272
    Memory Throughput                 %        50.05
    DRAM Throughput                   %        11.96
    Duration                         us       610.75
   

In [None]:
!ncu ./four -o reduction_report_o3

==PROF== Connected to process 3189 (/content/four)
==PROF== Profiling "reduce0(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 297.294 ms
Effective bandwidth: 0.0564331 GB/s
==PROF== Disconnected from process 3189
[3189] four@127.0.0.1
  reduce0(int *, int *) (16384, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         4.99
    SM Frequency                    Mhz       584.97
    Elapsed Cycles                cycle      357,369
    Memory Throughput                 %        50.05
    DRAM Throughput                   %        11.99
    Duration                         us       610.91
    

In [None]:
!ncu -o reduction_report ./one

==PROF== Connected to process 3434 (/content/one)
==PROF== Profiling "reduce0(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 302.683 ms
Effective bandwidth: 0.0554283 GB/s
==PROF== Disconnected from process 3434
==PROF== Report: /content/reduction_report.ncu-rep


In [None]:
!nvprof ./one

==4486== NVPROF is profiling process 4486, command: ./one
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.776 ms
Effective bandwidth: 21.6201 GB/s
==4486== Profiling application: ./one
==4486== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   84.40%  3.4077ms         1  3.4077ms  3.4077ms  3.4077ms  [CUDA memcpy HtoD]
                   15.41%  622.14us         1  622.14us  622.14us  622.14us  reduce0(int*, int*)
                    0.19%  7.5520us         1  7.5520us  7.5520us  7.5520us  [CUDA memcpy DtoH]
      API calls:   97.21%  191.67ms         2  95.833ms  74.527us  191.59ms  cudaMalloc
                    1.87%  3.6885ms         2  1.8443ms  72.024us  3.6165ms  cudaMemcpy
                    0.44%  868.63us         2  434.32us  142.14us  726.49us  cudaFree
                    0.32%  622.68us         1  622.

In [None]:
!nvprof --print-gpu-trace ./one

==4603== NVPROF is profiling process 4603, command: ./one
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.769 ms
Effective bandwidth: 21.8169 GB/s
==4603== Profiling application: ./one
==4603== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
323.45ms  3.4735ms                    -               -         -         -         -  16.000MB  4.4984GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
327.00ms  622.20us          (16384 1 1)       (256 1 1)        16        0B  1.0000KB         -           -           -           -     Tesla T4 (0)         1         7  reduce0(int*, int*) [128]
327.63ms  7.4880us                    -               -         -         -         -  64.000KB  8.1511GB/s      Device    P

In [None]:
!nvprof ./two

==4688== NVPROF is profiling process 4688, command: ./two
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.839 ms
Effective bandwidth: 19.9967 GB/s
==4688== Profiling application: ./two
==4688== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   85.16%  3.6120ms         1  3.6120ms  3.6120ms  3.6120ms  [CUDA memcpy HtoD]
                   14.67%  622.17us         1  622.17us  622.17us  622.17us  reduce0(int*, int*)
                    0.18%  7.4550us         1  7.4550us  7.4550us  7.4550us  [CUDA memcpy DtoH]
      API calls:   97.55%  231.22ms         2  115.61ms  97.432us  231.13ms  cudaMalloc
                    1.65%  3.8997ms         2  1.9499ms  86.409us  3.8133ms  cudaMemcpy
                    0.39%  913.80us         2  456.90us  179.43us  734.37us  cudaFree
                    0.26%  619.02us         1  619.

In [None]:
!nvprof ./three

==4875== NVPROF is profiling process 4875, command: ./three
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.758 ms
Effective bandwidth: 22.1335 GB/s
==4875== Profiling application: ./three
==4875== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   84.58%  3.4526ms         1  3.4526ms  3.4526ms  3.4526ms  [CUDA memcpy HtoD]
                   15.24%  622.04us         1  622.04us  622.04us  622.04us  reduce0(int*, int*)
                    0.19%  7.5840us         1  7.5840us  7.5840us  7.5840us  [CUDA memcpy DtoH]
      API calls:   97.02%  176.56ms         2  88.282ms  85.281us  176.48ms  cudaMalloc
                    2.03%  3.6996ms         2  1.8498ms  74.624us  3.6250ms  cudaMemcpy
                    0.46%  832.41us         2  416.21us  117.89us  714.52us  cudaFree
                    0.34%  624.86us         1  

In [None]:
!nvprof ./four

==5023== NVPROF is profiling process 5023, command: ./four
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.755 ms
Effective bandwidth: 22.2215 GB/s
==5023== Profiling application: ./four
==5023== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   84.40%  3.4081ms         1  3.4081ms  3.4081ms  3.4081ms  [CUDA memcpy HtoD]
                   15.41%  622.24us         1  622.24us  622.24us  622.24us  reduce0(int*, int*)
                    0.19%  7.4880us         1  7.4880us  7.4880us  7.4880us  [CUDA memcpy DtoH]
      API calls:   97.23%  191.95ms         2  95.976ms  79.749us  191.87ms  cudaMalloc
                    1.85%  3.6601ms         2  1.8301ms  70.586us  3.5895ms  cudaMemcpy
                    0.45%  886.04us         2  443.02us  126.56us  759.49us  cudaFree
                    0.32%  623.85us         1  62

In [53]:
!nsys --version

NVIDIA Nsight Systems version 2023.1.2.43-32377213v0


In [56]:
!nsys profile ./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.803 ms
Effective bandwidth: 20.8932 GB/s
Generating '/tmp/nsys-report-ead0.qdstrm'
Generated:
    /content/report1.nsys-rep


In [58]:
!nsys stats report1.nsys-rep

Generating SQLite file report1.sqlite from report1.nsys-rep
Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/nvtx_sum.py]... 
SKIPPED: report1.sqlite does not contain NV Tools Extension (NVTX) data.

Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/osrt_sum.py]... 

 ** OS Runtime Summary (osrt_sum):

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)       Med (ns)     Min (ns)    Max (ns)     StdDev (ns)            Name         
 --------  ---------------  ---------  -------------  -------------  ---------  -----------  -------------  ----------------------
     44.1      544,058,652          2  272,029,326.0  272,029,326.0  1,859,245  542,199,407  382,078,192.7  sem_wait              
     41.6      513,068,390         14   36,647,742.1    2,929,888.5      1,980  312,510,406   84,311,672.0  poll                  
     13.4      165,846,799        536      309,415.7       12,599.5