In [10]:
%%writefile four.cu

#include <iostream>
#include<cuda_runtime.h>
#include <chrono>
#include <numeric>

// Adding this function to help with unrolling and adding the Template
template <unsigned int blockSize>
__device__ void warpReduce(volatile int* sdata, unsigned int tid){
    if(blockSize >= 64) sdata[tid] += sdata[tid + 32];
    if(blockSize >= 32) sdata[tid] += sdata[tid + 16];
    if(blockSize >= 16) sdata[tid] += sdata[tid + 8];
    if(blockSize >= 8) sdata[tid] += sdata[tid + 4];
    if(blockSize >= 4) sdata[tid] += sdata[tid + 2];
    if(blockSize >= 2) sdata[tid] += sdata[tid + 1];
}

// REDUCTION 6 – Multiple Adds / Threads
template <int blockSize>
__global__ void reduce6(int *g_in_data, int *g_out_data, unsigned int n){
    extern __shared__ int sdata[];  // stored in the shared memory

    // Each thread loading one element from global onto shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockSize*2) + tid;
    unsigned int gridSize = blockDim.x * 2 * gridDim.x;
    sdata[tid] = 0;

    while(i < n) {
      sdata[tid] += g_in_data[i] + g_in_data[i + blockSize];
      i += gridSize;
    }
    __syncthreads();

    // Perform reductions in steps, reducing thread synchronization
    if (blockSize >= 512) {
        if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads();
    }
    if (blockSize >= 256) {
        if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads();
    }
    if (blockSize >= 128) {
        if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads();
    }

    if (tid < 32) warpReduce<blockSize>(sdata, tid);

    if (tid == 0){
        g_out_data[blockIdx.x] = sdata[0];
    }
}

// I hope to use this main file for all of the reduction files
int main(){
    int n = 1<<22; // Increase to about 4M elements
    size_t bytes = n * sizeof(int);

    // Host/CPU arrays
    int *host_input_data = new int[n];
    int *host_output_data = new int[(n + 255) / 256]; // to have sufficient size for output array

    // Device/GPU arrays
    int *dev_input_data, *dev_output_data;

    // Init data
    srand(42); // Fixed seed
    for (int i = 0; i < n; i++){
        host_input_data[i] = rand() % 100;
    }

    // Allocating memory on GPU for device arrays
    cudaMalloc(&dev_input_data, bytes);
    cudaMalloc(&dev_output_data, (n + 255) / 256 * sizeof(int));

    // Copying our data onto the device (GPU)
    cudaMemcpy(dev_input_data, host_input_data, bytes, cudaMemcpyHostToDevice);

    int blockSize = 256; // number of threads per block
    int num_blocks = (n + (2 * blockSize) - 1) / (2 * blockSize);   // Modifying this to account for the fact that 1 thread accesses 2 elements

    auto start = std::chrono::high_resolution_clock::now(); // start timer

    // Needed for Complete unrolling
    // Launch Kernel and Synchronize threads
    switch (blockSize) {
        case 512:
            reduce6<512><<<num_blocks, 512, 512 * sizeof(int)>>>(dev_input_data, dev_output_data, n);
            break;
        case 256:
            reduce6<256><<<num_blocks, 256, 256 * sizeof(int)>>>(dev_input_data, dev_output_data, n);
            break;
        case 128:
            reduce6<128><<<num_blocks, 128, 128 * sizeof(int)>>>(dev_input_data, dev_output_data, n);
            break;
    }

    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count() / 1000.0; // duration in milliseconds with three decimal points

    // Copying data back to the host (CPU)
    cudaMemcpy(host_output_data, dev_output_data, (n + 255) / 256 * sizeof(int), cudaMemcpyDeviceToHost);

    // Final reduction on the host
    int finalResult = host_output_data[0];
    for (int i = 1; i < (n + 255) / 256; ++i) {
        finalResult += host_output_data[i];
    }

    // CPU Summation for verification
    int cpuResult = std::accumulate(host_input_data, host_input_data + n, 0);
    if (cpuResult == finalResult) {
        std::cout << "\033[32m"; // Set text color to green
        std::cout << "Verification successful: GPU result matches CPU result.\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    } else {
        std::cout << "\033[31m"; // Set text color to red
        std::cout << "Verification failed: GPU result (" << finalResult << ") does not match CPU result (" << cpuResult << ").\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    }
    std::cout << "\033[0m"; // Reset text color to default

    double bandwidth = (duration > 0) ? (bytes / duration / 1e6) : 0; // computed in GB/s, handling zero duration
    std::cout << "Reduced result: " << finalResult << std::endl;
    std::cout << "Time elapsed: " << duration << " ms" << std::endl;
    std::cout << "Effective bandwidth: " << bandwidth << " GB/s" << std::endl;

    // Freeing memory
    cudaFree(dev_input_data);
    cudaFree(dev_output_data);
    delete[] host_input_data;
    delete[] host_output_data;
}

Writing four.cu


In [2]:
!nvcc -arch=sm_75 one.cu -o one

In [3]:
!./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.284 ms
Effective bandwidth: 59.0747 GB/s


In [5]:
!nvcc -arch=sm_75 two.cu -o two -O1 -lineinfo

In [6]:
!./two

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.326 ms
Effective bandwidth: 51.4639 GB/s


In [8]:
!nvcc -arch=sm_75 three.cu -o three -O2 -lineinfo

In [9]:
!./three

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.284 ms
Effective bandwidth: 59.0747 GB/s


In [11]:
!nvcc -arch=sm_75 four.cu -o four -O3 -lineinfo

In [12]:
!./four

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.862 ms
Effective bandwidth: 19.4631 GB/s


In [13]:
! set -x \
&& cd $(mktemp -d) \
&& wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run \
&& sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit \
&& rm cuda_12.1.0_530.30.02_linux.run

++ mktemp -d
+ cd /tmp/tmp.uhY8BH96A1
+ wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
--2025-06-12 17:32:04--  https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.43.51.18, 23.43.51.10, 23.43.51.12
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.43.51.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4245586997 (4.0G) [application/octet-stream]
Saving to: ‘cuda_12.1.0_530.30.02_linux.run’


2025-06-12 17:32:47 (95.6 MB/s) - ‘cuda_12.1.0_530.30.02_linux.run’ saved [4245586997/4245586997]

+ sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit
+ rm cuda_12.1.0_530.30.02_linux.run


In [14]:
import os
os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/cuda/bin/'

In [15]:
!ncu ./one -o reduction_report_no

==PROF== Connected to process 3773 (/content/one)
==PROF== Profiling "reduce6" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 505.11 ms
Effective bandwidth: 0.033215 GB/s
==PROF== Disconnected from process 3773
[3773] one@127.0.0.1
  void reduce6<256>(int *, int *, unsigned int) (8192, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         4.93
    SM Frequency                    Mhz       584.70
    Elapsed Cycles                cycle       78,120
    Memory Throughput                 %        57.17
    DRAM Throughput                   %        52.05
    Duration                         us       133.60

In [16]:
!ncu ./two -o reduction_report_o1

==PROF== Connected to process 3873 (/content/two)
==PROF== Profiling "reduce6" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 311.836 ms
Effective bandwidth: 0.0538014 GB/s
==PROF== Disconnected from process 3873
[3873] two@127.0.0.1
  void reduce6<256>(int *, int *, unsigned int) (8192, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         4.95
    SM Frequency                    Mhz       584.72
    Elapsed Cycles                cycle       78,312
    Memory Throughput                 %        57.00
    DRAM Throughput                   %        51.59
    Duration                         us       133.

In [17]:
!ncu ./three -o reduction_report_o2

==PROF== Connected to process 3967 (/content/three)
==PROF== Profiling "reduce6" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 308.418 ms
Effective bandwidth: 0.0543977 GB/s
==PROF== Disconnected from process 3967
[3967] three@127.0.0.1
  void reduce6<256>(int *, int *, unsigned int) (8192, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         4.92
    SM Frequency                    Mhz       584.88
    Elapsed Cycles                cycle       78,333
    Memory Throughput                 %        56.89
    DRAM Throughput                   %        51.67
    Duration                         us       

In [18]:
!ncu ./four -o reduction_report_o3

==PROF== Connected to process 4061 (/content/four)
==PROF== Profiling "reduce6" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 308.84 ms
Effective bandwidth: 0.0543233 GB/s
==PROF== Disconnected from process 4061
[4061] four@127.0.0.1
  void reduce6<256>(int *, int *, unsigned int) (8192, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 7.5
    Section: GPU Speed Of Light Throughput
    ----------------------- ----------- ------------
    Metric Name             Metric Unit Metric Value
    ----------------------- ----------- ------------
    DRAM Frequency                  Ghz         4.95
    SM Frequency                    Mhz       584.81
    Elapsed Cycles                cycle       78,247
    Memory Throughput                 %        57.02
    DRAM Throughput                   %        51.56
    Duration                         us       133

In [19]:
!nvprof ./one

==4186== NVPROF is profiling process 4186, command: ./one
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 1.509 ms
Effective bandwidth: 11.1181 GB/s
==4186== Profiling application: ./one
==4186== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.82%  3.5014ms         1  3.5014ms  3.5014ms  3.5014ms  [CUDA memcpy HtoD]
                    3.97%  145.21us         1  145.21us  145.21us  145.21us  void reduce6<int=256>(int*, int*, unsigned int)
                    0.20%  7.4230us         1  7.4230us  7.4230us  7.4230us  [CUDA memcpy DtoH]
      API calls:   94.22%  103.58ms         2  51.789ms  78.952us  103.50ms  cudaMalloc
                    3.48%  3.8295ms         2  1.9147ms  85.689us  3.7438ms  cudaMemcpy
                    1.23%  1.3497ms         1  1.3497ms  1.3497ms  1.3497ms  cudaLaunchKernel
               

In [20]:
!nvprof --print-gpu-trace ./one

==4369== NVPROF is profiling process 4369, command: ./one
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.399 ms
Effective bandwidth: 42.0482 GB/s
==4369== Profiling application: ./one
==4369== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
236.86ms  18.290ms                    -               -         -         -         -  16.000MB  874.79MB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
255.33ms  145.12us           (8192 1 1)       (256 1 1)        16        0B  1.0000KB         -           -           -           -     Tesla T4 (0)         1         7  void reduce6<int=256>(int*, int*, unsigned int) [128]
255.50ms  7.3920us                    -               -         -         -         -  64.000KB 

In [21]:
!nvprof ./two

==4470== NVPROF is profiling process 4470, command: ./two
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.315 ms
Effective bandwidth: 53.261 GB/s
==4470== Profiling application: ./two
==4470== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.90%  3.5766ms         1  3.5766ms  3.5766ms  3.5766ms  [CUDA memcpy HtoD]
                    3.90%  145.50us         1  145.50us  145.50us  145.50us  void reduce6<int=256>(int*, int*, unsigned int)
                    0.20%  7.4240us         1  7.4240us  7.4240us  7.4240us  [CUDA memcpy DtoH]
      API calls:   95.25%  103.75ms         2  51.873ms  76.900us  103.67ms  cudaMalloc
                    3.53%  3.8451ms         2  1.9225ms  78.365us  3.7667ms  cudaMemcpy
                    0.79%  860.35us         2  430.18us  142.95us  717.40us  cudaFree
                    0.15

In [22]:
!nvprof ./three

==4507== NVPROF is profiling process 4507, command: ./three
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.314 ms
Effective bandwidth: 53.4306 GB/s
==4507== Profiling application: ./three
==4507== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.98%  3.6384ms         1  3.6384ms  3.6384ms  3.6384ms  [CUDA memcpy HtoD]
                    3.82%  144.89us         1  144.89us  144.89us  144.89us  void reduce6<int=256>(int*, int*, unsigned int)
                    0.20%  7.3920us         1  7.3920us  7.3920us  7.3920us  [CUDA memcpy DtoH]
      API calls:   95.53%  111.27ms         2  55.635ms  80.020us  111.19ms  cudaMalloc
                    3.33%  3.8814ms         2  1.9407ms  73.568us  3.8078ms  cudaMemcpy
                    0.74%  858.86us         2  429.43us  148.62us  710.24us  cudaFree
                   

In [23]:
!nvprof ./four

==4544== NVPROF is profiling process 4544, command: ./four
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.301 ms
Effective bandwidth: 55.7383 GB/s
==4544== Profiling application: ./four
==4544== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.78%  3.4625ms         1  3.4625ms  3.4625ms  3.4625ms  [CUDA memcpy HtoD]
                    4.02%  145.24us         1  145.24us  145.24us  145.24us  void reduce6<int=256>(int*, int*, unsigned int)
                    0.20%  7.3910us         1  7.3910us  7.3910us  7.3910us  [CUDA memcpy DtoH]
      API calls:   95.36%  103.38ms         2  51.692ms  110.37us  103.27ms  cudaMalloc
                    3.44%  3.7266ms         2  1.8633ms  86.762us  3.6398ms  cudaMemcpy
                    0.78%  850.89us         2  425.45us  136.91us  713.98us  cudaFree
                    0

In [24]:
!nsys --version

NVIDIA Nsight Systems version 2023.1.2.43-32377213v0


In [25]:
!nsys profile ./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.3 ms
Effective bandwidth: 55.9241 GB/s
Generating '/tmp/nsys-report-dbb9.qdstrm'
Generated:
    /content/report1.nsys-rep


In [26]:
!nsys stats report1.nsys-rep

Generating SQLite file report1.sqlite from report1.nsys-rep
Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/nvtx_sum.py]... 
SKIPPED: report1.sqlite does not contain NV Tools Extension (NVTX) data.

Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/osrt_sum.py]... 

 ** OS Runtime Summary (osrt_sum):

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)       Med (ns)     Min (ns)    Max (ns)     StdDev (ns)            Name         
 --------  ---------------  ---------  -------------  -------------  ---------  -----------  -------------  ----------------------
     47.2      435,351,451          2  217,675,725.5  217,675,725.5  2,136,355  433,215,096  304,818,701.0  sem_wait              
     44.9      414,100,651         13   31,853,896.2    2,696,400.0      1,521  313,742,288   85,732,312.4  poll                  
      7.4       68,354,182        536      127,526.5       14,297.0