In [17]:
%%writefile four.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <numeric>
#include <chrono>

// REDUCTION 1 – Interleaved Addressing without branch divergence
__global__ void reduce1(int *g_in_data, int *g_out_data){
    extern __shared__ int sdata[];  // stored in the shared memory

    // Each thread loading one element from global onto shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    sdata[tid] = g_in_data[i];
    __syncthreads();

    // Reduction method -- occurs in shared memory
    for(unsigned int s = 1; s < blockDim.x; s *= 2){
        // note the stride as s *= 2 : this causes the interleaving addressing
        int index = 2 * s * tid;    // now we don't need a diverging branch from the if condition
        if (index + s < blockDim.x)
        {
            sdata[index] += sdata[index + s];   // s is used to denote the offset that will be combined
        }
        __syncthreads();
    }
    if (tid == 0){
        g_out_data[blockIdx.x] = sdata[0];
    }
}

// I hope to use this main file for all of the reduction files
int main(){
    int n = 1 << 22; // Increase to about 4M elements
    size_t bytes = n * sizeof(int);

    // Host/CPU arrays
    int *host_input_data = new int[n];
    int *host_output_data = new int[(n + 255) / 256]; // to have sufficient size for output array

    // Device/GPU arrays
    int *dev_input_data, *dev_output_data;

    // Init data
    srand(42); // Fixed seed
    for (int i = 0; i < n; i++){
        host_input_data[i] = rand() % 100;
    }

    // Allocating memory on GPU for device arrays
    cudaMalloc(&dev_input_data, bytes);
    cudaMalloc(&dev_output_data, (n + 255) / 256 * sizeof(int));

    // Copying our data onto the device (GPU)
    cudaMemcpy(dev_input_data, host_input_data, bytes, cudaMemcpyHostToDevice);

    int blockSize = 256; // number of threads per block

    auto start = std::chrono::high_resolution_clock::now(); // start timer

    // Launch Kernel and Synchronize threads
    int num_blocks = (n + blockSize - 1) / blockSize;
    cudaError_t err;
    reduce1<<<num_blocks, blockSize, blockSize * sizeof(int)>>>(dev_input_data, dev_output_data);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
    }
    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count() / 1000.0; // duration in milliseconds with three decimal points

    // Copying data back to the host (CPU)
    cudaMemcpy(host_output_data, dev_output_data, (n + 255) / 256 * sizeof(int), cudaMemcpyDeviceToHost);

    // Final reduction on the host
    int finalResult = host_output_data[0];
    for (int i = 1; i < (n + 255) / 256; ++i) {
        finalResult += host_output_data[i];
    }

    // CPU Summation for verification
    int cpuResult = std::accumulate(host_input_data, host_input_data + n, 0);
    if (cpuResult == finalResult) {
        std::cout << "\033[32m"; // Set text color to green
        std::cout << "Verification successful: GPU result matches CPU result.\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    } else {
        std::cout << "\033[31m"; // Set text color to red
        std::cout << "Verification failed: GPU result (" << finalResult << ") does not match CPU result (" << cpuResult << ").\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    }
    std::cout << "\033[0m"; // Reset text color to default

    double bandwidth = (duration > 0) ? (bytes / duration / 1e6) : 0; // computed in GB/s, handling zero duration
    std::cout << "Reduced result: " << finalResult << std::endl;
    std::cout << "Time elapsed: " << duration << " ms" << std::endl;
    std::cout << "Effective bandwidth: " << bandwidth << " GB/s" << std::endl;

    // Freeing memory
    cudaFree(dev_input_data);
    cudaFree(dev_output_data);
    delete[] host_input_data;
    delete[] host_output_data;
}

Writing four.cu


In [4]:
!nvcc -arch=sm_75 one.cu -o one -lineinfo

In [5]:
!./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.661 ms
Effective bandwidth: 25.3816 GB/s


In [12]:
!nvcc -arch=sm_75 two.cu -o two -O1 -lineinfo

In [13]:
!./two

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.641 ms
Effective bandwidth: 26.1735 GB/s


In [15]:
!nvcc -arch=sm_75 three.cu -o three -O2 -lineinfo

In [16]:
!./three

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.633 ms
Effective bandwidth: 26.5043 GB/s


In [18]:
!nvcc -arch=sm_75 four.cu -o four -O3 -lineinfo

In [19]:
!./four

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.643 ms
Effective bandwidth: 26.0921 GB/s


In [22]:
! set -x \
&& cd $(mktemp -d) \
&& wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run \
&& sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit \
&& rm cuda_12.1.0_530.30.02_linux.run

++ mktemp -d
+ cd /tmp/tmp.TAG5U5vmg3
+ wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
--2025-06-06 09:48:35--  https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 2.16.106.152, 2.16.106.132
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|2.16.106.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4245586997 (4.0G) [application/octet-stream]
Saving to: ‘cuda_12.1.0_530.30.02_linux.run’


2025-06-06 09:49:01 (165 MB/s) - ‘cuda_12.1.0_530.30.02_linux.run’ saved [4245586997/4245586997]

+ sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit
+ rm cuda_12.1.0_530.30.02_linux.run


In [23]:
import os
os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/cuda/bin/'

In [24]:
!nsys --version

NVIDIA Nsight Systems version 2023.1.2.43-32377213v0


In [25]:
!nsys profile ./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 1.782 ms
Effective bandwidth: 9.41482 GB/s
Generating '/tmp/nsys-report-2de9.qdstrm'
Generated:
    /content/report1.nsys-rep


In [38]:
!nsys stats report1.nsys-rep

Generating SQLite file report1.sqlite from report1.nsys-rep
Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/nvtx_sum.py]... 
SKIPPED: report1.sqlite does not contain NV Tools Extension (NVTX) data.

Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/osrt_sum.py]... 

 ** OS Runtime Summary (osrt_sum):

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)       Med (ns)      Min (ns)     Max (ns)    StdDev (ns)            Name         
 --------  ---------------  ---------  -------------  -------------  -----------  -----------  ------------  ----------------------
     47.8      563,104,470          1  563,104,470.0  563,104,470.0  563,104,470  563,104,470           0.0  sem_wait              
     44.0      518,147,306         14   37,010,521.9    4,355,015.0        2,037  316,758,308  84,794,272.0  poll                  
      7.2       85,359,212        536      159,252.3       16,8

In [None]:
!ncu --version

NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2024 NVIDIA Corporation
Version 2024.2.1.0 (build 34372528) (public-release)


In [28]:
!ncu -o reduction_report_no ./one

==PROF== Connected to process 6522 (/content/one)
==PROF== Profiling "reduce1(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 406.358 ms
Effective bandwidth: 0.0412868 GB/s
==PROF== Disconnected from process 6522
==PROF== Report: /content/reduction_report_no.ncu-rep


In [29]:
!ncu -o reduction_report_o1 ./two

==PROF== Connected to process 6616 (/content/two)
==PROF== Profiling "reduce1(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 321.742 ms
Effective bandwidth: 0.0521449 GB/s
==PROF== Disconnected from process 6616
==PROF== Report: /content/reduction_report_o1.ncu-rep


In [30]:
!ncu -o reduction_report_o2 ./three

==PROF== Connected to process 6716 (/content/three)
==PROF== Profiling "reduce1(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 319.959 ms
Effective bandwidth: 0.0524355 GB/s
==PROF== Disconnected from process 6716
==PROF== Report: /content/reduction_report_o2.ncu-rep


In [31]:
!ncu -o reduction_report_o3 ./four

==PROF== Connected to process 6810 (/content/four)
==PROF== Profiling "reduce1(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 305.65 ms
Effective bandwidth: 0.0548903 GB/s
==PROF== Disconnected from process 6810
==PROF== Report: /content/reduction_report_o3.ncu-rep


In [32]:
!nvprof ./one

==6973== NVPROF is profiling process 6973, command: ./one
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.656 ms
Effective bandwidth: 25.575 GB/s
==6973== Profiling application: ./one
==6973== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   87.16%  3.5945ms         1  3.5945ms  3.5945ms  3.5945ms  [CUDA memcpy HtoD]
                   12.65%  521.78us         1  521.78us  521.78us  521.78us  reduce1(int*, int*)
                    0.18%  7.5840us         1  7.5840us  7.5840us  7.5840us  [CUDA memcpy DtoH]
      API calls:   95.06%  108.23ms         2  54.113ms  76.377us  108.15ms  cudaMalloc
                    3.40%  3.8656ms         2  1.9328ms  102.56us  3.7630ms  cudaMemcpy
                    0.77%  875.09us         2  437.55us  144.91us  730.18us  cudaFree
                    0.46%  522.72us         1  522.7

In [33]:
!nvprof --print-gpu-trace ./one

==7050== NVPROF is profiling process 7050, command: ./one
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.759 ms
Effective bandwidth: 22.1044 GB/s
==7050== Profiling application: ./one
==7050== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
226.83ms  4.0439ms                    -               -         -         -         -  16.000MB  3.8638GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
231.04ms  521.91us          (16384 1 1)       (256 1 1)        16        0B  1.0000KB         -           -           -           -     Tesla T4 (0)         1         7  reduce1(int*, int*) [128]
231.60ms  7.5520us                    -               -         -         -         -  64.000KB  8.0820GB/s      Device    P

In [34]:
!nvprof ./two

==7089== NVPROF is profiling process 7089, command: ./two
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.676 ms
Effective bandwidth: 24.8184 GB/s
==7089== Profiling application: ./two
==7089== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   87.20%  3.6093ms         1  3.6093ms  3.6093ms  3.6093ms  [CUDA memcpy HtoD]
                   12.61%  522.07us         1  522.07us  522.07us  522.07us  reduce1(int*, int*)
                    0.18%  7.5840us         1  7.5840us  7.5840us  7.5840us  [CUDA memcpy DtoH]
      API calls:   94.98%  106.99ms         2  53.495ms  85.684us  106.90ms  cudaMalloc
                    3.45%  3.8811ms         2  1.9406ms  91.547us  3.7896ms  cudaMemcpy
                    0.81%  911.90us         2  455.95us  180.65us  731.25us  cudaFree
                    0.46%  522.99us         1  522.

In [35]:
!nvprof --print-gpu-trace ./two

==7182== NVPROF is profiling process 7182, command: ./two
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.682 ms
Effective bandwidth: 24.6 GB/s
==7182== Profiling application: ./two
==7182== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
235.64ms  3.5499ms                    -               -         -         -         -  16.000MB  4.4015GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
239.27ms  521.94us          (16384 1 1)       (256 1 1)        16        0B  1.0000KB         -           -           -           -     Tesla T4 (0)         1         7  reduce1(int*, int*) [128]
239.82ms  7.5520us                    -               -         -         -         -  64.000KB  8.0820GB/s      Device    Page

In [36]:
!nvprof ./three

==7231== NVPROF is profiling process 7231, command: ./three
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.673 ms
Effective bandwidth: 24.929 GB/s
==7231== Profiling application: ./three
==7231== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   86.98%  3.5383ms         1  3.5383ms  3.5383ms  3.5383ms  [CUDA memcpy HtoD]
                   12.83%  522.07us         1  522.07us  522.07us  522.07us  reduce1(int*, int*)
                    0.19%  7.5520us         1  7.5520us  7.5520us  7.5520us  [CUDA memcpy DtoH]
      API calls:   94.76%  100.81ms         2  50.406ms  86.626us  100.72ms  cudaMalloc
                    3.61%  3.8368ms         2  1.9184ms  80.612us  3.7562ms  cudaMemcpy
                    0.85%  899.64us         2  449.82us  172.26us  727.39us  cudaFree
                    0.49%  523.81us         1  5

In [37]:
!nvprof ./four

==7268== NVPROF is profiling process 7268, command: ./four
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.7 ms
Effective bandwidth: 23.9675 GB/s
==7268== Profiling application: ./four
==7268== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   87.16%  3.5933ms         1  3.5933ms  3.5933ms  3.5933ms  [CUDA memcpy HtoD]
                   12.66%  521.91us         1  521.91us  521.91us  521.91us  reduce1(int*, int*)
                    0.18%  7.6160us         1  7.6160us  7.6160us  7.6160us  [CUDA memcpy DtoH]
      API calls:   94.94%  105.85ms         2  52.924ms  133.89us  105.71ms  cudaMalloc
                    3.46%  3.8547ms         2  1.9274ms  80.017us  3.7747ms  cudaMemcpy
                    0.82%  909.25us         2  454.63us  171.44us  737.81us  cudaFree
                    0.47%  522.12us         1  522.