In [13]:
%%writefile four.cu

#include <iostream>
#include "cuda_runtime.h"
#include <chrono>
#include <numeric>

// Adding this function to help with unrolling and adding the Template
template <unsigned int blockSize>
__device__ void warpReduce(volatile int* sdata, int tid){
    if(blockSize >= 64) sdata[tid] += sdata[tid + 32];
    if(blockSize >= 32) sdata[tid] += sdata[tid + 16];
    if(blockSize >= 16) sdata[tid] += sdata[tid + 8];
    if(blockSize >= 8) sdata[tid] += sdata[tid + 4];
    if(blockSize >= 4) sdata[tid] += sdata[tid + 2];
    if(blockSize >= 2) sdata[tid] += sdata[tid + 1];
}

// REDUCTION 5 – Completely Unroll
template <unsigned int blockSize>
__global__ void reduce5(int *g_in_data, int *g_out_data){
    extern __shared__ int sdata[];  // stored in the shared memory

    // Each thread loading one element from global onto shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
    sdata[tid] = g_in_data[i] + g_in_data[i+blockDim.x];
    __syncthreads();

    // Perform reductions in steps, reducing thread synchronization
    if (blockSize >= 512) {
        if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads();
    }
    if (blockSize >= 256) {
        if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads();
    }
    if (blockSize >= 128) {
        if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads();
    }

    if (tid < 32) warpReduce<blockSize>(sdata, tid);

    if (tid == 0){
        g_out_data[blockIdx.x] = sdata[0];
    }
}

// I hope to use this main file for all of the reduction files
int main(){
    int n = 1<<22; // Increase to about 4M elements
    size_t bytes = n * sizeof(int);

    // Host/CPU arrays
    int *host_input_data = new int[n];
    int *host_output_data = new int[(n + 255) / 256]; // to have sufficient size for output array

    // Device/GPU arrays
    int *dev_input_data, *dev_output_data;

    // Init data
    srand(42); // Fixed seed
    for (int i = 0; i < n; i++){
        host_input_data[i] = rand() % 100;
    }

    // Allocating memory on GPU for device arrays
    cudaMalloc(&dev_input_data, bytes);
    cudaMalloc(&dev_output_data, (n + 255) / 256 * sizeof(int));

    // Copying our data onto the device (GPU)
    cudaMemcpy(dev_input_data, host_input_data, bytes, cudaMemcpyHostToDevice);

    int blockSize = 256; // number of threads per block
    int num_blocks = (n + (2 * blockSize) - 1) / (2 * blockSize);   // Modifying this to account for the fact that 1 thread accesses 2 elements

    auto start = std::chrono::high_resolution_clock::now(); // start timer

    // Needed for Complete unrolling
    // Launch Kernel and Synchronize threads
    switch (blockSize) {
        case 512:
            reduce5<512><<<num_blocks, 512, 512 * sizeof(int)>>>(dev_input_data, dev_output_data);
            break;
        case 256:
            reduce5<256><<<num_blocks, 256, 256 * sizeof(int)>>>(dev_input_data, dev_output_data);
            break;
        case 128:
            reduce5<128><<<num_blocks, 128, 128 * sizeof(int)>>>(dev_input_data, dev_output_data);
            break;
    }

    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count() / 1000.0; // duration in milliseconds with three decimal points

    // Copying data back to the host (CPU)
    cudaMemcpy(host_output_data, dev_output_data, (n + 255) / 256 * sizeof(int), cudaMemcpyDeviceToHost);

    // Final reduction on the host
    int finalResult = host_output_data[0];
    for (int i = 1; i < (n + 255) / 256; ++i) {
        finalResult += host_output_data[i];
    }

    // CPU Summation for verification
    int cpuResult = std::accumulate(host_input_data, host_input_data + n, 0);
    if (cpuResult == finalResult) {
        std::cout << "\033[32m"; // Set text color to green
        std::cout << "Verification successful: GPU result matches CPU result.\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    } else {
        std::cout << "\033[31m"; // Set text color to red
        std::cout << "Verification failed: GPU result (" << finalResult << ") does not match CPU result (" << cpuResult << ").\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    }
    std::cout << "\033[0m"; // Reset text color to default

    double bandwidth = (duration > 0) ? (bytes / duration / 1e6) : 0; // computed in GB/s, handling zero duration
    std::cout << "Reduced result: " << finalResult << std::endl;
    std::cout << "Time elapsed: " << duration << " ms" << std::endl;
    std::cout << "Effective bandwidth: " << bandwidth << " GB/s" << std::endl;

    // Freeing memory
    cudaFree(dev_input_data);
    cudaFree(dev_output_data);
    delete[] host_input_data;
    delete[] host_output_data;
}

Writing four.cu


In [3]:
!nvcc -arch=sm_75 one.cu -o one

In [4]:
!./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.293 ms
Effective bandwidth: 57.2601 GB/s


In [14]:
!nvcc -arch=sm_75 two.cu -o two -O1 -lineinfo

In [15]:
!./two

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.268 ms
Effective bandwidth: 62.6016 GB/s


In [16]:
!nvcc -arch=sm_75 three.cu -o three -O2 -lineinfo

In [19]:
!./three

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.303 ms
Effective bandwidth: 55.3703 GB/s


In [20]:
!nvcc -arch=sm_75 four.cu -o four -O3 -lineinfo

In [21]:
!./four

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.305 ms
Effective bandwidth: 55.0073 GB/s


In [22]:
! set -x \
&& cd $(mktemp -d) \
&& wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run \
&& sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit \
&& rm cuda_12.1.0_530.30.02_linux.run

++ mktemp -d
+ cd /tmp/tmp.3n6ud455iM
+ wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
--2025-06-12 14:40:42--  https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.43.51.15, 23.43.51.10
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.43.51.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4245586997 (4.0G) [application/octet-stream]
Saving to: ‘cuda_12.1.0_530.30.02_linux.run’


2025-06-12 14:41:17 (116 MB/s) - ‘cuda_12.1.0_530.30.02_linux.run’ saved [4245586997/4245586997]

+ sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit
+ rm cuda_12.1.0_530.30.02_linux.run


In [23]:
import os
os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/cuda/bin/'

In [24]:
!ncu -o reduction_report_no ./one

==PROF== Connected to process 6893 (/content/one)
==PROF== Profiling "reduce5" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 508.746 ms
Effective bandwidth: 0.0329776 GB/s
==PROF== Disconnected from process 6893
==PROF== Report: /content/reduction_report_no.ncu-rep


In [25]:
!ncu -o reduction_report_o1 ./two

==PROF== Connected to process 6991 (/content/two)
==PROF== Profiling "reduce5" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 296.269 ms
Effective bandwidth: 0.0566283 GB/s
==PROF== Disconnected from process 6991
==PROF== Report: /content/reduction_report_o1.ncu-rep


In [26]:
!ncu -o reduction_report_o2 ./three

==PROF== Connected to process 7091 (/content/three)
==PROF== Profiling "reduce5" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 320.864 ms
Effective bandwidth: 0.0522876 GB/s
==PROF== Disconnected from process 7091
==PROF== Report: /content/reduction_report_o2.ncu-rep


In [27]:
!ncu -o reduction_report_o3 ./four

==PROF== Connected to process 7215 (/content/four)
==PROF== Profiling "reduce5" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 304.368 ms
Effective bandwidth: 0.0551215 GB/s
==PROF== Disconnected from process 7215
==PROF== Report: /content/reduction_report_o3.ncu-rep


In [28]:
!nsys profile ./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 1.214 ms
Effective bandwidth: 13.8198 GB/s
Generating '/tmp/nsys-report-5885.qdstrm'
Generated:
    /content/report1.nsys-rep


In [29]:
!nsys stats report1.nsys-rep

Generating SQLite file report1.sqlite from report1.nsys-rep
Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/nvtx_sum.py]... 
SKIPPED: report1.sqlite does not contain NV Tools Extension (NVTX) data.

Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/osrt_sum.py]... 

 ** OS Runtime Summary (osrt_sum):

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)       Med (ns)     Min (ns)    Max (ns)     StdDev (ns)            Name         
 --------  ---------------  ---------  -------------  -------------  ---------  -----------  -------------  ----------------------
     46.9      438,493,397          2  219,246,698.5  219,246,698.5  2,157,075  436,336,322  307,011,089.8  sem_wait              
     44.8      419,286,382         13   32,252,798.6    2,700,198.0      2,134  318,486,813   87,013,059.2  poll                  
      7.1       66,459,069        536      123,990.8       15,570.5

In [30]:
!nvprof ./one

==7537== NVPROF is profiling process 7537, command: ./one
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.359 ms
Effective bandwidth: 46.7332 GB/s
==7537== Profiling application: ./one
==7537== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   96.13%  3.7138ms         1  3.7138ms  3.7138ms  3.7138ms  [CUDA memcpy HtoD]
                    3.68%  142.05us         1  142.05us  142.05us  142.05us  void reduce5<unsigned int=256>(int*, int*)
                    0.20%  7.5520us         1  7.5520us  7.5520us  7.5520us  [CUDA memcpy DtoH]
      API calls:   95.08%  103.88ms         2  51.939ms  89.781us  103.79ms  cudaMalloc
                    3.66%  3.9996ms         2  1.9998ms  84.030us  3.9156ms  cudaMemcpy
                    0.79%  867.21us         2  433.61us  140.67us  726.54us  cudaFree
                    0.19%  2

In [31]:
!nvprof ./two

==7574== NVPROF is profiling process 7574, command: ./two
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.313 ms
Effective bandwidth: 53.6013 GB/s
==7574== Profiling application: ./two
==7574== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.90%  3.4802ms         1  3.4802ms  3.4802ms  3.4802ms  [CUDA memcpy HtoD]
                    3.90%  141.50us         1  141.50us  141.50us  141.50us  void reduce5<unsigned int=256>(int*, int*)
                    0.21%  7.4550us         1  7.4550us  7.4550us  7.4550us  [CUDA memcpy DtoH]
      API calls:   95.29%  103.91ms         2  51.953ms  78.228us  103.83ms  cudaMalloc
                    3.42%  3.7326ms         2  1.8663ms  76.306us  3.6563ms  cudaMemcpy
                    0.82%  898.14us         2  449.07us  151.19us  746.95us  cudaFree
                    0.16%  1

In [32]:
!nvprof ./three

==7645== NVPROF is profiling process 7645, command: ./three
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.302 ms
Effective bandwidth: 55.5537 GB/s
==7645== Profiling application: ./three
==7645== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.95%  3.5349ms         1  3.5349ms  3.5349ms  3.5349ms  [CUDA memcpy HtoD]
                    3.85%  141.89us         1  141.89us  141.89us  141.89us  void reduce5<unsigned int=256>(int*, int*)
                    0.20%  7.4560us         1  7.4560us  7.4560us  7.4560us  [CUDA memcpy DtoH]
      API calls:   95.18%  101.21ms         2  50.604ms  82.568us  101.13ms  cudaMalloc
                    3.55%  3.7762ms         2  1.8881ms  74.602us  3.7016ms  cudaMemcpy
                    0.81%  860.07us         2  430.03us  134.33us  725.74us  cudaFree
                    0.16

In [33]:
!nvprof ./four

==7698== NVPROF is profiling process 7698, command: ./four
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.349 ms
Effective bandwidth: 48.0723 GB/s
==7698== Profiling application: ./four
==7698== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   97.83%  6.7447ms         1  6.7447ms  6.7447ms  6.7447ms  [CUDA memcpy HtoD]
                    2.06%  141.89us         1  141.89us  141.89us  141.89us  void reduce5<unsigned int=256>(int*, int*)
                    0.11%  7.4880us         1  7.4880us  7.4880us  7.4880us  [CUDA memcpy DtoH]
      API calls:   94.78%  154.42ms         2  77.212ms  102.93us  154.32ms  cudaMalloc
                    4.32%  7.0356ms         2  3.5178ms  92.043us  6.9436ms  cudaMemcpy
                    0.56%  907.78us         2  453.89us  159.50us  748.27us  cudaFree
                    0.12% 