In [20]:
%%writefile five.cu
#include <iostream>
#include "cuda_runtime.h"
#include <chrono>
#include <numeric>

// REDUCTION 3 – First Add During Load
__global__ void reduce3(int *g_in_data, int *g_out_data){
    extern __shared__ int sdata[];  // stored in the shared memory

    // Each thread loading one element from global onto shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
    sdata[tid] = g_in_data[i] + g_in_data[i+blockDim.x];
    __syncthreads();

    // Reduction method -- occurs in shared memory
    for(unsigned int s = blockDim.x/2; s > 0; s >>= 1){
        // check out the reverse loop above
        if (tid < s){   // then, we check tid to do our computation
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    if (tid == 0){
        g_out_data[blockIdx.x] = sdata[0];
    }
}

// I hope to use this main file for all of the reduction files
int main(){
    int n = 1<<22; // Increase to about 4M elements
    size_t bytes = n * sizeof(int);

    // Host/CPU arrays
    int *host_input_data = new int[n];
    int *host_output_data = new int[(n + 255) / 256]; // to have sufficient size for output array

    // Device/GPU arrays
    int *dev_input_data, *dev_output_data;

    // Init data
    srand(42); // Fixed seed
    for (int i = 0; i < n; i++){
        host_input_data[i] = rand() % 100;
    }

    // Allocating memory on GPU for device arrays
    cudaMalloc(&dev_input_data, bytes);
    cudaMalloc(&dev_output_data, (n + 255) / 256 * sizeof(int));

    // Copying our data onto the device (GPU)
    cudaMemcpy(dev_input_data, host_input_data, bytes, cudaMemcpyHostToDevice);

    int blockSize = 256; // number of threads per block

    auto start = std::chrono::high_resolution_clock::now(); // start timer

    // Launch Kernel and Synchronize threads
    int num_blocks = (n + (2 * blockSize) - 1) / (2 * blockSize);   // Modifying this to account for the fact that 1 thread accesses 2 elements
    cudaError_t err;
    reduce3<<<num_blocks, blockSize, blockSize * sizeof(int)>>>(dev_input_data, dev_output_data);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
    }
    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count() / 1000.0; // duration in milliseconds with three decimal points

    // Copying data back to the host (CPU)
    cudaMemcpy(host_output_data, dev_output_data, (n + 255) / 256 * sizeof(int), cudaMemcpyDeviceToHost);

    // Final reduction on the host
    int finalResult = host_output_data[0];
    for (int i = 1; i < (n + 255) / 256; ++i) {
        finalResult += host_output_data[i];
    }

    // CPU Summation for verification
    int cpuResult = std::accumulate(host_input_data, host_input_data + n, 0);
    if (cpuResult == finalResult) {
        std::cout << "\033[32m"; // Set text color to green
        std::cout << "Verification successful: GPU result matches CPU result.\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    } else {
        std::cout << "\033[31m"; // Set text color to red
        std::cout << "Verification failed: GPU result (" << finalResult << ") does not match CPU result (" << cpuResult << ").\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    }
    std::cout << "\033[0m"; // Reset text color to default

    double bandwidth = (duration > 0) ? (bytes / duration / 1e6) : 0; // computed in GB/s, handling zero duration
    std::cout << "Reduced result: " << finalResult << std::endl;
    std::cout << "Time elapsed: " << duration << " ms" << std::endl;
    std::cout << "Effective bandwidth: " << bandwidth << " GB/s" << std::endl;

    // Freeing memory
    cudaFree(dev_input_data);
    cudaFree(dev_output_data);
    delete[] host_input_data;
    delete[] host_output_data;
}

Writing five.cu


In [7]:
!nvcc -arch=sm_75 two.cu -o two

In [8]:
!./two

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.355 ms
Effective bandwidth: 47.2598 GB/s


In [11]:
!nvcc -arch=sm_75 three.cu -o three -O1 -lineinfo

In [13]:
!./three

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.332 ms
Effective bandwidth: 50.5338 GB/s


In [17]:
!nvcc -arch=sm_75 four.cu -o four -O2 -lineinfo

In [18]:
!./four

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.329 ms
Effective bandwidth: 50.9946 GB/s


In [21]:
!nvcc -arch=sm_75 five.cu -o five -O3 -lineinfo

In [22]:
!./five

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.324 ms
Effective bandwidth: 51.7815 GB/s


In [23]:
! set -x \
&& cd $(mktemp -d) \
&& wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run \
&& sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit \
&& rm cuda_12.1.0_530.30.02_linux.run

++ mktemp -d
+ cd /tmp/tmp.ZfhfY2weaK
+ wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
--2025-06-12 07:04:16--  https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.59.88.207, 23.59.88.195
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.59.88.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4245586997 (4.0G) [application/octet-stream]
Saving to: ‘cuda_12.1.0_530.30.02_linux.run’


2025-06-12 07:04:40 (165 MB/s) - ‘cuda_12.1.0_530.30.02_linux.run’ saved [4245586997/4245586997]

+ sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit
+ rm cuda_12.1.0_530.30.02_linux.run


In [24]:
import os
os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/cuda/bin/'

In [25]:
!ncu -o reduction_report_no ./two

==PROF== Connected to process 11127 (/content/two)
==PROF== Profiling "reduce3(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 480.13 ms
Effective bandwidth: 0.0349431 GB/s
==PROF== Disconnected from process 11127
==PROF== Report: /content/reduction_report_no.ncu-rep


In [43]:
!ncu -o reduction_report_o1 ./three

==PROF== Connected to process 12678 (/content/three)
==PROF== Profiling "reduce3(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 317.704 ms
Effective bandwidth: 0.0528077 GB/s
==PROF== Disconnected from process 12678
==PROF== Report: /content/reduction_report_o1.ncu-rep


In [41]:
!ncu -o reduction_report_o2 ./four

==PROF== Connected to process 12547 (/content/four)
==PROF== Profiling "reduce3(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 385.163 ms
Effective bandwidth: 0.0435587 GB/s
==PROF== Disconnected from process 12547
==PROF== Report: /content/reduction_report_o2.ncu-rep


In [44]:
!ncu -o reduction_report_o3 ./five

==PROF== Connected to process 12890 (/content/five)
==PROF== Profiling "reduce3(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 289.578 ms
Effective bandwidth: 0.0579368 GB/s
==PROF== Disconnected from process 12890
==PROF== Report: /content/reduction_report_o3.ncu-rep


In [46]:
!nsys profile ./two

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 1.308 ms
Effective bandwidth: 12.8266 GB/s
Generating '/tmp/nsys-report-10e6.qdstrm'
Generated:
    /content/report1.nsys-rep


In [47]:
!nsys stats report1.nsys-rep

Generating SQLite file report1.sqlite from report1.nsys-rep
Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/nvtx_sum.py]... 
SKIPPED: report1.sqlite does not contain NV Tools Extension (NVTX) data.

Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/osrt_sum.py]... 

 ** OS Runtime Summary (osrt_sum):

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)       Med (ns)     Min (ns)    Max (ns)     StdDev (ns)            Name         
 --------  ---------------  ---------  -------------  -------------  ---------  -----------  -------------  ----------------------
     44.2      651,742,677         14   46,553,048.4    3,252,315.0     44,681  440,150,267  117,207,672.4  poll                  
     44.0      647,415,878          2  323,707,939.0  323,707,939.0  2,056,475  645,359,403  454,883,862.7  sem_wait              
     11.4      168,542,628        536      314,445.2       16,180.5

In [48]:
!nvprof ./two

==21679== NVPROF is profiling process 21679, command: ./two
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.383 ms
Effective bandwidth: 43.8047 GB/s
==21679== Profiling application: ./two
==21679== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   93.94%  3.5143ms         1  3.5143ms  3.5143ms  3.5143ms  [CUDA memcpy HtoD]
                    5.86%  219.36us         1  219.36us  219.36us  219.36us  reduce3(int*, int*)
                    0.20%  7.3600us         1  7.3600us  7.3600us  7.3600us  [CUDA memcpy DtoH]
      API calls:   97.31%  187.97ms         2  93.983ms  76.222us  187.89ms  cudaMalloc
                    1.95%  3.7651ms         2  1.8826ms  85.406us  3.6797ms  cudaMemcpy
                    0.45%  876.66us         2  438.33us  157.72us  718.95us  cudaFree
                    0.11%  218.76us         1  

In [49]:
!nvprof ./three

==26837== NVPROF is profiling process 26837, command: ./three
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.355 ms
Effective bandwidth: 47.2598 GB/s
==26837== Profiling application: ./three
==26837== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   93.89%  3.4846ms         1  3.4846ms  3.4846ms  3.4846ms  [CUDA memcpy HtoD]
                    5.91%  219.32us         1  219.32us  219.32us  219.32us  reduce3(int*, int*)
                    0.20%  7.4240us         1  7.4240us  7.4240us  7.4240us  [CUDA memcpy DtoH]
      API calls:   97.30%  184.01ms         2  92.003ms  75.700us  183.93ms  cudaMalloc
                    1.98%  3.7442ms         2  1.8721ms  79.860us  3.6644ms  cudaMemcpy
                    0.45%  859.07us         2  429.54us  147.20us  711.87us  cudaFree
                    0.11%  211.49us        

In [50]:
!nvprof ./four

==26913== NVPROF is profiling process 26913, command: ./four
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.356 ms
Effective bandwidth: 47.127 GB/s
==26913== Profiling application: ./four
==26913== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   93.87%  3.4689ms         1  3.4689ms  3.4689ms  3.4689ms  [CUDA memcpy HtoD]
                    5.93%  219.32us         1  219.32us  219.32us  219.32us  reduce3(int*, int*)
                    0.20%  7.3920us         1  7.3920us  7.3920us  7.3920us  [CUDA memcpy DtoH]
      API calls:   97.37%  187.49ms         2  93.747ms  79.636us  187.42ms  cudaMalloc
                    1.93%  3.7201ms         2  1.8601ms  72.473us  3.6477ms  cudaMemcpy
                    0.44%  839.01us         2  419.50us  126.05us  712.95us  cudaFree
                    0.11%  219.61us         1 

In [51]:
!nvprof ./five

==27054== NVPROF is profiling process 27054, command: ./five
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.345 ms
Effective bandwidth: 48.6296 GB/s
==27054== Profiling application: ./five
==27054== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   93.84%  3.4551ms         1  3.4551ms  3.4551ms  3.4551ms  [CUDA memcpy HtoD]
                    5.96%  219.29us         1  219.29us  219.29us  219.29us  reduce3(int*, int*)
                    0.20%  7.3910us         1  7.3910us  7.3910us  7.3910us  [CUDA memcpy DtoH]
      API calls:   97.33%  185.49ms         2  92.746ms  74.936us  185.42ms  cudaMalloc
                    1.95%  3.7146ms         2  1.8573ms  76.147us  3.6385ms  cudaMemcpy
                    0.44%  835.59us         2  417.80us  122.84us  712.76us  cudaFree
                    0.12%  220.42us         1