In [14]:
%%writefile four.cu
#include <iostream>
#include<cuda_runtime.h>
#include <chrono>
#include <numeric>

// Adding this function to help with unrolling
__device__ void warpReduce(volatile int* sdata, int tid){
  // the aim is to save all the warps from useless work
  sdata[tid] += sdata[tid + 32];
  sdata[tid] += sdata[tid + 16];
  sdata[tid] += sdata[tid + 8];
  sdata[tid] += sdata[tid + 4];
  sdata[tid] += sdata[tid + 2];
  sdata[tid] += sdata[tid + 1];
}

// REDUCTION 4 – Unroll Last Warp
__global__ void reduce4(int *g_in_data, int *g_out_data){
    extern __shared__ int sdata[];  // stored in the shared memory

    // Each thread loading one element from global onto shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
    sdata[tid] = g_in_data[i] + g_in_data[i+blockDim.x];
    __syncthreads();

    // Reduction method -- occurs in shared memory
    for(unsigned int s = blockDim.x/2; s > 32; s >>= 1){  // only changing the end limit
        // check out the reverse loop above
        if (tid < s){   // then, we check tid to do our computation
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // Adding this to use warpReduce
    if (tid < 32){
      warpReduce(sdata, tid);
    }

    if (tid == 0){
        g_out_data[blockIdx.x] = sdata[0];
    }
}

// I hope to use this main file for all of the reduction files
int main(){
    int n = 1<<22; // Increase to about 4M elements
    size_t bytes = n * sizeof(int);

    // Host/CPU arrays
    int *host_input_data = new int[n];
    int *host_output_data = new int[(n + 255) / 256]; // to have sufficient size for output array

    // Device/GPU arrays
    int *dev_input_data, *dev_output_data;

    // Init data
    srand(42); // Fixed seed
    for (int i = 0; i < n; i++){
        host_input_data[i] = rand() % 100;
    }

    // Allocating memory on GPU for device arrays
    cudaMalloc(&dev_input_data, bytes);
    cudaMalloc(&dev_output_data, (n + 255) / 256 * sizeof(int));

    // Copying our data onto the device (GPU)
    cudaMemcpy(dev_input_data, host_input_data, bytes, cudaMemcpyHostToDevice);

    int blockSize = 256; // number of threads per block

    auto start = std::chrono::high_resolution_clock::now(); // start timer

    // Launch Kernel and Synchronize threads
    int num_blocks = (n + (2 * blockSize) - 1) / (2 * blockSize);   // Modifying this to account for the fact that 1 thread accesses 2 elements
    cudaError_t err;
    reduce4<<<num_blocks, blockSize, blockSize * sizeof(int)>>>(dev_input_data, dev_output_data);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
    }
    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count() / 1000.0; // duration in milliseconds with three decimal points

    // Copying data back to the host (CPU)
    cudaMemcpy(host_output_data, dev_output_data, (n + 255) / 256 * sizeof(int), cudaMemcpyDeviceToHost);

    // Final reduction on the host
    int finalResult = host_output_data[0];
    for (int i = 1; i < (n + 255) / 256; ++i) {
        finalResult += host_output_data[i];
    }

    // CPU Summation for verification
    int cpuResult = std::accumulate(host_input_data, host_input_data + n, 0);
    if (cpuResult == finalResult) {
        std::cout << "\033[32m"; // Set text color to green
        std::cout << "Verification successful: GPU result matches CPU result.\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    } else {
        std::cout << "\033[31m"; // Set text color to red
        std::cout << "Verification failed: GPU result (" << finalResult << ") does not match CPU result (" << cpuResult << ").\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    }
    std::cout << "\033[0m"; // Reset text color to default

    double bandwidth = (duration > 0) ? (bytes / duration / 1e6) : 0; // computed in GB/s, handling zero duration
    std::cout << "Reduced result: " << finalResult << std::endl;
    std::cout << "Time elapsed: " << duration << " ms" << std::endl;
    std::cout << "Effective bandwidth: " << bandwidth << " GB/s" << std::endl;

    // Freeing memory
    cudaFree(dev_input_data);
    cudaFree(dev_output_data);
    delete[] host_input_data;
    delete[] host_output_data;
}

Writing four.cu


In [2]:
!nvcc -arch=sm_75 one.cu -o one

In [3]:
!./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.289 ms
Effective bandwidth: 58.0527 GB/s


In [7]:
!nvcc -arch=sm_75 two.cu -o two -O1 -lineinfo

In [8]:
!./two

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.26 ms
Effective bandwidth: 64.5278 GB/s


In [12]:
!nvcc -arch=sm_75 three.cu -o three -O2 -lineinfo

In [13]:
!./three

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.251 ms
Effective bandwidth: 66.8415 GB/s


In [17]:
!nvcc -arch=sm_75 four.cu -o four -O3 -lineinfo

In [18]:
!./four

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.267 ms
Effective bandwidth: 62.836 GB/s


In [19]:
! set -x \
&& cd $(mktemp -d) \
&& wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run \
&& sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit \
&& rm cuda_12.1.0_530.30.02_linux.run

++ mktemp -d
+ cd /tmp/tmp.qNtcgCaA8h
+ wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
--2025-06-12 10:00:47--  https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.213.43.207, 23.213.43.199
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.213.43.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4245586997 (4.0G) [application/octet-stream]
Saving to: ‘cuda_12.1.0_530.30.02_linux.run’


2025-06-12 10:01:13 (160 MB/s) - ‘cuda_12.1.0_530.30.02_linux.run’ saved [4245586997/4245586997]

+ sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit
+ rm cuda_12.1.0_530.30.02_linux.run


In [20]:
import os
os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/cuda/bin/'

In [21]:
!ncu -o reduction_report_no ./one

==PROF== Connected to process 15242 (/content/one)
==PROF== Profiling "reduce4(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 519.042 ms
Effective bandwidth: 0.0323234 GB/s
==PROF== Disconnected from process 15242
==PROF== Report: /content/reduction_report_no.ncu-rep


In [22]:
!ncu -o reduction_report_o1 ./two

==PROF== Connected to process 15430 (/content/two)
==PROF== Profiling "reduce4(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 306.407 ms
Effective bandwidth: 0.0547547 GB/s
==PROF== Disconnected from process 15430
==PROF== Report: /content/reduction_report_o1.ncu-rep


In [23]:
!ncu -o reduction_report_o2 ./three

==PROF== Connected to process 15562 (/content/three)
==PROF== Profiling "reduce4(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 306.277 ms
Effective bandwidth: 0.0547779 GB/s
==PROF== Disconnected from process 15562
==PROF== Report: /content/reduction_report_o2.ncu-rep


In [24]:
!ncu -o reduction_report_o3 ./four

==PROF== Connected to process 15688 (/content/four)
==PROF== Profiling "reduce4(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 311.224 ms
Effective bandwidth: 0.0539072 GB/s
==PROF== Disconnected from process 15688
==PROF== Report: /content/reduction_report_o3.ncu-rep


In [25]:
!nsys profile ./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 1.503 ms
Effective bandwidth: 11.1625 GB/s
Generating '/tmp/nsys-report-18d1.qdstrm'
Generated:
    /content/report1.nsys-rep


In [26]:
!nsys stats report1.nsys-rep

Generating SQLite file report1.sqlite from report1.nsys-rep
Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/nvtx_sum.py]... 
SKIPPED: report1.sqlite does not contain NV Tools Extension (NVTX) data.

Processing [report1.sqlite] with [/usr/local/cuda-12.1/nsight-systems-2023.1.2/host-linux-x64/reports/osrt_sum.py]... 

 ** OS Runtime Summary (osrt_sum):

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)       Med (ns)     Min (ns)    Max (ns)     StdDev (ns)            Name         
 --------  ---------------  ---------  -------------  -------------  ---------  -----------  -------------  ----------------------
     43.8      555,701,642          2  277,850,821.0  277,850,821.0  2,099,304  553,602,338  389,971,535.2  sem_wait              
     42.1      534,373,948         14   38,169,567.7    2,851,975.5      2,444  333,351,323   89,351,036.7  poll                  
     13.0      165,128,288        536      308,075.2       12,929.0

In [27]:
!nvprof ./one

==16898== NVPROF is profiling process 16898, command: ./one
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.283 ms
Effective bandwidth: 59.2834 GB/s
==16898== Profiling application: ./one
==16898== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.80%  3.4669ms         1  3.4669ms  3.4669ms  3.4669ms  [CUDA memcpy HtoD]
                    4.00%  144.77us         1  144.77us  144.77us  144.77us  reduce4(int*, int*)
                    0.20%  7.3920us         1  7.3920us  7.3920us  7.3920us  [CUDA memcpy DtoH]
      API calls:   97.47%  194.50ms         2  97.248ms  76.351us  194.42ms  cudaMalloc
                    1.86%  3.7016ms         2  1.8508ms  74.133us  3.6275ms  cudaMemcpy
                    0.44%  871.24us         2  435.62us  137.39us  733.85us  cudaFree
                    0.08%  164.22us       114  

In [28]:
!nvprof ./two

==16958== NVPROF is profiling process 16958, command: ./two
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.275 ms
Effective bandwidth: 61.0081 GB/s
==16958== Profiling application: ./two
==16958== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.76%  3.4418ms         1  3.4418ms  3.4418ms  3.4418ms  [CUDA memcpy HtoD]
                    4.03%  144.89us         1  144.89us  144.89us  144.89us  reduce4(int*, int*)
                    0.21%  7.3920us         1  7.3920us  7.3920us  7.3920us  [CUDA memcpy DtoH]
      API calls:   97.55%  198.24ms         2  99.119ms  85.607us  198.15ms  cudaMalloc
                    1.81%  3.6849ms         2  1.8424ms  78.844us  3.6060ms  cudaMemcpy
                    0.42%  854.63us         2  427.32us  131.55us  723.08us  cudaFree
                    0.07%  146.61us         1  

In [29]:
!nvprof ./three

==16996== NVPROF is profiling process 16996, command: ./three
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.271 ms
Effective bandwidth: 61.9085 GB/s
==16996== Profiling application: ./three
==16996== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.70%  3.3853ms         1  3.3853ms  3.3853ms  3.3853ms  [CUDA memcpy HtoD]
                    4.09%  144.67us         1  144.67us  144.67us  144.67us  reduce4(int*, int*)
                    0.21%  7.3600us         1  7.3600us  7.3600us  7.3600us  [CUDA memcpy DtoH]
      API calls:   97.53%  195.04ms         2  97.520ms  73.811us  194.97ms  cudaMalloc
                    1.83%  3.6539ms         2  1.8269ms  72.007us  3.5819ms  cudaMemcpy
                    0.43%  859.07us         2  429.54us  134.99us  724.09us  cudaFree
                    0.07%  144.85us        

In [30]:
!nvprof ./four

==17040== NVPROF is profiling process 17040, command: ./four
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.282 ms
Effective bandwidth: 59.4937 GB/s
==17040== Profiling application: ./four
==17040== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   95.80%  3.4735ms         1  3.4735ms  3.4735ms  3.4735ms  [CUDA memcpy HtoD]
                    3.99%  144.83us         1  144.83us  144.83us  144.83us  reduce4(int*, int*)
                    0.20%  7.3600us         1  7.3600us  7.3600us  7.3600us  [CUDA memcpy DtoH]
      API calls:   97.67%  210.29ms         2  105.14ms  75.740us  210.21ms  cudaMalloc
                    1.72%  3.7113ms         2  1.8556ms  72.996us  3.6383ms  cudaMemcpy
                    0.40%  855.61us         2  427.80us  138.09us  717.52us  cudaFree
                    0.08%  164.27us       114