In [21]:
%%writefile four.cu

#include <iostream>
#include<cuda_runtime.h>
#include <chrono>
#include <numeric>

// REDUCTION 2 – Sequence Addressing
__global__ void reduce2(int *g_in_data, int *g_out_data){
    extern __shared__ int sdata[];  // stored in the shared memory

    // Each thread loading one element from global onto shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    sdata[tid] = g_in_data[i];
    __syncthreads();

    // Reduction method -- occurs in shared memory
    for(unsigned int s = blockDim.x/2; s > 0; s >>= 1){
        // check out the reverse loop above
        if (tid < s){   // then, we check threadID to do our computation
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    if (tid == 0){
        g_out_data[blockIdx.x] = sdata[0];
    }
}

// I hope to use this main file for all of the reduction files
int main(){
    int n = 1 << 22; // Increase to about 4M elements
    size_t bytes = n * sizeof(int);

    // Host/CPU arrays
    int *host_input_data = new int[n];
    int *host_output_data = new int[(n + 255) / 256]; // to have sufficient size for output array

    // Device/GPU arrays
    int *dev_input_data, *dev_output_data;

    // Init data
    srand(42); // Fixed seed
    for (int i = 0; i < n; i++){
        host_input_data[i] = rand() % 100;
    }

    // Allocating memory on GPU for device arrays
    cudaMalloc(&dev_input_data, bytes);
    cudaMalloc(&dev_output_data, (n + 255) / 256 * sizeof(int));

    // Copying our data onto the device (GPU)
    cudaMemcpy(dev_input_data, host_input_data, bytes, cudaMemcpyHostToDevice);

    int blockSize = 256; // number of threads per block

    auto start = std::chrono::high_resolution_clock::now(); // start timer

    // Launch Kernel and Synchronize threads
    int num_blocks = (n + blockSize - 1) / blockSize;
    cudaError_t err;
    reduce2<<<num_blocks, blockSize, blockSize * sizeof(int)>>>(dev_input_data, dev_output_data);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
    }
    cudaDeviceSynchronize();

    auto stop = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count() / 1000.0; // duration in milliseconds with three decimal points

    // Copying data back to the host (CPU)
    cudaMemcpy(host_output_data, dev_output_data, (n + 255) / 256 * sizeof(int), cudaMemcpyDeviceToHost);

    // Final reduction on the host
    int finalResult = host_output_data[0];
    for (int i = 1; i < (n + 255) / 256; ++i) {
        finalResult += host_output_data[i];
    }

    // CPU Summation for verification
    int cpuResult = std::accumulate(host_input_data, host_input_data + n, 0);
    if (cpuResult == finalResult) {
        std::cout << "\033[32m"; // Set text color to green
        std::cout << "Verification successful: GPU result matches CPU result.\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    } else {
        std::cout << "\033[31m"; // Set text color to red
        std::cout << "Verification failed: GPU result (" << finalResult << ") does not match CPU result (" << cpuResult << ").\n";
        std::cout << "GPU Result: " << finalResult << ", CPU Result: " << cpuResult << std::endl;
    }
    std::cout << "\033[0m"; // Reset text color to default

    double bandwidth = (duration > 0) ? (bytes / duration / 1e6) : 0; // computed in GB/s, handling zero duration
    std::cout << "Reduced result: " << finalResult << std::endl;
    std::cout << "Time elapsed: " << duration << " ms" << std::endl;
    std::cout << "Effective bandwidth: " << bandwidth << " GB/s" << std::endl;

    // Freeing memory
    cudaFree(dev_input_data);
    cudaFree(dev_output_data);
    delete[] host_input_data;
    delete[] host_output_data;
}

Writing four.cu


In [4]:
!nvcc -arch=sm_75 one.cu -o one

In [5]:
!./one

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.529 ms
Effective bandwidth: 31.715 GB/s


In [19]:
!nvcc -arch=sm_75 two.cu -o two -O1 -lineinfo

In [20]:
!./two

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.521 ms
Effective bandwidth: 32.202 GB/s


In [12]:
!nvcc -arch=sm_75 three.cu -o three -O2 -lineinfo

In [13]:
!./three

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.513 ms
Effective bandwidth: 32.7041 GB/s


In [24]:
!nvcc -arch=sm_75 four.cu -o four -O3 -lineinfo

In [25]:
!./four

[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.568 ms
Effective bandwidth: 29.5374 GB/s


In [26]:
! set -x \
&& cd $(mktemp -d) \
&& wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run \
&& sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit \
&& rm cuda_12.1.0_530.30.02_linux.run

++ mktemp -d
+ cd /tmp/tmp.vKkHq9sFRK
+ wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
--2025-06-06 06:07:37--  https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.52.40.50, 23.52.40.64
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.52.40.50|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4245586997 (4.0G) [application/octet-stream]
Saving to: ‘cuda_12.1.0_530.30.02_linux.run’


2025-06-06 06:08:03 (158 MB/s) - ‘cuda_12.1.0_530.30.02_linux.run’ saved [4245586997/4245586997]

+ sudo sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit
+ rm cuda_12.1.0_530.30.02_linux.run


In [27]:
import os
os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/cuda/bin/'

In [28]:
!ncu --version

NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2024 NVIDIA Corporation
Version 2024.2.1.0 (build 34372528) (public-release)


In [30]:
!ncu -o reduction_report_no ./one

==PROF== Connected to process 5942 (/content/one)
==PROF== Profiling "reduce2(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 300.084 ms
Effective bandwidth: 0.0559084 GB/s
==PROF== Disconnected from process 5942
==PROF== Report: /content/reduction_report_no.ncu-rep


In [31]:
!ncu -o reduction_report_o1 ./two

==PROF== Connected to process 6092 (/content/two)
==PROF== Profiling "reduce2(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 297.156 ms
Effective bandwidth: 0.0564593 GB/s
==PROF== Disconnected from process 6092
==PROF== Report: /content/reduction_report_o1.ncu-rep


In [32]:
!ncu -o reduction_report_o2 ./three

==PROF== Connected to process 6204 (/content/three)
==PROF== Profiling "reduce2(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 302.64 ms
Effective bandwidth: 0.0554362 GB/s
==PROF== Disconnected from process 6204
==PROF== Report: /content/reduction_report_o2.ncu-rep


In [33]:
!ncu -o reduction_report_o3 ./three

==PROF== Connected to process 6294 (/content/three)
==PROF== Profiling "reduce2(int *, int *)" - 0: 0%....50%....100% - 9 passes
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 301.558 ms
Effective bandwidth: 0.0556351 GB/s
==PROF== Disconnected from process 6294
==PROF== Report: /content/reduction_report_o3.ncu-rep


In [34]:
!nvprof --version

nvprof: NVIDIA (R) Cuda command line profiler
Copyright (c) 2012 - 2023 NVIDIA Corporation
Release version 12.1.55 (21)


In [35]:
!nvprof ./one

==6900== NVPROF is profiling process 6900, command: ./one
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 1.729 ms
Effective bandwidth: 9.70342 GB/s
==6900== Profiling application: ./one
==6900== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   89.44%  3.5865ms         1  3.5865ms  3.5865ms  3.5865ms  [CUDA memcpy HtoD]
                   10.38%  416.06us         1  416.06us  416.06us  416.06us  reduce2(int*, int*)
                    0.19%  7.5190us         1  7.5190us  7.5190us  7.5190us  [CUDA memcpy DtoH]
      API calls:   96.74%  196.17ms         2  98.083ms  79.746us  196.09ms  cudaMalloc
                    1.89%  3.8327ms         2  1.9164ms  76.542us  3.7562ms  cudaMemcpy
                    0.64%  1.3061ms         1  1.3061ms  1.3061ms  1.3061ms  cudaLaunchKernel
                    0.43%  863.83us        

In [36]:
!nvprof --print-gpu-trace ./one

==7079== NVPROF is profiling process 7079, command: ./one
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.562 ms
Effective bandwidth: 29.8527 GB/s
==7079== Profiling application: ./one
==7079== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
333.89ms  3.5764ms                    -               -         -         -         -  16.000MB  4.3689GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
337.54ms  416.28us          (16384 1 1)       (256 1 1)        16        0B  1.0000KB         -           -           -           -     Tesla T4 (0)         1         7  reduce2(int*, int*) [128]
337.97ms  7.5520us                    -               -         -         -         -  64.000KB  8.0820GB/s      Device    P

In [37]:
!nvprof ./two

==7138== NVPROF is profiling process 7138, command: ./two
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.566 ms
Effective bandwidth: 29.6417 GB/s
==7138== Profiling application: ./two
==7138== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   89.15%  3.4828ms         1  3.4828ms  3.4828ms  3.4828ms  [CUDA memcpy HtoD]
                   10.66%  416.25us         1  416.25us  416.25us  416.25us  reduce2(int*, int*)
                    0.19%  7.5840us         1  7.5840us  7.5840us  7.5840us  [CUDA memcpy DtoH]
      API calls:   97.37%  196.53ms         2  98.266ms  74.663us  196.46ms  cudaMalloc
                    1.85%  3.7301ms         2  1.8650ms  83.383us  3.6467ms  cudaMemcpy
                    0.42%  857.15us         2  428.58us  124.55us  732.61us  cudaFree
                    0.21%  416.80us         1  416.

In [42]:
!nvprof --print-gpu-trace ./two

==7457== NVPROF is profiling process 7457, command: ./two
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.55 ms
Effective bandwidth: 30.504 GB/s
==7457== Profiling application: ./two
==7457== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
327.37ms  3.4773ms                    -               -         -         -         -  16.000MB  4.4934GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
330.90ms  416.28us          (16384 1 1)       (256 1 1)        16        0B  1.0000KB         -           -           -           -     Tesla T4 (0)         1         7  reduce2(int*, int*) [128]
331.35ms  7.6480us                    -               -         -         -         -  64.000KB  7.9805GB/s      Device    Pag

In [43]:
!nvprof ./three

==7488== NVPROF is profiling process 7488, command: ./three
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.584 ms
Effective bandwidth: 28.7281 GB/s
==7488== Profiling application: ./three
==7488== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   89.39%  3.5683ms         1  3.5683ms  3.5683ms  3.5683ms  [CUDA memcpy HtoD]
                   10.42%  416.09us         1  416.09us  416.09us  416.09us  reduce2(int*, int*)
                    0.19%  7.5520us         1  7.5520us  7.5520us  7.5520us  [CUDA memcpy DtoH]
      API calls:   97.41%  209.16ms         2  104.58ms  122.82us  209.04ms  cudaMalloc
                    1.79%  3.8352ms         2  1.9176ms  93.196us  3.7420ms  cudaMemcpy
                    0.46%  992.63us         2  496.31us  198.29us  794.33us  cudaFree
                    0.19%  417.01us         1  

In [40]:
!nvprof --print-gpu-trace ./three

==7349== NVPROF is profiling process 7349, command: ./three
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.559 ms
Effective bandwidth: 30.0129 GB/s
==7349== Profiling application: ./three
==7349== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
335.16ms  3.5946ms                    -               -         -         -         -  16.000MB  4.3468GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
338.82ms  416.09us          (16384 1 1)       (256 1 1)        16        0B  1.0000KB         -           -           -           -     Tesla T4 (0)         1         7  reduce2(int*, int*) [128]
339.26ms  7.5190us                    -               -         -         -         -  64.000KB  8.1175GB/s      Device 

In [44]:
!nvprof ./four

==7578== NVPROF is profiling process 7578, command: ./four
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.565 ms
Effective bandwidth: 29.6942 GB/s
==7578== Profiling application: ./four
==7578== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   89.14%  3.4796ms         1  3.4796ms  3.4796ms  3.4796ms  [CUDA memcpy HtoD]
                   10.67%  416.38us         1  416.38us  416.38us  416.38us  reduce2(int*, int*)
                    0.19%  7.5840us         1  7.5840us  7.5840us  7.5840us  [CUDA memcpy DtoH]
      API calls:   97.49%  208.05ms         2  104.03ms  79.381us  207.98ms  cudaMalloc
                    1.75%  3.7357ms         2  1.8679ms  77.613us  3.6581ms  cudaMemcpy
                    0.41%  874.20us         2  437.10us  147.63us  726.57us  cudaFree
                    0.20%  417.08us         1  41

In [45]:
!nvprof --print-gpu-trace ./four

==7630== NVPROF is profiling process 7630, command: ./four
[32mVerification successful: GPU result matches CPU result.
GPU Result: 207451054, CPU Result: 207451054
[0mReduced result: 207451054
Time elapsed: 0.584 ms
Effective bandwidth: 28.7281 GB/s
==7630== Profiling application: ./four
==7630== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
373.85ms  4.2482ms                    -               -         -         -         -  16.000MB  3.6780GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
378.19ms  416.03us          (16384 1 1)       (256 1 1)        16        0B  1.0000KB         -           -           -           -     Tesla T4 (0)         1         7  reduce2(int*, int*) [128]
378.63ms  7.5190us                    -               -         -         -         -  64.000KB  8.1175GB/s      Device   