In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-uqyih79t
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-uqyih79t
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=5ad9dfe85545783b5febbe0b7ced2cc183359455d6a44e197bd2c89eea11835c
  Stored in directory: /tmp/pip-ephem-wheel-cache-54s2pww3/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [5]:
%%cu
//Global reduction kernel using more than one warp
#include < iostream>    // cout, endl
#include < numeric>     // accumulate
#include < algorithm>   // iota, fill

#define WARPSIZE (32)
#define SDIV(x,y)(((x)+(y)-1)/(y))

using value_t = unsigned int;

constexpr size_t NUM_ELEMENTS = 1UL<<28;
constexpr size_t BLOCK_SIZE = 512;

template <typename T>
__device__
T my_warp_shfl_down(T var, unsigned int delta) {
    return __shfl_down_sync(0xFFFFFFFF, var, delta, 32);
}

///////////////////////////////////////////////////////////////////////////////
// FINISHED KERNEL (you dont have to change anything)
///////////////////////////////////////////////////////////////////////////////

__global__
void global_reduction_kernel(const value_t * input, value_t * output, size_t inputSize)
{
    const size_t thid = blockDim.x*blockIdx.x + threadIdx.x;

    value_t warpAccum = value_t(0); // here we store the warp result

    // store entries in registers
    if (thid < inputSize)
        warpAccum = input[thid];

    // reduce all values within a warp
    for (size_t offset = WARPSIZE/2; offset>0; offset/=2)
        warpAccum += my_warp_shfl_down(warpAccum, offset);

    // first thread of every warp adds to global result
    if (threadIdx.x % 32 == 0) atomicAdd(output, warpAccum);
}

///////////////////////////////////////////////////////////////////////////////
// STUDENTS PART (fill in the gaps)
///////////////////////////////////////////////////////////////////////////////

__global__
void shared_reduction_kernel(const value_t * input, value_t * output, size_t inputSize)
{
    __shared__ value_t blockAccum; // here we store the block result

    const size_t thid = blockDim.x*blockIdx.x + threadIdx.x;

    // TODO: initialize block result
    

    value_t warpAccum = value_t(0); // here we store the warp result

    // store entries in registers
    if (thid < inputSize)
        warpAccum = input[thid];

    // reduce all values within a warp
    for (size_t offset = WARPSIZE/2; offset>0; offset/=2)
        warpAccum += my_warp_shfl_down(warpAccum, offset);

    // TODO: first thread of every warp adds to block result
  
 
    // TODO: first thread of every block adds to global result
 
}

///////////////////////////////////////////////////////////////////////////////
// MAIN PROGRAM
///////////////////////////////////////////////////////////////////////////////

int main () {

    // choose GPU 0 (GTX 1080 (Pascal) 8GB RAM)
    // or GPU 1 (Titan X (Maxwell) 12GB RAM)
    cudaSetDevice(0);                                                     

    // pointers to host arrays
    value_t *h_array = nullptr;
    value_t *h_sum = nullptr;
    // pointers to device arrays
    value_t *d_array = nullptr;
    value_t *d_sum = nullptr;

    const size_t arraySize = sizeof(value_t)*NUM_ELEMENTS;

    // allocate memory
    cudaMallocHost(&h_array, arraySize);                                      
    cudaMallocHost(&h_sum, sizeof(value_t));                                
    cudaMalloc(&d_array, arraySize);                                          
    cudaMalloc(&d_sum, sizeof(value_t));                                     

    std::fill(h_array, h_array+NUM_ELEMENTS, 1);

    h_sum[0] = std::accumulate(h_array, h_array+NUM_ELEMENTS, 0);
 

    // WARNING: this computes incorrect results!
     h_sum[0] = 0;
    for (size_t i = 0; i < NUM_ELEMENTS; i++)
        h_sum[0] += h_array[i];
    std::cout << '\n';

    cudaMemcpy(d_array, h_array, arraySize, cudaMemcpyHostToDevice);      
    cudaMemset(d_sum, 0, sizeof(value_t));                                
 
    // invoke the kernel
    global_reduction_kernel<<< SDIV(NUM_ELEMENTS, BLOCK_SIZE), BLOCK_SIZE >>>
        (d_array, d_sum, NUM_ELEMENTS);                                  
  
    // reset d_sum
    cudaMemset(d_sum, 0, sizeof(value_t));                               

    // invoke the kernel
    shared_reduction_kernel<<< SDIV(NUM_ELEMENTS, BLOCK_SIZE), BLOCK_SIZE >>>
        (d_array, d_sum, NUM_ELEMENTS);                                   
 
     cudaMemcpy(h_sum, d_sum, sizeof(value_t), cudaMemcpyDeviceToHost);    
 
    std::cout << '\n';

    // check if result computed correctly by CUDA
    bool no_errors = true;
    value_t truth = value_t(NUM_ELEMENTS);
    if (h_sum[0] != truth) {
        std::cout << "error: got " << h_sum[0] << " but expected " << truth << '\n';
        no_errors = false;
    }

    // free allocated memory
    cudaFree(d_array);
    cudaFree(d_sum);
    cudaFreeHost(h_array);
    cudaFreeHost(h_sum);

    if(no_errors)
        std::cout << "CUDA programming is fun!\n";
}



error: got 0 but expected 268435456

